Пример #1
0
class WeiboSpider(object):
    page_count = 0  # 记录每个页面抓取的数据量
    all_count = 0  # 记录抓取的数据总量
    save_data = OrderedDict({  # 保存抓取的数据
        "昵称": [],
        "微博正文": [],
        "微博链接": [],
        "时间": [],
        "收藏数": [],
        "转发数": [],
        "评论数": [],
        "点赞数": [],
        "设备": []
    })

    xpath_dict = {  # 解析数据用的 xpath
        '昵称': '//div[@class="info"]//a[@class="name"]/text()',
        '微博正文':
        '//div[@class="content"]/p[@node-type="feed_list_content"]//text()',
        '微博链接': '//div[@class="content"]/p[@class="from"]/a[1]/@href',
        '时间': '//div[@class="content"]/p[@class="from"]/a[1]/text()',
        '收藏数': '//a[@action-type="feed_list_favorite"]/text()',
        '转发数': '//a[@action-type="feed_list_forward"]/text()',
        '评论数': '//a[@action-type="feed_list_comment"]/text()',
        '点赞数':
        '//div[@class="card-act"]//a[@action-type="feed_list_like"]//em/text()',
        '设备': '//div[@class="content"]/p/a[@rel="nofollow"]/text()'
    }

    def __init__(self,
                 keyword,
                 start_time,
                 end_time,
                 sleep_time=10,
                 username=None,
                 password=None):
        self.username = username  # 微博用户名
        self.password = password  # 微博密码

        self.browser = None
        self.browser_name = "firefox"  # 浏览器名
        self.driver_path = "../driver/firefoxdriver.exe"  # 打开浏览器的驱动

        self.base_url = "https://s.weibo.com/"  # 微博搜索主页
        self.search_url = 'https://s.weibo.com/weibo/{keyword}' \
                          '&timescope=custom:{start_time}:{end_time}&refer=g'  # 搜索结果的url
        self.keyword = keyword  # 搜索关键字
        self.sleep_time = sleep_time  # 点击下一页的时间间隔
        self.start_time = start_time  # 搜索内容的起始时间
        self.end_time = end_time  # 搜索内容的结束时间

        self.base_sava_path = '../files/'  # 输出文件的保存路径
        self.save_file_name = self.keyword + self.start_time + "~" + self.end_time  # 输出文件名

    def refactor_date(self, start_time, end_time):  # 构造搜索的起止时间
        self.start_time = datetime.strptime(
            start_time, "%Y-%m-%d-%H").strftime("%Y-%m-%d-%H")
        if end_time:
            self.end_time = datetime.strptime(
                end_time, "%Y-%m-%d-%H").strftime("%Y-%m-%d-%H")
        else:
            self.end_time = datetime.now().strftime('%Y-%m-%d-%H')

    def get_search_url(self):  # 构造搜索url
        self.refactor_date(self.start_time, self.end_time)  # 将输入的时间重构成搜索用的标准参数

        return self.search_url.format(keyword=self.keyword,
                                      start_time=self.start_time,
                                      end_time=self.end_time)

    def login(self):
        """
        用于登录微博
        """
        self.browser = Browser(driver_name=self.browser_name,
                               executable_path=self.driver_path,
                               service_log_path='../files/log.log')  # 打开浏览器

        self.browser.visit(self.base_url)  # 访问微博搜索页面
        self.browser.click_link_by_text('登录')  # 点击登录

        # 填充用户名的密码
        if self.username is not None:
            self.browser.fill("username", self.username)
        if self.password is not None:
            self.browser.fill("password", self.password)

        print("请在打开的浏览器中登录........")

        time.sleep(2)  # 暂停两秒,等待浏览器加载完成
        logining_url = self.browser.url  # 获取正在登录时的 url

        # 防止网络不好时获取不到正在登录时的 url
        while logining_url == self.base_url:
            time.sleep(2)
            logining_url = self.browser.url

        # 通过验证 url 保证已经登录
        while 1:
            if self.browser.url != logining_url:
                break
            time.sleep(2)
        print("已成功登录,开始抓取信息.......")

    def search(self):
        """
        通过构造的搜索 url,跳转到搜索结果页面
        """

        self.browser.visit(self.get_search_url())

    def get_card_data(self, card, xapth_dict: dict):
        """
        用户获取每一篇博客的信息
        """

        etree_html = etree.HTML(card.html)
        number = ['收藏数', '转发数', '评论数', '点赞数']

        self.page_count += 1  # 统计每一页抓取的数据量

        for key in xapth_dict.keys():
            xpath = xapth_dict.get(key)
            data = etree_html.xpath(xpath)

            if data:
                if key in number:
                    self.save_data[key].append(Utils.get_num(data[0]))

                elif key == '时间':
                    self.save_data[key].append(
                        Utils.get_date(data[0]).strftime('%Y-%m-%d'))

                elif key == '微博正文':
                    content = ''.join(data).replace(' ', '').replace('\n', '')
                    self.save_data[key].append(content)

                else:
                    self.save_data[key].append(data[0])

            else:
                if key in number:
                    self.save_data[key].append('0')

                else:
                    self.save_data[key].append('')

    def download_data(self):
        self.browser.execute_script(
            "window.scrollTo(0,document.body.scrollHeight)")  # 跳转到页面底部

        try:
            self.browser.click_link_by_text('查看全部搜索结果')
        except ElementDoesNotExist:
            pass

        page_index = 1  # 记录页码
        while page_index <= 50:  # 微博搜索结果最多为50页

            # 获取真实的页码
            try:
                # 微博搜索结果有时候一直点击下一页,到了最后一页会跳转到第一页
                # 这段代码用于防止出现这种情况
                real_page = re.findall(r'page=(\d+)', self.browser.url)
                if real_page:
                    real_page = int(real_page[0])
                else:
                    real_page = 1
                if real_page < page_index:
                    break

                print('正在抓取第%s页内容:' % page_index, end='')
                self.browser.execute_script(
                    "window.scrollTo(0,document.body.scrollHeight)")  # 跳转到页面底部

                cards = self.browser.find_by_xpath(
                    '//div[@class="card"]')  # 获取所有的博文

            except KeyboardInterrupt:  # 如果觉得抓取时间过长,可以按下ctrl c 中止抓取,然后可以保存已抓取的信息
                print('中途退出抓取,正则保存中.....')
                break

            for card in cards:  # 遍历所有的文章,获取数据
                try:
                    self.get_card_data(card, self.xpath_dict)
                except KeyboardInterrupt:
                    pass

            try:
                print('本页抓取了%s条数据,模拟等待中.....' % self.page_count)
                self.all_count += self.page_count  # 统计获取的所有数据量
                self.page_count = 0

                # 基于设置的休眠时间,随机设置一个休眠值
                sleep_time = random.randint(self.sleep_time,
                                            self.sleep_time + 5)

                time.sleep(sleep_time)  # 模拟用户浏览网页的时间

                try:
                    self.browser.click_link_by_text('下一页')  # 点击下一页
                    page_index += 1

                except ElementDoesNotExist:
                    break

            except KeyboardInterrupt:
                print('中途退出抓取,正则保存中.....')
                break

    def save(self):
        """
        保存数据
        """

        print('--------------------------------------------')
        print('本次共抓取了%s条数据' % self.all_count)

        try:
            data = pandas.DataFrame(self.save_data)
            file_path = self.base_sava_path + self.save_file_name + '.xlsx'

            data.to_excel(file_path, index=False)  # 将数据保存到 excel
            print('文件正在保存...', end='\n\n')

        except Exception as e:
            print('文件保存失败!!!', end='\n\n')
            print(e)

    def close(self):
        self.browser.quit()  # 关闭浏览器

    @staticmethod
    def test():
        """
        用于测试的方法,实际运行时不执行
        """

        browser = Browser(executable_path="../driver/firefox.exe")
        browser.visit(
            "https://s.weibo.com/weibo/%25E5%25B0%25B1%25E5%25"
            "BC%2580%25E5%25A7%258B%25E5%25A4%25A7%25E5%25B9%2585?topnav=1&wvr=6&b=1"
        )

        cards = browser.find_by_xpath('//div[@class="card"]')
        for c in cards:
            etree_html = etree.HTML(c.html)
            a = etree_html.xpath(
                '//div[@class="card-act"]//a[@title="赞"]/em/text()')

            print(a[0])
Пример #2
0
    for city in citys:
        while True:
            try:
                while not browser.is_element_present_by_css(
                        '.talent_rankinglist .tab .cur a', wait_time=10):
                    print 'prov and city page reload'
                    browser.reload()
                browser.select('scity', city[0])
                break
            except:
                continue
        while not browser.is_element_present_by_css(
                '.talent_rankinglist .tab .cur a', wait_time=10):
            print 'prov and city page reload'
            browser.reload()
        browser.click_link_by_text('查找')
        while not browser.is_element_present_by_css(
                '.talent_rankinglist .tab .cur a', wait_time=10):
            print 'prov and city page reload'
            browser.reload()

        baseurl = browser.url
        for i in range(11)[1:]:
            url = baseurl + '&page=' + str(i) + '&'
            print url
            while True:
                try:
                    browser.visit(url)
                    break
                except Exception:
                    continue
Пример #3
0
class TestCase(ptc.PloneTestCase):

    def __init__(self):
        self.browser = Browser(driver_name='firefox')
        self.host, self.port = startZServer()

    def afterSetUp(self):
        self.browser.visit('http://%s:%s/plone' % (self.host, self.port))

    def beforeTearDown(self):
        self.browser.quit()

    def portal_visit(self, url):
        self.browser.visit('http://%s:%s/plone/%s' % (self.host, self.port, url))

    def portal_home(self):
        self.browser.visit('http://%s:%s/plone/' % (self.host, self.port))

    def portal_login(self, user, password):
        self.portal_visit('login_form')
        self.browser.fill('__ac_name', user)
        self.browser.fill('__ac_password', password)
        self.browser.find_by_name('submit').first.click()

    def portal_login_as_owner(self):
        self.portal_login(user=portal_owner, password=default_password)

    def portal_logout(self):
        self.portal_visit('logout')

    def portal_search(self, search_word):
        self.browser.fill('SearchableText','%s' % (search_word))
        self.browser.find_by_css('.searchButton').first.click()

    def portal_navigate_submenu(self, option):
        self.browser.find_by_xpath("//li[contains(@id, 'contentview')]/a[text()='%s']" % (option)).first.click()

    def portal_click_a_personaltool(self, personaltool):
        self.browser.click_link_by_href('http://%s:%s/plone/dashboard' % (self.host, self.port))
        self.browser.click_link_by_text('%s' % (personaltool))

    def portal_add_user(self, fullname, username, email, password):
        self.portal_click_a_personaltool('Site Setup')
        self.browser.click_link_by_text('Users and Groups')
        self.browser.find_by_name('form.button.AddUser').first.click()
        self.browser.fill('form.fullname','%s' % (fullname))
        self.browser.fill('form.username','%s' % (username))
        self.browser.fill('form.email','%s' % (email))
        self.browser.fill('form.password','%s' % (password))
        self.browser.fill('form.password_ctl','%s' % (password))
        self.browser.find_by_id('form.actions.register').first.click()

    def portal_add_user_as_manager(self, fullname, username, email, password):
        self.portal_click_a_personaltool('Site Setup')
        self.browser.click_link_by_text('Users and Groups')
        self.browser.find_by_name('form.button.AddUser').first.click()
        self.browser.fill('form.fullname','%s' % (fullname))
        self.browser.fill('form.username','%s' % (username))
        self.browser.fill('form.email','%s' % (email))
        self.browser.fill('form.password','%s' % (password))
        self.browser.fill('form.password_ctl','%s' % (password))
        self.browser.find_by_id('form.groups.0').first.click()
        self.browser.find_by_id('form.actions.register').first.click()

    def portal_change_user_role(self, username, new_role):
        self.portal_click_a_personaltool('Site Setup')
        self.browser.click_link_by_text('Users and Groups')
        self.browser.find_by_xpath("//tr[*/input[@value='%s']]//input[@value='%s']" % (username, new_role)).first.click()
        self.browser.find_by_name('form.button.Modify').first.click()

    def portal_click_enable_content_types(self):
        self.browser.find_by_css('a[title="Add new items inside this item"]').first.click()

    def portal_add_content_type(self, type):
        self.portal_click_enable_content_types()
        self.browser.click_link_by_text('%s' % (type))

    def portal_click_content_item_action(self):
        self.browser.find_by_css('a[title="Actions for the current content item"]').first.click()

    def portal_add_item_action(self, type):
        self.portal_click_content_item_action()
        self.browser.click_link_by_text('%s' % (type))

    def portal_list_states(self):
        self.browser.find_by_css('a[title="Change the state of this item"]').first.click()

    def portal_modify_state_to(self, state):
        self.portal_list_states()
        self.browser.click_link_by_text('%s' % (state))
Пример #4
0
    citys = [[j.get('value'),j.string] for j in soup.findAll('option')]

    for city in citys:
        while True:
            try:
                while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10):
                    print 'prov and city page reload'
                    browser.reload()
                browser.select('scity',city[0])
                break
            except:
                continue
        while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10):
            print 'prov and city page reload'
            browser.reload()
        browser.click_link_by_text('查找')
        while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10):
            print 'prov and city page reload'
            browser.reload()

        baseurl = browser.url
        for i in range(11)[1:]:
            url = baseurl + '&page=' + str(i) + '&'
            print url
            while True:
                try:
                    browser.visit(url)
                    break
                except Exception:
                    continue
            soup = BeautifulSoup(browser.html)