class WeiboSpider(object): page_count = 0 # 记录每个页面抓取的数据量 all_count = 0 # 记录抓取的数据总量 save_data = OrderedDict({ # 保存抓取的数据 "昵称": [], "微博正文": [], "微博链接": [], "时间": [], "收藏数": [], "转发数": [], "评论数": [], "点赞数": [], "设备": [] }) xpath_dict = { # 解析数据用的 xpath '昵称': '//div[@class="info"]//a[@class="name"]/text()', '微博正文': '//div[@class="content"]/p[@node-type="feed_list_content"]//text()', '微博链接': '//div[@class="content"]/p[@class="from"]/a[1]/@href', '时间': '//div[@class="content"]/p[@class="from"]/a[1]/text()', '收藏数': '//a[@action-type="feed_list_favorite"]/text()', '转发数': '//a[@action-type="feed_list_forward"]/text()', '评论数': '//a[@action-type="feed_list_comment"]/text()', '点赞数': '//div[@class="card-act"]//a[@action-type="feed_list_like"]//em/text()', '设备': '//div[@class="content"]/p/a[@rel="nofollow"]/text()' } def __init__(self, keyword, start_time, end_time, sleep_time=10, username=None, password=None): self.username = username # 微博用户名 self.password = password # 微博密码 self.browser = None self.browser_name = "firefox" # 浏览器名 self.driver_path = "../driver/firefoxdriver.exe" # 打开浏览器的驱动 self.base_url = "https://s.weibo.com/" # 微博搜索主页 self.search_url = 'https://s.weibo.com/weibo/{keyword}' \ '×cope=custom:{start_time}:{end_time}&refer=g' # 搜索结果的url self.keyword = keyword # 搜索关键字 self.sleep_time = sleep_time # 点击下一页的时间间隔 self.start_time = start_time # 搜索内容的起始时间 self.end_time = end_time # 搜索内容的结束时间 self.base_sava_path = '../files/' # 输出文件的保存路径 self.save_file_name = self.keyword + self.start_time + "~" + self.end_time # 输出文件名 def refactor_date(self, start_time, end_time): # 构造搜索的起止时间 self.start_time = datetime.strptime( start_time, "%Y-%m-%d-%H").strftime("%Y-%m-%d-%H") if end_time: self.end_time = datetime.strptime( end_time, "%Y-%m-%d-%H").strftime("%Y-%m-%d-%H") else: self.end_time = datetime.now().strftime('%Y-%m-%d-%H') def get_search_url(self): # 构造搜索url self.refactor_date(self.start_time, self.end_time) # 将输入的时间重构成搜索用的标准参数 return self.search_url.format(keyword=self.keyword, start_time=self.start_time, end_time=self.end_time) def login(self): """ 用于登录微博 """ self.browser = Browser(driver_name=self.browser_name, executable_path=self.driver_path, service_log_path='../files/log.log') # 打开浏览器 self.browser.visit(self.base_url) # 访问微博搜索页面 self.browser.click_link_by_text('登录') # 点击登录 # 填充用户名的密码 if self.username is not None: self.browser.fill("username", self.username) if self.password is not None: self.browser.fill("password", self.password) print("请在打开的浏览器中登录........") time.sleep(2) # 暂停两秒,等待浏览器加载完成 logining_url = self.browser.url # 获取正在登录时的 url # 防止网络不好时获取不到正在登录时的 url while logining_url == self.base_url: time.sleep(2) logining_url = self.browser.url # 通过验证 url 保证已经登录 while 1: if self.browser.url != logining_url: break time.sleep(2) print("已成功登录,开始抓取信息.......") def search(self): """ 通过构造的搜索 url,跳转到搜索结果页面 """ self.browser.visit(self.get_search_url()) def get_card_data(self, card, xapth_dict: dict): """ 用户获取每一篇博客的信息 """ etree_html = etree.HTML(card.html) number = ['收藏数', '转发数', '评论数', '点赞数'] self.page_count += 1 # 统计每一页抓取的数据量 for key in xapth_dict.keys(): xpath = xapth_dict.get(key) data = etree_html.xpath(xpath) if data: if key in number: self.save_data[key].append(Utils.get_num(data[0])) elif key == '时间': self.save_data[key].append( Utils.get_date(data[0]).strftime('%Y-%m-%d')) elif key == '微博正文': content = ''.join(data).replace(' ', '').replace('\n', '') self.save_data[key].append(content) else: self.save_data[key].append(data[0]) else: if key in number: self.save_data[key].append('0') else: self.save_data[key].append('') def download_data(self): self.browser.execute_script( "window.scrollTo(0,document.body.scrollHeight)") # 跳转到页面底部 try: self.browser.click_link_by_text('查看全部搜索结果') except ElementDoesNotExist: pass page_index = 1 # 记录页码 while page_index <= 50: # 微博搜索结果最多为50页 # 获取真实的页码 try: # 微博搜索结果有时候一直点击下一页,到了最后一页会跳转到第一页 # 这段代码用于防止出现这种情况 real_page = re.findall(r'page=(\d+)', self.browser.url) if real_page: real_page = int(real_page[0]) else: real_page = 1 if real_page < page_index: break print('正在抓取第%s页内容:' % page_index, end='') self.browser.execute_script( "window.scrollTo(0,document.body.scrollHeight)") # 跳转到页面底部 cards = self.browser.find_by_xpath( '//div[@class="card"]') # 获取所有的博文 except KeyboardInterrupt: # 如果觉得抓取时间过长,可以按下ctrl c 中止抓取,然后可以保存已抓取的信息 print('中途退出抓取,正则保存中.....') break for card in cards: # 遍历所有的文章,获取数据 try: self.get_card_data(card, self.xpath_dict) except KeyboardInterrupt: pass try: print('本页抓取了%s条数据,模拟等待中.....' % self.page_count) self.all_count += self.page_count # 统计获取的所有数据量 self.page_count = 0 # 基于设置的休眠时间,随机设置一个休眠值 sleep_time = random.randint(self.sleep_time, self.sleep_time + 5) time.sleep(sleep_time) # 模拟用户浏览网页的时间 try: self.browser.click_link_by_text('下一页') # 点击下一页 page_index += 1 except ElementDoesNotExist: break except KeyboardInterrupt: print('中途退出抓取,正则保存中.....') break def save(self): """ 保存数据 """ print('--------------------------------------------') print('本次共抓取了%s条数据' % self.all_count) try: data = pandas.DataFrame(self.save_data) file_path = self.base_sava_path + self.save_file_name + '.xlsx' data.to_excel(file_path, index=False) # 将数据保存到 excel print('文件正在保存...', end='\n\n') except Exception as e: print('文件保存失败!!!', end='\n\n') print(e) def close(self): self.browser.quit() # 关闭浏览器 @staticmethod def test(): """ 用于测试的方法,实际运行时不执行 """ browser = Browser(executable_path="../driver/firefox.exe") browser.visit( "https://s.weibo.com/weibo/%25E5%25B0%25B1%25E5%25" "BC%2580%25E5%25A7%258B%25E5%25A4%25A7%25E5%25B9%2585?topnav=1&wvr=6&b=1" ) cards = browser.find_by_xpath('//div[@class="card"]') for c in cards: etree_html = etree.HTML(c.html) a = etree_html.xpath( '//div[@class="card-act"]//a[@title="赞"]/em/text()') print(a[0])
for city in citys: while True: try: while not browser.is_element_present_by_css( '.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() browser.select('scity', city[0]) break except: continue while not browser.is_element_present_by_css( '.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() browser.click_link_by_text('查找') while not browser.is_element_present_by_css( '.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() baseurl = browser.url for i in range(11)[1:]: url = baseurl + '&page=' + str(i) + '&' print url while True: try: browser.visit(url) break except Exception: continue
class TestCase(ptc.PloneTestCase): def __init__(self): self.browser = Browser(driver_name='firefox') self.host, self.port = startZServer() def afterSetUp(self): self.browser.visit('http://%s:%s/plone' % (self.host, self.port)) def beforeTearDown(self): self.browser.quit() def portal_visit(self, url): self.browser.visit('http://%s:%s/plone/%s' % (self.host, self.port, url)) def portal_home(self): self.browser.visit('http://%s:%s/plone/' % (self.host, self.port)) def portal_login(self, user, password): self.portal_visit('login_form') self.browser.fill('__ac_name', user) self.browser.fill('__ac_password', password) self.browser.find_by_name('submit').first.click() def portal_login_as_owner(self): self.portal_login(user=portal_owner, password=default_password) def portal_logout(self): self.portal_visit('logout') def portal_search(self, search_word): self.browser.fill('SearchableText','%s' % (search_word)) self.browser.find_by_css('.searchButton').first.click() def portal_navigate_submenu(self, option): self.browser.find_by_xpath("//li[contains(@id, 'contentview')]/a[text()='%s']" % (option)).first.click() def portal_click_a_personaltool(self, personaltool): self.browser.click_link_by_href('http://%s:%s/plone/dashboard' % (self.host, self.port)) self.browser.click_link_by_text('%s' % (personaltool)) def portal_add_user(self, fullname, username, email, password): self.portal_click_a_personaltool('Site Setup') self.browser.click_link_by_text('Users and Groups') self.browser.find_by_name('form.button.AddUser').first.click() self.browser.fill('form.fullname','%s' % (fullname)) self.browser.fill('form.username','%s' % (username)) self.browser.fill('form.email','%s' % (email)) self.browser.fill('form.password','%s' % (password)) self.browser.fill('form.password_ctl','%s' % (password)) self.browser.find_by_id('form.actions.register').first.click() def portal_add_user_as_manager(self, fullname, username, email, password): self.portal_click_a_personaltool('Site Setup') self.browser.click_link_by_text('Users and Groups') self.browser.find_by_name('form.button.AddUser').first.click() self.browser.fill('form.fullname','%s' % (fullname)) self.browser.fill('form.username','%s' % (username)) self.browser.fill('form.email','%s' % (email)) self.browser.fill('form.password','%s' % (password)) self.browser.fill('form.password_ctl','%s' % (password)) self.browser.find_by_id('form.groups.0').first.click() self.browser.find_by_id('form.actions.register').first.click() def portal_change_user_role(self, username, new_role): self.portal_click_a_personaltool('Site Setup') self.browser.click_link_by_text('Users and Groups') self.browser.find_by_xpath("//tr[*/input[@value='%s']]//input[@value='%s']" % (username, new_role)).first.click() self.browser.find_by_name('form.button.Modify').first.click() def portal_click_enable_content_types(self): self.browser.find_by_css('a[title="Add new items inside this item"]').first.click() def portal_add_content_type(self, type): self.portal_click_enable_content_types() self.browser.click_link_by_text('%s' % (type)) def portal_click_content_item_action(self): self.browser.find_by_css('a[title="Actions for the current content item"]').first.click() def portal_add_item_action(self, type): self.portal_click_content_item_action() self.browser.click_link_by_text('%s' % (type)) def portal_list_states(self): self.browser.find_by_css('a[title="Change the state of this item"]').first.click() def portal_modify_state_to(self, state): self.portal_list_states() self.browser.click_link_by_text('%s' % (state))
citys = [[j.get('value'),j.string] for j in soup.findAll('option')] for city in citys: while True: try: while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() browser.select('scity',city[0]) break except: continue while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() browser.click_link_by_text('查找') while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() baseurl = browser.url for i in range(11)[1:]: url = baseurl + '&page=' + str(i) + '&' print url while True: try: browser.visit(url) break except Exception: continue soup = BeautifulSoup(browser.html)