def download_wechat_topics(self, wechat_id, process_topic): """ 在微信号的文章列表页面,逐一点击打开每一篇文章,并爬取 """ browser = self.browser js = """ return document.documentElement.innerHTML; """ body = browser.execute_script(js) htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(body), htmlparser) elems = [item.strip() for item in tree.xpath("//h4[@class='weui_media_title']/text()") if item.strip()] hrefs = ['http://mp.weixin.qq.com%s' % item for item in tree.xpath("//h4[@class='weui_media_title']/@hrefs")] elems_avatars = tree.xpath("//div[@class='weui_media_box appmsg']/span/@style") avatars = [item[21:-1] for item in elems_avatars] elems_abstracts = tree.xpath("//p[@class='weui_media_desc']") abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts] links = [] for idx, item in enumerate(elems[:10]): title = item print title if not title: continue uniqueid = get_uniqueid('%s:%s' % (wechat_id, title)) try: Topic.objects.get(uniqueid=uniqueid) except Topic.DoesNotExist: #print len(elems), len(hrefs), len(avatars), len(abstracts) #print elems, hrefs, avatars, abstracts links.append((title, hrefs[idx], avatars[idx], abstracts[idx])) logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid)) for title, link, avatar, abstract in reversed(links): # 可以访问了 browser.get(link) time.sleep(3) if 'antispider' in browser.current_url: """被检测出爬虫了""" self.log_antispider() time.sleep(randint(1, 5)) else: js = """ var imgs = document.getElementsByTagName('img'); for(var i = 0; i < imgs.length; i++) { var dataSrc = imgs[i].getAttribute('data-src'); if (dataSrc){ imgs[i].setAttribute('src', dataSrc); } } return document.documentElement.innerHTML; """ body = browser.execute_script(js) process_topic({ 'url': browser.current_url, 'body': body, 'avatar': avatar, 'title': title, 'abstract': abstract, 'kind': KIND_NORMAL }) time.sleep(randint(1, 5))
def download_wechat_keyword_topics(self, word, process_topic): """ 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """ browser = self.browser js = """ return document.documentElement.innerHTML; """ body = browser.execute_script(js) htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(body), htmlparser) elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h4/a")] hrefs = tree.xpath("//div[@class='txt-box']/h4/a/@href") avatars = tree.xpath("//div[@class='img_box2']/a/img/@src") elems_abstracts = tree.xpath("//div[@class='txt-box']/p") abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts] links = [] for idx, item in enumerate(elems[:10]): title = item print title if not title: continue uniqueid = get_uniqueid('%s:%s' % (word, title)) try: Topic.objects.get(uniqueid=uniqueid) except Topic.DoesNotExist: #print len(elems), len(hrefs), len(avatars), len(abstracts) print elems, hrefs, avatars, abstracts links.append((title, hrefs[idx], avatars[idx], abstracts[idx])) logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid)) for title, link, avatar, abstract in reversed(links): # 可以访问了 browser.get(link) time.sleep(3) if 'antispider' in browser.current_url: """被检测出爬虫了""" self.log_antispider() time.sleep(randint(1, 5)) else: js = """ var imgs = document.getElementsByTagName('img'); for(var i = 0; i < imgs.length; i++) { var dataSrc = imgs[i].getAttribute('data-src'); if (dataSrc){ imgs[i].setAttribute('src', dataSrc); } } return document.documentElement.innerHTML; """ body = browser.execute_script(js) process_topic({ 'url': browser.current_url, 'body': body, 'avatar': avatar, 'title': title, 'abstract': abstract, 'kind': KIND_KEYWORD }) time.sleep(randint(1, 5))
def process(self, params): C = self._class # 排除被屏蔽的情况 if 'mp.weixin.qq.com' not in params.get('url'): return # 排除代理失败的情况 if 'wx.qq.com' not in params.get('source'): return # 存储数据 if params.get('kind') in [KIND_DETAIL, KIND_KEYWORD]: params.pop('kind', None) params.pop('retry', None) # 保存微信号 wechatid = params.pop('wechatid', '') name = params.pop('name', '') intro = params.pop('intro', '') qrcode = params.pop('qrcode', '') wechat, created = Wechat.objects.get_or_create( wechatid=wechatid, defaults={ "wechatid": wechatid, "name": name, "intro": intro, "qrcode": qrcode, "status": Wechat.STATUS_DISABLE }) # 如果微信号状态为已删除,则不保存这篇文章 if wechat.status == Wechat.STATUS_DELETE: return # 保存文章 params['wechat_id'] = wechat.id params['uniqueid'] = get_uniqueid( '%s:%s' % (params['wechat_id'], params['title'])) C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params) else: params.pop('kind', None) params.pop('retry', None) params['uniqueid'] = get_uniqueid( '%s:%s' % (params['wechat_id'], params['title'])) C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params)
def process(self, params): C = self._class # 排除被屏蔽的情况 if 'mp.weixin.qq.com' not in params.get('url'): return # 排除代理失败的情况 if 'wx.qq.com' not in params.get('source'): return # 存储数据 if params.get('kind') in [KIND_DETAIL, KIND_KEYWORD]: params.pop('kind', None) params.pop('retry', None) # 保存微信号 wechatid = params.pop('wechatid', '') name = params.pop('name', '') intro = params.pop('intro', '') qrcode = params.pop('qrcode', '') wechat, created = Wechat.objects.get_or_create(wechatid=wechatid, defaults={ "wechatid": wechatid, "name": name, "intro": intro, "qrcode": qrcode, "status": Wechat.STATUS_DISABLE }) # 如果微信号状态为已删除,则不保存这篇文章 if wechat.status == Wechat.STATUS_DELETE: return # 保存文章 params['wechat_id'] = wechat.id params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title'])) C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params) else: params.pop('kind', None) params.pop('retry', None) params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title'])) C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params)
def download_xb_wechat_keyword_topics(self,word,process_topic,data): """ 在新榜的文章列表页面,逐一点击文章并下载 """ browser = self.browser try: WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'inside-ul-li'))) js = """ return document.documentElement.innerHTML; """ body = browser.execute_script(js) htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(body), htmlparser) elems = [stringify_children(item) for item in tree.xpath("//div[@class='inside-li-top']/p")] hrefs = tree.xpath("//ul[@class='inside-left-file']/li/@data-url") avatars = [''] * len(elems) abstracts = [''] * len(elems) links = [] for idx, item in enumerate(elems): title = item print title if not title: continue uniqueid = get_uniqueid('%s:%s' % (word, title)) try: Topic.objects.get(uniqueid=uniqueid) except Topic.DoesNotExist: # print len(elems), len(hrefs), len(avatars), len(abstracts) print elems, hrefs, avatars, abstracts links.append((title, hrefs[idx], avatars[idx], abstracts[idx])) logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid)) for title, link, avatar, abstract in reversed(links): # 可以访问了 browser.get(link) time.sleep(3) if 'antispider' in browser.current_url: """ 检测出爬虫""" self.log_antispider() time.sleep(randint(1, 5)) elif browser.title == '': """ 该文章已经被删除""" logger.debug('文章已经被删除') continue else: js = """ var imgs = document.getElementsByTagName('img'); for(var i = 0; i < imgs.length; i++) { var dataSrc = imgs[i].getAttribute(' data-src'); if (dataSrc){ imgs[i].setAttribute('src', dataSrc); } } return document.documentElement.innerHTML; """ body = browser.execute_script(js) process_topic({ 'url': browser.current_url, 'body': body, 'avatar': avatar, 'title': title, 'abstract': abstract, 'kind': KIND_KEYWORD }, data) time.sleep(randint(1, 5)) except TimeoutException as ex: print("未找到文章列表" + str(ex)) browser.close()
def download_wechat_keyword_topics(self, word, process_topic): """ 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """ browser = self.browser js = """ return document.documentElement.innerHTML; """ body = browser.execute_script(js) htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(body), htmlparser) elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h3/a")] hrefs = tree.xpath("//div[@class='txt-box']/h3/a/@href") #avatars = tree.xpath("//div[@class='img-box']/a/img/@src") #elems_abstracts = tree.xpath("//div[@class='txt-box']/p") #abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts] avatars = [''] * len(elems) abstracts = [''] * len(elems) links = [] for idx, item in enumerate(elems): title = item print title if not title: continue uniqueid = get_uniqueid('%s:%s' % (word, title)) try: Topic.objects.get(uniqueid=uniqueid) except Topic.DoesNotExist: #print len(elems), len(hrefs), len(avatars), len(abstracts) print elems, hrefs, avatars, abstracts links.append((title, hrefs[idx], avatars[idx], abstracts[idx])) logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid)) for title, link, avatar, abstract in reversed(links): # 可以访问了 browser.get(link) time.sleep(3) if 'antispider' in browser.current_url: """被检测出爬虫了""" self.log_antispider() time.sleep(randint(1, 5)) else: js = """ var imgs = document.getElementsByTagName('img'); for(var i = 0; i < imgs.length; i++) { var dataSrc = imgs[i].getAttribute('data-src'); if (dataSrc){ imgs[i].setAttribute('src', dataSrc); } } return document.documentElement.innerHTML; """ body = browser.execute_script(js) process_topic({ 'url': browser.current_url, 'body': body, 'avatar': avatar, 'title': title, 'abstract': abstract, 'kind': KIND_KEYWORD }) time.sleep(randint(1, 5))