예제 #1
0
    def download_wechat_topics(self, wechat_id, process_topic):
        """ 在微信号的文章列表页面,逐一点击打开每一篇文章,并爬取 """
        browser = self.browser
        js = """ return document.documentElement.innerHTML; """
        body = browser.execute_script(js)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), htmlparser)

        elems = [item.strip() for item in tree.xpath("//h4[@class='weui_media_title']/text()") if item.strip()]
        hrefs = ['http://mp.weixin.qq.com%s' % item for item in tree.xpath("//h4[@class='weui_media_title']/@hrefs")]
        elems_avatars = tree.xpath("//div[@class='weui_media_box appmsg']/span/@style")
        avatars = [item[21:-1] for item in elems_avatars]
        elems_abstracts = tree.xpath("//p[@class='weui_media_desc']")
        abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
        links = []
        for idx, item in enumerate(elems[:10]):
            title = item
            print title
            if not title:
                continue
            uniqueid = get_uniqueid('%s:%s' % (wechat_id, title))
            try:
                Topic.objects.get(uniqueid=uniqueid)
            except Topic.DoesNotExist:
                #print len(elems), len(hrefs), len(avatars), len(abstracts)
                #print elems, hrefs, avatars, abstracts
                links.append((title, hrefs[idx], avatars[idx], abstracts[idx]))
                logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid))
        for title, link, avatar, abstract in reversed(links):
            # 可以访问了
            browser.get(link)
            time.sleep(3)

            if 'antispider' in browser.current_url:
                """被检测出爬虫了"""
                self.log_antispider()
                time.sleep(randint(1, 5))
            else:
                js = """
                    var imgs = document.getElementsByTagName('img');

                    for(var i = 0; i < imgs.length; i++) {
                      var dataSrc = imgs[i].getAttribute('data-src');
                      if (dataSrc){
                        imgs[i].setAttribute('src', dataSrc);
                      }
                    }
                    return document.documentElement.innerHTML;
                """
                body = browser.execute_script(js)
                process_topic({
                    'url': browser.current_url,
                    'body': body,
                    'avatar': avatar,
                    'title': title,
                    'abstract': abstract,
                    'kind': KIND_NORMAL
                })
                time.sleep(randint(1, 5))
예제 #2
0
    def download_wechat_topics(self, wechat_id, process_topic):
        """ 在微信号的文章列表页面,逐一点击打开每一篇文章,并爬取 """
        browser = self.browser
        js = """ return document.documentElement.innerHTML; """
        body = browser.execute_script(js)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), htmlparser)

        elems = [item.strip() for item in tree.xpath("//h4[@class='weui_media_title']/text()") if item.strip()]
        hrefs = ['http://mp.weixin.qq.com%s' % item for item in tree.xpath("//h4[@class='weui_media_title']/@hrefs")]
        elems_avatars = tree.xpath("//div[@class='weui_media_box appmsg']/span/@style")
        avatars = [item[21:-1] for item in elems_avatars]
        elems_abstracts = tree.xpath("//p[@class='weui_media_desc']")
        abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
        links = []
        for idx, item in enumerate(elems[:10]):
            title = item
            print title
            if not title:
                continue
            uniqueid = get_uniqueid('%s:%s' % (wechat_id, title))
            try:
                Topic.objects.get(uniqueid=uniqueid)
            except Topic.DoesNotExist:
                #print len(elems), len(hrefs), len(avatars), len(abstracts)
                #print elems, hrefs, avatars, abstracts
                links.append((title, hrefs[idx], avatars[idx], abstracts[idx]))
                logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid))
        for title, link, avatar, abstract in reversed(links):
            # 可以访问了
            browser.get(link)
            time.sleep(3)

            if 'antispider' in browser.current_url:
                """被检测出爬虫了"""
                self.log_antispider()
                time.sleep(randint(1, 5))
            else:
                js = """
                    var imgs = document.getElementsByTagName('img');

                    for(var i = 0; i < imgs.length; i++) {
                      var dataSrc = imgs[i].getAttribute('data-src');
                      if (dataSrc){
                        imgs[i].setAttribute('src', dataSrc);
                      }
                    }
                    return document.documentElement.innerHTML;
                """
                body = browser.execute_script(js)
                process_topic({
                    'url': browser.current_url,
                    'body': body,
                    'avatar': avatar,
                    'title': title,
                    'abstract': abstract,
                    'kind': KIND_NORMAL
                })
                time.sleep(randint(1, 5))
예제 #3
0
    def download_wechat_keyword_topics(self, word, process_topic):
        """ 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """
        browser = self.browser
        js = """ return document.documentElement.innerHTML; """
        body = browser.execute_script(js)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), htmlparser)

        elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h4/a")]
        hrefs = tree.xpath("//div[@class='txt-box']/h4/a/@href")
        avatars = tree.xpath("//div[@class='img_box2']/a/img/@src")
        elems_abstracts = tree.xpath("//div[@class='txt-box']/p")
        abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
        links = []
        for idx, item in enumerate(elems[:10]):
            title = item
            print title
            if not title:
                continue
            uniqueid = get_uniqueid('%s:%s' % (word, title))
            try:
                Topic.objects.get(uniqueid=uniqueid)
            except Topic.DoesNotExist:
                #print len(elems), len(hrefs), len(avatars), len(abstracts)
                print elems, hrefs, avatars, abstracts
                links.append((title, hrefs[idx], avatars[idx], abstracts[idx]))
                logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid))
        for title, link, avatar, abstract in reversed(links):
            # 可以访问了
            browser.get(link)
            time.sleep(3)

            if 'antispider' in browser.current_url:
                """被检测出爬虫了"""
                self.log_antispider()
                time.sleep(randint(1, 5))
            else:
                js = """
                    var imgs = document.getElementsByTagName('img');

                    for(var i = 0; i < imgs.length; i++) {
                      var dataSrc = imgs[i].getAttribute('data-src');
                      if (dataSrc){
                        imgs[i].setAttribute('src', dataSrc);
                      }
                    }
                    return document.documentElement.innerHTML;
                """
                body = browser.execute_script(js)
                process_topic({
                    'url': browser.current_url,
                    'body': body,
                    'avatar': avatar,
                    'title': title,
                    'abstract': abstract,
                    'kind': KIND_KEYWORD
                })
                time.sleep(randint(1, 5))
예제 #4
0
    def process(self, params):
        C = self._class
        # 排除被屏蔽的情况
        if 'mp.weixin.qq.com' not in params.get('url'):
            return
        # 排除代理失败的情况
        if 'wx.qq.com' not in params.get('source'):
            return
        # 存储数据
        if params.get('kind') in [KIND_DETAIL, KIND_KEYWORD]:
            params.pop('kind', None)
            params.pop('retry', None)
            # 保存微信号
            wechatid = params.pop('wechatid', '')
            name = params.pop('name', '')
            intro = params.pop('intro', '')
            qrcode = params.pop('qrcode', '')
            wechat, created = Wechat.objects.get_or_create(
                wechatid=wechatid,
                defaults={
                    "wechatid": wechatid,
                    "name": name,
                    "intro": intro,
                    "qrcode": qrcode,
                    "status": Wechat.STATUS_DISABLE
                })
            # 如果微信号状态为已删除,则不保存这篇文章
            if wechat.status == Wechat.STATUS_DELETE:
                return

            # 保存文章
            params['wechat_id'] = wechat.id
            params['uniqueid'] = get_uniqueid(
                '%s:%s' % (params['wechat_id'], params['title']))
            C.objects.update_or_create(uniqueid=params['uniqueid'],
                                       defaults=params)

        else:
            params.pop('kind', None)
            params.pop('retry', None)
            params['uniqueid'] = get_uniqueid(
                '%s:%s' % (params['wechat_id'], params['title']))
            C.objects.update_or_create(uniqueid=params['uniqueid'],
                                       defaults=params)
예제 #5
0
    def process(self, params):
        C = self._class
        # 排除被屏蔽的情况
        if 'mp.weixin.qq.com' not in params.get('url'):
            return
        # 排除代理失败的情况
        if 'wx.qq.com' not in params.get('source'):
            return
        # 存储数据
        if params.get('kind') in [KIND_DETAIL, KIND_KEYWORD]:
            params.pop('kind', None)
            params.pop('retry', None)
            # 保存微信号
            wechatid = params.pop('wechatid', '')
            name = params.pop('name', '')
            intro = params.pop('intro', '')
            qrcode = params.pop('qrcode', '')
            wechat, created = Wechat.objects.get_or_create(wechatid=wechatid, defaults={
                "wechatid": wechatid,
                "name": name,
                "intro": intro,
                "qrcode": qrcode,
                "status": Wechat.STATUS_DISABLE
            })
            # 如果微信号状态为已删除,则不保存这篇文章
            if wechat.status == Wechat.STATUS_DELETE:
                return

            # 保存文章
            params['wechat_id'] = wechat.id
            params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title']))
            C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params)

        else:
            params.pop('kind', None)
            params.pop('retry', None)
            params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title']))
            C.objects.update_or_create(uniqueid=params['uniqueid'], defaults=params)
예제 #6
0
    def download_xb_wechat_keyword_topics(self,word,process_topic,data):
        """ 在新榜的文章列表页面,逐一点击文章并下载 """
        browser = self.browser
        try:
            WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'inside-ul-li')))
            js = """ return document.documentElement.innerHTML; """
            body = browser.execute_script(js)
            htmlparser = etree.HTMLParser()
            tree = etree.parse(StringIO(body), htmlparser)

            elems = [stringify_children(item) for item in tree.xpath("//div[@class='inside-li-top']/p")]
            hrefs = tree.xpath("//ul[@class='inside-left-file']/li/@data-url")
            avatars = [''] * len(elems)
            abstracts = [''] * len(elems)
            links = []
            for idx, item in enumerate(elems):
                title = item
                print title
                if not title:
                    continue
                uniqueid = get_uniqueid('%s:%s' % (word, title))
                try:
                    Topic.objects.get(uniqueid=uniqueid)
                except Topic.DoesNotExist:
                    # print len(elems), len(hrefs), len(avatars), len(abstracts)
                    print elems, hrefs, avatars, abstracts
                    links.append((title, hrefs[idx], avatars[idx], abstracts[idx]))
                    logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid))
            for title, link, avatar, abstract in reversed(links):
                # 可以访问了
                browser.get(link)
                time.sleep(3)

                if 'antispider' in browser.current_url:
                    """ 检测出爬虫"""
                    self.log_antispider()
                    time.sleep(randint(1, 5))
                elif browser.title == '':
                    """ 该文章已经被删除"""
                    logger.debug('文章已经被删除')
                    continue
                else:
                    js = """
                                var imgs = document.getElementsByTagName('img');

                                for(var i = 0; i < imgs.length; i++) {
                                  var dataSrc = imgs[i].getAttribute('  data-src');
                                  if (dataSrc){
                                    imgs[i].setAttribute('src', dataSrc);
                                  }
                                }
                                return document.documentElement.innerHTML;
                            """
                    body = browser.execute_script(js)
                    process_topic({
                        'url': browser.current_url,
                        'body': body,
                        'avatar': avatar,
                        'title': title,
                        'abstract': abstract,
                        'kind': KIND_KEYWORD
                    }, data)
                    time.sleep(randint(1, 5))
        except TimeoutException as ex:
            print("未找到文章列表" + str(ex))
            browser.close()
예제 #7
0
    def download_wechat_keyword_topics(self, word, process_topic):
        """ 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """
        browser = self.browser
        js = """ return document.documentElement.innerHTML; """
        body = browser.execute_script(js)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), htmlparser)

        elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h3/a")]
        hrefs = tree.xpath("//div[@class='txt-box']/h3/a/@href")
        #avatars = tree.xpath("//div[@class='img-box']/a/img/@src")
        #elems_abstracts = tree.xpath("//div[@class='txt-box']/p")
        #abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
        avatars = [''] * len(elems)
        abstracts = [''] * len(elems)
        links = []
        for idx, item in enumerate(elems):
            title = item
            print title
            if not title:
                continue
            uniqueid = get_uniqueid('%s:%s' % (word, title))
            try:
                Topic.objects.get(uniqueid=uniqueid)
            except Topic.DoesNotExist:
                #print len(elems), len(hrefs), len(avatars), len(abstracts)
                print elems, hrefs, avatars, abstracts
                links.append((title, hrefs[idx], avatars[idx], abstracts[idx]))
                logger.debug('文章不存在, title=%s, uniqueid=%s' % (title, uniqueid))
        for title, link, avatar, abstract in reversed(links):
            # 可以访问了
            browser.get(link)
            time.sleep(3)

            if 'antispider' in browser.current_url:
                """被检测出爬虫了"""
                self.log_antispider()
                time.sleep(randint(1, 5))
            else:
                js = """
                    var imgs = document.getElementsByTagName('img');

                    for(var i = 0; i < imgs.length; i++) {
                      var dataSrc = imgs[i].getAttribute('data-src');
                      if (dataSrc){
                        imgs[i].setAttribute('src', dataSrc);
                      }
                    }
                    return document.documentElement.innerHTML;
                """
                body = browser.execute_script(js)
                process_topic({
                    'url': browser.current_url,
                    'body': body,
                    'avatar': avatar,
                    'title': title,
                    'abstract': abstract,
                    'kind': KIND_KEYWORD
                })
                time.sleep(randint(1, 5))