예제 #1
0
def parse_roll(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    try:
        link = LinkExtractor(
            restrict_xpaths=
            '//div[@class="paneT"]//ul[@class="iconBoxT14"]//li/a')
        links = link.extract_links(response)
        for link in links:
            url = link.url
            if url:
                if url.find('index.html'):
                    continue
                data['link_text'] = link.text
                data['link_url'] = link.url
                data['page_function'] = 'parse_article'
                request = Request(url,
                                  dont_filter=True,
                                  priority=10,
                                  meta=data)
                yield request
    except:
        r.sadd('article:crawl:news:error_page', page_url)

    article_util.remove_page_setting(page_url)
예제 #2
0
def parse_roll(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    # 解析数据
    text1 = response.text
    try:
        json1 = json.loads(text1)
    except:
        traceback.print_exc()
        r.sadd('article:crawl:news:error_page', response.url)
        return

    # 处理数据
    if 'result' in json1 and 'status' in json1['result'] and 'code' in json1['result']['status'] \
            and json1['result']['status']['code'] == 0:
        if 'data' in json1['result']:
            # 新闻列表
            items = json1['result']['data']
            # 采集新闻详情
            for item in items:
                url = item['url']
                data['link_text'] = item['title']
                data['link_url'] = item['url']
                data['page_function'] = 'parse_article'
                request = Request(url,
                                  dont_filter=True,
                                  priority=10,
                                  meta=data)
                yield request
    else:
        r.sadd('article:crawl:news:error_page', response.url)

    article_util.remove_page_setting(page_url)
예제 #3
0
def parse_roll(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    # 解析数据
    text1 = response.text
    try:
        json1 = json.loads(text1)
        datas = json1['data']
    except:
        traceback.print_exc()
        r.sadd('article:crawl:news:error_page', page_url)
        return
    # 处理数据
    if data:
        # 采集新闻详情
        for item in datas:
            date = item['focus_date']
            url = item['url']
            data['link_text'] = item['title']
            data['link_url'] = item['url']
            data['page_function'] = 'parse_article'
            data['page_name'] = parse_page_name(page_url)
            data['title'] = item['title']
            data['original_source'] = item['source']
            data['publish_time_str'] = date
            data['publish_time'] = parse_time(date)
            request = Request(url, dont_filter=True, priority=10, meta=data)
            yield request
    else:
        r.sadd('article:crawl:news:error_page', response.url)

    article_util.remove_page_setting(page_url)
예제 #4
0
def parse_mil_ent(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    text1 = re.sub(u"\\(|\\)", '', response.text)
    try:
        json1 = json.loads(text1)
    except:
        traceback.print_exc()
        r.sadd('article:crawl:news:error_page', response.url)
        return

    # 处理数据
    if 'data' in json1 and 'status' in json1 and json1['status'] == 0:
        # 新闻列表
        items = json1['data']['list']
        # 采集新闻详情
        for item in items:
            url = item['LinkUrl']
            data['link_text'] = item['Title']
            data['link_url'] = item['LinkUrl']
            data['page_function'] = 'parse_article'
            request = Request(url, dont_filter=True, priority=10, meta=data)
            yield request
    else:
        r.sadd('article:crawl:news:error_page', response.url)

    article_util.remove_page_setting(page_url)
예제 #5
0
def parse_china(response):
    link = LinkExtractor(restrict_xpaths=['//li'])
    links = link.extract_links(response)
    page_url = response.url
    data = article_util.get_page_setting(page_url)

    # 设置数据
    for link in links:
        url = link.url
        if url.startswith('https://news.sina.com.cn/c'):
            data['link_text'] = link.text
            data['link_url'] = link.url
            data['page_function'] = 'parse_china_article'
            request = Request(url, dont_filter=True, priority=10, meta=data)
            yield request
예제 #6
0
def parse_roll(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    tree_node = etree.HTML(response.text,
                           parser=etree.HTMLParser(encoding='utf-8'))
    c_node = tree_node.xpath(
        '//ul[contains(@class,"fin_newsList") and contains(@class,"cfix")]')
    cc_node = c_node[0].xpath('//li[@class="cfix"]/h2')

    if cc_node:
        for ccc_node in cc_node:
            cc_url = ccc_node.xpath('a/@href')[0]
            data['link_text'] = ccc_node.xpath('a/text()')[0]
            data['link_url'] = cc_url
            data['page_function'] = 'parse_article'
            yield Request(cc_url, dont_filter=True, priority=10, meta=data)
    else:
        r.sadd('article:crawl:news:error_page', response.url)

    article_util.remove_page_setting(page_url)
예제 #7
0
def parse_roll_keji(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    try:
        link = LinkExtractor(
            restrict_xpaths='//div[@class="right_content"]/div[2]/ul/li')
        links = link.extract_links(response)
        for link in links:
            url = link.url
            if url:
                data['link_text'] = link.text
                data['link_url'] = link.url
                data['page_function'] = 'parse_article'
                request = Request(url,
                                  dont_filter=True,
                                  priority=10,
                                  meta=data)
                yield request
    except:
        r.sadd('article:crawl:news:error_page', page_url)

    article_util.remove_page_setting(page_url)
예제 #8
0
def parse_roll(response):
    page_url = response.url
    data = article_util.get_page_setting(page_url)
    try:
        link = LinkExtractor(restrict_xpaths='//dd[@class="dd6401"]/a')
        links = link.extract_links(response)
        for link in links:
            url = link.url
            if url:
                if url.find('https://mp.weixin.qq.com') > -1:
                    continue
                data['link_text'] = link.text
                data['link_url'] = link.url
                data['page_function'] = 'parse_article'
                request = Request(url,
                                  dont_filter=True,
                                  priority=10,
                                  meta=data)
                yield request
    except:
        r.sadd('article:crawl:news:error_page', page_url)

    article_util.remove_page_setting(page_url)
예제 #9
0
    def parse(self, response):
        page_url = response.url

        data = article_util.get_page_setting(page_url)

        # 判断页面配置是否为空
        if not data:
            return None
        print(data['page_module'], data['page_package'], data['page_function'])
        # 动态加载文件
        lib = importlib.import_module('.' + data['page_module'],
                                      data['page_package'])

        # 动态执行文件结果
        for item in eval('lib.%s(response)' % data['page_function']):
            # 返回 Request
            if isinstance(item, Request):
                request = item
                data = request.meta
                url = data['link_url']
                main_url = data['page_url']
                dupe_key = main_url + ':' + url
                dupe_key = hashlib.md5(dupe_key.encode('utf-8')).hexdigest()
                # 检查是否已采集
                if self.r.sismember('article:crawl:news:urls',
                                    dupe_key) or self.r.sismember(
                                        'article:crawl:news:error_page', url):
                    print('已采集-->' + url)
                    continue
                # 设为已采集
                request.meta['dupe_key'] = dupe_key
                self.r.sadd('article:crawl:news:urls', dupe_key)
                # 设置request
                request.callback = self.parse_article
                request.errback = self.errback
                yield request