예제 #1
0
    def test_results_are_cached_across_multiple_items(self):
        rsp1 = Response('http://url1')
        req1 = Request('http://url1', meta=dict(response=rsp1))
        item = dict(requests=req1)
        new_item = yield self.pipe.process_item(item, self.spider)
        self.assertTrue(new_item is item)
        self.assertEqual(new_item['results'], [(True, rsp1)])

        # rsp2 is ignored, rsp1 must be in results because request fingerprints are the same
        req2 = Request(req1.url, meta=dict(response=Response('http://donot.download.me')))
        item = dict(requests=req2)
        new_item = yield self.pipe.process_item(item, self.spider)
        self.assertTrue(new_item is item)
        self.assertEqual(request_fingerprint(req1), request_fingerprint(req2))
        self.assertEqual(new_item['results'], [(True, rsp1)])
예제 #2
0
    def parse(self, response):
        cate = response.meta['cate']
        # 获取本页所有新闻链接
        node_list = response.xpath('//div[@class="list_left"]/ul/li')
        for node in node_list:
            link_list = node.xpath('./a/@href').extract_first()
            if link_list:
                item = ScrapyHuodianbjxV101Item()
                item['content_url'] = link_list
                item['issue_time'] = node.xpath('./span/text()').extract_first()
                req = scrapy.Request(url=link_list, callback=self.parse2, meta={'item': item, 'cate': cate})
                item["id"] = request.request_fingerprint(req)
                yield req
                # print(link_list)

        # 下一页新闻链接
        next_page1 = response.xpath('//div[@class="page"]/a[@title="下一页"]/@href').extract_first()
        next_page2 = response.xpath('//div[@class="tempa page btemp"]/a[@title="下一页"]/@href').extract_first()

        if next_page1:
            next_url = f'http://huodian.bjx.com.cn{next_page1}'
            # print(next_url)
            yield scrapy.Request(url=next_url, callback=self.parse, meta={'cate': cate})
        elif next_page2:
            next_url = f'http://huodian.bjx.com.cn{next_page2}'
            # print(next_url)
            yield scrapy.Request(url=next_url, callback=self.parse, meta={'cate': cate})
예제 #3
0
 def request_seen(self, request):
     fp = request_fingerprint(request)
     if fp in self.fingerprints:
         return True
     self.fingerprints.add(fp)
     if self.file:
         self.file.write(fp + os.linesep)
예제 #4
0
    def request_seen(self, request):
        fp = request_fingerprint(request)
        if self.exists(fp):
            return True

        self.insert(fp)
        return False
예제 #5
0
    def parse(self, response):
        config_list = response.xpath('//div[@class="wonderful"]/ul/li')
        for config in config_list:
            item = ScrapyCcbuildV1Item()
            link = config.xpath('./a/@href').extract_first()
            # print(link)
            req = scrapy.Request(url=link,
                                 callback=self.parse_peple,
                                 dont_filter=True,
                                 meta={'item': item})

            item['content_url'] = link
            item['id'] = request.request_fingerprint(req)
            title_images = config.xpath('./a/img/@src').extract_first()
            if title_images:
                if 'http' in title_images:
                    item['title_images'] = title_images
                else:
                    item['title_images'] = self.base_url + title_images
            else:
                item['title_images'] = None

            item['issue_time'] = config.xpath(
                './a/article/div/span[2]/text()').extract_first()[:9]
            item['tags'] = config.xpath(
                './a/article/div/span[1]/text()').extract_first()

            yield req
예제 #6
0
파일: qtfy3.py 프로젝트: kingweiliu/dban
    def parse_item(self, response):
        #i = DoubancrawlItem()
        print "ljw: *****************"
        print response.url
        item = qtfyItem()
        #tp = md5.new(response.url)
        #item["id"] = tp.hexdigest()
        item["id"] = request_fingerprint(response.request)
        item["url"] = response.url
        item["name"] = response.xpath("//h2/text()").extract()[0]
        entry = response.xpath('//div[@class="entry"]/p')
        
        item["desc"] = entry[0].xpath("text()").extract()[0]
         
        infos = entry[1].xpath("node()").extract()
        if len(infos) < 3:
            infos = entry[2].xpath("node()").extract()

        item["info"] = reduce((lambda x, y: x+y), infos)
        item["img"] = entry.xpath('img/@src').extract()[0]
        
        lnks = entry.xpath('a')
        lkret = []
        for ln in lnks:
            href = ln.xpath("@href").extract()[0]
            
            if not self.isDownloadLink(href):
                continue
            title = ln.xpath("text()").extract()[0]
            lkret.append({"title":title, "href":href})
        item["links"] = lkret
        #print item
        #inspect_response(response, self)
        yield item
예제 #7
0
 def request_seen(self, request):
     fingerprint = request_fingerprint(request)
     if fingerprint in self.record:
         print(request.url, '已经访问过')
         return True
     else:
         self.record.add(fingerprint)
예제 #8
0
 def process_response(self, request, response, spider):
     if type(response) is HtmlResponse and len(response.body) > 4000 and response.status <> 403:
         times = request.meta.get('retry_times2', 0)
         if times > 0:
             log.msg('retry %s time(s) saved %s' % (times, response.url), level=log.ERROR)
             
     retry = False
     if response.status == 403:
         retry = True   
     if type(response) is HtmlResponse:
         if len(response.body) < 4000:
             retry = True
         elif response_is_invalid(response):
             retry = True
     if retry:     
             retries = request.meta.get('retry_times2', 0) + 1
             if retries < 10:
                 key = request_fingerprint(request)
                 rpath = os.path.join(self.cachedir, spider.name, key[0:2], key)
                 metapath = os.path.join(rpath, 'pickled_meta')
                 if os.path.exists(metapath):
                     os.unlink(metapath)
                 retryreq = request.copy()
                 retryreq.meta['retry_times2'] = retries  
                 retryreq.dont_filter = True
                 log.msg('retry %s time(s) %s' % (retries, request.url), level=log.ERROR)  
                 return retryreq
             log.msg('give up %s' % request.url, level=log.ERROR)    
             raise IgnoreRequest    
         
     return response
    def save_response(self, response, spider):
        if isinstance(response, TextResponse):
            fp = request_fingerprint(response.request)
            payload = {
                "_key": fp,
                "_jobid": self.hsref.job.key,
                "_type": "_pageitem",
                "_encoding": response.encoding,
                "url": response.url,
            }
            self._set_cookies(payload, response)

            if response.request.method == 'POST':
                payload["postdata"] = dict(parse_qsl(response.request.body.decode()))

            payload["body"] = response.body_as_unicode()
            if self.trim_html:
                payload['body'] = payload['body'].strip(' \r\n\0')

            if len(payload['body']) > self._writer.maxitemsize:
                spider.logger.warning("Page not saved, body too large: <%s>" %
                                      response.url)
                return

            try:
                self._writer.write(payload)
            except ValueTooLarge as exc:
                spider.logger.warning("Page not saved, %s: <%s>" %
                                      (exc, response.url))
예제 #10
0
 def parse(self, response):
     self.logger.info("Parse")
     soup = BeautifulSoup(response.body, 'lxml')
     all_img_tags = filter(self.tagfilter, soup.find_all('img'))
     candidate_img_tags = filter(self.maybe_ad_img, all_img_tags)
     for candidate in candidate_img_tags:
         ad_url = self.get_img_ad_url(candidate)
         ad_src = candidate.attrs.get('src', None)
         ad_url = fix_url(ad_url)
         ad_src = fix_url(ad_src)
         if not ad_url or not ad_src:
             continue
         item = AdcrawlerDataTaskItem()
         item['url'] = ad_url
         item['fingerprint'] = request_fingerprint(response.request)
         item['ad_img_urls'] = [ad_src]
         yield item
     all_a_tags = soup.find_all('a')
     if self.server.llen(self.redis_key) > self.settings.get('MAX_URL_TASKS', self.DEFAULT_MAX_URL_TASKS):
         return
     for at in all_a_tags:
         a_url = at.attrs.get('href', None)
         a_url = fix_url(a_url)
         if not a_url:
             continue
         next_task = AdcrawlerUrlTaskItem()
         next_task['url'] = a_url
         next_task['cur_depth'] = response.meta['cur_depth'] + 1
         yield next_task
예제 #11
0
    def process_spider_output(self, response, result, spider):
        f = self.outputs[spider]
        fp = request_fingerprint(response.request)
        tracetime = time.time()
        data = self._objtodict(self.RESPONSE_ATTRS, response)
        data['request'] = self._objtodict(self.REQUEST_ATTRS, response.request)
        self._write(f, fp, tracetime, 'response', data)

        for item in result:
            if isinstance(item, Request):
                data = self._objtodict(self.REQUEST_ATTRS, item)
                data['fp'] = request_fingerprint(item)
                self._write(f, fp, tracetime, 'request', data)
            else:
                self._write(f, fp, tracetime, 'item', dict(item))
            yield item
예제 #12
0
    def parse(self, response):

        cate = response.meta['cate']

        config_list = response.xpath('//div[@class="catlist"]/ul/li')
        m = [6, 12, 18, 24]
        n = 1
        for config in config_list:
            # print(config)
            if n not in m:
                item = ScrapyJc123V101Item()
                # title_img = config.xpath('./a/img/@src').extract_first()
                title = config.xpath('./a/text()').extract_first()
                link = config.xpath('./a/@href').extract_first()
                issue_time = config.xpath('./span/text()').extract_first()[:10]

                item['title'] = title
                item['issue_time'] = issue_time
                item['content_url'] = link
                item['information_categories'] = cate
                item['title_images'] = None
                # print(item)

                req = scrapy.Request(url=link, callback=self.parse2,
                                     meta={'item': item},
                                     dont_filter=True)
                item['id'] = request.request_fingerprint(req)
                # print(time, title, link)
                yield req
            n += 1
예제 #13
0
 def _check_downloading(response):
     fp = request_fingerprint(req1)
     self.assertTrue(fp in self.info.downloading)
     self.assertTrue(fp in self.info.waiting)
     self.assertTrue(fp not in self.info.downloaded)
     self.assertEqual(len(self.info.waiting[fp]), 2)
     return response
예제 #14
0
    def parse(self, response):
        # pass
        cate = response.meta['cate']

        config_list = response.xpath('//ul[@class="pagelist"]/li')
        # print(len(config_list))
        n = [10, 21, 32]
        m = 0
        for config in config_list:
            # print(config)
            if m not in n:
                # print(m)
                item = ScrapyChinabaogaoV101Item()
                title = config.xpath('./h3/a/text()').extract_first()
                link =config.xpath('./h3/a/@href').extract_first()
                issue_time = config.xpath('./span/text()').extract_first()

                item['title'] = title
                item['issue_time'] = issue_time
                item['content_url'] = link
                item['information_categories'] = cate
                item['title_images'] = None
                # print(item)

                req = scrapy.Request(url=link, callback=self.parse2,
                                     meta={'item': item},
                                     dont_filter=True)
                item['id'] = request.request_fingerprint(req)
                yield req
            m += 1
예제 #15
0
    def parse(self, response):
        config_list = response.xpath('//div[@class="catlist"]/ul/li')
        num = [5, 11, 17, 23]
        for i in range(len(config_list)):
            if i not in num:
                item = Scrapy9ToV1Item()
                link = config_list[i].xpath('./a/@href').extract_first()
                title = config_list[i].xpath('./a/text()').extract_first()
                issue_time = config_list[i].xpath('./i/text()').extract_first()
                # print(title, link, issue_time)
                req = scrapy.Request(url=link,
                                     callback=self.parse_detail,
                                     meta={'item': item})

                item['id'] = request.request_fingerprint(req)
                item['title'] = title
                item['title_images'] = None
                item['content_url'] = link
                item['issue_time'] = issue_time[:10] if issue_time else None
                item['industry_categories'] = 'E'
                item['industry_Lcategories'] = response.meta[
                    'industry_Lcategories'][:2]
                item['industry_Mcategories'] = response.meta[
                    'industry_Lcategories']
                item['industry_Scategories'] = None
                item['information_categories'] = response.meta[
                    'information_categories']

                yield req
예제 #16
0
파일: httpcache.py 프로젝트: st-li/rg_chn3
    def store_response(self, spider, request, response):
        """Store the given response in the mongo."""
        key = request_fingerprint(request)
        response_headers = headers_dict_to_raw(response.headers)
        response_body = self._get_body(response.headers, response.body)
        request_headers = headers_dict_to_raw(request.headers)
        request_body = self._get_body(request.headers, request.body)
        stored_data = {
            'metadata': {
                'url': request.url,
                'method': request.method,
                'status': response.status,
                'response_url': response.url,
                'timestamp': time(),
            },
            'response_headers': response_headers,
            'response_body': response_body,
            'request_headers': request_headers,
            'request_body': request_body,
        }
        #print stored_data

        collection_index = int(key, 16) % 1000
        collection_name = 'collection' + str(collection_index)
        collection = self.db[collection_name]

        try:
            collection.insert({"_id": key, "value": stored_data})
            print "-----------------Write cache %s------------------" % collection_name
        except Exception, e:
            print e.message
            pass
예제 #17
0
class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        self.file = None
        self.fingerprints = set()
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + os.linesep)

    def request_fingerprint(self, request):5
        return request_fingerprint(request)
예제 #18
0
파일: qtfy3.py 프로젝트: kingweiliu/dban
    def parse_item(self, response):
        #i = DoubancrawlItem()
        print "ljw: *****************"
        print response.url
        item = qtfyItem()
        #tp = md5.new(response.url)
        #item["id"] = tp.hexdigest()
        item["id"] = request_fingerprint(response.request)
        item["url"] = response.url
        item["name"] = response.xpath("//h2/text()").extract()[0]
        entry = response.xpath('//div[@class="entry"]/p')

        item["desc"] = entry[0].xpath("text()").extract()[0]

        infos = entry[1].xpath("node()").extract()
        if len(infos) < 3:
            infos = entry[2].xpath("node()").extract()

        item["info"] = reduce((lambda x, y: x + y), infos)
        item["img"] = entry.xpath('img/@src').extract()[0]

        lnks = entry.xpath('a')
        lkret = []
        for ln in lnks:
            href = ln.xpath("@href").extract()[0]

            if not self.isDownloadLink(href):
                continue
            title = ln.xpath("text()").extract()[0]
            lkret.append({"title": title, "href": href})
        item["links"] = lkret
        #print item
        #inspect_response(response, self)
        yield item
예제 #19
0
def splash_request_fingerprint(request, include_headers=None):
    """ Request fingerprint which takes 'splash' meta key into account """

    fp = request_fingerprint(request, include_headers=include_headers)
    if 'splash' not in request.meta:
        return fp
    return dict_hash(request.meta['splash'], fp)
예제 #20
0
    def parse(self, response):
        config_list = response.xpath('//div[@class="morenews"]/ul/li')
        for con in config_list:
            item = ScrapyNewsbuildhrV1Item()
            title = con.xpath('./h1/a/text()').extract_first()
            link = 'http:' + con.xpath('./h1/a/@href').extract_first()
            source = con.xpath('./h1/span/address/text()').extract_first()
            issue_time = con.xpath(
                './h1/span/b/text()').extract_first().replace(
                    '年', '-').replace('月', '-').replace('日', '')
            # print(title, link, issue_time, source)

            req = scrapy.Request(url=link,
                                 callback=self.parse_detail,
                                 dont_filter=True,
                                 meta={'item': item})

            item['id'] = request.request_fingerprint(req)
            item['title'] = title
            item['title_images'] = None
            item['content_url'] = link
            item['issue_time'] = issue_time
            item['source'] = source[3:] if source else '建筑英才网'

            yield req
예제 #21
0
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(log.err, spider=info.spider)
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
예제 #22
0
 def request_reseen(self, response, request, spider):
     if 200 != response.status:
         fp = request_fingerprint(request)
         if self.server.sismember(self.key, fp):
             self.server.srem(self.key, fp)
             print "reseen [sts]%d [url]%s!" % (response.status,
                                                request.url)
예제 #23
0
파일: media.py 프로젝트: zhangcheng/scrapy
    def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(log.err, spider=info.spider)
        return dfd.addBoth(lambda _: wad) # it must return wad at last
예제 #24
0
    def test_get_media_requests(self):
        # returns single Request (without callback)
        req = Request('http://url')
        item = dict(requests=req) # pass a single item
        new_item = yield self.pipe.process_item(item, self.spider)
        assert new_item is item
        assert request_fingerprint(req) in self.info.downloaded

        # returns iterable of Requests
        req1 = Request('http://url1')
        req2 = Request('http://url2')
        item = dict(requests=iter([req1, req2]))
        new_item = yield self.pipe.process_item(item, self.spider)
        assert new_item is item
        assert request_fingerprint(req1) in self.info.downloaded
        assert request_fingerprint(req2) in self.info.downloaded
예제 #25
0
 def _check_downloading(response):
     fp = request_fingerprint(req1)
     self.assertTrue(fp in self.info.downloading)
     self.assertTrue(fp in self.info.waiting)
     self.assertTrue(fp not in self.info.downloaded)
     self.assertEqual(len(self.info.waiting[fp]), 2)
     return response
    def test_process_spider_output(self):
        fake_response = mock.Mock()
        fake_response.request = Request('http://source-request')

        def sort_requests_and_items(val):
            return val.__class__.__name__

        fake_result = sorted([
            Request('ftp://req1'),
            Request('https://req2'),
            Response('http://source-request'),
            DictItem(), {
                'field1': 'value1'
            }
        ],
                             key=sort_requests_and_items)
        results = self.instance.process_spider_output(fake_response,
                                                      fake_result, self.spider)
        assert isinstance(results, types.GeneratorType)
        for r in sorted(results, key=sort_requests_and_items):
            assert isinstance(r, type(fake_result.pop(0)))
            if isinstance(r, (DictItem, dict)):
                self.assertEqual(r["_cached_page_id"],
                                 request_fingerprint(fake_response.request))
        bad_fake_request = DictItem()
        bad_fake_request._values = None
        self.instance.process_spider_exception = mock.Mock()
        with self.assertRaises(TypeError):
            for _ in self.instance.process_spider_output(
                    fake_response, [bad_fake_request], self.spider):
                pass
        assert self.instance.process_spider_exception.called
예제 #27
0
 def test_get_key(self):
     mw = self.mwcls(self.temp_dir, reset=True)
     test_req1 = Request('http://url1')
     self.assertEqual(mw._get_key(test_req1),
                      to_bytes(request_fingerprint(test_req1)))
     test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
     self.assertEqual(mw._get_key(test_req2), b'dfkey1')
예제 #28
0
파일: dupefilter.py 프로젝트: tml/scrapy
 def request_seen(self, request):
     fp = request_fingerprint(request)
     if fp in self.fingerprints:
         return True
     self.fingerprints.add(fp)
     if self.file:
         self.file.write(fp + os.linesep)
예제 #29
0
    def delete_cached_request(self, request):
        ''' Delete a cached request from FilesystemCacheStorage. Returns True if successful. '''

        import os.path
        from shutil import rmtree
        from scrapy.utils.request import request_fingerprint

        if not settings[
                'HTTPCACHE_ENABLED'] and not 'httpcache.FilesystemCacheStorage' in settings[
                    'HTTPCACHE_STORAGE']:
            self.log(
                'HTTPCACHE is disabled or HTTPCACHE_STORAGE is not FilesystemCacheStorage.',
                log.ERROR)
            return False

        if not request and not isinstance(request,
                                          'scrapy.http.request.Request'):
            raise TypeError('Invalid argument "request"')
        req_fp = request_fingerprint(request)
        req_dir = os.path.join('.scrapy', settings['HTTPCACHE_DIR'], self.name,
                               req_fp[:2], req_fp)
        if not os.path.exists(req_dir):
            self.log('Path does not exist or permission denied %s' % req_dir,
                     log.ERROR)
            return False

        try:
            rmtree(req_dir)
            self.log(
                'Deleted cached request %s, url %s' % (req_dir, request.url),
                log.DEBUG)
            return True
        except Exception:
            self.log('Error deleting %s' % req_dir, log.ERROR)
            return False
예제 #30
0
    def request_seen(self, request):
        fp = request_fingerprint(request)
        if self.bf.exists(fp):  # bf is a object of BloomFilter
            return True

        self.bf.insert(fp)
        return False
예제 #31
0
    def process_spider_output(self, response, result, spider):
        f = self.outputs[spider]
        fp = request_fingerprint(response.request)
        tracetime = time.time()
        data = self._objtodict(self.RESPONSE_ATTRS, response)
        data['request'] = self._objtodict(self.REQUEST_ATTRS, response.request)
        self._write(f, fp, tracetime, 'response', data)

        for item in result:
            if isinstance(item, Request):
                data = self._objtodict(self.REQUEST_ATTRS, item)
                data['fp'] = request_fingerprint(item)
                self._write(f, fp, tracetime, 'request', data)
            else:
                self._write(f, fp, tracetime, 'item', dict(item))
            yield item
예제 #32
0
 def request_seen(self, request):
     fp = request_fingerprint(request)
     if self.bf.isContains(fp):  #已经存在
         return True
     else:
         self.bf.insert(fp)
         return False
예제 #33
0
 def parse(self, response):
     cate = response.meta['cate']
     print(response.status)
     new_urls = response.xpath('//div[@class="main_left"]/div')
     a = 0
     for new_url in new_urls:
         if a != 0 and a != 21:
             item = ScrapyPowerofweekV101Item()
             item['content_url'] = new_url.xpath(
                 './/h3/a/@href').extract_first()
             item['tags'] = new_url.xpath(
                 './/div[@class="tag"]/span[1]/a/text()').extract_first(
                 ).strip()
             item['title_images'] = None
             item['industry_categories'] = 'D'
             item['industry_Lcategories'] = '44'
             item['industry_Mcategories'] = '441'
             item['industry_Scategories'] = None
             item['information_categories'] = cate
             req = scrapy.Request(url=item['content_url'],
                                  callback=self.parse2,
                                  meta={'item': item})
             item["id"] = request.request_fingerprint(req)
             yield req
         a += 1
예제 #34
0
파일: RGSpider1.py 프로젝트: st-li/rg_chn
    def parse_candidate_overview(self, response):
        if response.status == 429:
            lostitem_str = 'lost overview: ' + response.url
            self.lostitem_file.write(lostitem_str)
            self.lostitem_file.close()
            raise CloseSpider(reason='被封了,准备切换ip')
        print '-----------start to process: ' + response.url
        headers = response.request.headers
        headers["referer"] = response.url

        item = RGPersonItem()

        featured_researches = response.xpath('//div[contains(@class, "profile-highlights-publications")]').extract()
        address = DataFilter.simple_format(response.xpath('//div[contains(@class, "institution-location")]/text()').extract())
        add_list = address.split(',')
        add_len = len(add_list)
        if add_len == 3:
            city = add_list[0].strip()
            province = add_list[1].strip()
            country = add_list[2].strip()
        elif add_len == 2:
            city = add_list[0].strip()
            province = ''
            country = add_list[1].strip()
        elif add_len == 1:
            city = add_list[0].strip()
            province = ''
            country = ''
        else:
            city = address
            province = ''
            country = ''
        person_key = request_fingerprint(response.request)
        item['person_key'] = person_key
        item['fullname'] = DataFilter.simple_format(response.xpath('//a[@class = "ga-profile-header-name"]/text()').extract())
        item['target_sciences'] = DataFilter.simple_format(response.xpath('//*[@id="target-sciences"]/text()').extract())
        item['title'] = DataFilter.simple_format(response.xpath('//*[contains(@class,"profile-degree")]/div[@class="title"]/text()').extract())
        item['score'] = DataFilter.simple_format(response.xpath('//span[starts-with(@class, "score-link")]').extract())

        top_coauthors = response.xpath('//div[starts-with(@class, "authors-block")]//ul/li//h5[@class="ga-top-coauthor-name"]/a')
        item['co_authors'] = parse_text_by_multi_content(top_coauthors, "|")
        
        skills_expertise = response.xpath('//div[starts-with(@class, "profile-skills")]/ul/li//a[starts-with(@class, "keyword-list-token-text")]')
        item['skills'] = parse_text_by_multi_content(skills_expertise, "|")

        topics = response.xpath('//ul[@class="keyword-list clearfix"]/li//a[starts-with(@class, "keyword-list-token-text")]')
        item['topics'] = parse_text_by_multi_content(topics, "|")

        item['institution'] = DataFilter.simple_format(response.xpath('//div[starts-with(@class, "institution-name")]').extract())
        item['department'] = DataFilter.simple_format(response.xpath('//div[@class = "institution-dept"]').extract())
        
        item['city'] = city
        item['province'] = province
        item['country'] = country
        if featured_researches and country != 'China': 
            url = response.url + "/publications"
            yield item
            yield Request(url, headers=headers, callback=self.parse_contribution, dont_filter=True, meta={"person_key":person_key})
        else:
            print "--------Nothing to return, it is invalid--------"
예제 #35
0
    def test_get_media_requests(self):
        # returns single Request (without callback)
        req = Request('http://url')
        item = dict(requests=req)  # pass a single item
        new_item = yield self.pipe.process_item(item, self.spider)
        assert new_item is item
        assert request_fingerprint(req) in self.info.downloaded

        # returns iterable of Requests
        req1 = Request('http://url1')
        req2 = Request('http://url2')
        item = dict(requests=iter([req1, req2]))
        new_item = yield self.pipe.process_item(item, self.spider)
        assert new_item is item
        assert request_fingerprint(req1) in self.info.downloaded
        assert request_fingerprint(req2) in self.info.downloaded
예제 #36
0
    def parse(self, response):
        config_list = response.xpath('//div[@class="show2 left"]/div')
        num = [0, 1, 12]
        for i in range(len(config_list)):

            if i not in num:
                item = ScrapyCnbridgeV1Item()
                title = config_list[i].xpath(
                    './/div[@class="list5"]/a/text()').extract_first()
                link = self.base_url + config_list[i].xpath(
                    './/div[@class="list5"]/a/@href').extract_first()
                title_images = config_list[i].xpath(
                    './/img/@src').extract_first()
                # print(link, title, title_images)

                req = scrapy.Request(url=link,
                                     callback=self.parse_detail,
                                     meta={'item': item})

                item['id'] = request.request_fingerprint(req)
                item['title'] = title
                item['title_images'] = title_images
                item['content_url'] = link
                item['industry_categories'] = 'E'
                item['industry_Lcategories'] = '48'
                item['industry_Mcategories'] = '481'
                item['industry_Scategories'] = None
                item['information_categories'] = '行业资讯'

                yield req
예제 #37
0
    def test_results_are_cached_across_multiple_items(self):
        rsp1 = Response('http://url1')
        req1 = Request('http://url1', meta=dict(response=rsp1))
        item = dict(requests=req1)
        new_item = yield self.pipe.process_item(item, self.spider)
        self.assertTrue(new_item is item)
        self.assertEqual(new_item['results'], [(True, rsp1)])

        # rsp2 is ignored, rsp1 must be in results because request fingerprints are the same
        req2 = Request(
            req1.url, meta=dict(response=Response('http://donot.download.me')))
        item = dict(requests=req2)
        new_item = yield self.pipe.process_item(item, self.spider)
        self.assertTrue(new_item is item)
        self.assertEqual(request_fingerprint(req1), request_fingerprint(req2))
        self.assertEqual(new_item['results'], [(True, rsp1)])
예제 #38
0
 def request_seen(self, request):
     fp = request_fingerprint(request)
     if self.bf.isContains(fp):
         return True
     else:
         self.bf.insert(fp)
         return False
예제 #39
0
    def test_get_media_requests(self):
        # returns single Request (without callback)
        info = self.pipe.spiderinfo[self.spider]
        req = Request("http://media.com/2.gif")
        item = dict(requests=req)  # pass a single item
        new_item = yield self.pipe.process_item(self.spider, item)
        assert new_item is item
        assert request_fingerprint(req) in info.downloaded

        # returns iterable of Requests
        req1 = Request("http://media.com/1.gif")
        req2 = Request("http://media.com/1.jpg")
        item = dict(requests=iter([req1, req2]))
        new_item = yield self.pipe.process_item(self.spider, item)
        assert new_item is item
        assert info.downloaded.get(request_fingerprint(req1)) is None
        assert info.downloaded.get(request_fingerprint(req2)) is None
예제 #40
0
    def request_seen(self, request):
        fp = request_fingerprint(request)
        crawlid = request.meta['crawlid']
        key = self.key + ':' + crawlid

        added = self.server.sadd(key, fp)
        self.server.expire(key, self.timeout)
        return not added
예제 #41
0
 def _request_key(self, spider, request):
     rfp = request_fingerprint(request)
     # We could disable the namespacing in sharded mode (old behaviour),
     # but keeping it allows us to merge collections later without
     # worrying about key conflicts.
     #if self.sharded:
     #    return rfp
     return '%s/%s' % (spider.name, rfp)
예제 #42
0
    def request_seen(self, request):
        fp = request_fingerprint(request)
        added = self.server.sadd(self.key, fp)

        if self.ttl > 0:
            self.server.expire(self.key, self.ttl)

        return not added
예제 #43
0
    def request_seen(self, request):
        fp = request_fingerprint(request)
        c_id = request.meta['crawlid']
        
        added = self.server.sadd(self.key + ":" + c_id, fp)
        self.server.expire(self.key + ":" + c_id, self.timeout)

        return not added
예제 #44
0
def get_request_finger(url):
    """
    获取 url 指纹(允许参数无序)
    :param url:
    :return:
    """
    req = Request(url=url)
    return request.request_fingerprint(req)
 def request_seen(self, request):
     fp = request_fingerprint(request)
     q = 'INSERT INTO crawl_history VALUES (?, ?)'
     args = (fp, request.url)
     try:
         self.conn.execute(q, args)
         self.conn.commit()
     except sqlite3.IntegrityError:
         return True
예제 #46
0
 def request_seen(self, request):
     level = self.determine_level(request)
     if level == 5:
         fp = request_fingerprint(request)
         if fp in self.fingerprints:
             return True
         self.fingerprints.add(fp)
         if self.file:
             self.file.write(fp + os.linesep)
예제 #47
0
 def request_fingerprint(self, request):
   # Order matters to lizard-rc.se.
   if urlparse.urlparse(request.url).netloc == "lizard-rc.se":
     fp = hashlib.sha1()
     fp.update(request.method)
     fp.update(request.url)
     fp.update(request.body or b'')
     return fp.hexdigest()
   return request_fingerprint(request)
예제 #48
0
파일: dupefilter.py 프로젝트: seaify/chuck
 def request_seen(self, request):
     fp = request_fingerprint(request)
     if fp in self.fingerprints:
         return True
     self.fingerprints.add(fp)
     if 'not_write_file' in request.meta and request.meta['not_write_file']:
         return
     if self.file:
         self.file.write(fp + os.linesep)
예제 #49
0
 def request_seen(self, request):
     fp = request_fingerprint(request)
     q = "INSERT INTO seen VALUES (?)"
     args = (fp,)
     try:
         self.conn.execute(q, args)
         self.conn.commit()
     except sqlite3.IntegrityError:
         return True
예제 #50
0
파일: filter.py 프로젝트: songrgg/crawljobs
 def request_seen(self, request):
     """
         use sismember judge whether fp is duplicate.
     """
     fp = request_fingerprint(request)
     if self.server.sismember(self.key, fp):
         return True
     self.server.sadd(self.key, fp)
     return False
예제 #51
0
    def request_seen(self, request):
        fp = request_fingerprint(request)

        added = self.server.basic_publish(
            exchange='',
            routing_key=self.key,
            body=fp
        )

        return not added
예제 #52
0
    def request_seen(self, request):
        added = True

        fp = request_fingerprint(request)
        result = self.collection.find({'fp': fp}).limit(1)
        if not result.count():
            self.collection.insert({'fp': fp})
            added = False

        return added
 def process_spider_input(self, response, spider):
     self.pipe_writer.write_request(
         url=response.url,
         status=response.status,
         method=response.request.method,
         rs=len(response.body),
         duration=response.meta.get('download_latency', 0) * 1000,
         parent=response.meta.get(HS_PARENT_ID_KEY),
         fp=request_fingerprint(response.request),
     )
     self._seen[response] = next(self.request_id_sequence)
예제 #54
0
def splash_requst_fingerprint(request, include_headers=None):
    """ Request fingerprint that takes 'splash' meta key into account """

    fp = request_fingerprint(request, include_headers=include_headers)
    if 'splash' not in request.meta:
        return fp

    h = hashlib.sha1(fp)
    for key, value in sorted(request.meta['splash'].items()):
        h.update(key)
        h.update(str(value))
    return h.hexdigest()
예제 #55
0
 def request_seen(self, request):
     """
         use sismember judge whether fp is duplicate.
     """
     
     fp = request_fingerprint(request)
     if self.server.sismember(self.key,fp):
         return True
     self.server.sadd(self.key, fp)
     with open('sinxin_debug', 'a') as f:
         f.write("[url]%s [url2]%s [mode]%s\n" % (request.url, canonicalize_url(request.url), request.method))
     return False
예제 #56
0
    def request_fingerprint(self, request):
        """Returns a fingerprint for a given request.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        str

        """
        return request_fingerprint(request)
예제 #57
0
    def request_seen(self, request):
        """
            use sismember judge whether fp is duplicate.
        """
        thisurl = request.url
        if(thisurl.find('page') != -1 ):
            return False

        fp = request_fingerprint(request)
        if self.server.sismember(self.key,fp):
            return True
        self.server.sadd(self.key, fp)
        return False
 def process_spider_input(self, response, spider):
     parent = response.meta.get('_hsparent')
     riq = hsref.job.requests.add(
         parent=parent,
         url=response.url,
         status=response.status,
         method=response.request.method,
         rs=len(response.body),
         duration=response.meta.get('download_latency', 0) * 1000,
         ts=time.time() * 1000,
         fp=request_fingerprint(response.request),
     )
     self._seen[response] = riq