def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), ) ).next() response = HtmlResponse( url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() crawl_result = pipeline.process_item(parsed_item, spider) assert crawl_result['record'] yield crawl_result['record'] clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_item(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), )).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record yield parsed_record clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): record = pipeline.process_item(item, spider) return record parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ))) assert parsed_items pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ))) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return [ _get_processed_item(parsed_item, spider) for parsed_item in parsed_items ]
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ from scrapy.http import TextResponse records = list( spider.parse( fake_response_from_file('arxiv/sample_arxiv_record.xml', response_type=TextResponse))) assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return [pipeline.process_item(record, spider) for record in records]
def record(scrape_pos_page_body): """Return results generator from the PoS spider.""" crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file('pos/sample_pos_record.xml')).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) record = request.callback(response) return pipeline.process_item(record, spider)
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file(file_name=response_file_name, response_type=TextResponse)) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return (pipeline.process_item(record, spider).record for record in records)
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file( file_name=response_file_name, response_type=TextResponse ) ) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return ( pipeline.process_item(record, spider)['record'] for record in records )
def record(): """Return results generator from the crossref spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=crossref_spider.CrossrefSpider) spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi') fake_response = fake_response_from_file( 'crossref/sample_crossref_record.json', response_type=TextResponse, ) parsed_items = spider.parse(fake_response) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield _get_record_from_processed_item(parsed_items, spider) clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): record = pipeline.process_item(item, spider) return record fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): crawl_result = pipeline.process_item(item, spider) return crawl_result['record'] fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items] clean_dir()