示例#1
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'), )).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()
示例#2
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ).next()
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
示例#3
0
def authors():
    """Returns get_authors() output."""
    spider = phil_spider.PhilSpider()
    response = fake_response_from_file('phil/test_thesis.json')
    jsonrecord = json.loads(response.body_as_unicode())
    response.meta["jsonrecord"] = jsonrecord[0]
    return spider.get_authors(jsonrecord[0]['authors'])
示例#4
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record.xml',
                response_type=TextResponse,
            )))

    assert parsed_items
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
示例#5
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record0.xml',
                response_type=TextResponse,
            )))

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]
示例#6
0
def urls():
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    return spider.get_urls_in_record(nodes[0])
示例#7
0
def record(scrape_pos_page_body):
    """Return the results of the spider."""
    spider = dnb_spider.DNBSpider()

    with requests_mock.Mocker() as mock:
        mock.head('http://nbn-resolving.de/urn:nbn:de:hebis:30:3-386257',
                  headers={
                      'Content-Type': 'text/html',
                  })
        mock.head('http://d-nb.info/1079912991/34',
                  headers={
                      'Content-Type': 'application/pdf;charset=base64',
                  })
        mock.head(
            'http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38625',
            headers={
                'Content-Type': 'text/html',
            })
        request = spider.parse(
            fake_response_from_file('dnb/test_1.xml')).next()
        response = HtmlResponse(url=request.url,
                                request=request,
                                body=scrape_pos_page_body,
                                **{'encoding': 'utf-8'})

        parsed_item = request.callback(response)
        assert parsed_item
        assert parsed_item.record

        return parsed_item.record
示例#8
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
示例#9
0
def parse_requests():
    """Returns a fake request to the record file.

    With links.
    """
    spider = phil_spider.PhilSpider()
    response = fake_response_from_file('phil/test_thesis.json')
    return spider.parse(response)
示例#10
0
def test_results_jats_parser_handle_date_absence():
    from scrapy.http import XmlResponse

    spider = aps_spider.APSSpider(aps_token="secret")
    fake_response = fake_response_from_file(
        'aps/PhysRevD.96.095036_no_date_nodes.xml',
        response_type=XmlResponse,
    )
    record = spider._parse_jats(fake_response).record
    assert validate(record, 'hep') is None
示例#11
0
def record():
    """Return scraping results from the MIT spider."""
    spider = mit_spider.MITSpider()
    response = fake_response_from_file('mit/test_splash.html')

    parsed_item = spider.build_item(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#12
0
def record():
    """Return scraping results from the INFN spider."""
    spider = infn_spider.InfnSpider()
    response = fake_response_from_file('infn/test_splash.html')

    parsed_item = spider.scrape_splash(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#13
0
def results():
    """Return results generator from the Alpha spider."""
    spider = alpha_spider.AlphaSpider()
    parsed_items = list(
        spider.parse(fake_response_from_file('alpha/test_1.htm')))

    records = [parsed_item.record for parsed_item in parsed_items]
    assert records

    return records
示例#14
0
def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    items = spider.parse(
        fake_response_from_file(
            'arxiv/sample_arxiv_record10.xml',
            response_type=TextResponse,
        ), )
    parsed_record = items.next()
    assert parsed_record
    return spider, parsed_record
示例#15
0
def test_results_from_jats():
    """Get and validate results from mocking a JATS response."""
    from scrapy.http import XmlResponse

    spider = aps_spider.APSSpider()
    fake_response = fake_response_from_file(
        'aps/PhysRevD.96.095036.xml',
        response_type=XmlResponse,
    )
    record = spider._parse_jats(fake_response).record
    assert validate(record, 'hep') == None
示例#16
0
def parsed_node():
    """Call parse_node and return its request call."""
    spider = mit_spider.MITSpider()
    response = fake_response_from_file('mit/test_list.html')
    tag = spider.itertag
    node = get_node(spider, tag, response, rtype="html")

    parsed_item = spider.parse_node(response, node).next()
    assert parsed_item

    return parsed_item
示例#17
0
def record():
    """Return results from the MAGIC spider. First parse node, then scrape,
    and finally build the record.
    """
    spider = magic_spider.MagicSpider()
    response = fake_response_from_file('magic/test_1.html')
    selector = Selector(response, type='html')
    node = selector.xpath('//%s' % spider.itertag)[0]
    spider.domain = "file:///tests/responses/magic/"
    parsed_node = spider.parse_node(response, node)

    splash_response = fake_response_from_file('magic/test_splash.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]

    parsed_item = spider.scrape_for_pdf(splash_response).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#18
0
def test_results_from_jats():
    """Get and validate results from mocking a JATS response."""
    from scrapy.http import XmlResponse

    spider = aps_spider.APSSpider(aps_token="secret")
    fake_response = fake_response_from_file(
        'aps/PhysRevD.96.095036.xml',
        response_type=XmlResponse,
    )
    record = spider._parse_jats(fake_response).record
    assert validate(record, 'hep') is None
示例#19
0
def record():
    """Return the results from the Hindawi spider."""
    spider = hindawi_spider.HindawiSpider()
    response = fake_response_from_file("hindawi/test_1.xml")
    nodes = get_node(spider, "//marc:record", response)

    parsed_item = spider.parse_node(response, nodes[0])
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#20
0
def test_error_handler(crawler):
    """Test ErrorHandler extension."""
    handler = ErrorHandler.from_crawler(crawler)
    response = crawler.spider.parse(
        fake_response_from_file('world_scientific/sample_ws_record.xml'))
    assert 'errors' not in crawler.spider.state
    handler.spider_error("Some failure", response, crawler.spider)

    assert 'errors' in crawler.spider.state
    assert crawler.spider.state['errors'][0]["exception"] == "Some failure"
    assert crawler.spider.state['errors'][0]["sender"] == response
示例#21
0
def record():
    """Return the results from the Hindawi spider."""
    spider = hindawi_spider.HindawiSpider()
    response = fake_response_from_file("hindawi/test_1.xml")
    nodes = get_node(spider, "//marc:record", response)

    parsed_item = spider.parse_node(response, nodes[0])
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#22
0
def record():
    """Return results from the T2K spider."""
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)
    spider.domain = "file:///tests/responses/t2k/"
    parsed_node = spider.parse_node(response, nodes[0])

    splash_response = fake_response_from_file('t2k/001.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["title"] = parsed_node.meta["title"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]
    splash_response.meta["authors"] = parsed_node.meta["authors"]

    parsed_item = spider.scrape_for_pdf(splash_response).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#23
0
def test_error_handler(crawler):
    """Test ErrorHandler extension."""
    handler = ErrorHandler.from_crawler(crawler)
    response = crawler.spider.parse(fake_response_from_file(
        'world_scientific/sample_ws_record.xml'
    ))
    assert 'errors' not in crawler.spider.state
    handler.spider_error("Some failure", response, crawler.spider)

    assert 'errors' in crawler.spider.state
    assert crawler.spider.state['errors'][0]["exception"] == "Some failure"
    assert crawler.spider.state['errors'][0]["sender"] == response
示例#24
0
def record():
    """Return results generator from the Alpha spider."""
    spider = phenix_spider.PhenixSpider()
    response = fake_response_from_file('phenix/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)

    parsed_item = spider.parse_node(response, nodes[0])
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#25
0
def record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    response = fake_response_from_file('iop/xml/test_standard.xml')
    node = get_node(spider, "Article", response)
    spider.pdf_files = TEST_PDF_DIR

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#26
0
def record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    response = fake_response_from_file('iop/xml/test_standard.xml')
    node = get_node(spider, "Article", response)
    spider.pdf_files = TEST_PDF_DIR

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#27
0
def test_parse_node_nolink():
    """Test parse_node function. This time there is no splash page link.
    The result should be a HEPRecord with minimal data.
    """
    spider = infn_spider.InfnSpider()
    response = fake_response_from_file('infn/test_1_nolink.html')
    selector = Selector(response, type='html')
    node = selector.xpath('//%s' % spider.itertag)[0]
    parsed_item = spider.parse_node(response, node).next()
    assert parsed_item
    assert parsed_item.record

    assert isinstance(parsed_item.record, hepcrawl.items.HEPRecord)
示例#28
0
def record():
    """Return results from the Brown spider."""
    spider = brown_spider.BrownSpider()
    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com/studio/item/bdr:11303/PDF/',
            headers={
                'Content-Type': 'text/html',
            },
        )
        response = fake_response_from_file('brown/test_1.json')
        jsonresponse = json.loads(response.body_as_unicode())
        jsonrecord = jsonresponse["items"]["docs"][0]
        jsonrecord["uri"] = "brown/test_splash.html"

        splash_response = fake_response_from_file('brown/test_splash.html')
        splash_response.meta["jsonrecord"] = jsonrecord

        parsed_item = spider.scrape_splash(splash_response)
        assert parsed_item
        assert parsed_item.record

        return parsed_item.record
示例#29
0
def splash():
    """Call web scraper function, return final HEPRecord."""
    spider = base_spider.BaseSpider()
    splash_response = fake_response_from_file('base/test_1_splash.htm')
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    splash_response.meta["record"] = nodes[0].extract()

    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf',
            headers={
                'Content-Type': 'text/html',
            },
        )

        parsed_item = spider.scrape_for_pdf(splash_response)
        assert parsed_item
        assert parsed_item.record

        return parsed_item.record
示例#30
0
def non_url():
    """Parse the node without any links. Should
    take straight to `build_item` and build the HEPRecord.
    """
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1_nourl.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)

    parsed_item = spider.parse_node(response, nodes[0]).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#31
0
def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record10.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    items = (spider.parse_record(sel) for sel in test_selectors)
    parsed_record = items.next()
    assert parsed_record
    yield spider, parsed_record

    clean_dir()
示例#32
0
def journal():
    """Return results generator from the Phil spider.

    Journal specific.
    """
    spider = phil_spider.PhilSpider()
    response = fake_response_from_file('phil/test_journal.json')
    jsonrecord = json.loads(response.body_as_unicode())
    response.meta["jsonrecord"] = jsonrecord[0]

    parsed_item = spider.build_item(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#33
0
def record_rich(package_rich):
    """Return results from the EDP spider with 'rich' format.

    This is not an open access journal, so no splash scraping.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_rich.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    fake_resp.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", fake_resp)[0]

    parsed_item = spider.parse_node(fake_resp, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#34
0
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(file_name=response_file_name,
                                response_type=TextResponse))

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (pipeline.process_item(record, spider).record for record in records)
示例#35
0
def record_rich(package_rich):
    """Return results from the EDP spider with 'rich' format.

    This is not an open access journal, so no splash scraping.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_rich.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    fake_resp.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", fake_resp)[0]

    parsed_item = spider.parse_node(fake_resp, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#36
0
def record():
    """Return built HEPRecord from the BASE spider."""
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')

    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    response.meta["record"] = nodes[0].extract()
    response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]

    parsed_item = spider.build_item(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#37
0
def record_jats(package_jats, scrape_pos_page_body):
    """Return results from the EDP spider with JATS format.

    This is an open access journal, so we can scrape the splash page.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_jats.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    node = get_node(spider, "//article", fake_resp)[0]
    request = spider.parse_node(fake_resp, node)
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_page_body,
        **{'encoding': 'utf-8'}
    )

    parsed_item = request.callback(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
示例#38
0
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(
            file_name=response_file_name,
            response_type=TextResponse
        )
    )

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (
        pipeline.process_item(record, spider)['record']
        for record in records
    )
示例#39
0
def results_from_json():
    """Return results by parsing a JSON file."""
    from scrapy.http import TextResponse

    spider = aps_spider.APSSpider()
    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'aps/aps_single_response.json',
                response_type=TextResponse,
            )
        )
    )

    class MockFailure:
        """Mock twisted.python.failure.Failure, failure on JATS request."""
        def __init__(self):
            self.request = parsed_items[0]

    records = [spider._parse_json_on_failure(MockFailure()).record]

    assert records
    return records
示例#40
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        return crawl_result['record']

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
示例#41
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]

    clean_dir()