Python BaseSpider示例，hepcrawl.spiders.base_spider.BaseSpider Python示例

示例#1

0

显示文件

def parsed_node_missing_scheme():
    """Call parse_node function with a link missing a http identifier."""
    spider = base_spider.BaseSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <record>
        <metadata>
            <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/         http://oai.base-search.net/base_dc/base_dc.xsd">
            <base_dc:link>www.example.com</base_dc:link>
            </base_dc:dc>
        </metadata>
        </record>
    </OAI-PMH>
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node.extract_first()

    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com',
            headers={
                'Content-Type': 'text/html',
            },
        )
        return spider.parse_node(response, node)

示例#2

0

显示文件

def urls():
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    return spider.get_urls_in_record(nodes[0])

示例#3

0

显示文件

def parsed_node():
    """Call parse_node function with a direct link"""
    url = "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf"
    responses.add(responses.HEAD,
                  url,
                  status=200,
                  content_type='application/pdf')
    spider = base_spider.BaseSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <record>
        <metadata>
            <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/         http://oai.base-search.net/base_dc/base_dc.xsd">
            <base_dc:link>http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf</base_dc:link>
            </base_dc:dc>
        </metadata>
        </record>
    </OAI-PMH>
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node[0].extract()

    parsed_item = spider.parse_node(response, node[0])
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record

示例#4

0

显示文件

def splash():
    """Call web scraper function, return final HEPRecord."""
    spider = base_spider.BaseSpider()
    splash_response = fake_response_from_file('base/test_1_splash.htm')
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    splash_response.meta["record"] = nodes[0].extract()
    return spider.scrape_for_pdf(splash_response)

示例#5

0

显示文件

def record():
    """Return built HEPRecord from the BASE spider."""
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    response.meta["record"] = nodes[0].extract()
    response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]
    parsed_record = spider.build_item(response)
    assert parsed_record
    return parsed_record

示例#6

0

显示文件

def parsed_node_without_link():
    """Call parse_node function without a direct link"""
    spider = base_spider.BaseSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <record>
        <metadata>
            <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/         http://oai.base-search.net/base_dc/base_dc.xsd">
            <base_dc:link>http://www.example.com</base_dc:link>
            </base_dc:dc>
        </metadata>
        </record>
    </OAI-PMH>
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node.extract()
    return spider.parse_node(response, node)

示例#7

0

显示文件

def splash():
    """Call web scraper function, return final HEPRecord."""
    spider = base_spider.BaseSpider()
    splash_response = fake_response_from_file('base/test_1_splash.htm')
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    splash_response.meta["record"] = nodes[0].extract()

    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf',
            headers={
                'Content-Type': 'text/html',
            },
        )

        parsed_item = spider.scrape_for_pdf(splash_response)
        assert parsed_item
        assert parsed_item.record

        return parsed_item.record

示例#8

0

显示文件

def direct_links():
    spider = base_spider.BaseSpider()
    urls = ["http://hdl.handle.net/1885/10005"]
    return spider.find_direct_links(urls)