def parsed_node_missing_scheme(): """Call parse_node function with a link missing a http identifier.""" spider = base_spider.BaseSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <record> <metadata> <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd"> <base_dc:link>www.example.com</base_dc:link> </base_dc:dc> </metadata> </record> </OAI-PMH> """ response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node.extract_first() with requests_mock.Mocker() as mock: mock.head( 'http://www.example.com', headers={ 'Content-Type': 'text/html', }, ) return spider.parse_node(response, node)
def urls(): spider = base_spider.BaseSpider() response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) return spider.get_urls_in_record(nodes[0])
def parsed_node(): """Call parse_node function with a direct link""" url = "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf" responses.add(responses.HEAD, url, status=200, content_type='application/pdf') spider = base_spider.BaseSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <record> <metadata> <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd"> <base_dc:link>http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf</base_dc:link> </base_dc:dc> </metadata> </record> </OAI-PMH> """ response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node[0].extract() parsed_item = spider.parse_node(response, node[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def splash(): """Call web scraper function, return final HEPRecord.""" spider = base_spider.BaseSpider() splash_response = fake_response_from_file('base/test_1_splash.htm') response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) splash_response.meta["record"] = nodes[0].extract() return spider.scrape_for_pdf(splash_response)
def record(): """Return built HEPRecord from the BASE spider.""" spider = base_spider.BaseSpider() response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] parsed_record = spider.build_item(response) assert parsed_record return parsed_record
def parsed_node_without_link(): """Call parse_node function without a direct link""" spider = base_spider.BaseSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <record> <metadata> <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd"> <base_dc:link>http://www.example.com</base_dc:link> </base_dc:dc> </metadata> </record> </OAI-PMH> """ response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node.extract() return spider.parse_node(response, node)
def splash(): """Call web scraper function, return final HEPRecord.""" spider = base_spider.BaseSpider() splash_response = fake_response_from_file('base/test_1_splash.htm') response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) splash_response.meta["record"] = nodes[0].extract() with requests_mock.Mocker() as mock: mock.head( 'http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf', headers={ 'Content-Type': 'text/html', }, ) parsed_item = spider.scrape_for_pdf(splash_response) assert parsed_item assert parsed_item.record return parsed_item.record
def direct_links(): spider = base_spider.BaseSpider() urls = ["http://hdl.handle.net/1885/10005"] return spider.find_direct_links(urls)