def record(): """Return scraping results from the INFN spider.""" spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_splash.html') parsed_record = spider.scrape_splash(response) assert parsed_record return parsed_record
def test_parse_node_nolink(): """Test parse_node function. This time there is no splash page link. The result should be a HEPRecord with minimal data. """ spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_1_nolink.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] record = spider.parse_node(response, node).next() assert isinstance(record, hepcrawl.items.HEPRecord)
def test_parse_node(): """Test parse_node function. This should be a scrapy Request object. The object should have both the splash and pdf links in its meta-dictionary. """ spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) record = spider.parse_node(response, nodes[0]).next() splash_link = "http://www.infn.it/thesis/thesis_dettaglio.php?tid=10136" pdf_link = "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf" assert isinstance(record, Request) assert record.meta["splash_link"] == splash_link assert record.meta["pdf_links"][0] == pdf_link
def test_non_thesis(): """Test MSc thesis skipping. Return a HEPrecord for a Master's thesis (should be None as we don't want them).""" spider = infn_spider.InfnSpider() body = """ <html> <body> <tr valign="top"> <td align="left" class="intest"> Tipo</td> <td align="left" class="bordo">Magister</td> </tr> </body> </html> """ response = fake_response_from_string(body) record = spider.scrape_splash(response) assert record is None