示例#1
0
def record():
    """Return scraping results from the INFN spider."""
    spider = infn_spider.InfnSpider()
    response = fake_response_from_file('infn/test_splash.html')
    parsed_record = spider.scrape_splash(response)
    assert parsed_record
    return parsed_record
示例#2
0
def test_parse_node_nolink():
    """Test parse_node function. This time there is no splash page link.
    The result should be a HEPRecord with minimal data.
    """
    spider = infn_spider.InfnSpider()
    response = fake_response_from_file('infn/test_1_nolink.html')
    selector = Selector(response, type='html')
    node = selector.xpath('//%s' % spider.itertag)[0]
    record = spider.parse_node(response, node).next()

    assert isinstance(record, hepcrawl.items.HEPRecord)
示例#3
0
def test_parse_node():
    """Test parse_node function. This should be a scrapy Request object.

    The object should have both the splash and pdf links in its meta-dictionary.
    """
    spider = infn_spider.InfnSpider()
    response = fake_response_from_file('infn/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)
    record = spider.parse_node(response, nodes[0]).next()

    splash_link = "http://www.infn.it/thesis/thesis_dettaglio.php?tid=10136"
    pdf_link = "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf"

    assert isinstance(record, Request)
    assert record.meta["splash_link"] == splash_link
    assert record.meta["pdf_links"][0] == pdf_link
示例#4
0
def test_non_thesis():
    """Test MSc thesis skipping.

    Return a HEPrecord for a Master's thesis (should be None as we don't
    want them)."""
    spider = infn_spider.InfnSpider()
    body = """
    <html>
    <body>
    <tr valign="top">
      <td align="left" class="intest"> Tipo</td>
      <td align="left" class="bordo">Magister</td>
    </tr>
    </body>
    </html>
    """
    response = fake_response_from_string(body)
    record = spider.scrape_splash(response)

    assert record is None