def test_no_dois_jats(): """Test parsing when no DOI in record. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id> <title-group> <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title> </title-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "dois" not in record assert isinstance(record, HEPRecord)
def parsed_node(): """Call parse_node function with a direct link""" url = "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf" responses.add(responses.HEAD, url, status=200, content_type='application/pdf') spider = base_spider.BaseSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <record> <metadata> <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd"> <base_dc:link>http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf</base_dc:link> </base_dc:dc> </metadata> </record> </OAI-PMH> """ response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node[0].extract() parsed_item = spider.parse_node(response, node[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def parsed_node_missing_scheme(): """Call parse_node function with a link missing a http identifier.""" spider = base_spider.BaseSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <record> <metadata> <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd"> <base_dc:link>www.example.com</base_dc:link> </base_dc:dc> </metadata> </record> </OAI-PMH> """ response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node.extract_first() with requests_mock.Mocker() as mock: mock.head( 'http://www.example.com', headers={ 'Content-Type': 'text/html', }, ) return spider.parse_node(response, node)
def test_no_author_no_date_no_url(): """Parse the node in the listing without author, date, or url. Should take straight to `build_item` and build the HEPRecord. """ spider = magic_spider.MagicSpider() body = """ <html> <body id="f1d"> <table class="list" style="margin-left: 20px; width: 920px;"> <tr class="odd"> <td><a>Limits to the violation of...</a></td> </tr> </table> </body> </html> """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) parsed_item = spider.parse_node(response, node).next() assert parsed_item assert parsed_item.record record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "authors" not in record
def erratum_open_access_record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PublisherName>Institute of Physics</PublisherName> <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle> <Volume>143</Volume> <Issue>3</Issue> </Journal> <FirstPage LZero="save">336</FirstPage> <PublicationType>Published Erratum</PublicationType> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_no_spash_page(): """Test that when url was found but could not be reached, build the record with the available data. """ spider = magic_spider.MagicSpider() body = """ <html> <body id="f1d"> <table class="list" style="margin-left: 20px; width: 920px;"> <tr class="odd"> <td> <a href="http://non_reachable_url/">Limits to the violation of...</a> </td> </tr> </table> </body> </html> """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) parsed_node = spider.parse_node(response, node) response.status = 404 response.meta["title"] = parsed_node.meta["title"] response.meta["urls"] = parsed_node.meta["urls"] parsed_item = spider.scrape_for_pdf(response).next() assert parsed_item assert parsed_item.record record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "urls" in record assert "title" in record assert record["urls"][0]["value"] == "http://non_reachable_url/" assert record["title"] == "Limits to the violation of..."
def record_references_only(): """Parse references.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <back> <ref-list> <title>References</title> <ref id="R5"><label>5.</label><mixed-citation publication-type="journal" id="a"><string-name><given-names>R.V.</given-names> <surname>Krishnan</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Panneerselvam</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Manikandan</surname></string-name> <string-name><given-names>M.P.</given-names> <surname>Antony</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Nagarajan</surname></string-name>, <source>J. Nucl. Radiochem. Sci.</source>, <volume>10</volume>.<issue>1</issue>, <fpage>19</fpage>–<lpage>26</lpage> (<year>2009</year>).</mixed-citation></ref> <ref id="R44"><label>44.</label><mixed-citation publication-type="journal"><string-name><given-names>L.</given-names> <surname>Cronin</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Sojka</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Lefebvre</surname></string-name>, <source>SAE Technical Paper</source>, DOI: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4271/852086">10.4271/852086</ext-link>, (<year>1985</year>)</mixed-citation></ref> <ref id="R3"><label>3.</label><mixed-citation publication-type="book"><string-name><given-names>T.</given-names> <surname>Aliyev</surname></string-name>, <string-name><given-names>Т.</given-names> <surname>Belyaev</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gallagher</surname></string-name> <article-title>Simulation in ANSYS flow to the gas purification section of the multicomponent gas mixture through the dust cyclone CKBN GP-628</article-title>. <source>Mechanical engineering</source>, <publisher-loc>Moscow</publisher-loc>, №<issue>10</issue>, (<year>2014</year>).</mixed-citation></ref> </ref-list> </back> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_addendum_jats(): """Test parsing when article type is addendum. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id> <title-group> <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title> </title-group> <related-article ext-link-type="doi" href="10.1051/0004-6361/201014485"> </related-article> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "related_article_doi" in record assert record["related_article_doi"][0][ "value"] == "10.1051/0004-6361/201014485"
def test_no_dois_jats(): """Test parsing when no DOI in record. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id> <title-group> <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title> </title-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "dois" not in record assert isinstance(record, HEPRecord)
def record_references_only(): """Parse references.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <back> <ref-list> <title>References</title> <ref id="R5"><label>5.</label><mixed-citation publication-type="journal" id="a"><string-name><given-names>R.V.</given-names> <surname>Krishnan</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Panneerselvam</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Manikandan</surname></string-name> <string-name><given-names>M.P.</given-names> <surname>Antony</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Nagarajan</surname></string-name>, <source>J. Nucl. Radiochem. Sci.</source>, <volume>10</volume>.<issue>1</issue>, <fpage>19</fpage>–<lpage>26</lpage> (<year>2009</year>).</mixed-citation></ref> <ref id="R44"><label>44.</label><mixed-citation publication-type="journal"><string-name><given-names>L.</given-names> <surname>Cronin</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Sojka</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Lefebvre</surname></string-name>, <source>SAE Technical Paper</source>, DOI: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4271/852086">10.4271/852086</ext-link>, (<year>1985</year>)</mixed-citation></ref> <ref id="R3"><label>3.</label><mixed-citation publication-type="book"><string-name><given-names>T.</given-names> <surname>Aliyev</surname></string-name>, <string-name><given-names>Т.</given-names> <surname>Belyaev</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gallagher</surname></string-name> <article-title>Simulation in ANSYS flow to the gas purification section of the multicomponent gas mixture through the dust cyclone CKBN GP-628</article-title>. <source>Mechanical engineering</source>, <publisher-loc>Moscow</publisher-loc>, №<issue>10</issue>, (<year>2014</year>).</mixed-citation></ref> </ref-list> </back> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_addendum_jats(): """Test parsing when article type is addendum. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id> <title-group> <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title> </title-group> <related-article ext-link-type="doi" href="10.1051/0004-6361/201014485"> </related-article> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "related_article_doi" in record assert record["related_article_doi"][0][ "value"] == "10.1051/0004-6361/201014485"
def parse_without_splash(): """Test parsing the XML without splash page links.""" spider = dnb_spider.DNBSpider() body = """ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> <ListRecords xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"> <record> <metadata> <slim:record xmlns:slim="http://www.loc.gov/MARC21/slim" type="Bibliographic"> <slim:datafield tag="856" ind1=" " ind2="0"> <slim:subfield code="u">http://d-nb.info/1079912991/34</slim:subfield> </slim:datafield> </slim:record> </metadata> </record> </ListRecords> </OAI-PMH> """ response = fake_response_from_string(body) nodes = get_node(spider, "//" + spider.itertag, response) with requests_mock.Mocker() as mock: mock.head('http://d-nb.info/1079912991/34', headers={ 'Content-Type': 'application/pdf;charset=base64', }) parsed_item = spider.parse_node(response, nodes[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def erratum_open_access_record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PublisherName>Institute of Physics</PublisherName> <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle> <Volume>143</Volume> <Issue>3</Issue> </Journal> <FirstPage LZero="save">336</FirstPage> <PublicationType>Published Erratum</PublicationType> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def record(): """Return the results from the Hindawi spider.""" spider = hindawi_spider.HindawiSpider() response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) parsed_item = spider.parse_node(response, nodes[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def parsed_node(): """Call parse_node and return its request call.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_list.html') tag = spider.itertag node = get_node(spider, tag, response, rtype="html") parsed_item = spider.parse_node(response, node).next() assert parsed_item return parsed_item
def record(): """Return the results from the Hindawi spider.""" spider = hindawi_spider.HindawiSpider() response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) parsed_item = spider.parse_node(response, nodes[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_no_valid_article(): """Test parsing when filtering out non-interesting article types.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="lecture" xml:lang="en" dtd-version="3.0"> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] record = spider.parse_node(response, node) assert record is None
def record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_no_valid_article(): """Test parsing when filtering out non-interesting article types.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="lecture" xml:lang="en" dtd-version="3.0"> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] record = spider.parse_node(response, node) assert record is None
def non_thesis(): """Return a heprecord for a Master's thesis (should be None as we don't want them).""" spider = phenix_spider.PhenixSpider() body = """ <ul> <li><b>M.Sc. Author</b>: "This is an Master's thesis, not a PhD", M.Sc. thesis at Master Science University, 2016, <br><br> </ul> """ response = fake_response_from_string(body) node = get_node(spider, '//li', text=body) return spider.parse_node(response, node)
def record_rich(package_rich): """Return results from the EDP spider with 'rich' format. This is not an open access journal, so no splash scraping. """ spider = edp_spider.EDPSpider() xml_path = package_rich.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] parsed_item = spider.parse_node(fake_resp, node) assert parsed_item assert parsed_item.record return parsed_item.record
def record_rich(package_rich): """Return results from the EDP spider with 'rich' format. This is not an open access journal, so no splash scraping. """ spider = edp_spider.EDPSpider() xml_path = package_rich.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] parsed_item = spider.parse_node(fake_resp, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_collections_review(): """Test collections when doctype is review. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" xml:lang="en" dtd-version="3.0"> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "collections" in record assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
def test_collections_review(): """Test collections when doctype is review. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" xml:lang="en" dtd-version="3.0"> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "collections" in record assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
def record_jats(package_jats, scrape_pos_page_body): """Return results from the EDP spider with JATS format. This is an open access journal, so we can scrape the splash page. """ spider = edp_spider.EDPSpider() xml_path = package_jats.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) node = get_node(spider, "//article", fake_resp)[0] request = spider.parse_node(fake_resp, node) response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) parsed_item = request.callback(response) assert parsed_item assert parsed_item.record return parsed_item.record
def test_no_dois_rich(): """Test parsing when no DOI in record. 'Rich' format.""" spider = edp_spider.EDPSpider() body = """ <EDPSArticle> <ArticleID Type="Article"> <EDPSRef>ds1691</EDPSRef> </ArticleID> </EDPSArticle> """ response = fake_response_from_string(body) response.meta["rich"] = True node = get_node(spider, "//EDPSArticle", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "dois" not in record assert isinstance(record, HEPRecord)
def test_no_dois_rich(): """Test parsing when no DOI in record. 'Rich' format.""" spider = edp_spider.EDPSpider() body = """ <EDPSArticle> <ArticleID Type="Article"> <EDPSRef>ds1691</EDPSRef> </ArticleID> </EDPSArticle> """ response = fake_response_from_string(body) response.meta["rich"] = True node = get_node(spider, "//EDPSArticle", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert "dois" not in record assert isinstance(record, HEPRecord)
def record_jats(package_jats, scrape_pos_page_body): """Return results from the EDP spider with JATS format. This is an open access journal, so we can scrape the splash page. """ spider = edp_spider.EDPSpider() xml_path = package_jats.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) node = get_node(spider, "//article", fake_resp)[0] request = spider.parse_node(fake_resp, node) response = HtmlResponse( url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) parsed_item = request.callback(response) assert parsed_item assert parsed_item.record return parsed_item.record
def test_aff_with_email(): """Test popping email from the affiliation string. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <contrib-group> <contrib contrib-type="author"> <name> <surname>Gorczyca</surname> <given-names>T. W.</given-names> </name> <xref ref-type="aff" rid="AFF1">1</xref> </contrib> <aff id="AFF1"> <label>1</label> <addr-line>Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA e-mail: [email protected] </addr-line> </aff> <contrib-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA" assert 'affiliations' in record['authors'][0] assert record['authors'][0]['affiliations'][0]['value'] == affiliation assert "e-mail" not in record['authors'][0]['affiliations'][0]['value'] assert record['authors'][0]['email'] is None
def test_aff_with_email(): """Test popping email from the affiliation string. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <contrib-group> <contrib contrib-type="author"> <name> <surname>Gorczyca</surname> <given-names>T. W.</given-names> </name> <xref ref-type="aff" rid="AFF1">1</xref> </contrib> <aff id="AFF1"> <label>1</label> <addr-line>Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA e-mail: [email protected] </addr-line> </aff> <contrib-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA" assert 'affiliations' in record['authors'][0] assert record['authors'][0]['affiliations'][0]['value'] == affiliation assert "e-mail" not in record['authors'][0]['affiliations'][0]['value'] assert record['authors'][0]['email'] is None
def test_not_published_record(): """Not-published paper should result in nothing.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PubDate PubStatus="aheadofprint"> <Year>2015</Year> <Month>03</Month> </PubDate> </Journal> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) records = spider.parse_node(response, node) assert records is None
def test_author_with_email(): """Test getting author email. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <contrib-group content-type="authors"> <contrib contrib-type="author" corresp="yes"><name><surname>Sname</surname><given-names>Fname</given-names></name><email>[email protected]</email><xref ref-type="aff" rid="AFF1"/><xref ref-type="corresp" rid="FN1">a</xref></contrib> </contrib-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert 'email' in record['authors'][0] assert record['authors'][0]['email'] == "*****@*****.**"
def test_author_with_email(): """Test getting author email. JATS format.""" spider = edp_spider.EDPSpider() body = """ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0"> <front> <article-meta> <contrib-group content-type="authors"> <contrib contrib-type="author" corresp="yes"><name><surname>Sname</surname><given-names>Fname</given-names></name><email>[email protected]</email><xref ref-type="aff" rid="AFF1"/><xref ref-type="corresp" rid="FN1">a</xref></contrib> </contrib-group> </article-meta> </front> </article> """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record record = parsed_item.record assert 'email' in record['authors'][0] assert record['authors'][0]['email'] == "*****@*****.**"
def test_not_published_record(): """Not-published paper should result in nothing.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PubDate PubStatus="aheadofprint"> <Year>2015</Year> <Month>03</Month> </PubDate> </Journal> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) records = spider.parse_node(response, node) assert records is None