def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def profile_article(self, document): """ Temporary, profile the article by folder names in test data set In real code we still want this to return the same values """ # Temporary setting of version values from directory names soup = self.article_soup(self.article_xml_file()) # elife id / doi id / manuscript id fid = parser.doi(soup).split('.')[-1] # article status if parser.is_poa(soup) is True: status = 'poa' else: status = 'vor' # version version = self.version_number(document) # volume volume = parser.volume(soup) return (fid, status, version, volume)
def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.get_authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def test_basic_fetching_of_common_attributes(self): "basic extraction of common values from a JATS-NLM XML article" self.assertEqual( parser.title(self.soup), u"Bacterial regulation of colony development in the closest living\n relatives of animals", ) self.assertEqual(parser.doi(self.soup), u"10.7554/eLife.00013") self.assertEqual( parser.keywords(self.soup), [ u"\nSalpingoeca rosetta\n", u"Algoriphagus", u"bacterial sulfonolipid", u"multicellular development", ], )
def doi(item): return parseJATS.doi(item)
def build_article_from_xml(article_xml_filename, detail="brief"): """ Parse JATS XML with elifetools parser, and populate an eLifePOA article object Basic data crossref needs: article_id, doi, title, contributors with names set detail="brief" is normally enough, detail="full" will populate all the contributor affiliations that are linked by xref tags """ error_count = 0 soup = parser.parse_document(article_xml_filename) # Get DOI doi = parser.doi(soup) # Create the article object article = eLifePOA(doi, title=None) # Related articles article.related_articles = build_related_articles(parser.related_article(soup)) # Get publisher_id and set object manuscript value publisher_id = parser.publisher_id(soup) article.manuscript = publisher_id # Set the articleType article_type = parser.article_type(soup) if article_type: article.articleType = article_type # title article.title = parser.full_title(soup) #print article.title # abstract article.abstract = clean_abstract(parser.full_abstract(soup)) # digest article.digest = clean_abstract(parser.full_digest(soup)) # elocation-id article.elocation_id = parser.elocation_id(soup) # contributors all_contributors = parser.contributors(soup, detail) author_contributors = filter(lambda con: con.get('type') in ['author', 'on-behalf-of'], all_contributors) contrib_type = "author" contributors = build_contributors(author_contributors, contrib_type) contrib_type = "author non-byline" authors = parser.authors_non_byline(soup, detail) contributors_non_byline = build_contributors(authors, contrib_type) article.contributors = contributors + contributors_non_byline # license href license = eLifeLicense() license.href = parser.license_url(soup) article.license = license # article_category article.article_categories = parser.category(soup) # keywords article.author_keywords = parser.keywords(soup) # research organisms article.research_organisms = parser.research_organism(soup) # funding awards article.funding_awards = build_funding(parser.full_award_groups(soup)) # references or citations article.ref_list = build_ref_list(parser.refs(soup)) # components with component DOI article.component_list = build_components(parser.components(soup)) # History dates date_types = ["received", "accepted"] for date_type in date_types: history_date = parser.history_date(soup, date_type) if history_date: date_instance = eLifeDate(date_type, history_date) article.add_date(date_instance) # Pub date pub_date = parser.pub_date(soup) if pub_date: date_instance = eLifeDate("pub", pub_date) article.add_date(date_instance) # Set the volume if present volume = parser.volume(soup) if volume: article.volume = volume article.is_poa = parser.is_poa(soup) return article, error_count