def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len( parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.get_authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root = xmlio.parse(xml_file) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) if parser.pub_date(soup) is None: # add the published date to the XML root = self.add_pub_date_to_xml(doi_id, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n",'').replace("\t",'') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def parse_jats_pub_date(soup): "extract the pub date from the soup" pub_date = parser.pub_date(soup) return pub_date
def build_article_from_xml(article_xml_filename, detail="brief"): """ Parse JATS XML with elifetools parser, and populate an eLifePOA article object Basic data crossref needs: article_id, doi, title, contributors with names set detail="brief" is normally enough, detail="full" will populate all the contributor affiliations that are linked by xref tags """ error_count = 0 soup = parser.parse_document(article_xml_filename) # Get DOI doi = parser.doi(soup) # Create the article object article = eLifePOA(doi, title=None) # Related articles article.related_articles = build_related_articles(parser.related_article(soup)) # Get publisher_id and set object manuscript value publisher_id = parser.publisher_id(soup) article.manuscript = publisher_id # Set the articleType article_type = parser.article_type(soup) if article_type: article.articleType = article_type # title article.title = parser.full_title(soup) #print article.title # abstract article.abstract = clean_abstract(parser.full_abstract(soup)) # digest article.digest = clean_abstract(parser.full_digest(soup)) # elocation-id article.elocation_id = parser.elocation_id(soup) # contributors all_contributors = parser.contributors(soup, detail) author_contributors = filter(lambda con: con.get('type') in ['author', 'on-behalf-of'], all_contributors) contrib_type = "author" contributors = build_contributors(author_contributors, contrib_type) contrib_type = "author non-byline" authors = parser.authors_non_byline(soup, detail) contributors_non_byline = build_contributors(authors, contrib_type) article.contributors = contributors + contributors_non_byline # license href license = eLifeLicense() license.href = parser.license_url(soup) article.license = license # article_category article.article_categories = parser.category(soup) # keywords article.author_keywords = parser.keywords(soup) # research organisms article.research_organisms = parser.research_organism(soup) # funding awards article.funding_awards = build_funding(parser.full_award_groups(soup)) # references or citations article.ref_list = build_ref_list(parser.refs(soup)) # components with component DOI article.component_list = build_components(parser.components(soup)) # History dates date_types = ["received", "accepted"] for date_type in date_types: history_date = parser.history_date(soup, date_type) if history_date: date_instance = eLifeDate(date_type, history_date) article.add_date(date_instance) # Pub date pub_date = parser.pub_date(soup) if pub_date: date_instance = eLifeDate("pub", pub_date) article.add_date(date_instance) # Set the volume if present volume = parser.volume(soup) if volume: article.volume = volume article.is_poa = parser.is_poa(soup) return article, error_count