def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.get_authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def repackage_archive_zip_to_pmc_zip(self, doi_id): "repackage the zip file in the TMP_DIR to a PMC zip format" # unzip contents zip_input_dir = os.path.join(self.get_tmp_dir(), self.TMP_DIR) zip_extracted_dir = os.path.join(self.get_tmp_dir(), self.JUNK_DIR) zip_renamed_files_dir = os.path.join(self.get_tmp_dir(), self.RENAME_DIR) pmc_zip_output_dir = os.path.join(self.get_tmp_dir(), self.INPUT_DIR) archive_zip_name = glob.glob(zip_input_dir + "/*.zip")[0] with zipfile.ZipFile(archive_zip_name, 'r') as myzip: myzip.extractall(zip_extracted_dir) # rename the files and profile the files file_name_map = article_processing.rename_files_remove_version_number( files_dir = zip_extracted_dir, output_dir = zip_renamed_files_dir ) if self.logger: self.logger.info("FTPArticle running %s workflow for article %s, file_name_map" % (self.workflow, self.doi_id)) self.logger.info(file_name_map) # convert the XML article_xml_file = glob.glob(zip_renamed_files_dir + "/*.xml")[0] article_processing.convert_xml(xml_file=article_xml_file, file_name_map=file_name_map) # rezip the files into PMC zip format soup = parser.parse_document(article_xml_file) volume = parser.volume(soup) pmc_zip_file_name = article_processing.new_pmc_zip_filename(self.journal, volume, doi_id) with zipfile.ZipFile(os.path.join(pmc_zip_output_dir, pmc_zip_file_name), 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as new_zipfile: dirfiles = article_processing.file_list(zip_renamed_files_dir) for df in dirfiles: filename = df.split(os.sep)[-1] new_zipfile.write(df, filename) return True
def to_soup(doc): if isinstance(doc, basestring): if os.path.exists(doc): return parseJATS.parse_document(doc) return parseJATS.parse_xml(doc) # assume it's a file-like object and attempt to .read() it's contents return parseJATS.parse_xml(doc.read())
def setUp(self): kitchen_sink_xml = sample_xml("elife-kitchen-sink.xml") # all of these methods are equivalent: # self.soup = bss(open(kitchen_sink_xml, 'r').read()) # self.soup = parser.parse_xml(open(kitchen_sink_xml, 'r')) self.soup = parser.parse_document(kitchen_sink_xml)
def parse_article_xml(self, document): """ Given article XML, parse it and return an object representation """ try: soup = parser.parse_document(document) self.doi = parser.doi(soup) if self.doi: self.doi_id = self.get_doi_id(self.doi) self.doi_url = self.get_doi_url(self.doi) self.lens_url = self.get_lens_url(self.doi) self.tweet_url = self.get_tweet_url(self.doi) self.pub_date = parser.pub_date(soup) self.pub_date_timestamp = parser.pub_date_timestamp(soup) self.article_title = parser.title(soup) self.article_type = parser.article_type(soup) self.authors = parser.authors(soup) self.authors_string = self.authors_string(self.authors) self.related_articles = parser.related_article(soup) self.is_poa = parser.is_poa(soup) #self.subject_area = self.parse_subject_area(soup) self.display_channel = parser.display_channel(soup) return True except: return False
def transform_xml(xml_asset_path, identifier): "modify the XML" # remove history tags from XML for certain article types root = parse.parse_article_xml(xml_asset_path) soup = parser.parse_document(xml_asset_path) root = transform_xml_history_tags(root, soup, identifier) write_xml_file(root, xml_asset_path, identifier)
def test_quickly(self): struct = [ (parser.doi, u"10.7554/eLife.00013"), (parser.journal_id, u"eLife"), (parser.journal_title, u"eLife"), (parser.journal_issn, u"2050-084X"), (parser.publisher, u"eLife Sciences Publications, Ltd"), ] for func, expected in struct: soup = parser.parse_document(self.kitchen_sink_xml) got = func(soup) try: self.assertEqual(got, expected) except AssertionError: print("failed on", func, "expected", expected, "got", got) raise soup = parser.parse_document(self.kitchen_sink_xml) self.assertEqual(parser.journal_issn(soup, pub_format="electronic"), u"2050-084X")
def article_wrapper(path,version=None): soup = parser.parse_document(path) # return a wrapper around the parser module that injects the soup when a function is called return ParserWrapper(soup,path,version)
def test_sub_article(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.sub_article(soup)), expected_len)
def test_article_body(self, filename, expected_type): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(type(raw_parser.article_body(soup)), expected_type)
def test_abstract(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.abstract(soup)), expected_len)
def test_decision_letter(self, filename, expected_type): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(type(raw_parser.decision_letter(soup)), expected_type)
def test_author_response(self, filename, expected_type): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(type(raw_parser.author_response(soup)), expected_type)
def parse_jats_file(jats_file_name): "parse the jats file into a BeautifulSoup object" return parser.parse_document(jats_file_name)
def test_funding_group(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.funding_group(soup)), expected_len)
def test_fn_group(self, filename, content_type, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.fn_group(soup, content_type)), expected_len)
def test_pub_id_uri(self, filename, ext_link_type, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.ext_link(soup, ext_link_type)), expected_len)
def test_disp_formula(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.disp_formula(soup)), expected_len)
def test_table_wrap_foot(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.table_wrap_foot(soup)), expected_len)
def article_soup(self, xml_file): return parser.parse_document(xml_file)
def test_principal_award_recipient(self, filename, expected_len): soup = parser.parse_document(sample_xml(filename)) self.assertEqual(len(raw_parser.principal_award_recipient(soup)), expected_len)
def setUp(self): self.kitchen_sink_xml = sample_xml("elife-kitchen-sink.xml") self.xml = sample_xml("elife00013.xml") self.soup = parser.parse_document(self.kitchen_sink_xml)
def build_article_from_xml(article_xml_filename, detail="brief"): """ Parse JATS XML with elifetools parser, and populate an eLifePOA article object Basic data crossref needs: article_id, doi, title, contributors with names set detail="brief" is normally enough, detail="full" will populate all the contributor affiliations that are linked by xref tags """ error_count = 0 soup = parser.parse_document(article_xml_filename) # Get DOI doi = parser.doi(soup) # Create the article object article = eLifePOA(doi, title=None) # Related articles article.related_articles = build_related_articles(parser.related_article(soup)) # Get publisher_id and set object manuscript value publisher_id = parser.publisher_id(soup) article.manuscript = publisher_id # Set the articleType article_type = parser.article_type(soup) if article_type: article.articleType = article_type # title article.title = parser.full_title(soup) #print article.title # abstract article.abstract = clean_abstract(parser.full_abstract(soup)) # digest article.digest = clean_abstract(parser.full_digest(soup)) # elocation-id article.elocation_id = parser.elocation_id(soup) # contributors all_contributors = parser.contributors(soup, detail) author_contributors = filter(lambda con: con.get('type') in ['author', 'on-behalf-of'], all_contributors) contrib_type = "author" contributors = build_contributors(author_contributors, contrib_type) contrib_type = "author non-byline" authors = parser.authors_non_byline(soup, detail) contributors_non_byline = build_contributors(authors, contrib_type) article.contributors = contributors + contributors_non_byline # license href license = eLifeLicense() license.href = parser.license_url(soup) article.license = license # article_category article.article_categories = parser.category(soup) # keywords article.author_keywords = parser.keywords(soup) # research organisms article.research_organisms = parser.research_organism(soup) # funding awards article.funding_awards = build_funding(parser.full_award_groups(soup)) # references or citations article.ref_list = build_ref_list(parser.refs(soup)) # components with component DOI article.component_list = build_components(parser.components(soup)) # History dates date_types = ["received", "accepted"] for date_type in date_types: history_date = parser.history_date(soup, date_type) if history_date: date_instance = eLifeDate(date_type, history_date) article.add_date(date_instance) # Pub date pub_date = parser.pub_date(soup) if pub_date: date_instance = eLifeDate("pub", pub_date) article.add_date(date_instance) # Set the volume if present volume = parser.volume(soup) if volume: article.volume = volume article.is_poa = parser.is_poa(soup) return article, error_count
def test_quickly2(self): soup = parser.parse_document(self.xml) self.assertEqual(raw_parser.article_type(soup), "research-article")
def test_mixed_citations(self): self.soup = parser.parse_document(sample_xml("elife-kitchen-sink.xml")) self.assertEqual(1, len(raw_parser.mixed_citations(self.soup)))
def article_wrapper(path, version=None): soup = parser.parse_document(path) # return a wrapper around the parser module that injects the soup when a function is called return ParserWrapper(soup, path, version)