예제 #1
0
    def parse_article_xml(self, document):
        """
        Given article XML, parse
        it and return an object representation
        """

        try:
            soup = parser.parse_document(document)
            self.doi = parser.doi(soup)
            if self.doi:
                self.doi_id = self.get_doi_id(self.doi)
                self.doi_url = self.get_doi_url(self.doi)
                self.lens_url = self.get_lens_url(self.doi)
                self.tweet_url = self.get_tweet_url(self.doi)

            self.pub_date = parser.pub_date(soup)
            self.pub_date_timestamp = parser.pub_date_timestamp(soup)

            self.article_title = parser.title(soup)
            self.article_type = parser.article_type(soup)

            self.authors = parser.authors(soup)
            self.authors_string = self.get_authors_string(self.authors)

            self.related_articles = parser.related_article(soup)

            self.is_poa = parser.is_poa(soup)

            #self.subject_area = self.parse_subject_area(soup)

            self.display_channel = parser.display_channel(soup)

            return True
        except:
            return False
예제 #2
0
 def repackage_archive_zip_to_pmc_zip(self, doi_id):
     "repackage the zip file in the TMP_DIR to a PMC zip format"
     # unzip contents
     zip_input_dir = os.path.join(self.get_tmp_dir(), self.TMP_DIR)
     zip_extracted_dir = os.path.join(self.get_tmp_dir(), self.JUNK_DIR)
     zip_renamed_files_dir = os.path.join(self.get_tmp_dir(), self.RENAME_DIR)
     pmc_zip_output_dir = os.path.join(self.get_tmp_dir(), self.INPUT_DIR)
     archive_zip_name = glob.glob(zip_input_dir + "/*.zip")[0]
     with zipfile.ZipFile(archive_zip_name, 'r') as myzip:
         myzip.extractall(zip_extracted_dir)
     # rename the files and profile the files
     file_name_map = article_processing.rename_files_remove_version_number(
         files_dir = zip_extracted_dir,
         output_dir = zip_renamed_files_dir
     )
     if self.logger:
         self.logger.info("FTPArticle running %s workflow for article %s, file_name_map"
                          % (self.workflow, self.doi_id))
         self.logger.info(file_name_map)
     # convert the XML
     article_xml_file = glob.glob(zip_renamed_files_dir + "/*.xml")[0]
     article_processing.convert_xml(xml_file=article_xml_file,
                      file_name_map=file_name_map)
     # rezip the files into PMC zip format
     soup = parser.parse_document(article_xml_file)
     volume = parser.volume(soup)
     pmc_zip_file_name = article_processing.new_pmc_zip_filename(self.journal, volume, doi_id)
     with zipfile.ZipFile(os.path.join(pmc_zip_output_dir, pmc_zip_file_name), 'w',
                          zipfile.ZIP_DEFLATED, allowZip64=True) as new_zipfile:
         dirfiles = article_processing.file_list(zip_renamed_files_dir)
         for df in dirfiles:
             filename = df.split(os.sep)[-1]
             new_zipfile.write(df, filename)
     return True
예제 #3
0
def to_soup(doc):
    if isinstance(doc, basestring):
        if os.path.exists(doc):
            return parseJATS.parse_document(doc)
        return parseJATS.parse_xml(doc)
    # assume it's a file-like object and attempt to .read() it's contents
    return parseJATS.parse_xml(doc.read())
예제 #4
0
    def setUp(self):
        kitchen_sink_xml = sample_xml("elife-kitchen-sink.xml")

        # all of these methods are equivalent:
        # self.soup = bss(open(kitchen_sink_xml, 'r').read())
        # self.soup = parser.parse_xml(open(kitchen_sink_xml, 'r'))
        self.soup = parser.parse_document(kitchen_sink_xml)
예제 #5
0
    def parse_article_xml(self, document):
        """
        Given article XML, parse
        it and return an object representation
        """

        try:
            soup = parser.parse_document(document)
            self.doi = parser.doi(soup)
            if self.doi:
                self.doi_id = self.get_doi_id(self.doi)
                self.doi_url = self.get_doi_url(self.doi)
                self.lens_url = self.get_lens_url(self.doi)
                self.tweet_url = self.get_tweet_url(self.doi)

            self.pub_date = parser.pub_date(soup)
            self.pub_date_timestamp = parser.pub_date_timestamp(soup)

            self.article_title = parser.title(soup)
            self.article_type = parser.article_type(soup)

            self.authors = parser.authors(soup)
            self.authors_string = self.authors_string(self.authors)

            self.related_articles = parser.related_article(soup)

            self.is_poa = parser.is_poa(soup)

            #self.subject_area = self.parse_subject_area(soup)

            self.display_channel = parser.display_channel(soup)

            return True
        except:
            return False
예제 #6
0
def transform_xml(xml_asset_path, identifier):
    "modify the XML"
    # remove history tags from XML for certain article types
    root = parse.parse_article_xml(xml_asset_path)
    soup = parser.parse_document(xml_asset_path)
    root = transform_xml_history_tags(root, soup, identifier)
    write_xml_file(root, xml_asset_path, identifier)
예제 #7
0
def to_soup(doc):
    if isinstance(doc, basestring):
        if os.path.exists(doc):
            return parseJATS.parse_document(doc)
        return parseJATS.parse_xml(doc)
    # assume it's a file-like object and attempt to .read() it's contents
    return parseJATS.parse_xml(doc.read())
예제 #8
0
    def test_quickly(self):
        struct = [
            (parser.doi, u"10.7554/eLife.00013"),
            (parser.journal_id, u"eLife"),
            (parser.journal_title, u"eLife"),
            (parser.journal_issn, u"2050-084X"),
            (parser.publisher, u"eLife Sciences Publications, Ltd"),
        ]
        for func, expected in struct:
            soup = parser.parse_document(self.kitchen_sink_xml)
            got = func(soup)
            try:

                self.assertEqual(got, expected)
            except AssertionError:
                print("failed on", func, "expected", expected, "got", got)
                raise
        soup = parser.parse_document(self.kitchen_sink_xml)
        self.assertEqual(parser.journal_issn(soup, pub_format="electronic"),
                         u"2050-084X")
예제 #9
0
파일: feeds.py 프로젝트: gnott/jats-scraper
def article_wrapper(path,version=None):
    soup = parser.parse_document(path)
    # return a wrapper around the parser module that injects the soup when a function is called
    return ParserWrapper(soup,path,version)
예제 #10
0
 def test_sub_article(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.sub_article(soup)), expected_len)
예제 #11
0
 def test_article_body(self, filename, expected_type):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(type(raw_parser.article_body(soup)), expected_type)
예제 #12
0
 def test_abstract(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.abstract(soup)), expected_len)
예제 #13
0
 def test_decision_letter(self, filename, expected_type):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(type(raw_parser.decision_letter(soup)), expected_type)
예제 #14
0
 def test_author_response(self, filename, expected_type):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(type(raw_parser.author_response(soup)), expected_type)
예제 #15
0
def parse_jats_file(jats_file_name):
    "parse the jats file into a BeautifulSoup object"
    return parser.parse_document(jats_file_name)
예제 #16
0
 def test_funding_group(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.funding_group(soup)), expected_len)
예제 #17
0
 def test_fn_group(self, filename, content_type, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.fn_group(soup, content_type)),
                      expected_len)
예제 #18
0
 def test_pub_id_uri(self, filename, ext_link_type, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.ext_link(soup, ext_link_type)),
                      expected_len)
예제 #19
0
 def test_disp_formula(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.disp_formula(soup)), expected_len)
예제 #20
0
 def test_table_wrap_foot(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.table_wrap_foot(soup)), expected_len)
예제 #21
0
 def article_soup(self, xml_file):
     return parser.parse_document(xml_file)
예제 #22
0
 def article_soup(self, xml_file):
     return parser.parse_document(xml_file)
예제 #23
0
 def test_principal_award_recipient(self, filename, expected_len):
     soup = parser.parse_document(sample_xml(filename))
     self.assertEqual(len(raw_parser.principal_award_recipient(soup)),
                      expected_len)
예제 #24
0
 def setUp(self):
     self.kitchen_sink_xml = sample_xml("elife-kitchen-sink.xml")
     self.xml = sample_xml("elife00013.xml")
     self.soup = parser.parse_document(self.kitchen_sink_xml)
예제 #25
0
def build_article_from_xml(article_xml_filename, detail="brief"):
    """
    Parse JATS XML with elifetools parser, and populate an
    eLifePOA article object
    Basic data crossref needs: article_id, doi, title, contributors with names set
    detail="brief" is normally enough,
    detail="full" will populate all the contributor affiliations that are linked by xref tags
    """

    error_count = 0

    soup = parser.parse_document(article_xml_filename)

    # Get DOI
    doi = parser.doi(soup)

    # Create the article object
    article = eLifePOA(doi, title=None)

    # Related articles
    article.related_articles = build_related_articles(parser.related_article(soup))

    # Get publisher_id and set object manuscript value
    publisher_id = parser.publisher_id(soup)
    article.manuscript = publisher_id

    # Set the articleType
    article_type = parser.article_type(soup)
    if article_type:
        article.articleType = article_type

    # title
    article.title = parser.full_title(soup)
    #print article.title

    # abstract
    article.abstract = clean_abstract(parser.full_abstract(soup))

    # digest
    article.digest = clean_abstract(parser.full_digest(soup))

    # elocation-id
    article.elocation_id = parser.elocation_id(soup)

    # contributors
    all_contributors = parser.contributors(soup, detail)
    author_contributors = filter(lambda con: con.get('type')
                                 in ['author', 'on-behalf-of'], all_contributors)
    contrib_type = "author"
    contributors = build_contributors(author_contributors, contrib_type)

    contrib_type = "author non-byline"
    authors = parser.authors_non_byline(soup, detail)
    contributors_non_byline = build_contributors(authors, contrib_type)
    article.contributors = contributors + contributors_non_byline

    # license href
    license = eLifeLicense()
    license.href = parser.license_url(soup)
    article.license = license

    # article_category
    article.article_categories = parser.category(soup)

    # keywords
    article.author_keywords = parser.keywords(soup)

    # research organisms
    article.research_organisms = parser.research_organism(soup)

    # funding awards
    article.funding_awards = build_funding(parser.full_award_groups(soup))

    # references or citations
    article.ref_list = build_ref_list(parser.refs(soup))

    # components with component DOI
    article.component_list = build_components(parser.components(soup))

    # History dates
    date_types = ["received", "accepted"]
    for date_type in date_types:
        history_date = parser.history_date(soup, date_type)
        if history_date:
            date_instance = eLifeDate(date_type, history_date)
            article.add_date(date_instance)

    # Pub date
    pub_date = parser.pub_date(soup)
    if pub_date:
        date_instance = eLifeDate("pub", pub_date)
        article.add_date(date_instance)

    # Set the volume if present
    volume = parser.volume(soup)
    if volume:
        article.volume = volume

    article.is_poa = parser.is_poa(soup)

    return article, error_count
예제 #26
0
 def test_quickly2(self):
     soup = parser.parse_document(self.xml)
     self.assertEqual(raw_parser.article_type(soup), "research-article")
예제 #27
0
 def test_mixed_citations(self):
     self.soup = parser.parse_document(sample_xml("elife-kitchen-sink.xml"))
     self.assertEqual(1, len(raw_parser.mixed_citations(self.soup)))
예제 #28
0
def article_wrapper(path, version=None):
    soup = parser.parse_document(path)
    # return a wrapper around the parser module that injects the soup when a function is called
    return ParserWrapper(soup, path, version)