示例#1
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(
                    parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file,
                                                  root)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)


        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
示例#3
0
    def parse_article_xml(self, document):
        """
        Given article XML, parse
        it and return an object representation
        """

        try:
            soup = parser.parse_document(document)
            self.doi = parser.doi(soup)
            if self.doi:
                self.doi_id = self.get_doi_id(self.doi)
                self.doi_url = self.get_doi_url(self.doi)
                self.lens_url = self.get_lens_url(self.doi)
                self.tweet_url = self.get_tweet_url(self.doi)

            self.pub_date = parser.pub_date(soup)
            self.pub_date_timestamp = parser.pub_date_timestamp(soup)

            self.article_title = parser.title(soup)
            self.article_type = parser.article_type(soup)

            self.authors = parser.authors(soup)
            self.authors_string = self.authors_string(self.authors)

            self.related_articles = parser.related_article(soup)

            self.is_poa = parser.is_poa(soup)

            #self.subject_area = self.parse_subject_area(soup)

            self.display_channel = parser.display_channel(soup)

            return True
        except:
            return False
示例#4
0
    def parse_article_xml(self, document):
        """
        Given article XML, parse
        it and return an object representation
        """

        try:
            soup = parser.parse_document(document)
            self.doi = parser.doi(soup)
            if self.doi:
                self.doi_id = self.get_doi_id(self.doi)
                self.doi_url = self.get_doi_url(self.doi)
                self.lens_url = self.get_lens_url(self.doi)
                self.tweet_url = self.get_tweet_url(self.doi)

            self.pub_date = parser.pub_date(soup)
            self.pub_date_timestamp = parser.pub_date_timestamp(soup)

            self.article_title = parser.title(soup)
            self.article_type = parser.article_type(soup)

            self.authors = parser.authors(soup)
            self.authors_string = self.get_authors_string(self.authors)

            self.related_articles = parser.related_article(soup)

            self.is_poa = parser.is_poa(soup)

            #self.subject_area = self.parse_subject_area(soup)

            self.display_channel = parser.display_channel(soup)

            return True
        except:
            return False
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()
        
        root = xmlio.parse(xml_file)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)
            
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                root = self.add_pub_date_to_xml(doi_id, root)
            
            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)
            
            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)
            
                
        # Start the file output
        reparsed_string = xmlio.output(root)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n",'').replace("\t",'')
        
        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
示例#6
0
def parse_jats_pub_date(soup):
    "extract the pub date from the soup"
    pub_date = parser.pub_date(soup)
    return pub_date
示例#7
0
def build_article_from_xml(article_xml_filename, detail="brief"):
    """
    Parse JATS XML with elifetools parser, and populate an
    eLifePOA article object
    Basic data crossref needs: article_id, doi, title, contributors with names set
    detail="brief" is normally enough,
    detail="full" will populate all the contributor affiliations that are linked by xref tags
    """

    error_count = 0

    soup = parser.parse_document(article_xml_filename)

    # Get DOI
    doi = parser.doi(soup)

    # Create the article object
    article = eLifePOA(doi, title=None)

    # Related articles
    article.related_articles = build_related_articles(parser.related_article(soup))

    # Get publisher_id and set object manuscript value
    publisher_id = parser.publisher_id(soup)
    article.manuscript = publisher_id

    # Set the articleType
    article_type = parser.article_type(soup)
    if article_type:
        article.articleType = article_type

    # title
    article.title = parser.full_title(soup)
    #print article.title

    # abstract
    article.abstract = clean_abstract(parser.full_abstract(soup))

    # digest
    article.digest = clean_abstract(parser.full_digest(soup))

    # elocation-id
    article.elocation_id = parser.elocation_id(soup)

    # contributors
    all_contributors = parser.contributors(soup, detail)
    author_contributors = filter(lambda con: con.get('type')
                                 in ['author', 'on-behalf-of'], all_contributors)
    contrib_type = "author"
    contributors = build_contributors(author_contributors, contrib_type)

    contrib_type = "author non-byline"
    authors = parser.authors_non_byline(soup, detail)
    contributors_non_byline = build_contributors(authors, contrib_type)
    article.contributors = contributors + contributors_non_byline

    # license href
    license = eLifeLicense()
    license.href = parser.license_url(soup)
    article.license = license

    # article_category
    article.article_categories = parser.category(soup)

    # keywords
    article.author_keywords = parser.keywords(soup)

    # research organisms
    article.research_organisms = parser.research_organism(soup)

    # funding awards
    article.funding_awards = build_funding(parser.full_award_groups(soup))

    # references or citations
    article.ref_list = build_ref_list(parser.refs(soup))

    # components with component DOI
    article.component_list = build_components(parser.components(soup))

    # History dates
    date_types = ["received", "accepted"]
    for date_type in date_types:
        history_date = parser.history_date(soup, date_type)
        if history_date:
            date_instance = eLifeDate(date_type, history_date)
            article.add_date(date_instance)

    # Pub date
    pub_date = parser.pub_date(soup)
    if pub_date:
        date_instance = eLifeDate("pub", pub_date)
        article.add_date(date_instance)

    # Set the volume if present
    volume = parser.volume(soup)
    if volume:
        article.volume = volume

    article.is_poa = parser.is_poa(soup)

    return article, error_count