Пример #1
0
 def repackage_archive_zip_to_pmc_zip(self, doi_id):
     "repackage the zip file in the TMP_DIR to a PMC zip format"
     # unzip contents
     zip_input_dir = os.path.join(self.get_tmp_dir(), self.TMP_DIR)
     zip_extracted_dir = os.path.join(self.get_tmp_dir(), self.JUNK_DIR)
     zip_renamed_files_dir = os.path.join(self.get_tmp_dir(), self.RENAME_DIR)
     pmc_zip_output_dir = os.path.join(self.get_tmp_dir(), self.INPUT_DIR)
     archive_zip_name = glob.glob(zip_input_dir + "/*.zip")[0]
     with zipfile.ZipFile(archive_zip_name, 'r') as myzip:
         myzip.extractall(zip_extracted_dir)
     # rename the files and profile the files
     file_name_map = article_processing.rename_files_remove_version_number(
         files_dir = zip_extracted_dir,
         output_dir = zip_renamed_files_dir
     )
     if self.logger:
         self.logger.info("FTPArticle running %s workflow for article %s, file_name_map"
                          % (self.workflow, self.doi_id))
         self.logger.info(file_name_map)
     # convert the XML
     article_xml_file = glob.glob(zip_renamed_files_dir + "/*.xml")[0]
     article_processing.convert_xml(xml_file=article_xml_file,
                      file_name_map=file_name_map)
     # rezip the files into PMC zip format
     soup = parser.parse_document(article_xml_file)
     volume = parser.volume(soup)
     pmc_zip_file_name = article_processing.new_pmc_zip_filename(self.journal, volume, doi_id)
     with zipfile.ZipFile(os.path.join(pmc_zip_output_dir, pmc_zip_file_name), 'w',
                          zipfile.ZIP_DEFLATED, allowZip64=True) as new_zipfile:
         dirfiles = article_processing.file_list(zip_renamed_files_dir)
         for df in dirfiles:
             filename = df.split(os.sep)[-1]
             new_zipfile.write(df, filename)
     return True
 def profile_article(self, document):
     """
     Temporary, profile the article by folder names in test data set
     In real code we still want this to return the same values
     """
     # Temporary setting of version values from directory names
     
     soup = self.article_soup(self.article_xml_file())
     
     # elife id / doi id / manuscript id
     fid = parser.doi(soup).split('.')[-1]
 
     # article status
     if parser.is_poa(soup) is True:
         status = 'poa'
     else:
         status = 'vor'
     
     # version
     version = self.version_number(document)
 
     # volume
     volume = parser.volume(soup)
         
     return (fid, status, version, volume)
Пример #3
0
    def profile_article(self, document):
        """
        Temporary, profile the article by folder names in test data set
        In real code we still want this to return the same values
        """
        # Temporary setting of version values from directory names

        soup = self.article_soup(self.article_xml_file())

        # elife id / doi id / manuscript id
        fid = parser.doi(soup).split('.')[-1]

        # article status
        if parser.is_poa(soup) is True:
            status = 'poa'
        else:
            status = 'vor'

        # version
        version = self.version_number(document)

        # volume
        volume = parser.volume(soup)

        return (fid, status, version, volume)
Пример #4
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(
                    parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file,
                                                  root)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Пример #5
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)


        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Пример #6
0
def build_article_from_xml(article_xml_filename, detail="brief"):
    """
    Parse JATS XML with elifetools parser, and populate an
    eLifePOA article object
    Basic data crossref needs: article_id, doi, title, contributors with names set
    detail="brief" is normally enough,
    detail="full" will populate all the contributor affiliations that are linked by xref tags
    """

    error_count = 0

    soup = parser.parse_document(article_xml_filename)

    # Get DOI
    doi = parser.doi(soup)

    # Create the article object
    article = eLifePOA(doi, title=None)

    # Related articles
    article.related_articles = build_related_articles(parser.related_article(soup))

    # Get publisher_id and set object manuscript value
    publisher_id = parser.publisher_id(soup)
    article.manuscript = publisher_id

    # Set the articleType
    article_type = parser.article_type(soup)
    if article_type:
        article.articleType = article_type

    # title
    article.title = parser.full_title(soup)
    #print article.title

    # abstract
    article.abstract = clean_abstract(parser.full_abstract(soup))

    # digest
    article.digest = clean_abstract(parser.full_digest(soup))

    # elocation-id
    article.elocation_id = parser.elocation_id(soup)

    # contributors
    all_contributors = parser.contributors(soup, detail)
    author_contributors = filter(lambda con: con.get('type')
                                 in ['author', 'on-behalf-of'], all_contributors)
    contrib_type = "author"
    contributors = build_contributors(author_contributors, contrib_type)

    contrib_type = "author non-byline"
    authors = parser.authors_non_byline(soup, detail)
    contributors_non_byline = build_contributors(authors, contrib_type)
    article.contributors = contributors + contributors_non_byline

    # license href
    license = eLifeLicense()
    license.href = parser.license_url(soup)
    article.license = license

    # article_category
    article.article_categories = parser.category(soup)

    # keywords
    article.author_keywords = parser.keywords(soup)

    # research organisms
    article.research_organisms = parser.research_organism(soup)

    # funding awards
    article.funding_awards = build_funding(parser.full_award_groups(soup))

    # references or citations
    article.ref_list = build_ref_list(parser.refs(soup))

    # components with component DOI
    article.component_list = build_components(parser.components(soup))

    # History dates
    date_types = ["received", "accepted"]
    for date_type in date_types:
        history_date = parser.history_date(soup, date_type)
        if history_date:
            date_instance = eLifeDate(date_type, history_date)
            article.add_date(date_instance)

    # Pub date
    pub_date = parser.pub_date(soup)
    if pub_date:
        date_instance = eLifeDate("pub", pub_date)
        article.add_date(date_instance)

    # Set the volume if present
    volume = parser.volume(soup)
    if volume:
        article.volume = volume

    article.is_poa = parser.is_poa(soup)

    return article, error_count