예제 #1
0
    def preprocess(self,
                   xml_file_name='dblp.xml.gz',
                   process_name=True,
                   num_file_lines=10**6,
                   show_progress=True):
        """
        Bulk preprocess of the DBLP raw data.

        Parameters
        ----------
        :param process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        :param xml_file_name: str, default 'dblp.xml.gz'
            The xml file name.

        :param num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        """

        ACCEPT_DOCTYPES = set([
            'article', 'inproceedings', 'proceedings', 'book', 'incollection',
            'phdthesis', 'mastersthesis'
        ])
        REJECT_DOCTYPES = set(['www'])
        DATA_ITEMS = [
            'title', 'booktitle', 'year', 'journal', 'ee', ' url', 'month',
            'mdate', 'isbn', 'publisher'
        ]
        SKIP_FIELDS = [
            'note', 'cite', 'cdrom', 'crossref', 'editor', 'series', 'tt',
            'school', 'chapter', 'address'
        ]

        doctype = {
            'article': 'j',
            'book': 'b',
            '': '',
            'phdthesis': 'phd',
            'proceedings': 'c',
            'inproceedings': 'c',
            'mastersthesis': 'ms',
            'incollection': 'c'
        }

        html_format_keys = [
            '<sub>', '</sub>', '<sup>', '</sup>', '<i>', '</i>'
        ]

        if show_progress:
            print("Starting to preprocess the DBLP database.")

        if not os.path.exists(os.path.join(self.path2database, 'publication')):
            os.mkdir(os.path.join(self.path2database, 'publication'))

        if not os.path.exists(os.path.join(self.path2database, 'author')):
            os.mkdir(os.path.join(self.path2database, 'author'))

        if not os.path.exists(
                os.path.join(self.path2database, 'publicationauthor')):
            os.mkdir(os.path.join(self.path2database, 'publicationauthor'))

        publication_df = []
        author_df = []
        author2pub_df = []
        journal_df = []

        PublicationId = 1
        AuthorId = 1
        aname2aid = {}
        author_columns = ['AuthorId', 'FullName']
        if process_name:
            author_columns += ['LastName', 'FirstName', 'MiddleName']
        JournalId = 1
        jname2jid = {}

        pub_record = self._blank_dblp_publication(PublicationId)
        pub_authors = []
        AuthorCount = 0

        ifile = 0

        # read dtd - this takes
        path2database = self.path2database  # remove self to use inside of this class

        class DTDResolver(etree.Resolver):
            def resolve(self, system_url, public_id, context):
                return self.resolve_filename(
                    os.path.join(path2database, system_url), context)

        if '.gz' in xml_file_name:
            with gzip.open(os.path.join(self.path2database, xml_file_name),
                           'r') as infile:
                xml_file = infile.read()

        else:
            with open(os.path.join(self.path2database, xml_file_name),
                      'r') as infile:
                xml_file = infile.read().encode('latin1')

        # extract the desired fields from the XML tree  #
        bytesxml = BytesIO(xml_file)
        xmltree = etree.iterparse(bytesxml,
                                  load_dtd=True,
                                  resolve_entities=True)
        xmltree.resolvers.add(DTDResolver())

        if show_progress:
            print("Xml tree parsed, iterating through elements.")

        last_position = 0
        xml_size = bytesxml.getbuffer().nbytes
        with tqdm(total=xml_size,
                  unit='iB',
                  unit_scale=True,
                  desc='dblp.xml',
                  leave=True,
                  disable=not show_progress) as pbar:
            for event, elem in xmltree:
                if elem.tag == 'title' or elem.tag == 'booktitle':
                    pub_record['Title'] = load_html_str(elem.text)

                elif elem.tag == 'year':
                    pub_record['Year'] = load_int(elem.text)

                elif elem.tag == 'month':
                    pub_record['Month'] = load_int(elem.text)

                elif elem.tag == 'volume':
                    pub_record['Volume'] = load_int(elem.text)

                elif elem.tag == 'number':
                    pub_record['Number'] = load_html_str(elem.text)

                elif elem.tag == 'pages':
                    pub_record['Pages'] = load_html_str(elem.text)

                elif elem.tag == 'journal':
                    pub_record['JournalId'] = load_html_str(elem.text)

                elif elem.tag == 'url':
                    pub_record['URL'] = load_html_str(elem.text)

                elif elem.tag == 'ee':
                    pub_record['EE'] = load_html_str(elem.text)

                elif elem.tag == 'author':
                    AuthorCount += 1
                    fullname = load_html_str(elem.text)
                    if aname2aid.get(fullname, None) is None:
                        if process_name:
                            fullname = ''.join([
                                i for i in fullname if not i.isdigit()
                            ]).strip()
                            hname = HumanName(fullname)
                            author_df.append([
                                AuthorId, fullname, hname.last, hname.first,
                                hname.middle
                            ])
                        else:
                            author_df.append([AuthorId, fullname])
                        aname2aid[fullname] = AuthorId
                        AuthorId += 1

                    pub_authors.append(
                        [PublicationId, aname2aid[fullname], AuthorCount])

                elif elem.tag in ACCEPT_DOCTYPES:
                    pub_record['TeamSize'] = AuthorCount
                    pub_record['DocType'] = doctype[load_html_str(elem.tag)]

                    publication_df.append(pub_record)
                    author2pub_df.extend(pub_authors)
                    PublicationId += 1
                    pub_record = self._blank_dblp_publication(PublicationId)
                    AuthorCount = 0
                    pub_authors = []

                    # update progress bar
                    pbar.update(bytesxml.tell() - last_position)
                    last_position = bytesxml.tell()

                    if num_file_lines > 0 and (PublicationId %
                                               num_file_lines) == 0:

                        self._save_dataframes(ifile, publication_df, author_df,
                                              author_columns, author2pub_df)

                        ifile += 1

                        publication_df = []
                        author_df = []
                        author2pub_df = []

                elif elem.tag in REJECT_DOCTYPES:
                    # the record was from a rejected category so reset record
                    pub_record = self._blank_dblp_publication(PublicationId)
                    AuthorCount = 0
                    pub_authors = []

                elif elem.tag in SKIP_FIELDS:
                    pass

        del xmltree

        self._save_dataframes(ifile, publication_df, author_df, author_columns,
                              author2pub_df)
예제 #2
0
    def preprocess(self, xml_directory = 'RawXML', process_name=True, num_file_lines=10**6, show_progress=True,rewrite_existing = False):
        """
        Bulk preprocess of the PubMed raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        rewrite_existing: bool, default False
            If True, rewrites the files in the data directory
        """

        if show_progress:
            print("Starting to preprocess the PubMed database.")

        for hier_dir_type in [self.path2pub_df, self.path2paa_df, self.path2pub2field_df, self.path2pub2ref_df, self.path2fieldinfo_df]:

            if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))


        xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

        # read dtd - this takes
        path2database = self.path2database # remove self to use inside of this class
        class DTDResolver(etree.Resolver):
            def resolve(self, system_url, public_id, context):
                return self.resolve_filename(os.path.join(path2database, system_url), context)
        parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

        pub2year = {}
        fieldinfo = {}

        ifile = 0
        for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

            # check if the xml file was already parsed
            dest_file_name = os.path.join(self.path2database, self.path2pub_df,'publication{}.hdf'.format(ifile))
            if not rewrite_existing and os.path.isfile(dest_file_name):
                ifile+=1
                continue

            publication_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []

            xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

            all_pubmed_articles = xmltree.findall("/PubmedArticle")

            for article_bucket in all_pubmed_articles:

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))
                pub_record = self._blank_pubmed_publication(PublicationId)

                article = medline.find("Article")
                pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                if article.find('Pagination') == None:
                    pub_record['Pages'] = None
                else:
                    pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                journal = article.find("Journal")
                pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                history = article_bucket.find("PubmedData/History")
                if not history is None:
                    pdate = history.find('PubMedPubDate')
                    if not pdate is None:
                        pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                        pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                        pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))


                if pub_record['Year'] > 0:
                    pub2year[PublicationId] = pub_record['Year']

                article_ids = article_bucket.find("PubmedData/ArticleIdList")
                if article_ids is not None:
                    doi = article_ids.find('ArticleId[@IdType="doi"]')
                    pub_record['Doi'] = load_xml_text(doi)


                author_list = article.find('AuthorList')

                if not author_list is None:
                    for seq, author in enumerate(author_list.findall('Author')):
                        author_record = self._blank_pubmed_author()

                        author_record['PublicationId'] = PublicationId
                        author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                        author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                        author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                        if author.find("AffiliationInfo/Affiliation") is not None:
                            author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                            author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                        author_record['AuthorSequence'] = seq+1

                        paa_df.append(author_record)

                    pub_record['TeamSize'] = seq + 1

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                references = article_bucket.find("PubmedData/ReferenceList")
                if not references is None:
                    for ref in references.findall("Reference"):
                        citation = load_xml_text(ref.find("Citation"))
                        if not ref.find('ArticleIdList') is None:
                            pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                        else:
                            pmid = ""
                        pub2ref_df.append([PublicationId, pmid, citation])

                publication_df.append(pub_record)

            self._save_dataframes(ifile, publication_df, paa_df, pub2ref_df, pub2field_df)
            ifile += 1

        # if rewriting
        dest_file_name = os.path.join(self.path2database, self.path2fieldinfo_df,'fieldinfo.hdf')
        if rewrite_existing:
            # save field info dictionary
            mesh_id_df_list = list(field_info.values())
            for i, j in enumerate(field_info.keys()):
                mesh_id_df_list[i].insert(0, j)

            fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
            fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))
예제 #3
0
    def parse_publications(self,
                           preprocess=False,
                           preprocess_dicts=True,
                           pubid2int=False,
                           archive_name='aps-dataset-metadata-2019.zip',
                           show_progress=False):

        archive = zipfile.ZipFile(
            os.path.join(self.path2database, archive_name), 'r')
        metadata_files = [
            fname for fname in archive.namelist()
            if 'aps-dataset-metadata' in fname and '.json' in fname
        ]

        # check that the archive concatins the expected directory
        if len(metadata_files) > 0:

            if preprocess:
                if not os.path.exists(
                        os.path.join(self.path2database, 'publication')):
                    os.mkdir(os.path.join(self.path2database, 'publication'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'journal')):
                    os.mkdir(os.path.join(self.path2database, 'journal'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'affiliation')):
                    os.mkdir(os.path.join(self.path2database, 'affiliation'))

                if not os.path.exists(
                        os.path.join(self.path2database,
                                     'publicationauthoraffiliation')):
                    os.mkdir(
                        os.path.join(self.path2database,
                                     'publicationauthoraffiliation'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'pub2field')):
                    os.mkdir(os.path.join(self.path2database, 'pub2field'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'fieldinfo')):
                    os.mkdir(os.path.join(self.path2database, 'fieldinfo'))

            journal_dict = {}
            journal_column_names = [
                'JournalId', 'FullName', 'AbbreviatedName', 'Publisher'
            ]

            pub_column_names = [
                'PublicationId', 'Title', 'Date', 'Year', 'Doi', 'JournalId',
                'Volume', 'Issue', 'PageStart', 'PageEnd', 'DocType',
                'TeamSize'
            ]

            pub_df = []
            pub2year = {}
            pub2doctype = {}
            pub2int = {}
            ipub = 0
            if pubid2int:
                pubintcol = ['PublicationId']
            else:
                pubintcol = []

            iaff = 0
            affil_dict = {}
            paa_df = []

            field_dict = {}
            pub2field_df = []

            for fname in tqdm(metadata_files,
                              desc='aps-metadata',
                              leave=True,
                              disable=not show_progress):
                # load pub json
                pubjson = json.loads(archive.read(fname).decode('utf-8'))
                ipub += 1

                # start parsing publication information
                if pubid2int:
                    pubid = ipub
                    pub2int[pubjson.get('id', '')] = pubid
                else:
                    pubid = pubjson.get('id', '')
                pubinfo = [pubid]
                pubinfo.append(pubjson.get('title', {}).get('value', ''))
                pubinfo.append(pubjson.get('date', ''))
                pubinfo.append(load_int(pubjson.get('date', '').split('-')[0]))
                pub2year[pubid] = pubinfo[-1]
                pubinfo.append(pubjson.get('id', ''))

                # journal of publication
                journalid = pubjson.get('journal', {}).get('id', '')
                pubinfo.append(journalid)
                pubinfo.append(
                    load_int(pubjson.get('volume', {}).get('number', '')))
                pubinfo.append(
                    load_int(pubjson.get('issue', {}).get('number', '')))

                # add pagenumber info
                pubinfo.append(load_int(pubjson.get('pageStart', '')))
                if not pubjson.get('pageEnd', None) is None:
                    pubinfo.append(load_int(pubjson.get('pageEnd', '')))
                elif not (pubjson.get('numPages', None) is None
                          or pubjson.get('pageStart', None) is None):
                    pubinfo.append(pubinfo[-1] +
                                   load_int(pubjson.get('numPages', '')))
                else:
                    pubinfo.append(None)

                # add the doctype
                pubinfo.append(pubjson.get('articleType', ''))
                pub2doctype[pubid] = pubinfo[-1]

                # calculate TeamSize
                pubinfo.append(len(pubjson.get('authors', [])))

                # finish publication infor
                pub_df.append(pubinfo)

                # check if we need to save journal information
                if journal_dict.get(journalid, None) is None:
                    journal_dict[journalid] = pubjson.get('journal', {})
                    journal_dict[journalid]['Publisher'] = pubjson.get(
                        'rights', {}).get('copyrightHolders', [{
                            'name': ''
                        }])[0].get('name', '')

                # start parsing affiliation information
                pub_affid_map = {}
                for pubaffdict in pubjson.get('affiliations', []):
                    # check if the affiliation has been used before (only using string match)
                    # ToDo: add disambigation
                    if affil_dict.get(pubaffdict.get('name', ''),
                                      None) is None:
                        affil_dict[pubaffdict.get('name', '')] = iaff
                        iaff += 1

                    # map the affiliation to the AffiliationId
                    pub_affid_map[pubaffdict.get(
                        'id', '')] = affil_dict[pubaffdict.get('name', '')]

                authorseq = 1
                # now start parsing author information
                for authordict in pubjson.get('authors', []):
                    for affid in authordict.get('affiliationIds', [None]):
                        paa_df.append([
                            pubid,
                            authordict.get('name', ''),
                            pub_affid_map.get(affid, None), authorseq
                        ])

                    authorseq += 1

                # now do the subject classifications
                for subjectdict in pubjson.get('classificationSchemes',
                                               {}).get('subjectAreas', []):
                    pub2field_df.append([pubid, subjectdict.get('id', None)])

                    if field_dict.get(subjectdict.get('id', None),
                                      None) is None:
                        field_dict[subjectdict.get('id',
                                                   None)] = subjectdict.get(
                                                       'label', None)

                # ToDo: parse concepts

            if show_progress:
                print("Parsing Complete\nSaving Publication DataFrames")

            pub_df = pd.DataFrame(pub_df, columns=pub_column_names)
            for intcol in pubintcol + ['Year']:
                pub_df[intcol] = pub_df[intcol].astype(int)

            journal_rename_dict = {
                'name': 'FullName',
                'id': 'JournalId',
                'abbreviatedName': 'AbbreviatedName'
            }
            journal_df = pd.DataFrame(
                journal_dict.values()).rename(columns=journal_rename_dict)

            affiliation_df = pd.DataFrame(
                [[affid, name] for name, affid in affil_dict.items()],
                columns=['AffiliationId', 'Address'])

            paa_df = pd.DataFrame(paa_df,
                                  columns=[
                                      'PublicationId', 'OrigAuthorName',
                                      'AffiliationId', 'AuthorSequence'
                                  ])
            for intcol in pubintcol + ['AuthorSequence']:
                paa_df[intcol] = paa_df[intcol].astype(int)

            pub2field_df = pd.DataFrame(pub2field_df,
                                        columns=['PublicationId', 'FieldId'])
            for intcol in pubintcol:
                pub2field_df[intcol] = pub2field_df[intcol].astype(int)

            field_df = pd.DataFrame(
                [[fieldid, fieldname]
                 for fieldid, fieldname in field_dict.items()],
                columns=['FieldId', 'FullName'])

            if preprocess:
                pub_df.to_hdf(os.path.join(self.path2database, 'publication',
                                           'publication0.hdf'),
                              mode='w',
                              key='publication')

                if pubid2int:
                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2int.json.gz'), 'w') as outfile:
                        outfile.write(json.dumps(pub2int).encode('utf8'))

                if preprocess_dicts:
                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2year.json.gz'), 'w') as outfile:
                        outfile.write(json.dumps(pub2year).encode('utf8'))

                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2doctype.json.gz'),
                            'w') as outfile:
                        outfile.write(json.dumps(pub2doctype).encode('utf8'))

                journal_df.to_hdf(os.path.join(self.path2database, 'journal',
                                               'journal0.hdf'),
                                  mode='w',
                                  key='journal')

                affiliation_df.to_hdf(os.path.join(self.path2database,
                                                   'affiliation',
                                                   'affiliation0.hdf'),
                                      mode='w',
                                      key='affiliation')

                paa_df.to_hdf(os.path.join(
                    self.path2database, 'publicationauthoraffiliation',
                    'publicationauthoraffiliation0.hdf'),
                              mode='w',
                              key='publicationauthoraffiliation')

                pub2field_df.to_hdf(os.path.join(self.path2database,
                                                 'pub2field',
                                                 'pub2field0.hdf'),
                                    mode='w',
                                    key='pub2field')

                field_df.to_hdf(os.path.join(self.path2database, 'fieldinfo',
                                             'fieldinfo0.hdf'),
                                mode='w',
                                key='pub2field')

        else:
            raise FileNotFoundError(
                'The archive {0} does not contain a metadata directory: {1}.'.
                format(archive_name, 'aps-dataset-metadata'))
예제 #4
0
    def parse_publicationauthoraffiliation(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing = False):
        """
        Parse the PubMed publication-author raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
       
        Returns
        ----------
        DataFrame
            Publication-Author DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publicationauthoraffiliation')):
                os.mkdir(os.path.join(self.path2database, 'publicationauthoraffiliation'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed author xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publicationauthoraffiliation{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                paa_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        for seq, author in enumerate(author_list.findall('Author')):
                            author_record = self._blank_pubmed_author()

                            author_record['PublicationId'] = PublicationId
                            author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                            author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                            author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                            if author.find("AffiliationInfo/Affiliation") is not None:
                                author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                                author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                            author_record['AuthorSequence'] = seq+1

                            paa_df.append(author_record)
                paa_df = pd.DataFrame(paa_df)
            paa_df['AuthorSequence'] = paa_df['AuthorSequence'].astype(int)
            paa_df.to_hdf( os.path.join(self.path2database, self.path2paa_df, 'publicationauthoraffiliation{}.hdf'.format(ifile)), key = 'paa', mode='w')


        ## load publication author dataframe into a large file
        paa_files_list = glob.glob(os.path.join(self.path2database, self.path2paa_df) + 'publicationauthoraffiliation*.hdf')

        paa_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_paa_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            paa_df = paa_df.append(pd.read_hdf(tmp_paa_df), ignore_index = True)

        return paa_df
예제 #5
0
    def parse_fields(self, preprocess = True, num_file_lines=10**7, rewrite_existing=False,xml_directory = 'RawXML'):
        """
        Parse the PubMed field (mesh term) raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication-Term ID DataFrame and Term ID - Term DataFrame
        """

        if preprocess:
            for hier_dir_type in [self.path2pub2field_df, self.path2fieldinfo_df]:
                if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                    os.mkdir(os.path.join(self.path2database, hier_dir_type))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)
            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            # global id to term mapping
            fieldinfo = {}

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2field_df,'pub2field{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2field_df = []

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                # save the pub-field id
                pub2field_df = pd.DataFrame(pub2field_df, columns = ['PublicationId', 'FieldId'], dtype=int)
                pub2field_df.to_hdf( os.path.join(self.path2database, self.path2pub2field_df, 'pub2field{}.hdf'.format(ifile)), key = 'pub2field', mode='w')


            # if rewriting
            dest_file_name = os.path.join(self.path2database, self.path2pub2fieldinfo_df,'fieldinfo.hdf')
            if rewrite_existing:
                # save field info dictionary
                mesh_id_df_list = list(field_info.values())
                for i, j in enumerate(field_info.keys()):
                    mesh_id_df_list[i].insert(0, j)

                fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
                fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')


        # load the dataframes

        # pub2field
        pub2field_files = glob.glob(os.path.join(self.path2database, self.path2pub2field_df) + 'pub2field*.hdf')
        pub2field_df = pd.DataFrame()

        for pub2field_tmp_file in tqdm(pub2field_files, desc='PubMed pub2field files', leave=True, disable=not show_progress):
            pub2field_df = pub2field_df.append(pd.read_hdf(pub2field_tmp_file), ignore_index=True)

        # field info map
        fieldinfo_df = pd.read_hdf(os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'))

        return pub2field_df, fildinfo_df
예제 #6
0
    def parse_publications(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7,rewrite_existing = False):
        """
        Parse the PubMed publication raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication metadata DataFrame.
        """

        # process publication files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publication')):
                os.mkdir(os.path.join(self.path2database, 'publication'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed publication xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publication{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                publication_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    pub_record = self._blank_pubmed_publication(PublicationId)

                    article = medline.find("Article")
                    pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                    if article.find('Pagination') == None:
                        pub_record['Pages'] = None
                    else:
                        pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                    journal = article.find("Journal")
                    pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                    pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                    pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                    pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                    history = article_bucket.find("PubmedData/History")
                    if not history is None:
                        pdate = history.find('PubMedPubDate')
                        if not pdate is None:
                            pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                            pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                            pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))

                    article_ids = article_bucket.find("PubmedData/ArticleIdList")
                    if article_ids is not None:
                        doi = article_ids.find('ArticleId[@IdType="doi"]')
                        pub_record['Doi'] = load_xml_text(doi)


                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        pub_record['TeamSize'] = len(author_list.findall('Author'))

                    publication_df.append(pub_record)

                # save publication dataframe
                publication_df = pd.DataFrame(publication_df)
                publication_df['PublicationId'] = publication_df['PublicationId'].astype(int)
                publication_df['Year'] = publication_df['Year'].astype(int)
                publication_df['Month'] = publication_df['Month'].astype(int)
                publication_df['Day'] = publication_df['Day'].astype(int)
                publication_df['Volume'] = pd.to_numeric(publication_df['Volume'])
                publication_df['TeamSize'] = publication_df['TeamSize'].astype(int)
                publication_df.to_hdf( os.path.join(self.path2database, self.path2pub_df, 'publication{}.hdf'.format(ifile)), key = 'pub', mode='w')

        ## load publication dataframe into a large file
        pub_files_list = glob.glob(os.path.join(self.path2database, self.path2pub_df) + 'publication*.hdf')

        pub_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_pub_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            pub_df = pub_df.append(pd.read_hdf(tmp_pub_df), ignore_index = True)

        return pub_df
예제 #7
0
    def parse_references(self, xml_directory='RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing=False,show_progress=True):
        """
        Parse the PubMed References raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Citations DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'pub2ref')):
                os.mkdir(os.path.join(self.path2database, 'pub2ref'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed reference xml files', leave=True, disable=not show_progress):

                xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2ref_df,'pub2ref{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2ref_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))


                    references = article_bucket.find("PubmedData/ReferenceList")
                    if not references is None:
                        for ref in references.findall("Reference"):
                            citation = load_xml_text(ref.find("Citation"))
                            if not ref.find('ArticleIdList') is None:
                                pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                            else:
                                pmid = ""
                            pub2ref_df.append([PublicationId, pmid, citation])

                # save file
                pub2ref_df = pd.DataFrame(pub2ref_df, columns = ['CitedPublicationId', 'CitingPublicationId', 'Citation'], dtype=int)
                pub2ref_df.to_hdf( os.path.join(self.path2database, self.path2pub2ref_df, 'pub2ref{}.hdf'.format(ifile)), key = 'pub2ref', mode='w')


        # load the citations into a large dataframe

        pub2ref_files = glob.glob(os.path.join(self.path2database, self.path2pub2ref_df)+ 'pub2ref*.hdf')

        pub2ref_df = pd.DataFrame()

        print("parsing citation data...")
        for pub2ref_tmp in tqdm(pub2ref_files,desc='PubMed citation xml files', leave=True, disable=not show_progress):
            pub2ref_df = pub2ref_df.append(pd.read_hdf(pub2ref_tmp), ignore_indexTrue)

        return pub2ref_df
예제 #8
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_fields(self,
                     preprocess=False,
                     num_file_lines=10**7,
                     show_progress=True):
        """
        Parse the MAG Paper Field raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param num_file_lines: int, default 10**7
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        Returns
        ----------
        DataFrame
            Pub2Field DataFrame.
        """
        field2get = [0, 5, 6]
        fieldnames = [
            'FieldId', 'FieldLevel', 'NumberPublications', 'FieldName'
        ]

        if preprocess:
            if not os.path.exists(os.path.join(self.path2database,
                                               'fieldinfo')):
                os.mkdir(os.path.join(self.path2database, 'fieldinfo'))

        fieldinfo = []
        with open(
                os.path.join(self.path2database, 'advanced',
                             'FieldsOfStudy.txt'), 'r') as infile:

            for line in infile:
                sline = line.split('\t')
                fielddata = [load_int(sline[ip])
                             for ip in field2get] + [sline[2]]
                fieldinfo.append(fielddata)

        field_df = pd.DataFrame(fieldinfo, columns=fieldnames)
        if preprocess:
            field_df.to_hdf(os.path.join(self.path2database, 'fieldinfo',
                                         'fieldinfo0.hdf'),
                            key='field',
                            mode='w')

        # and now do pub2field
        paperfields = [0, 1]
        paperfieldnames = ['PublicationId', 'FieldId']

        if preprocess:
            if not os.path.exists(os.path.join(self.path2database,
                                               'pub2field')):
                os.mkdir(os.path.join(self.path2database, 'pub2field'))

        file_name = os.path.join(self.path2database, 'advanced',
                                 'PaperFieldsOfStudy.txt')

        ipaper = 0
        ifile = 0
        fieldinfo = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='PaperFieldsOfStudy',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(file_name, 'r') as infile:

                for line in infile:
                    sline = line.split('\t')
                    fielddata = [int(sline[ip]) for ip in paperfields]
                    fieldinfo.append(fielddata)
                    ipaper += 1

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

                    if preprocess and ipaper % num_file_lines == 0:
                        pd.DataFrame(
                            fieldinfo,
                            columns=paperfieldnames).to_hdf(os.path.join(
                                self.path2database, 'pub2field',
                                'pub2field' + str(ifile) + '.hdf'),
                                                            key='pub2field',
                                                            mode='w')

                        ifile += 1
                        fieldinfo = []

        pub2field_df = pd.DataFrame(fieldinfo, columns=paperfieldnames)
        if preprocess:
            pub2field_df.to_hdf(
                os.path.join(self.path2database, 'pub2field',
                             'pub2field' + str(ifile) + '.hdf'),
                key='pub2field',
                mode='w')
        return pub2field_df
예제 #9
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_affiliations(self, preprocess=True, show_progress=True):
        """
        Parse the MAG Affilation raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param show_progress: bool, default True
            Show progress with processing of the data.


        Returns
        ----------
        DataFrame
            Affiliation DataFrame.
        """

        affil_int_columns = [0, 7, 8]
        affil_str_columns = [3, 4, 5, 6]
        affil_float_columns = [9, 10]

        affil_column_names = [
            'AffiliationId', 'NumberPublications', 'NumberCitations',
            'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude',
            'Longitude'
        ]

        file_name = os.path.join(self.path2database, 'mag', 'Affiliations.txt')

        affiliation_info = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='Affiliations',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(file_name, 'r') as infile:
                for line in infile:
                    sline = line.replace('\n', '').split('\t')
                    affline = [load_int(sline[i]) for i in affil_int_columns]
                    affline += [sline[i] for i in affil_str_columns]
                    affline += [
                        load_float(sline[i]) for i in affil_float_columns
                    ]
                    affiliation_info.append(affline)

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

        aff_df = pd.DataFrame(affiliation_info, columns=affil_column_names)

        if preprocess:
            if not os.path.exists(
                    os.path.join(self.path2database, 'affiliation')):
                os.mkdir(os.path.join(self.path2database, 'affiliation'))
            aff_df.to_hdf(os.path.join(self.path2database, 'affiliation',
                                       'affiliation0.hdf'),
                          key='affiliation',
                          mode='w')

        return aff_df
예제 #10
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_publicationauthoraffiliation(self,
                                           preprocess=False,
                                           num_file_lines=10**7,
                                           show_progress=True):
        """
        Parse the MAG PublicationAuthorAffiliation raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param num_file_lines: int, default 10**7
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        Returns
        ----------
        DataFrame
            PublicationAuthorAffiliation DataFrame.
        """
        pubauthaff_int_columns = [0, 1, 2, 3]
        pubauthaff_str_columns = [4, 5]
        pub_column_names = [
            'PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence',
            'OrigAuthorName', 'OrigAffiliationName'
        ]

        if preprocess:
            if not os.path.exists(
                    os.path.join(self.path2database,
                                 'publicationauthoraffiliation')):
                os.mkdir(
                    os.path.join(self.path2database,
                                 'publicationauthoraffiliation'))

        file_name = os.path.join(self.path2database, 'mag',
                                 'PaperAuthorAffiliations.txt')

        iref = 0
        ifile = 0
        pubauthaff_info = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='PaperAuthorAffiliations',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(file_name, 'r') as infile:
                for line in infile:
                    sline = line.replace('\n', '').split('\t')
                    pubauthaff_info.append(
                        [load_int(sline[ip])
                         for ip in pubauthaff_int_columns] + [
                             sline[ip] if len(sline) > ip else ''
                             for ip in pubauthaff_str_columns
                         ])
                    iref += 1

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

                    if preprocess and iref % num_file_lines == 0:
                        pd.DataFrame(
                            pubauthaff_info, columns=pub_column_names).to_hdf(
                                os.path.join(
                                    self.path2database,
                                    'publicationauthoraffiliation',
                                    'publicationauthoraffiliation{}.hdf'.
                                    format(ifile)),
                                key='publicationauthoraffiliation',
                                mode='w')

                        ifile += 1
                        pubauthaff_info = []

        paa_df = pd.DataFrame(pubauthaff_info, columns=pub_column_names)
        if preprocess:
            paa_df.to_hdf(os.path.join(
                self.path2database, 'publicationauthoraffiliation',
                'publicationauthoraffiliation{}.hdf'.format(ifile)),
                          key='publicationauthoraffiliation',
                          mode='w')
        return paa_df
예제 #11
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_references(self,
                         preprocess=False,
                         num_file_lines=10**7,
                         show_progress=True):
        """
        Parse the MAG References raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param num_file_lines: int, default 10**7
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        Returns
        ----------
        DataFrame
            Pub2Ref DataFrame.
        """
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'pub2ref')):
                os.mkdir(os.path.join(self.path2database, 'pub2ref'))

        file_name = os.path.join(self.path2database, 'mag',
                                 'PaperReferences.txt')

        iref = 0
        ifile = 0
        pub2ref_info = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  unit_divisor=1024,
                  desc='References',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(file_name, 'r') as infile:
                for line in infile:
                    # split the line and keep only the relevant columns
                    sline = line.replace('\n', '').split('\t')
                    pub2ref_info.append(
                        [load_int(sline[ip]) for ip in range(2)])
                    iref += 1

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

                    if preprocess and iref % num_file_lines == 0:
                        pd.DataFrame(pub2ref_info,
                                     columns=[
                                         'CitingPublicationId',
                                         'CitedPublicationId'
                                     ]).to_hdf(os.path.join(
                                         self.path2database, 'pub2ref',
                                         'pub2ref{}.hdf'.format(ifile)),
                                               key='pub2ref',
                                               mode='w')

                        ifile += 1
                        pub2ref_info = []

        pub2ref_df = pd.DataFrame(
            pub2ref_info,
            columns=['CitingPublicationId', 'CitedPublicationId'])

        if preprocess:
            pub2ref_df.to_hdf(os.path.join(self.path2database, 'pub2ref',
                                           'pub2ref{}.hdf'.format(ifile)),
                              key='pub2ref',
                              mode='w')

        return pub2ref_df
예제 #12
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_publications(self,
                           preprocess=True,
                           num_file_lines=5 * 10**6,
                           preprocess_dicts=True,
                           show_progress=True):
        """
        Parse the MAG Publication and Journal raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param preprocess_dicts: bool, default True
            Save the processed Year and DocType data as dictionaries.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        Returns
        ----------
        DataFrame
            Publication DataFrame.
        """

        # first do the journal information
        journal_str_col = [2, 4, 5, 6]
        journal_column_names = [
            'JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage'
        ]

        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'journal')):
                os.mkdir(os.path.join(self.path2database, 'journal'))

        file_name = os.path.join(self.path2database, 'mag', 'Journals.txt')

        journal_info = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='Journals',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(os.path.join(self.path2database, 'mag', 'Journals.txt'),
                      'r') as infile:
                for line in infile:
                    # split the line and keep only the relevant columns
                    sline = line.replace('\n', '').split('\t')
                    jline = [load_int(sline[0])
                             ] + [sline[i] for i in journal_str_col]
                    journal_info.append(jline)

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

        journal_df = pd.DataFrame(journal_info, columns=journal_column_names)
        if preprocess:
            journal_df.to_hdf(os.path.join(self.path2database, 'journal',
                                           'journal.hdf'),
                              key='journal',
                              mode='w')

        #now lets do the publication information

        doctype = {
            'Journal': 'j',
            'Book': 'b',
            '': '',
            'BookChapter': 'bc',
            'Conference': 'c',
            'Dataset': 'd',
            'Patent': 'p',
            'Repository': 'r'
        }

        pub_int_columns = [0, 7, 10, 21]
        pub_str_columns = [2, 4, 8, 13, 14]
        pub_column_names = [
            'PublicationId', 'Year', 'JournalId', 'FamilyId', 'Doi', 'Title',
            'Date', 'Volume', 'Issue', 'DocType'
        ]

        if preprocess:
            if not os.path.exists(
                    os.path.join(self.path2database, 'publication')):
                os.mkdir(os.path.join(self.path2database, 'publication'))

        file_name = os.path.join(self.path2database, 'mag', 'Papers.txt')

        ipub = 0
        ifile = 0
        pubinfo = []

        pub2year = {}
        pub2doctype = {}
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='Publications',
                  leave=True,
                  disable=not show_progress) as pbar:
            with open(file_name, 'r') as infile:
                for line in infile:
                    # split the line and keep only the relevant columns
                    sline = line.replace('\n', '').split('\t')
                    pline = [load_int(sline[ip]) for ip in pub_int_columns] + [
                        sline[ip] for ip in pub_str_columns
                    ] + [doctype[sline[3]]]
                    pub2year[pline[0]] = pline[1]
                    if doctype[sline[3]] != '':
                        pub2doctype[pline[0]] = doctype[sline[3]]

                    pubinfo.append(pline)
                    ipub += 1

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

                    if preprocess and ipub % num_file_lines == 0:
                        pd.DataFrame(pubinfo, columns=pub_column_names).to_hdf(
                            os.path.join(self.path2database, 'publication',
                                         'publication{}.hdf'.format(ifile)),
                            key='publication',
                            mode='w')

                        ifile += 1
                        pubinfo = []

            pub_df = pd.DataFrame(pubinfo, columns=pub_column_names)
            if preprocess:
                pub_df.to_hdf(os.path.join(self.path2database, 'publication',
                                           'publication{}.hdf'.format(ifile)),
                              key='publication',
                              mode='w')

                if preprocess_dicts:
                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2year.json.gz'), 'w') as outfile:
                        outfile.write(json.dumps(pub2year).encode('utf8'))

                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2doctype.json.gz'),
                            'w') as outfile:
                        outfile.write(json.dumps(pub2doctype).encode('utf8'))

        return pub_df
예제 #13
0
파일: MAG.py 프로젝트: jisungyoon/pyscisci
    def parse_authors(self,
                      preprocess=False,
                      process_name=True,
                      num_file_lines=5 * 10**6,
                      show_progress=True):
        """
        Parse the MAG Author raw data.

        Parameters
        ----------
        :param preprocess: bool, default True
            Save the processed data in new DataFrames.

        :param process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        :param num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        Returns
        ----------
        DataFrame
            Author DataFrame.
        """

        author_int_columns = [0, 4, 5, 6]

        author_column_names = [
            'AuthorId', 'LastKnownAffiliationId', 'NumberPublications',
            'NumberCitations', 'FullName'
        ]
        if process_name:
            author_column_names += ['LastName', 'FirstName', 'MiddleName']

        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'author')):
                os.mkdir(os.path.join(self.path2database, 'author'))

        file_name = os.path.join(self.path2database, 'mag', 'Authors.txt')

        iauthor = 0
        ifile = 0
        authorinfo = []
        with tqdm(total=os.path.getsize(file_name),
                  unit='iB',
                  unit_scale=True,
                  desc='Authors',
                  disable=not show_progress,
                  leave=True) as pbar:
            with open(file_name, 'r') as infile:

                for line in infile:

                    # split the line and keep only the relevant columns
                    sline = line.split('\t')
                    adata = [load_int(sline[ip])
                             for ip in author_int_columns] + [sline[2]]

                    # process the first, middle, and last names for the author
                    if process_name:
                        hname = HumanName(
                            unicodedata.normalize('NFD', sline[2]))
                        adata += [hname.last, hname.first, hname.middle]

                    authorinfo.append(adata)
                    iauthor += 1

                    # update progress bar
                    pbar.update(sys.getsizeof(line))

                    # time to save
                    if preprocess and iauthor % num_file_lines == 0:
                        pd.DataFrame(authorinfo,
                                     columns=author_column_names).to_hdf(
                                         os.path.join(
                                             self.path2database, 'author',
                                             'author{}.hdf'.format(ifile)),
                                         key='author',
                                         mode='w')

                        ifile += 1
                        authorinfo = []

                author_df = pd.DataFrame(authorinfo,
                                         columns=author_column_names)
                if preprocess:
                    author_df.to_hdf(
                        os.path.join(self.path2database, 'author',
                                     'author{}.hdf'.format(ifile)),
                        key='author',
                        mode='w')

        return author_df
예제 #14
0
    def preprocess(
            self,
            xml_directory='RawXML',
            name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord',
            process_name=True,
            num_file_lines=10**6,
            show_progress=True):
        """
        Bulk preprocess of the Web of Science raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        xml_file_name: str, default 'dblp.xml.gz'
            The xml file name.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        """

        pub_column_names = [
            'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title',
            'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize'
        ]
        author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName']

        if show_progress:
            print("Starting to preprocess the WOS database.")

        for hier_dir_type in [
                'publication', 'author', 'publicationauthoraffiliation',
                'pub2field', 'pub2ref', 'affiliation'
        ]:

            if not os.path.exists(
                    os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))

        pub2year = {}
        pub2doctype = {}

        found_aids = set([])

        found_affiliations = {}

        ns = {"ns": name_space}
        xmlfiles = sorted([
            fname for fname in os.listdir(
                os.path.join(self.path2database, xml_directory))
            if '.xml' in fname
        ])

        ifile = 0
        for xml_file_name in tqdm(xmlfiles,
                                  desc='WOS xml files',
                                  leave=True,
                                  disable=not show_progress):

            publication_df = []
            author_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []
            affiliation_df = []
            field_df = []

            name, extension = os.path.splitext(xml_file_name)

            if extension == '.gz':
                with gzip.open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()
                bytesxml = BytesIO(xml_file)

            elif extension == '.xml':
                with open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()

            # extract the desired fields from the XML tree  #

            xmltree = etree.iterparse(bytesxml,
                                      events=('end', ),
                                      tag="{{{0}}}REC".format(name_space))

            if show_progress:
                print("{} Xml tree parsed, iterating through elements.".format(
                    xml_file_name))

            last_position = 0

            for event, elem in xmltree:

                # scrape the publication information
                PublicationId = load_html_str(
                    elem.xpath('./ns:UID', namespaces=ns)[0].text)

                pub_record = self._blank_wos_publication(PublicationId)

                pub_record['Title'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]',
                            namespaces=ns)))
                pub_record['JournalId'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]',
                            namespaces=ns)))

                pub_info = elem.xpath(
                    './ns:static_data/ns:summary/ns:pub_info',
                    namespaces=ns)[0]
                pub_record['Year'] = load_int(pub_info.get('pubyear', ''))
                pub_record['Date'] = load_html_str(pub_info.get(
                    'sortdate', ''))
                pub_record['Volume'] = load_int(pub_info.get('vol', ''))
                pub_record['Issue'] = load_int(pub_info.get('issue', ''))

                pub2year[PublicationId] = pub_record['Year']

                pub_record['Pages'] = load_html_str(
                    load_xml_text(elem.xpath(
                        './ns:static_data/ns:summary/ns:pub_info/ns:page',
                        namespaces=ns),
                                  default=''))

                for ident in ['ISSN', 'Doi']:
                    identobject = elem.xpath(
                        './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]'
                        .format(ident.lower()),
                        namespaces=ns)
                    if len(identobject) > 0:
                        pub_record[ident] = load_html_str(identobject[0].get(
                            'value', ''))

                #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns)))

                pub_record['DocType'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:doctypes/ns:doctype',
                            namespaces=ns)))

                pub2doctype[PublicationId] = pub_record['DocType']

                # now scrape the authors
                pub_authors = {}
                author_objects = elem.xpath(
                    './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]',
                    namespaces=ns)
                pub_record['TeamSize'] = len(author_objects)

                for author_obj in author_objects:
                    author_record = self._blank_wos_author(None)
                    author_record['AuthorId'] = author_obj.get('dais_id', None)

                    author_record['FullName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:full_name', namespaces=ns)))
                    author_record['FirstName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:first_name',
                                             namespaces=ns)))
                    author_record['LastName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:last_name', namespaces=ns)))

                    author_record['Affiliations'] = author_obj.get(
                        'addr_no', '')
                    author_record['Affiliations'] = [
                        int(single_addr_no) for single_addr_no in
                        author_record['Affiliations'].split(' ')
                        if len(single_addr_no) > 0
                    ]

                    author_record['AuthorOrder'] = int(
                        author_obj.get('seq_no', None))

                    pub_authors[author_record['AuthorOrder']] = author_record

                #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns)

                address_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec',
                    namespaces=ns)
                for addr_obj in address_objects:
                    addr_record = self._blank_wos_affiliation()

                    organization_objects = addr_obj.xpath(
                        './ns:organizations/ns:organization[@pref="Y"]',
                        namespaces=ns)
                    if len(organization_objects) == 0:
                        organization_objects = addr_obj.xpath(
                            './ns:organizations/ns:organization',
                            namespaces=ns)

                    if len(organization_objects) == 0:
                        orgtext = ''
                    else:
                        orgtext = organization_objects[0].text

                    address_no = int(addr_obj.get('addr_no'))

                    affiliation_df.append([PublicationId, addr_no, orgtext])

                    #if found_affiliations

                    #article['addresses'][address_no] = address_info

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'heading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'subheading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]',
                    namespaces=ns)
                field_df.extend([[
                    PublicationId, field_obj.text, 'ASCA traditional subject'
                ] for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'ASCA extended subject']
                     for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'keyword']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:item/ns:keywords_plus/ns:keyword',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'keyword plus']
                     for field_obj in field_objects if field_obj is not None])

                reference_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference',
                    namespaces=ns)
                for ref_obj in reference_objects:
                    for ref_elem in ref_obj:
                        if ref_elem.tag == "{{{0}}}uid".format(name_space):
                            refid = load_html_str(
                                ref_elem.text.replace('WOS:', ''))
                            pub2ref_df.append([PublicationId, refid])
                        elif ref_elem.tag == "{{{0}}}year".format(name_space):
                            pub2year[refid] = load_int(ref_elem.text)

                publication_df.append(
                    [pub_record[k] for k in pub_column_names])

                for aorder, author_record in pub_authors.items():
                    if not author_record[
                            'AuthorId'] is None and not author_record[
                                'AuthorId'] in found_aids:
                        found_aids.add(author_record['AuthorId'])
                        author_df.append(
                            [author_record[k] for k in author_column_names])

                    paa_df.append([
                        PublicationId, author_record['AuthorId'], aorder,
                        author_record['FullName']
                    ])

            self._save_dataframes(ifile, publication_df, pub_column_names,
                                  author_df, author_column_names, paa_df,
                                  pub2ref_df, affiliation_df, field_df)
            ifile += 1

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

        with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2doctype).encode('utf8'))