Python load_int 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyscisci.datasource.readwrite

메소드/함수: load_int

hotexamples.com에서의 예제들: 14

Python load_int - 14개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyscisci.datasource.readwrite.load_int에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def preprocess(self,
                   xml_file_name='dblp.xml.gz',
                   process_name=True,
                   num_file_lines=10**6,
                   show_progress=True):
        """
        Bulk preprocess of the DBLP raw data.

        Parameters
        ----------
        :param process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        :param xml_file_name: str, default 'dblp.xml.gz'
            The xml file name.

        :param num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        :param show_progress: bool, default True
            Show progress with processing of the data.

        """

        ACCEPT_DOCTYPES = set([
            'article', 'inproceedings', 'proceedings', 'book', 'incollection',
            'phdthesis', 'mastersthesis'
        ])
        REJECT_DOCTYPES = set(['www'])
        DATA_ITEMS = [
            'title', 'booktitle', 'year', 'journal', 'ee', ' url', 'month',
            'mdate', 'isbn', 'publisher'
        ]
        SKIP_FIELDS = [
            'note', 'cite', 'cdrom', 'crossref', 'editor', 'series', 'tt',
            'school', 'chapter', 'address'
        ]

        doctype = {
            'article': 'j',
            'book': 'b',
            '': '',
            'phdthesis': 'phd',
            'proceedings': 'c',
            'inproceedings': 'c',
            'mastersthesis': 'ms',
            'incollection': 'c'
        }

        html_format_keys = [
            '<sub>', '</sub>', '<sup>', '</sup>', '<i>', '</i>'
        ]

        if show_progress:
            print("Starting to preprocess the DBLP database.")

        if not os.path.exists(os.path.join(self.path2database, 'publication')):
            os.mkdir(os.path.join(self.path2database, 'publication'))

        if not os.path.exists(os.path.join(self.path2database, 'author')):
            os.mkdir(os.path.join(self.path2database, 'author'))

        if not os.path.exists(
                os.path.join(self.path2database, 'publicationauthor')):
            os.mkdir(os.path.join(self.path2database, 'publicationauthor'))

        publication_df = []
        author_df = []
        author2pub_df = []
        journal_df = []

        PublicationId = 1
        AuthorId = 1
        aname2aid = {}
        author_columns = ['AuthorId', 'FullName']
        if process_name:
            author_columns += ['LastName', 'FirstName', 'MiddleName']
        JournalId = 1
        jname2jid = {}

        pub_record = self._blank_dblp_publication(PublicationId)
        pub_authors = []
        AuthorCount = 0

        ifile = 0

        # read dtd - this takes
        path2database = self.path2database  # remove self to use inside of this class

        class DTDResolver(etree.Resolver):
            def resolve(self, system_url, public_id, context):
                return self.resolve_filename(
                    os.path.join(path2database, system_url), context)

        if '.gz' in xml_file_name:
            with gzip.open(os.path.join(self.path2database, xml_file_name),
                           'r') as infile:
                xml_file = infile.read()

        else:
            with open(os.path.join(self.path2database, xml_file_name),
                      'r') as infile:
                xml_file = infile.read().encode('latin1')

        # extract the desired fields from the XML tree  #
        bytesxml = BytesIO(xml_file)
        xmltree = etree.iterparse(bytesxml,
                                  load_dtd=True,
                                  resolve_entities=True)
        xmltree.resolvers.add(DTDResolver())

        if show_progress:
            print("Xml tree parsed, iterating through elements.")

        last_position = 0
        xml_size = bytesxml.getbuffer().nbytes
        with tqdm(total=xml_size,
                  unit='iB',
                  unit_scale=True,
                  desc='dblp.xml',
                  leave=True,
                  disable=not show_progress) as pbar:
            for event, elem in xmltree:
                if elem.tag == 'title' or elem.tag == 'booktitle':
                    pub_record['Title'] = load_html_str(elem.text)

                elif elem.tag == 'year':
                    pub_record['Year'] = load_int(elem.text)

                elif elem.tag == 'month':
                    pub_record['Month'] = load_int(elem.text)

                elif elem.tag == 'volume':
                    pub_record['Volume'] = load_int(elem.text)

                elif elem.tag == 'number':
                    pub_record['Number'] = load_html_str(elem.text)

                elif elem.tag == 'pages':
                    pub_record['Pages'] = load_html_str(elem.text)

                elif elem.tag == 'journal':
                    pub_record['JournalId'] = load_html_str(elem.text)

                elif elem.tag == 'url':
                    pub_record['URL'] = load_html_str(elem.text)

                elif elem.tag == 'ee':
                    pub_record['EE'] = load_html_str(elem.text)

                elif elem.tag == 'author':
                    AuthorCount += 1
                    fullname = load_html_str(elem.text)
                    if aname2aid.get(fullname, None) is None:
                        if process_name:
                            fullname = ''.join([
                                i for i in fullname if not i.isdigit()
                            ]).strip()
                            hname = HumanName(fullname)
                            author_df.append([
                                AuthorId, fullname, hname.last, hname.first,
                                hname.middle
                            ])
                        else:
                            author_df.append([AuthorId, fullname])
                        aname2aid[fullname] = AuthorId
                        AuthorId += 1

                    pub_authors.append(
                        [PublicationId, aname2aid[fullname], AuthorCount])

                elif elem.tag in ACCEPT_DOCTYPES:
                    pub_record['TeamSize'] = AuthorCount
                    pub_record['DocType'] = doctype[load_html_str(elem.tag)]

                    publication_df.append(pub_record)
                    author2pub_df.extend(pub_authors)
                    PublicationId += 1
                    pub_record = self._blank_dblp_publication(PublicationId)
                    AuthorCount = 0
                    pub_authors = []

                    # update progress bar
                    pbar.update(bytesxml.tell() - last_position)
                    last_position = bytesxml.tell()

                    if num_file_lines > 0 and (PublicationId %
                                               num_file_lines) == 0:

                        self._save_dataframes(ifile, publication_df, author_df,
                                              author_columns, author2pub_df)

                        ifile += 1

                        publication_df = []
                        author_df = []
                        author2pub_df = []

                elif elem.tag in REJECT_DOCTYPES:
                    # the record was from a rejected category so reset record
                    pub_record = self._blank_dblp_publication(PublicationId)
                    AuthorCount = 0
                    pub_authors = []

                elif elem.tag in SKIP_FIELDS:
                    pass

        del xmltree

        self._save_dataframes(ifile, publication_df, author_df, author_columns,
                              author2pub_df)

예제 #2

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def preprocess(self, xml_directory = 'RawXML', process_name=True, num_file_lines=10**6, show_progress=True,rewrite_existing = False):
        """
        Bulk preprocess of the PubMed raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        rewrite_existing: bool, default False
            If True, rewrites the files in the data directory
        """

        if show_progress:
            print("Starting to preprocess the PubMed database.")

        for hier_dir_type in [self.path2pub_df, self.path2paa_df, self.path2pub2field_df, self.path2pub2ref_df, self.path2fieldinfo_df]:

            if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))


        xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

        # read dtd - this takes
        path2database = self.path2database # remove self to use inside of this class
        class DTDResolver(etree.Resolver):
            def resolve(self, system_url, public_id, context):
                return self.resolve_filename(os.path.join(path2database, system_url), context)
        parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

        pub2year = {}
        fieldinfo = {}

        ifile = 0
        for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

            # check if the xml file was already parsed
            dest_file_name = os.path.join(self.path2database, self.path2pub_df,'publication{}.hdf'.format(ifile))
            if not rewrite_existing and os.path.isfile(dest_file_name):
                ifile+=1
                continue

            publication_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []

            xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

            all_pubmed_articles = xmltree.findall("/PubmedArticle")

            for article_bucket in all_pubmed_articles:

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))
                pub_record = self._blank_pubmed_publication(PublicationId)

                article = medline.find("Article")
                pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                if article.find('Pagination') == None:
                    pub_record['Pages'] = None
                else:
                    pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                journal = article.find("Journal")
                pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                history = article_bucket.find("PubmedData/History")
                if not history is None:
                    pdate = history.find('PubMedPubDate')
                    if not pdate is None:
                        pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                        pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                        pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))


                if pub_record['Year'] > 0:
                    pub2year[PublicationId] = pub_record['Year']

                article_ids = article_bucket.find("PubmedData/ArticleIdList")
                if article_ids is not None:
                    doi = article_ids.find('ArticleId[@IdType="doi"]')
                    pub_record['Doi'] = load_xml_text(doi)


                author_list = article.find('AuthorList')

                if not author_list is None:
                    for seq, author in enumerate(author_list.findall('Author')):
                        author_record = self._blank_pubmed_author()

                        author_record['PublicationId'] = PublicationId
                        author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                        author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                        author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                        if author.find("AffiliationInfo/Affiliation") is not None:
                            author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                            author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                        author_record['AuthorSequence'] = seq+1

                        paa_df.append(author_record)

                    pub_record['TeamSize'] = seq + 1

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                references = article_bucket.find("PubmedData/ReferenceList")
                if not references is None:
                    for ref in references.findall("Reference"):
                        citation = load_xml_text(ref.find("Citation"))
                        if not ref.find('ArticleIdList') is None:
                            pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                        else:
                            pmid = ""
                        pub2ref_df.append([PublicationId, pmid, citation])

                publication_df.append(pub_record)

            self._save_dataframes(ifile, publication_df, paa_df, pub2ref_df, pub2field_df)
            ifile += 1

        # if rewriting
        dest_file_name = os.path.join(self.path2database, self.path2fieldinfo_df,'fieldinfo.hdf')
        if rewrite_existing:
            # save field info dictionary
            mesh_id_df_list = list(field_info.values())
            for i, j in enumerate(field_info.keys()):
                mesh_id_df_list[i].insert(0, j)

            fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
            fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

예제 #3

파일 보기

    def parse_publications(self,
                           preprocess=False,
                           preprocess_dicts=True,
                           pubid2int=False,
                           archive_name='aps-dataset-metadata-2019.zip',
                           show_progress=False):

        archive = zipfile.ZipFile(
            os.path.join(self.path2database, archive_name), 'r')
        metadata_files = [
            fname for fname in archive.namelist()
            if 'aps-dataset-metadata' in fname and '.json' in fname
        ]

        # check that the archive concatins the expected directory
        if len(metadata_files) > 0:

            if preprocess:
                if not os.path.exists(
                        os.path.join(self.path2database, 'publication')):
                    os.mkdir(os.path.join(self.path2database, 'publication'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'journal')):
                    os.mkdir(os.path.join(self.path2database, 'journal'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'affiliation')):
                    os.mkdir(os.path.join(self.path2database, 'affiliation'))

                if not os.path.exists(
                        os.path.join(self.path2database,
                                     'publicationauthoraffiliation')):
                    os.mkdir(
                        os.path.join(self.path2database,
                                     'publicationauthoraffiliation'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'pub2field')):
                    os.mkdir(os.path.join(self.path2database, 'pub2field'))

                if not os.path.exists(
                        os.path.join(self.path2database, 'fieldinfo')):
                    os.mkdir(os.path.join(self.path2database, 'fieldinfo'))

            journal_dict = {}
            journal_column_names = [
                'JournalId', 'FullName', 'AbbreviatedName', 'Publisher'
            ]

            pub_column_names = [
                'PublicationId', 'Title', 'Date', 'Year', 'Doi', 'JournalId',
                'Volume', 'Issue', 'PageStart', 'PageEnd', 'DocType',
                'TeamSize'
            ]

            pub_df = []
            pub2year = {}
            pub2doctype = {}
            pub2int = {}
            ipub = 0
            if pubid2int:
                pubintcol = ['PublicationId']
            else:
                pubintcol = []

            iaff = 0
            affil_dict = {}
            paa_df = []

            field_dict = {}
            pub2field_df = []

            for fname in tqdm(metadata_files,
                              desc='aps-metadata',
                              leave=True,
                              disable=not show_progress):
                # load pub json
                pubjson = json.loads(archive.read(fname).decode('utf-8'))
                ipub += 1

                # start parsing publication information
                if pubid2int:
                    pubid = ipub
                    pub2int[pubjson.get('id', '')] = pubid
                else:
                    pubid = pubjson.get('id', '')
                pubinfo = [pubid]
                pubinfo.append(pubjson.get('title', {}).get('value', ''))
                pubinfo.append(pubjson.get('date', ''))
                pubinfo.append(load_int(pubjson.get('date', '').split('-')[0]))
                pub2year[pubid] = pubinfo[-1]
                pubinfo.append(pubjson.get('id', ''))

                # journal of publication
                journalid = pubjson.get('journal', {}).get('id', '')
                pubinfo.append(journalid)
                pubinfo.append(
                    load_int(pubjson.get('volume', {}).get('number', '')))
                pubinfo.append(
                    load_int(pubjson.get('issue', {}).get('number', '')))

                # add pagenumber info
                pubinfo.append(load_int(pubjson.get('pageStart', '')))
                if not pubjson.get('pageEnd', None) is None:
                    pubinfo.append(load_int(pubjson.get('pageEnd', '')))
                elif not (pubjson.get('numPages', None) is None
                          or pubjson.get('pageStart', None) is None):
                    pubinfo.append(pubinfo[-1] +
                                   load_int(pubjson.get('numPages', '')))
                else:
                    pubinfo.append(None)

                # add the doctype
                pubinfo.append(pubjson.get('articleType', ''))
                pub2doctype[pubid] = pubinfo[-1]

                # calculate TeamSize
                pubinfo.append(len(pubjson.get('authors', [])))

                # finish publication infor
                pub_df.append(pubinfo)

                # check if we need to save journal information
                if journal_dict.get(journalid, None) is None:
                    journal_dict[journalid] = pubjson.get('journal', {})
                    journal_dict[journalid]['Publisher'] = pubjson.get(
                        'rights', {}).get('copyrightHolders', [{
                            'name': ''
                        }])[0].get('name', '')

                # start parsing affiliation information
                pub_affid_map = {}
                for pubaffdict in pubjson.get('affiliations', []):
                    # check if the affiliation has been used before (only using string match)
                    # ToDo: add disambigation
                    if affil_dict.get(pubaffdict.get('name', ''),
                                      None) is None:
                        affil_dict[pubaffdict.get('name', '')] = iaff
                        iaff += 1

                    # map the affiliation to the AffiliationId
                    pub_affid_map[pubaffdict.get(
                        'id', '')] = affil_dict[pubaffdict.get('name', '')]

                authorseq = 1
                # now start parsing author information
                for authordict in pubjson.get('authors', []):
                    for affid in authordict.get('affiliationIds', [None]):
                        paa_df.append([
                            pubid,
                            authordict.get('name', ''),
                            pub_affid_map.get(affid, None), authorseq
                        ])

                    authorseq += 1

                # now do the subject classifications
                for subjectdict in pubjson.get('classificationSchemes',
                                               {}).get('subjectAreas', []):
                    pub2field_df.append([pubid, subjectdict.get('id', None)])

                    if field_dict.get(subjectdict.get('id', None),
                                      None) is None:
                        field_dict[subjectdict.get('id',
                                                   None)] = subjectdict.get(
                                                       'label', None)

                # ToDo: parse concepts

            if show_progress:
                print("Parsing Complete\nSaving Publication DataFrames")

            pub_df = pd.DataFrame(pub_df, columns=pub_column_names)
            for intcol in pubintcol + ['Year']:
                pub_df[intcol] = pub_df[intcol].astype(int)

            journal_rename_dict = {
                'name': 'FullName',
                'id': 'JournalId',
                'abbreviatedName': 'AbbreviatedName'
            }
            journal_df = pd.DataFrame(
                journal_dict.values()).rename(columns=journal_rename_dict)

            affiliation_df = pd.DataFrame(
                [[affid, name] for name, affid in affil_dict.items()],
                columns=['AffiliationId', 'Address'])

            paa_df = pd.DataFrame(paa_df,
                                  columns=[
                                      'PublicationId', 'OrigAuthorName',
                                      'AffiliationId', 'AuthorSequence'
                                  ])
            for intcol in pubintcol + ['AuthorSequence']:
                paa_df[intcol] = paa_df[intcol].astype(int)

            pub2field_df = pd.DataFrame(pub2field_df,
                                        columns=['PublicationId', 'FieldId'])
            for intcol in pubintcol:
                pub2field_df[intcol] = pub2field_df[intcol].astype(int)

            field_df = pd.DataFrame(
                [[fieldid, fieldname]
                 for fieldid, fieldname in field_dict.items()],
                columns=['FieldId', 'FullName'])

            if preprocess:
                pub_df.to_hdf(os.path.join(self.path2database, 'publication',
                                           'publication0.hdf'),
                              mode='w',
                              key='publication')

                if pubid2int:
                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2int.json.gz'), 'w') as outfile:
                        outfile.write(json.dumps(pub2int).encode('utf8'))

                if preprocess_dicts:
                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2year.json.gz'), 'w') as outfile:
                        outfile.write(json.dumps(pub2year).encode('utf8'))

                    with gzip.open(
                            os.path.join(self.path2database,
                                         'pub2doctype.json.gz'),
                            'w') as outfile:
                        outfile.write(json.dumps(pub2doctype).encode('utf8'))

                journal_df.to_hdf(os.path.join(self.path2database, 'journal',
                                               'journal0.hdf'),
                                  mode='w',
                                  key='journal')

                affiliation_df.to_hdf(os.path.join(self.path2database,
                                                   'affiliation',
                                                   'affiliation0.hdf'),
                                      mode='w',
                                      key='affiliation')

                paa_df.to_hdf(os.path.join(
                    self.path2database, 'publicationauthoraffiliation',
                    'publicationauthoraffiliation0.hdf'),
                              mode='w',
                              key='publicationauthoraffiliation')

                pub2field_df.to_hdf(os.path.join(self.path2database,
                                                 'pub2field',
                                                 'pub2field0.hdf'),
                                    mode='w',
                                    key='pub2field')

                field_df.to_hdf(os.path.join(self.path2database, 'fieldinfo',
                                             'fieldinfo0.hdf'),
                                mode='w',
                                key='pub2field')

        else:
            raise FileNotFoundError(
                'The archive {0} does not contain a metadata directory: {1}.'.
                format(archive_name, 'aps-dataset-metadata'))

예제 #4

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_publicationauthoraffiliation(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing = False):
        """
        Parse the PubMed publication-author raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
       
        Returns
        ----------
        DataFrame
            Publication-Author DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publicationauthoraffiliation')):
                os.mkdir(os.path.join(self.path2database, 'publicationauthoraffiliation'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed author xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publicationauthoraffiliation{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                paa_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        for seq, author in enumerate(author_list.findall('Author')):
                            author_record = self._blank_pubmed_author()

                            author_record['PublicationId'] = PublicationId
                            author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                            author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                            author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                            if author.find("AffiliationInfo/Affiliation") is not None:
                                author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                                author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                            author_record['AuthorSequence'] = seq+1

                            paa_df.append(author_record)
                paa_df = pd.DataFrame(paa_df)
            paa_df['AuthorSequence'] = paa_df['AuthorSequence'].astype(int)
            paa_df.to_hdf( os.path.join(self.path2database, self.path2paa_df, 'publicationauthoraffiliation{}.hdf'.format(ifile)), key = 'paa', mode='w')


        ## load publication author dataframe into a large file
        paa_files_list = glob.glob(os.path.join(self.path2database, self.path2paa_df) + 'publicationauthoraffiliation*.hdf')

        paa_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_paa_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            paa_df = paa_df.append(pd.read_hdf(tmp_paa_df), ignore_index = True)

        return paa_df

예제 #5

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_fields(self, preprocess = True, num_file_lines=10**7, rewrite_existing=False,xml_directory = 'RawXML'):
        """
        Parse the PubMed field (mesh term) raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication-Term ID DataFrame and Term ID - Term DataFrame
        """

        if preprocess:
            for hier_dir_type in [self.path2pub2field_df, self.path2fieldinfo_df]:
                if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                    os.mkdir(os.path.join(self.path2database, hier_dir_type))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)
            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            # global id to term mapping
            fieldinfo = {}

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2field_df,'pub2field{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2field_df = []

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                # save the pub-field id
                pub2field_df = pd.DataFrame(pub2field_df, columns = ['PublicationId', 'FieldId'], dtype=int)
                pub2field_df.to_hdf( os.path.join(self.path2database, self.path2pub2field_df, 'pub2field{}.hdf'.format(ifile)), key = 'pub2field', mode='w')


            # if rewriting
            dest_file_name = os.path.join(self.path2database, self.path2pub2fieldinfo_df,'fieldinfo.hdf')
            if rewrite_existing:
                # save field info dictionary
                mesh_id_df_list = list(field_info.values())
                for i, j in enumerate(field_info.keys()):
                    mesh_id_df_list[i].insert(0, j)

                fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
                fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')


        # load the dataframes

        # pub2field
        pub2field_files = glob.glob(os.path.join(self.path2database, self.path2pub2field_df) + 'pub2field*.hdf')
        pub2field_df = pd.DataFrame()

        for pub2field_tmp_file in tqdm(pub2field_files, desc='PubMed pub2field files', leave=True, disable=not show_progress):
            pub2field_df = pub2field_df.append(pd.read_hdf(pub2field_tmp_file), ignore_index=True)

        # field info map
        fieldinfo_df = pd.read_hdf(os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'))

        return pub2field_df, fildinfo_df

예제 #6

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_publications(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7,rewrite_existing = False):
        """
        Parse the PubMed publication raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication metadata DataFrame.
        """

        # process publication files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publication')):
                os.mkdir(os.path.join(self.path2database, 'publication'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed publication xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publication{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                publication_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    pub_record = self._blank_pubmed_publication(PublicationId)

                    article = medline.find("Article")
                    pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                    if article.find('Pagination') == None:
                        pub_record['Pages'] = None
                    else:
                        pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                    journal = article.find("Journal")
                    pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                    pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                    pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                    pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                    history = article_bucket.find("PubmedData/History")
                    if not history is None:
                        pdate = history.find('PubMedPubDate')
                        if not pdate is None:
                            pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                            pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                            pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))

                    article_ids = article_bucket.find("PubmedData/ArticleIdList")
                    if article_ids is not None:
                        doi = article_ids.find('ArticleId[@IdType="doi"]')
                        pub_record['Doi'] = load_xml_text(doi)


                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        pub_record['TeamSize'] = len(author_list.findall('Author'))

                    publication_df.append(pub_record)

                # save publication dataframe
                publication_df = pd.DataFrame(publication_df)
                publication_df['PublicationId'] = publication_df['PublicationId'].astype(int)
                publication_df['Year'] = publication_df['Year'].astype(int)
                publication_df['Month'] = publication_df['Month'].astype(int)
                publication_df['Day'] = publication_df['Day'].astype(int)
                publication_df['Volume'] = pd.to_numeric(publication_df['Volume'])
                publication_df['TeamSize'] = publication_df['TeamSize'].astype(int)
                publication_df.to_hdf( os.path.join(self.path2database, self.path2pub_df, 'publication{}.hdf'.format(ifile)), key = 'pub', mode='w')

        ## load publication dataframe into a large file
        pub_files_list = glob.glob(os.path.join(self.path2database, self.path2pub_df) + 'publication*.hdf')

        pub_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_pub_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            pub_df = pub_df.append(pd.read_hdf(tmp_pub_df), ignore_index = True)

        return pub_df

예제 #7

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_references(self, xml_directory='RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing=False,show_progress=True):
        """
        Parse the PubMed References raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Citations DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'pub2ref')):
                os.mkdir(os.path.join(self.path2database, 'pub2ref'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed reference xml files', leave=True, disable=not show_progress):

                xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2ref_df,'pub2ref{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2ref_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))


                    references = article_bucket.find("PubmedData/ReferenceList")
                    if not references is None:
                        for ref in references.findall("Reference"):
                            citation = load_xml_text(ref.find("Citation"))
                            if not ref.find('ArticleIdList') is None:
                                pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                            else:
                                pmid = ""
                            pub2ref_df.append([PublicationId, pmid, citation])

                # save file
                pub2ref_df = pd.DataFrame(pub2ref_df, columns = ['CitedPublicationId', 'CitingPublicationId', 'Citation'], dtype=int)
                pub2ref_df.to_hdf( os.path.join(self.path2database, self.path2pub2ref_df, 'pub2ref{}.hdf'.format(ifile)), key = 'pub2ref', mode='w')


        # load the citations into a large dataframe

        pub2ref_files = glob.glob(os.path.join(self.path2database, self.path2pub2ref_df)+ 'pub2ref*.hdf')

        pub2ref_df = pd.DataFrame()

        print("parsing citation data...")
        for pub2ref_tmp in tqdm(pub2ref_files,desc='PubMed citation xml files', leave=True, disable=not show_progress):
            pub2ref_df = pub2ref_df.append(pd.read_hdf(pub2ref_tmp), ignore_indexTrue)

        return pub2ref_df

예제 #8

파일 보기