Python load_xml_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyscisci.datasource.readwrite

메소드/함수: load_xml_text

hotexamples.com에서의 예제들: 6

Python load_xml_text - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyscisci.datasource.readwrite.load_xml_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

0

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def preprocess(self, xml_directory = 'RawXML', process_name=True, num_file_lines=10**6, show_progress=True,rewrite_existing = False):
        """
        Bulk preprocess of the PubMed raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        rewrite_existing: bool, default False
            If True, rewrites the files in the data directory
        """

        if show_progress:
            print("Starting to preprocess the PubMed database.")

        for hier_dir_type in [self.path2pub_df, self.path2paa_df, self.path2pub2field_df, self.path2pub2ref_df, self.path2fieldinfo_df]:

            if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))


        xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

        # read dtd - this takes
        path2database = self.path2database # remove self to use inside of this class
        class DTDResolver(etree.Resolver):
            def resolve(self, system_url, public_id, context):
                return self.resolve_filename(os.path.join(path2database, system_url), context)
        parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

        pub2year = {}
        fieldinfo = {}

        ifile = 0
        for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

            # check if the xml file was already parsed
            dest_file_name = os.path.join(self.path2database, self.path2pub_df,'publication{}.hdf'.format(ifile))
            if not rewrite_existing and os.path.isfile(dest_file_name):
                ifile+=1
                continue

            publication_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []

            xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

            all_pubmed_articles = xmltree.findall("/PubmedArticle")

            for article_bucket in all_pubmed_articles:

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))
                pub_record = self._blank_pubmed_publication(PublicationId)

                article = medline.find("Article")
                pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                if article.find('Pagination') == None:
                    pub_record['Pages'] = None
                else:
                    pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                journal = article.find("Journal")
                pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                history = article_bucket.find("PubmedData/History")
                if not history is None:
                    pdate = history.find('PubMedPubDate')
                    if not pdate is None:
                        pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                        pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                        pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))


                if pub_record['Year'] > 0:
                    pub2year[PublicationId] = pub_record['Year']

                article_ids = article_bucket.find("PubmedData/ArticleIdList")
                if article_ids is not None:
                    doi = article_ids.find('ArticleId[@IdType="doi"]')
                    pub_record['Doi'] = load_xml_text(doi)


                author_list = article.find('AuthorList')

                if not author_list is None:
                    for seq, author in enumerate(author_list.findall('Author')):
                        author_record = self._blank_pubmed_author()

                        author_record['PublicationId'] = PublicationId
                        author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                        author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                        author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                        if author.find("AffiliationInfo/Affiliation") is not None:
                            author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                            author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                        author_record['AuthorSequence'] = seq+1

                        paa_df.append(author_record)

                    pub_record['TeamSize'] = seq + 1

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                references = article_bucket.find("PubmedData/ReferenceList")
                if not references is None:
                    for ref in references.findall("Reference"):
                        citation = load_xml_text(ref.find("Citation"))
                        if not ref.find('ArticleIdList') is None:
                            pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                        else:
                            pmid = ""
                        pub2ref_df.append([PublicationId, pmid, citation])

                publication_df.append(pub_record)

            self._save_dataframes(ifile, publication_df, paa_df, pub2ref_df, pub2field_df)
            ifile += 1

        # if rewriting
        dest_file_name = os.path.join(self.path2database, self.path2fieldinfo_df,'fieldinfo.hdf')
        if rewrite_existing:
            # save field info dictionary
            mesh_id_df_list = list(field_info.values())
            for i, j in enumerate(field_info.keys()):
                mesh_id_df_list[i].insert(0, j)

            fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
            fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

예제 #2

0

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_fields(self, preprocess = True, num_file_lines=10**7, rewrite_existing=False,xml_directory = 'RawXML'):
        """
        Parse the PubMed field (mesh term) raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication-Term ID DataFrame and Term ID - Term DataFrame
        """

        if preprocess:
            for hier_dir_type in [self.path2pub2field_df, self.path2fieldinfo_df]:
                if not os.path.exists(os.path.join(self.path2database, hier_dir_type)):
                    os.mkdir(os.path.join(self.path2database, hier_dir_type))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)
            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            # global id to term mapping
            fieldinfo = {}

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2field_df,'pub2field{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2field_df = []

                medline = article_bucket.find("MedlineCitation")

                # scrape the publication information
                PublicationId = load_int(load_xml_text(medline.find('PMID')))

                meshterms = medline.find("MeshHeadingList")

                if meshterms is not None:
                    for term in meshterms.getchildren():
                        ui = term.find("DescriptorName").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh']

                chemicals = medline.find("ChemicalList")
                if chemicals is not None:
                    for chemical in chemicals.findall("Chemical"):
                        ui = chemical.find("NameOfSubstance").attrib.get("UI", "")
                        if len(ui)>0:
                            pub2field_df.append([PublicationId, ui])
                            fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem']

                # save the pub-field id
                pub2field_df = pd.DataFrame(pub2field_df, columns = ['PublicationId', 'FieldId'], dtype=int)
                pub2field_df.to_hdf( os.path.join(self.path2database, self.path2pub2field_df, 'pub2field{}.hdf'.format(ifile)), key = 'pub2field', mode='w')


            # if rewriting
            dest_file_name = os.path.join(self.path2database, self.path2pub2fieldinfo_df,'fieldinfo.hdf')
            if rewrite_existing:
                # save field info dictionary
                mesh_id_df_list = list(field_info.values())
                for i, j in enumerate(field_info.keys()):
                    mesh_id_df_list[i].insert(0, j)

                fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int)
                fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w')


        # load the dataframes

        # pub2field
        pub2field_files = glob.glob(os.path.join(self.path2database, self.path2pub2field_df) + 'pub2field*.hdf')
        pub2field_df = pd.DataFrame()

        for pub2field_tmp_file in tqdm(pub2field_files, desc='PubMed pub2field files', leave=True, disable=not show_progress):
            pub2field_df = pub2field_df.append(pd.read_hdf(pub2field_tmp_file), ignore_index=True)

        # field info map
        fieldinfo_df = pd.read_hdf(os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'))

        return pub2field_df, fildinfo_df

예제 #3

0

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_publicationauthoraffiliation(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing = False):
        """
        Parse the PubMed publication-author raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
       
        Returns
        ----------
        DataFrame
            Publication-Author DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publicationauthoraffiliation')):
                os.mkdir(os.path.join(self.path2database, 'publicationauthoraffiliation'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed author xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publicationauthoraffiliation{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                paa_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        for seq, author in enumerate(author_list.findall('Author')):
                            author_record = self._blank_pubmed_author()

                            author_record['PublicationId'] = PublicationId
                            author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName")))
                            author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName")))
                            author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName']

                            if author.find("AffiliationInfo/Affiliation") is not None:
                                author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation")))
                                author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","")

                            author_record['AuthorSequence'] = seq+1

                            paa_df.append(author_record)
                paa_df = pd.DataFrame(paa_df)
            paa_df['AuthorSequence'] = paa_df['AuthorSequence'].astype(int)
            paa_df.to_hdf( os.path.join(self.path2database, self.path2paa_df, 'publicationauthoraffiliation{}.hdf'.format(ifile)), key = 'paa', mode='w')


        ## load publication author dataframe into a large file
        paa_files_list = glob.glob(os.path.join(self.path2database, self.path2paa_df) + 'publicationauthoraffiliation*.hdf')

        paa_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_paa_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            paa_df = paa_df.append(pd.read_hdf(tmp_paa_df), ignore_index = True)

        return paa_df

예제 #4

0

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_references(self, xml_directory='RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing=False,show_progress=True):
        """
        Parse the PubMed References raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Citations DataFrame.
        """

        # process author files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'pub2ref')):
                os.mkdir(os.path.join(self.path2database, 'pub2ref'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed reference xml files', leave=True, disable=not show_progress):

                xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser)

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2pub2ref_df,'pub2ref{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                pub2ref_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))


                    references = article_bucket.find("PubmedData/ReferenceList")
                    if not references is None:
                        for ref in references.findall("Reference"):
                            citation = load_xml_text(ref.find("Citation"))
                            if not ref.find('ArticleIdList') is None:
                                pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]')))
                            else:
                                pmid = ""
                            pub2ref_df.append([PublicationId, pmid, citation])

                # save file
                pub2ref_df = pd.DataFrame(pub2ref_df, columns = ['CitedPublicationId', 'CitingPublicationId', 'Citation'], dtype=int)
                pub2ref_df.to_hdf( os.path.join(self.path2database, self.path2pub2ref_df, 'pub2ref{}.hdf'.format(ifile)), key = 'pub2ref', mode='w')


        # load the citations into a large dataframe

        pub2ref_files = glob.glob(os.path.join(self.path2database, self.path2pub2ref_df)+ 'pub2ref*.hdf')

        pub2ref_df = pd.DataFrame()

        print("parsing citation data...")
        for pub2ref_tmp in tqdm(pub2ref_files,desc='PubMed citation xml files', leave=True, disable=not show_progress):
            pub2ref_df = pub2ref_df.append(pd.read_hdf(pub2ref_tmp), ignore_indexTrue)

        return pub2ref_df

예제 #5

0

파일 보기

파일: PubMed.py 프로젝트: SciSciCollective/pyscisci

    def parse_publications(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7,rewrite_existing = False):
        """
        Parse the PubMed publication raw data.
        
        Parameters
        ----------
        preprocess: bool, default True
            Save the processed data in new DataFrames.
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.
        num_file_lines: int, default 5*10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.
        show_progress: bool, default True
            Show progress with processing of the data.
        
        Returns
        ----------
        DataFrame
            Publication metadata DataFrame.
        """

        # process publication files through xml
        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'publication')):
                os.mkdir(os.path.join(self.path2database, 'publication'))

            xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname])

            # read dtd - this takes
            path2database = self.path2database # remove self to use inside of this class
            class DTDResolver(etree.Resolver):
                def resolve(self, system_url, public_id, context):
                    return self.resolve_filename(os.path.join(path2database, system_url), context)

            parser = etree.XMLParser(load_dtd=True, resolve_entities=True)

            ifile = 0
            for xml_file_name in tqdm(xmlfiles, desc='PubMed publication xml files', leave=True, disable=not show_progress):

                # check if the xml file was already parsed
                dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publication{}.hdf'.format(ifile))
                if not rewrite_existing and os.path.isfile(dest_file_name):
                    ifile+=1
                    continue

                publication_df = []

                all_pubmed_articles = xmltree.findall("/PubmedArticle")

                for article_bucket in all_pubmed_articles:

                    medline = article_bucket.find("MedlineCitation")

                    # scrape the publication information
                    PublicationId = load_int(load_xml_text(medline.find('PMID')))

                    pub_record = self._blank_pubmed_publication(PublicationId)

                    article = medline.find("Article")
                    pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle')))
                    if article.find('Pagination') == None:
                        pub_record['Pages'] = None
                    else:
                        pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn")))

                    journal = article.find("Journal")
                    pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title")))
                    pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume")))
                    pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue")))
                    pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN")))

                    history = article_bucket.find("PubmedData/History")
                    if not history is None:
                        pdate = history.find('PubMedPubDate')
                        if not pdate is None:
                            pub_record['Year'] = load_int(load_xml_text(pdate.find("Year")))
                            pub_record['Month'] = load_int(load_xml_text(pdate.find("Month")))
                            pub_record['Day'] = load_int(load_xml_text(pdate.find("Day")))

                    article_ids = article_bucket.find("PubmedData/ArticleIdList")
                    if article_ids is not None:
                        doi = article_ids.find('ArticleId[@IdType="doi"]')
                        pub_record['Doi'] = load_xml_text(doi)


                    author_list = article.find('AuthorList')

                    if not author_list is None:
                        pub_record['TeamSize'] = len(author_list.findall('Author'))

                    publication_df.append(pub_record)

                # save publication dataframe
                publication_df = pd.DataFrame(publication_df)
                publication_df['PublicationId'] = publication_df['PublicationId'].astype(int)
                publication_df['Year'] = publication_df['Year'].astype(int)
                publication_df['Month'] = publication_df['Month'].astype(int)
                publication_df['Day'] = publication_df['Day'].astype(int)
                publication_df['Volume'] = pd.to_numeric(publication_df['Volume'])
                publication_df['TeamSize'] = publication_df['TeamSize'].astype(int)
                publication_df.to_hdf( os.path.join(self.path2database, self.path2pub_df, 'publication{}.hdf'.format(ifile)), key = 'pub', mode='w')

        ## load publication dataframe into a large file
        pub_files_list = glob.glob(os.path.join(self.path2database, self.path2pub_df) + 'publication*.hdf')

        pub_df = pd.DataFrame()

        print("Parsing files...")
        for tmp_pub_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress):
            pub_df = pub_df.append(pd.read_hdf(tmp_pub_df), ignore_index = True)

        return pub_df

예제 #6

0

파일 보기

파일: WOS.py 프로젝트: SciSciCollective/pyscisci

    def preprocess(
            self,
            xml_directory='RawXML',
            name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord',
            process_name=True,
            num_file_lines=10**6,
            show_progress=True):
        """
        Bulk preprocess of the Web of Science raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        xml_file_name: str, default 'dblp.xml.gz'
            The xml file name.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        """

        pub_column_names = [
            'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title',
            'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize'
        ]
        author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName']

        if show_progress:
            print("Starting to preprocess the WOS database.")

        for hier_dir_type in [
                'publication', 'author', 'publicationauthoraffiliation',
                'pub2field', 'pub2ref', 'affiliation'
        ]:

            if not os.path.exists(
                    os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))

        pub2year = {}
        pub2doctype = {}

        found_aids = set([])

        found_affiliations = {}

        ns = {"ns": name_space}
        xmlfiles = sorted([
            fname for fname in os.listdir(
                os.path.join(self.path2database, xml_directory))
            if '.xml' in fname
        ])

        ifile = 0
        for xml_file_name in tqdm(xmlfiles,
                                  desc='WOS xml files',
                                  leave=True,
                                  disable=not show_progress):

            publication_df = []
            author_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []
            affiliation_df = []
            field_df = []

            name, extension = os.path.splitext(xml_file_name)

            if extension == '.gz':
                with gzip.open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()
                bytesxml = BytesIO(xml_file)

            elif extension == '.xml':
                with open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()

            # extract the desired fields from the XML tree  #

            xmltree = etree.iterparse(bytesxml,
                                      events=('end', ),
                                      tag="{{{0}}}REC".format(name_space))

            if show_progress:
                print("{} Xml tree parsed, iterating through elements.".format(
                    xml_file_name))

            last_position = 0

            for event, elem in xmltree:

                # scrape the publication information
                PublicationId = load_html_str(
                    elem.xpath('./ns:UID', namespaces=ns)[0].text)

                pub_record = self._blank_wos_publication(PublicationId)

                pub_record['Title'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]',
                            namespaces=ns)))
                pub_record['JournalId'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]',
                            namespaces=ns)))

                pub_info = elem.xpath(
                    './ns:static_data/ns:summary/ns:pub_info',
                    namespaces=ns)[0]
                pub_record['Year'] = load_int(pub_info.get('pubyear', ''))
                pub_record['Date'] = load_html_str(pub_info.get(
                    'sortdate', ''))
                pub_record['Volume'] = load_int(pub_info.get('vol', ''))
                pub_record['Issue'] = load_int(pub_info.get('issue', ''))

                pub2year[PublicationId] = pub_record['Year']

                pub_record['Pages'] = load_html_str(
                    load_xml_text(elem.xpath(
                        './ns:static_data/ns:summary/ns:pub_info/ns:page',
                        namespaces=ns),
                                  default=''))

                for ident in ['ISSN', 'Doi']:
                    identobject = elem.xpath(
                        './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]'
                        .format(ident.lower()),
                        namespaces=ns)
                    if len(identobject) > 0:
                        pub_record[ident] = load_html_str(identobject[0].get(
                            'value', ''))

                #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns)))

                pub_record['DocType'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:doctypes/ns:doctype',
                            namespaces=ns)))

                pub2doctype[PublicationId] = pub_record['DocType']

                # now scrape the authors
                pub_authors = {}
                author_objects = elem.xpath(
                    './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]',
                    namespaces=ns)
                pub_record['TeamSize'] = len(author_objects)

                for author_obj in author_objects:
                    author_record = self._blank_wos_author(None)
                    author_record['AuthorId'] = author_obj.get('dais_id', None)

                    author_record['FullName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:full_name', namespaces=ns)))
                    author_record['FirstName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:first_name',
                                             namespaces=ns)))
                    author_record['LastName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:last_name', namespaces=ns)))

                    author_record['Affiliations'] = author_obj.get(
                        'addr_no', '')
                    author_record['Affiliations'] = [
                        int(single_addr_no) for single_addr_no in
                        author_record['Affiliations'].split(' ')
                        if len(single_addr_no) > 0
                    ]

                    author_record['AuthorOrder'] = int(
                        author_obj.get('seq_no', None))

                    pub_authors[author_record['AuthorOrder']] = author_record

                #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns)

                address_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec',
                    namespaces=ns)
                for addr_obj in address_objects:
                    addr_record = self._blank_wos_affiliation()

                    organization_objects = addr_obj.xpath(
                        './ns:organizations/ns:organization[@pref="Y"]',
                        namespaces=ns)
                    if len(organization_objects) == 0:
                        organization_objects = addr_obj.xpath(
                            './ns:organizations/ns:organization',
                            namespaces=ns)

                    if len(organization_objects) == 0:
                        orgtext = ''
                    else:
                        orgtext = organization_objects[0].text

                    address_no = int(addr_obj.get('addr_no'))

                    affiliation_df.append([PublicationId, addr_no, orgtext])

                    #if found_affiliations

                    #article['addresses'][address_no] = address_info

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'heading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'subheading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]',
                    namespaces=ns)
                field_df.extend([[
                    PublicationId, field_obj.text, 'ASCA traditional subject'
                ] for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'ASCA extended subject']
                     for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'keyword']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:item/ns:keywords_plus/ns:keyword',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'keyword plus']
                     for field_obj in field_objects if field_obj is not None])

                reference_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference',
                    namespaces=ns)
                for ref_obj in reference_objects:
                    for ref_elem in ref_obj:
                        if ref_elem.tag == "{{{0}}}uid".format(name_space):
                            refid = load_html_str(
                                ref_elem.text.replace('WOS:', ''))
                            pub2ref_df.append([PublicationId, refid])
                        elif ref_elem.tag == "{{{0}}}year".format(name_space):
                            pub2year[refid] = load_int(ref_elem.text)

                publication_df.append(
                    [pub_record[k] for k in pub_column_names])

                for aorder, author_record in pub_authors.items():
                    if not author_record[
                            'AuthorId'] is None and not author_record[
                                'AuthorId'] in found_aids:
                        found_aids.add(author_record['AuthorId'])
                        author_df.append(
                            [author_record[k] for k in author_column_names])

                    paa_df.append([
                        PublicationId, author_record['AuthorId'], aorder,
                        author_record['FullName']
                    ])

            self._save_dataframes(ifile, publication_df, pub_column_names,
                                  author_df, author_column_names, paa_df,
                                  pub2ref_df, affiliation_df, field_df)
            ifile += 1

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

        with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2doctype).encode('utf8'))