def preprocess(self, xml_file_name='dblp.xml.gz', process_name=True, num_file_lines=10**6, show_progress=True): """ Bulk preprocess of the DBLP raw data. Parameters ---------- :param process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. :param xml_file_name: str, default 'dblp.xml.gz' The xml file name. :param num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param show_progress: bool, default True Show progress with processing of the data. """ ACCEPT_DOCTYPES = set([ 'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis' ]) REJECT_DOCTYPES = set(['www']) DATA_ITEMS = [ 'title', 'booktitle', 'year', 'journal', 'ee', ' url', 'month', 'mdate', 'isbn', 'publisher' ] SKIP_FIELDS = [ 'note', 'cite', 'cdrom', 'crossref', 'editor', 'series', 'tt', 'school', 'chapter', 'address' ] doctype = { 'article': 'j', 'book': 'b', '': '', 'phdthesis': 'phd', 'proceedings': 'c', 'inproceedings': 'c', 'mastersthesis': 'ms', 'incollection': 'c' } html_format_keys = [ '<sub>', '</sub>', '<sup>', '</sup>', '<i>', '</i>' ] if show_progress: print("Starting to preprocess the DBLP database.") if not os.path.exists(os.path.join(self.path2database, 'publication')): os.mkdir(os.path.join(self.path2database, 'publication')) if not os.path.exists(os.path.join(self.path2database, 'author')): os.mkdir(os.path.join(self.path2database, 'author')) if not os.path.exists( os.path.join(self.path2database, 'publicationauthor')): os.mkdir(os.path.join(self.path2database, 'publicationauthor')) publication_df = [] author_df = [] author2pub_df = [] journal_df = [] PublicationId = 1 AuthorId = 1 aname2aid = {} author_columns = ['AuthorId', 'FullName'] if process_name: author_columns += ['LastName', 'FirstName', 'MiddleName'] JournalId = 1 jname2jid = {} pub_record = self._blank_dblp_publication(PublicationId) pub_authors = [] AuthorCount = 0 ifile = 0 # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename( os.path.join(path2database, system_url), context) if '.gz' in xml_file_name: with gzip.open(os.path.join(self.path2database, xml_file_name), 'r') as infile: xml_file = infile.read() else: with open(os.path.join(self.path2database, xml_file_name), 'r') as infile: xml_file = infile.read().encode('latin1') # extract the desired fields from the XML tree # bytesxml = BytesIO(xml_file) xmltree = etree.iterparse(bytesxml, load_dtd=True, resolve_entities=True) xmltree.resolvers.add(DTDResolver()) if show_progress: print("Xml tree parsed, iterating through elements.") last_position = 0 xml_size = bytesxml.getbuffer().nbytes with tqdm(total=xml_size, unit='iB', unit_scale=True, desc='dblp.xml', leave=True, disable=not show_progress) as pbar: for event, elem in xmltree: if elem.tag == 'title' or elem.tag == 'booktitle': pub_record['Title'] = load_html_str(elem.text) elif elem.tag == 'year': pub_record['Year'] = load_int(elem.text) elif elem.tag == 'month': pub_record['Month'] = load_int(elem.text) elif elem.tag == 'volume': pub_record['Volume'] = load_int(elem.text) elif elem.tag == 'number': pub_record['Number'] = load_html_str(elem.text) elif elem.tag == 'pages': pub_record['Pages'] = load_html_str(elem.text) elif elem.tag == 'journal': pub_record['JournalId'] = load_html_str(elem.text) elif elem.tag == 'url': pub_record['URL'] = load_html_str(elem.text) elif elem.tag == 'ee': pub_record['EE'] = load_html_str(elem.text) elif elem.tag == 'author': AuthorCount += 1 fullname = load_html_str(elem.text) if aname2aid.get(fullname, None) is None: if process_name: fullname = ''.join([ i for i in fullname if not i.isdigit() ]).strip() hname = HumanName(fullname) author_df.append([ AuthorId, fullname, hname.last, hname.first, hname.middle ]) else: author_df.append([AuthorId, fullname]) aname2aid[fullname] = AuthorId AuthorId += 1 pub_authors.append( [PublicationId, aname2aid[fullname], AuthorCount]) elif elem.tag in ACCEPT_DOCTYPES: pub_record['TeamSize'] = AuthorCount pub_record['DocType'] = doctype[load_html_str(elem.tag)] publication_df.append(pub_record) author2pub_df.extend(pub_authors) PublicationId += 1 pub_record = self._blank_dblp_publication(PublicationId) AuthorCount = 0 pub_authors = [] # update progress bar pbar.update(bytesxml.tell() - last_position) last_position = bytesxml.tell() if num_file_lines > 0 and (PublicationId % num_file_lines) == 0: self._save_dataframes(ifile, publication_df, author_df, author_columns, author2pub_df) ifile += 1 publication_df = [] author_df = [] author2pub_df = [] elif elem.tag in REJECT_DOCTYPES: # the record was from a rejected category so reset record pub_record = self._blank_dblp_publication(PublicationId) AuthorCount = 0 pub_authors = [] elif elem.tag in SKIP_FIELDS: pass del xmltree self._save_dataframes(ifile, publication_df, author_df, author_columns, author2pub_df)
def preprocess(self, xml_directory = 'RawXML', process_name=True, num_file_lines=10**6, show_progress=True,rewrite_existing = False): """ Bulk preprocess of the PubMed raw data. Parameters ---------- process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. rewrite_existing: bool, default False If True, rewrites the files in the data directory """ if show_progress: print("Starting to preprocess the PubMed database.") for hier_dir_type in [self.path2pub_df, self.path2paa_df, self.path2pub2field_df, self.path2pub2ref_df, self.path2fieldinfo_df]: if not os.path.exists(os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) pub2year = {} fieldinfo = {} ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub_df,'publication{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue publication_df = [] paa_df = [] pub2field_df = [] pub2ref_df = [] xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser) all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) pub_record = self._blank_pubmed_publication(PublicationId) article = medline.find("Article") pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle'))) if article.find('Pagination') == None: pub_record['Pages'] = None else: pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn"))) journal = article.find("Journal") pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title"))) pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume"))) pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue"))) pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN"))) history = article_bucket.find("PubmedData/History") if not history is None: pdate = history.find('PubMedPubDate') if not pdate is None: pub_record['Year'] = load_int(load_xml_text(pdate.find("Year"))) pub_record['Month'] = load_int(load_xml_text(pdate.find("Month"))) pub_record['Day'] = load_int(load_xml_text(pdate.find("Day"))) if pub_record['Year'] > 0: pub2year[PublicationId] = pub_record['Year'] article_ids = article_bucket.find("PubmedData/ArticleIdList") if article_ids is not None: doi = article_ids.find('ArticleId[@IdType="doi"]') pub_record['Doi'] = load_xml_text(doi) author_list = article.find('AuthorList') if not author_list is None: for seq, author in enumerate(author_list.findall('Author')): author_record = self._blank_pubmed_author() author_record['PublicationId'] = PublicationId author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName"))) author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName"))) author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName'] if author.find("AffiliationInfo/Affiliation") is not None: author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation"))) author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","") author_record['AuthorSequence'] = seq+1 paa_df.append(author_record) pub_record['TeamSize'] = seq + 1 meshterms = medline.find("MeshHeadingList") if meshterms is not None: for term in meshterms.getchildren(): ui = term.find("DescriptorName").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh'] chemicals = medline.find("ChemicalList") if chemicals is not None: for chemical in chemicals.findall("Chemical"): ui = chemical.find("NameOfSubstance").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem'] references = article_bucket.find("PubmedData/ReferenceList") if not references is None: for ref in references.findall("Reference"): citation = load_xml_text(ref.find("Citation")) if not ref.find('ArticleIdList') is None: pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]'))) else: pmid = "" pub2ref_df.append([PublicationId, pmid, citation]) publication_df.append(pub_record) self._save_dataframes(ifile, publication_df, paa_df, pub2ref_df, pub2field_df) ifile += 1 # if rewriting dest_file_name = os.path.join(self.path2database, self.path2fieldinfo_df,'fieldinfo.hdf') if rewrite_existing: # save field info dictionary mesh_id_df_list = list(field_info.values()) for i, j in enumerate(field_info.keys()): mesh_id_df_list[i].insert(0, j) fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int) fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w') with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8'))
def parse_publications(self, preprocess=False, preprocess_dicts=True, pubid2int=False, archive_name='aps-dataset-metadata-2019.zip', show_progress=False): archive = zipfile.ZipFile( os.path.join(self.path2database, archive_name), 'r') metadata_files = [ fname for fname in archive.namelist() if 'aps-dataset-metadata' in fname and '.json' in fname ] # check that the archive concatins the expected directory if len(metadata_files) > 0: if preprocess: if not os.path.exists( os.path.join(self.path2database, 'publication')): os.mkdir(os.path.join(self.path2database, 'publication')) if not os.path.exists( os.path.join(self.path2database, 'journal')): os.mkdir(os.path.join(self.path2database, 'journal')) if not os.path.exists( os.path.join(self.path2database, 'affiliation')): os.mkdir(os.path.join(self.path2database, 'affiliation')) if not os.path.exists( os.path.join(self.path2database, 'publicationauthoraffiliation')): os.mkdir( os.path.join(self.path2database, 'publicationauthoraffiliation')) if not os.path.exists( os.path.join(self.path2database, 'pub2field')): os.mkdir(os.path.join(self.path2database, 'pub2field')) if not os.path.exists( os.path.join(self.path2database, 'fieldinfo')): os.mkdir(os.path.join(self.path2database, 'fieldinfo')) journal_dict = {} journal_column_names = [ 'JournalId', 'FullName', 'AbbreviatedName', 'Publisher' ] pub_column_names = [ 'PublicationId', 'Title', 'Date', 'Year', 'Doi', 'JournalId', 'Volume', 'Issue', 'PageStart', 'PageEnd', 'DocType', 'TeamSize' ] pub_df = [] pub2year = {} pub2doctype = {} pub2int = {} ipub = 0 if pubid2int: pubintcol = ['PublicationId'] else: pubintcol = [] iaff = 0 affil_dict = {} paa_df = [] field_dict = {} pub2field_df = [] for fname in tqdm(metadata_files, desc='aps-metadata', leave=True, disable=not show_progress): # load pub json pubjson = json.loads(archive.read(fname).decode('utf-8')) ipub += 1 # start parsing publication information if pubid2int: pubid = ipub pub2int[pubjson.get('id', '')] = pubid else: pubid = pubjson.get('id', '') pubinfo = [pubid] pubinfo.append(pubjson.get('title', {}).get('value', '')) pubinfo.append(pubjson.get('date', '')) pubinfo.append(load_int(pubjson.get('date', '').split('-')[0])) pub2year[pubid] = pubinfo[-1] pubinfo.append(pubjson.get('id', '')) # journal of publication journalid = pubjson.get('journal', {}).get('id', '') pubinfo.append(journalid) pubinfo.append( load_int(pubjson.get('volume', {}).get('number', ''))) pubinfo.append( load_int(pubjson.get('issue', {}).get('number', ''))) # add pagenumber info pubinfo.append(load_int(pubjson.get('pageStart', ''))) if not pubjson.get('pageEnd', None) is None: pubinfo.append(load_int(pubjson.get('pageEnd', ''))) elif not (pubjson.get('numPages', None) is None or pubjson.get('pageStart', None) is None): pubinfo.append(pubinfo[-1] + load_int(pubjson.get('numPages', ''))) else: pubinfo.append(None) # add the doctype pubinfo.append(pubjson.get('articleType', '')) pub2doctype[pubid] = pubinfo[-1] # calculate TeamSize pubinfo.append(len(pubjson.get('authors', []))) # finish publication infor pub_df.append(pubinfo) # check if we need to save journal information if journal_dict.get(journalid, None) is None: journal_dict[journalid] = pubjson.get('journal', {}) journal_dict[journalid]['Publisher'] = pubjson.get( 'rights', {}).get('copyrightHolders', [{ 'name': '' }])[0].get('name', '') # start parsing affiliation information pub_affid_map = {} for pubaffdict in pubjson.get('affiliations', []): # check if the affiliation has been used before (only using string match) # ToDo: add disambigation if affil_dict.get(pubaffdict.get('name', ''), None) is None: affil_dict[pubaffdict.get('name', '')] = iaff iaff += 1 # map the affiliation to the AffiliationId pub_affid_map[pubaffdict.get( 'id', '')] = affil_dict[pubaffdict.get('name', '')] authorseq = 1 # now start parsing author information for authordict in pubjson.get('authors', []): for affid in authordict.get('affiliationIds', [None]): paa_df.append([ pubid, authordict.get('name', ''), pub_affid_map.get(affid, None), authorseq ]) authorseq += 1 # now do the subject classifications for subjectdict in pubjson.get('classificationSchemes', {}).get('subjectAreas', []): pub2field_df.append([pubid, subjectdict.get('id', None)]) if field_dict.get(subjectdict.get('id', None), None) is None: field_dict[subjectdict.get('id', None)] = subjectdict.get( 'label', None) # ToDo: parse concepts if show_progress: print("Parsing Complete\nSaving Publication DataFrames") pub_df = pd.DataFrame(pub_df, columns=pub_column_names) for intcol in pubintcol + ['Year']: pub_df[intcol] = pub_df[intcol].astype(int) journal_rename_dict = { 'name': 'FullName', 'id': 'JournalId', 'abbreviatedName': 'AbbreviatedName' } journal_df = pd.DataFrame( journal_dict.values()).rename(columns=journal_rename_dict) affiliation_df = pd.DataFrame( [[affid, name] for name, affid in affil_dict.items()], columns=['AffiliationId', 'Address']) paa_df = pd.DataFrame(paa_df, columns=[ 'PublicationId', 'OrigAuthorName', 'AffiliationId', 'AuthorSequence' ]) for intcol in pubintcol + ['AuthorSequence']: paa_df[intcol] = paa_df[intcol].astype(int) pub2field_df = pd.DataFrame(pub2field_df, columns=['PublicationId', 'FieldId']) for intcol in pubintcol: pub2field_df[intcol] = pub2field_df[intcol].astype(int) field_df = pd.DataFrame( [[fieldid, fieldname] for fieldid, fieldname in field_dict.items()], columns=['FieldId', 'FullName']) if preprocess: pub_df.to_hdf(os.path.join(self.path2database, 'publication', 'publication0.hdf'), mode='w', key='publication') if pubid2int: with gzip.open( os.path.join(self.path2database, 'pub2int.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2int).encode('utf8')) if preprocess_dicts: with gzip.open( os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open( os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8')) journal_df.to_hdf(os.path.join(self.path2database, 'journal', 'journal0.hdf'), mode='w', key='journal') affiliation_df.to_hdf(os.path.join(self.path2database, 'affiliation', 'affiliation0.hdf'), mode='w', key='affiliation') paa_df.to_hdf(os.path.join( self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation0.hdf'), mode='w', key='publicationauthoraffiliation') pub2field_df.to_hdf(os.path.join(self.path2database, 'pub2field', 'pub2field0.hdf'), mode='w', key='pub2field') field_df.to_hdf(os.path.join(self.path2database, 'fieldinfo', 'fieldinfo0.hdf'), mode='w', key='pub2field') else: raise FileNotFoundError( 'The archive {0} does not contain a metadata directory: {1}.'. format(archive_name, 'aps-dataset-metadata'))
def parse_publicationauthoraffiliation(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing = False): """ Parse the PubMed publication-author raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication-Author DataFrame. """ # process author files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'publicationauthoraffiliation')): os.mkdir(os.path.join(self.path2database, 'publicationauthoraffiliation')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed author xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publicationauthoraffiliation{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue paa_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) author_list = article.find('AuthorList') if not author_list is None: for seq, author in enumerate(author_list.findall('Author')): author_record = self._blank_pubmed_author() author_record['PublicationId'] = PublicationId author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName"))) author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName"))) author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName'] if author.find("AffiliationInfo/Affiliation") is not None: author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation"))) author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","") author_record['AuthorSequence'] = seq+1 paa_df.append(author_record) paa_df = pd.DataFrame(paa_df) paa_df['AuthorSequence'] = paa_df['AuthorSequence'].astype(int) paa_df.to_hdf( os.path.join(self.path2database, self.path2paa_df, 'publicationauthoraffiliation{}.hdf'.format(ifile)), key = 'paa', mode='w') ## load publication author dataframe into a large file paa_files_list = glob.glob(os.path.join(self.path2database, self.path2paa_df) + 'publicationauthoraffiliation*.hdf') paa_df = pd.DataFrame() print("Parsing files...") for tmp_paa_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress): paa_df = paa_df.append(pd.read_hdf(tmp_paa_df), ignore_index = True) return paa_df
def parse_fields(self, preprocess = True, num_file_lines=10**7, rewrite_existing=False,xml_directory = 'RawXML'): """ Parse the PubMed field (mesh term) raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication-Term ID DataFrame and Term ID - Term DataFrame """ if preprocess: for hier_dir_type in [self.path2pub2field_df, self.path2fieldinfo_df]: if not os.path.exists(os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) # global id to term mapping fieldinfo = {} ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub2field_df,'pub2field{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue pub2field_df = [] medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) meshterms = medline.find("MeshHeadingList") if meshterms is not None: for term in meshterms.getchildren(): ui = term.find("DescriptorName").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh'] chemicals = medline.find("ChemicalList") if chemicals is not None: for chemical in chemicals.findall("Chemical"): ui = chemical.find("NameOfSubstance").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem'] # save the pub-field id pub2field_df = pd.DataFrame(pub2field_df, columns = ['PublicationId', 'FieldId'], dtype=int) pub2field_df.to_hdf( os.path.join(self.path2database, self.path2pub2field_df, 'pub2field{}.hdf'.format(ifile)), key = 'pub2field', mode='w') # if rewriting dest_file_name = os.path.join(self.path2database, self.path2pub2fieldinfo_df,'fieldinfo.hdf') if rewrite_existing: # save field info dictionary mesh_id_df_list = list(field_info.values()) for i, j in enumerate(field_info.keys()): mesh_id_df_list[i].insert(0, j) fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int) fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w') # load the dataframes # pub2field pub2field_files = glob.glob(os.path.join(self.path2database, self.path2pub2field_df) + 'pub2field*.hdf') pub2field_df = pd.DataFrame() for pub2field_tmp_file in tqdm(pub2field_files, desc='PubMed pub2field files', leave=True, disable=not show_progress): pub2field_df = pub2field_df.append(pd.read_hdf(pub2field_tmp_file), ignore_index=True) # field info map fieldinfo_df = pd.read_hdf(os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf')) return pub2field_df, fildinfo_df
def parse_publications(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7,rewrite_existing = False): """ Parse the PubMed publication raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication metadata DataFrame. """ # process publication files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'publication')): os.mkdir(os.path.join(self.path2database, 'publication')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed publication xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publication{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue publication_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) pub_record = self._blank_pubmed_publication(PublicationId) article = medline.find("Article") pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle'))) if article.find('Pagination') == None: pub_record['Pages'] = None else: pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn"))) journal = article.find("Journal") pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title"))) pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume"))) pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue"))) pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN"))) history = article_bucket.find("PubmedData/History") if not history is None: pdate = history.find('PubMedPubDate') if not pdate is None: pub_record['Year'] = load_int(load_xml_text(pdate.find("Year"))) pub_record['Month'] = load_int(load_xml_text(pdate.find("Month"))) pub_record['Day'] = load_int(load_xml_text(pdate.find("Day"))) article_ids = article_bucket.find("PubmedData/ArticleIdList") if article_ids is not None: doi = article_ids.find('ArticleId[@IdType="doi"]') pub_record['Doi'] = load_xml_text(doi) author_list = article.find('AuthorList') if not author_list is None: pub_record['TeamSize'] = len(author_list.findall('Author')) publication_df.append(pub_record) # save publication dataframe publication_df = pd.DataFrame(publication_df) publication_df['PublicationId'] = publication_df['PublicationId'].astype(int) publication_df['Year'] = publication_df['Year'].astype(int) publication_df['Month'] = publication_df['Month'].astype(int) publication_df['Day'] = publication_df['Day'].astype(int) publication_df['Volume'] = pd.to_numeric(publication_df['Volume']) publication_df['TeamSize'] = publication_df['TeamSize'].astype(int) publication_df.to_hdf( os.path.join(self.path2database, self.path2pub_df, 'publication{}.hdf'.format(ifile)), key = 'pub', mode='w') ## load publication dataframe into a large file pub_files_list = glob.glob(os.path.join(self.path2database, self.path2pub_df) + 'publication*.hdf') pub_df = pd.DataFrame() print("Parsing files...") for tmp_pub_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress): pub_df = pub_df.append(pd.read_hdf(tmp_pub_df), ignore_index = True) return pub_df
def parse_references(self, xml_directory='RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing=False,show_progress=True): """ Parse the PubMed References raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Citations DataFrame. """ # process author files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'pub2ref')): os.mkdir(os.path.join(self.path2database, 'pub2ref')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed reference xml files', leave=True, disable=not show_progress): xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser) # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub2ref_df,'pub2ref{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue pub2ref_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) references = article_bucket.find("PubmedData/ReferenceList") if not references is None: for ref in references.findall("Reference"): citation = load_xml_text(ref.find("Citation")) if not ref.find('ArticleIdList') is None: pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]'))) else: pmid = "" pub2ref_df.append([PublicationId, pmid, citation]) # save file pub2ref_df = pd.DataFrame(pub2ref_df, columns = ['CitedPublicationId', 'CitingPublicationId', 'Citation'], dtype=int) pub2ref_df.to_hdf( os.path.join(self.path2database, self.path2pub2ref_df, 'pub2ref{}.hdf'.format(ifile)), key = 'pub2ref', mode='w') # load the citations into a large dataframe pub2ref_files = glob.glob(os.path.join(self.path2database, self.path2pub2ref_df)+ 'pub2ref*.hdf') pub2ref_df = pd.DataFrame() print("parsing citation data...") for pub2ref_tmp in tqdm(pub2ref_files,desc='PubMed citation xml files', leave=True, disable=not show_progress): pub2ref_df = pub2ref_df.append(pd.read_hdf(pub2ref_tmp), ignore_indexTrue) return pub2ref_df
def parse_fields(self, preprocess=False, num_file_lines=10**7, show_progress=True): """ Parse the MAG Paper Field raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param num_file_lines: int, default 10**7 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Pub2Field DataFrame. """ field2get = [0, 5, 6] fieldnames = [ 'FieldId', 'FieldLevel', 'NumberPublications', 'FieldName' ] if preprocess: if not os.path.exists(os.path.join(self.path2database, 'fieldinfo')): os.mkdir(os.path.join(self.path2database, 'fieldinfo')) fieldinfo = [] with open( os.path.join(self.path2database, 'advanced', 'FieldsOfStudy.txt'), 'r') as infile: for line in infile: sline = line.split('\t') fielddata = [load_int(sline[ip]) for ip in field2get] + [sline[2]] fieldinfo.append(fielddata) field_df = pd.DataFrame(fieldinfo, columns=fieldnames) if preprocess: field_df.to_hdf(os.path.join(self.path2database, 'fieldinfo', 'fieldinfo0.hdf'), key='field', mode='w') # and now do pub2field paperfields = [0, 1] paperfieldnames = ['PublicationId', 'FieldId'] if preprocess: if not os.path.exists(os.path.join(self.path2database, 'pub2field')): os.mkdir(os.path.join(self.path2database, 'pub2field')) file_name = os.path.join(self.path2database, 'advanced', 'PaperFieldsOfStudy.txt') ipaper = 0 ifile = 0 fieldinfo = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='PaperFieldsOfStudy', leave=True, disable=not show_progress) as pbar: with open(file_name, 'r') as infile: for line in infile: sline = line.split('\t') fielddata = [int(sline[ip]) for ip in paperfields] fieldinfo.append(fielddata) ipaper += 1 # update progress bar pbar.update(sys.getsizeof(line)) if preprocess and ipaper % num_file_lines == 0: pd.DataFrame( fieldinfo, columns=paperfieldnames).to_hdf(os.path.join( self.path2database, 'pub2field', 'pub2field' + str(ifile) + '.hdf'), key='pub2field', mode='w') ifile += 1 fieldinfo = [] pub2field_df = pd.DataFrame(fieldinfo, columns=paperfieldnames) if preprocess: pub2field_df.to_hdf( os.path.join(self.path2database, 'pub2field', 'pub2field' + str(ifile) + '.hdf'), key='pub2field', mode='w') return pub2field_df
def parse_affiliations(self, preprocess=True, show_progress=True): """ Parse the MAG Affilation raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Affiliation DataFrame. """ affil_int_columns = [0, 7, 8] affil_str_columns = [3, 4, 5, 6] affil_float_columns = [9, 10] affil_column_names = [ 'AffiliationId', 'NumberPublications', 'NumberCitations', 'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude', 'Longitude' ] file_name = os.path.join(self.path2database, 'mag', 'Affiliations.txt') affiliation_info = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='Affiliations', leave=True, disable=not show_progress) as pbar: with open(file_name, 'r') as infile: for line in infile: sline = line.replace('\n', '').split('\t') affline = [load_int(sline[i]) for i in affil_int_columns] affline += [sline[i] for i in affil_str_columns] affline += [ load_float(sline[i]) for i in affil_float_columns ] affiliation_info.append(affline) # update progress bar pbar.update(sys.getsizeof(line)) aff_df = pd.DataFrame(affiliation_info, columns=affil_column_names) if preprocess: if not os.path.exists( os.path.join(self.path2database, 'affiliation')): os.mkdir(os.path.join(self.path2database, 'affiliation')) aff_df.to_hdf(os.path.join(self.path2database, 'affiliation', 'affiliation0.hdf'), key='affiliation', mode='w') return aff_df
def parse_publicationauthoraffiliation(self, preprocess=False, num_file_lines=10**7, show_progress=True): """ Parse the MAG PublicationAuthorAffiliation raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param num_file_lines: int, default 10**7 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame PublicationAuthorAffiliation DataFrame. """ pubauthaff_int_columns = [0, 1, 2, 3] pubauthaff_str_columns = [4, 5] pub_column_names = [ 'PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence', 'OrigAuthorName', 'OrigAffiliationName' ] if preprocess: if not os.path.exists( os.path.join(self.path2database, 'publicationauthoraffiliation')): os.mkdir( os.path.join(self.path2database, 'publicationauthoraffiliation')) file_name = os.path.join(self.path2database, 'mag', 'PaperAuthorAffiliations.txt') iref = 0 ifile = 0 pubauthaff_info = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='PaperAuthorAffiliations', leave=True, disable=not show_progress) as pbar: with open(file_name, 'r') as infile: for line in infile: sline = line.replace('\n', '').split('\t') pubauthaff_info.append( [load_int(sline[ip]) for ip in pubauthaff_int_columns] + [ sline[ip] if len(sline) > ip else '' for ip in pubauthaff_str_columns ]) iref += 1 # update progress bar pbar.update(sys.getsizeof(line)) if preprocess and iref % num_file_lines == 0: pd.DataFrame( pubauthaff_info, columns=pub_column_names).to_hdf( os.path.join( self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'. format(ifile)), key='publicationauthoraffiliation', mode='w') ifile += 1 pubauthaff_info = [] paa_df = pd.DataFrame(pubauthaff_info, columns=pub_column_names) if preprocess: paa_df.to_hdf(os.path.join( self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'.format(ifile)), key='publicationauthoraffiliation', mode='w') return paa_df
def parse_references(self, preprocess=False, num_file_lines=10**7, show_progress=True): """ Parse the MAG References raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param num_file_lines: int, default 10**7 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Pub2Ref DataFrame. """ if preprocess: if not os.path.exists(os.path.join(self.path2database, 'pub2ref')): os.mkdir(os.path.join(self.path2database, 'pub2ref')) file_name = os.path.join(self.path2database, 'mag', 'PaperReferences.txt') iref = 0 ifile = 0 pub2ref_info = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, unit_divisor=1024, desc='References', leave=True, disable=not show_progress) as pbar: with open(file_name, 'r') as infile: for line in infile: # split the line and keep only the relevant columns sline = line.replace('\n', '').split('\t') pub2ref_info.append( [load_int(sline[ip]) for ip in range(2)]) iref += 1 # update progress bar pbar.update(sys.getsizeof(line)) if preprocess and iref % num_file_lines == 0: pd.DataFrame(pub2ref_info, columns=[ 'CitingPublicationId', 'CitedPublicationId' ]).to_hdf(os.path.join( self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile)), key='pub2ref', mode='w') ifile += 1 pub2ref_info = [] pub2ref_df = pd.DataFrame( pub2ref_info, columns=['CitingPublicationId', 'CitedPublicationId']) if preprocess: pub2ref_df.to_hdf(os.path.join(self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile)), key='pub2ref', mode='w') return pub2ref_df
def parse_publications(self, preprocess=True, num_file_lines=5 * 10**6, preprocess_dicts=True, show_progress=True): """ Parse the MAG Publication and Journal raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param preprocess_dicts: bool, default True Save the processed Year and DocType data as dictionaries. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication DataFrame. """ # first do the journal information journal_str_col = [2, 4, 5, 6] journal_column_names = [ 'JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage' ] if preprocess: if not os.path.exists(os.path.join(self.path2database, 'journal')): os.mkdir(os.path.join(self.path2database, 'journal')) file_name = os.path.join(self.path2database, 'mag', 'Journals.txt') journal_info = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='Journals', leave=True, disable=not show_progress) as pbar: with open(os.path.join(self.path2database, 'mag', 'Journals.txt'), 'r') as infile: for line in infile: # split the line and keep only the relevant columns sline = line.replace('\n', '').split('\t') jline = [load_int(sline[0]) ] + [sline[i] for i in journal_str_col] journal_info.append(jline) # update progress bar pbar.update(sys.getsizeof(line)) journal_df = pd.DataFrame(journal_info, columns=journal_column_names) if preprocess: journal_df.to_hdf(os.path.join(self.path2database, 'journal', 'journal.hdf'), key='journal', mode='w') #now lets do the publication information doctype = { 'Journal': 'j', 'Book': 'b', '': '', 'BookChapter': 'bc', 'Conference': 'c', 'Dataset': 'd', 'Patent': 'p', 'Repository': 'r' } pub_int_columns = [0, 7, 10, 21] pub_str_columns = [2, 4, 8, 13, 14] pub_column_names = [ 'PublicationId', 'Year', 'JournalId', 'FamilyId', 'Doi', 'Title', 'Date', 'Volume', 'Issue', 'DocType' ] if preprocess: if not os.path.exists( os.path.join(self.path2database, 'publication')): os.mkdir(os.path.join(self.path2database, 'publication')) file_name = os.path.join(self.path2database, 'mag', 'Papers.txt') ipub = 0 ifile = 0 pubinfo = [] pub2year = {} pub2doctype = {} with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='Publications', leave=True, disable=not show_progress) as pbar: with open(file_name, 'r') as infile: for line in infile: # split the line and keep only the relevant columns sline = line.replace('\n', '').split('\t') pline = [load_int(sline[ip]) for ip in pub_int_columns] + [ sline[ip] for ip in pub_str_columns ] + [doctype[sline[3]]] pub2year[pline[0]] = pline[1] if doctype[sline[3]] != '': pub2doctype[pline[0]] = doctype[sline[3]] pubinfo.append(pline) ipub += 1 # update progress bar pbar.update(sys.getsizeof(line)) if preprocess and ipub % num_file_lines == 0: pd.DataFrame(pubinfo, columns=pub_column_names).to_hdf( os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile)), key='publication', mode='w') ifile += 1 pubinfo = [] pub_df = pd.DataFrame(pubinfo, columns=pub_column_names) if preprocess: pub_df.to_hdf(os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile)), key='publication', mode='w') if preprocess_dicts: with gzip.open( os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open( os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8')) return pub_df
def parse_authors(self, preprocess=False, process_name=True, num_file_lines=5 * 10**6, show_progress=True): """ Parse the MAG Author raw data. Parameters ---------- :param preprocess: bool, default True Save the processed data in new DataFrames. :param process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. :param num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. :param show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Author DataFrame. """ author_int_columns = [0, 4, 5, 6] author_column_names = [ 'AuthorId', 'LastKnownAffiliationId', 'NumberPublications', 'NumberCitations', 'FullName' ] if process_name: author_column_names += ['LastName', 'FirstName', 'MiddleName'] if preprocess: if not os.path.exists(os.path.join(self.path2database, 'author')): os.mkdir(os.path.join(self.path2database, 'author')) file_name = os.path.join(self.path2database, 'mag', 'Authors.txt') iauthor = 0 ifile = 0 authorinfo = [] with tqdm(total=os.path.getsize(file_name), unit='iB', unit_scale=True, desc='Authors', disable=not show_progress, leave=True) as pbar: with open(file_name, 'r') as infile: for line in infile: # split the line and keep only the relevant columns sline = line.split('\t') adata = [load_int(sline[ip]) for ip in author_int_columns] + [sline[2]] # process the first, middle, and last names for the author if process_name: hname = HumanName( unicodedata.normalize('NFD', sline[2])) adata += [hname.last, hname.first, hname.middle] authorinfo.append(adata) iauthor += 1 # update progress bar pbar.update(sys.getsizeof(line)) # time to save if preprocess and iauthor % num_file_lines == 0: pd.DataFrame(authorinfo, columns=author_column_names).to_hdf( os.path.join( self.path2database, 'author', 'author{}.hdf'.format(ifile)), key='author', mode='w') ifile += 1 authorinfo = [] author_df = pd.DataFrame(authorinfo, columns=author_column_names) if preprocess: author_df.to_hdf( os.path.join(self.path2database, 'author', 'author{}.hdf'.format(ifile)), key='author', mode='w') return author_df
def preprocess( self, xml_directory='RawXML', name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord', process_name=True, num_file_lines=10**6, show_progress=True): """ Bulk preprocess of the Web of Science raw data. Parameters ---------- process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. xml_file_name: str, default 'dblp.xml.gz' The xml file name. num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. """ pub_column_names = [ 'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title', 'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize' ] author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName'] if show_progress: print("Starting to preprocess the WOS database.") for hier_dir_type in [ 'publication', 'author', 'publicationauthoraffiliation', 'pub2field', 'pub2ref', 'affiliation' ]: if not os.path.exists( os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) pub2year = {} pub2doctype = {} found_aids = set([]) found_affiliations = {} ns = {"ns": name_space} xmlfiles = sorted([ fname for fname in os.listdir( os.path.join(self.path2database, xml_directory)) if '.xml' in fname ]) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='WOS xml files', leave=True, disable=not show_progress): publication_df = [] author_df = [] paa_df = [] pub2field_df = [] pub2ref_df = [] affiliation_df = [] field_df = [] name, extension = os.path.splitext(xml_file_name) if extension == '.gz': with gzip.open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() bytesxml = BytesIO(xml_file) elif extension == '.xml': with open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() # extract the desired fields from the XML tree # xmltree = etree.iterparse(bytesxml, events=('end', ), tag="{{{0}}}REC".format(name_space)) if show_progress: print("{} Xml tree parsed, iterating through elements.".format( xml_file_name)) last_position = 0 for event, elem in xmltree: # scrape the publication information PublicationId = load_html_str( elem.xpath('./ns:UID', namespaces=ns)[0].text) pub_record = self._blank_wos_publication(PublicationId) pub_record['Title'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]', namespaces=ns))) pub_record['JournalId'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]', namespaces=ns))) pub_info = elem.xpath( './ns:static_data/ns:summary/ns:pub_info', namespaces=ns)[0] pub_record['Year'] = load_int(pub_info.get('pubyear', '')) pub_record['Date'] = load_html_str(pub_info.get( 'sortdate', '')) pub_record['Volume'] = load_int(pub_info.get('vol', '')) pub_record['Issue'] = load_int(pub_info.get('issue', '')) pub2year[PublicationId] = pub_record['Year'] pub_record['Pages'] = load_html_str( load_xml_text(elem.xpath( './ns:static_data/ns:summary/ns:pub_info/ns:page', namespaces=ns), default='')) for ident in ['ISSN', 'Doi']: identobject = elem.xpath( './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]' .format(ident.lower()), namespaces=ns) if len(identobject) > 0: pub_record[ident] = load_html_str(identobject[0].get( 'value', '')) #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns))) pub_record['DocType'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:doctypes/ns:doctype', namespaces=ns))) pub2doctype[PublicationId] = pub_record['DocType'] # now scrape the authors pub_authors = {} author_objects = elem.xpath( './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]', namespaces=ns) pub_record['TeamSize'] = len(author_objects) for author_obj in author_objects: author_record = self._blank_wos_author(None) author_record['AuthorId'] = author_obj.get('dais_id', None) author_record['FullName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:full_name', namespaces=ns))) author_record['FirstName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:first_name', namespaces=ns))) author_record['LastName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:last_name', namespaces=ns))) author_record['Affiliations'] = author_obj.get( 'addr_no', '') author_record['Affiliations'] = [ int(single_addr_no) for single_addr_no in author_record['Affiliations'].split(' ') if len(single_addr_no) > 0 ] author_record['AuthorOrder'] = int( author_obj.get('seq_no', None)) pub_authors[author_record['AuthorOrder']] = author_record #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns) address_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec', namespaces=ns) for addr_obj in address_objects: addr_record = self._blank_wos_affiliation() organization_objects = addr_obj.xpath( './ns:organizations/ns:organization[@pref="Y"]', namespaces=ns) if len(organization_objects) == 0: organization_objects = addr_obj.xpath( './ns:organizations/ns:organization', namespaces=ns) if len(organization_objects) == 0: orgtext = '' else: orgtext = organization_objects[0].text address_no = int(addr_obj.get('addr_no')) affiliation_df.append([PublicationId, addr_no, orgtext]) #if found_affiliations #article['addresses'][address_no] = address_info field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'heading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'subheading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]', namespaces=ns) field_df.extend([[ PublicationId, field_obj.text, 'ASCA traditional subject' ] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'ASCA extended subject'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'keyword'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:item/ns:keywords_plus/ns:keyword', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'keyword plus'] for field_obj in field_objects if field_obj is not None]) reference_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference', namespaces=ns) for ref_obj in reference_objects: for ref_elem in ref_obj: if ref_elem.tag == "{{{0}}}uid".format(name_space): refid = load_html_str( ref_elem.text.replace('WOS:', '')) pub2ref_df.append([PublicationId, refid]) elif ref_elem.tag == "{{{0}}}year".format(name_space): pub2year[refid] = load_int(ref_elem.text) publication_df.append( [pub_record[k] for k in pub_column_names]) for aorder, author_record in pub_authors.items(): if not author_record[ 'AuthorId'] is None and not author_record[ 'AuthorId'] in found_aids: found_aids.add(author_record['AuthorId']) author_df.append( [author_record[k] for k in author_column_names]) paa_df.append([ PublicationId, author_record['AuthorId'], aorder, author_record['FullName'] ]) self._save_dataframes(ifile, publication_df, pub_column_names, author_df, author_column_names, paa_df, pub2ref_df, affiliation_df, field_df) ifile += 1 with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8'))