def preprocess(self, xml_directory = 'RawXML', process_name=True, num_file_lines=10**6, show_progress=True,rewrite_existing = False): """ Bulk preprocess of the PubMed raw data. Parameters ---------- process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. rewrite_existing: bool, default False If True, rewrites the files in the data directory """ if show_progress: print("Starting to preprocess the PubMed database.") for hier_dir_type in [self.path2pub_df, self.path2paa_df, self.path2pub2field_df, self.path2pub2ref_df, self.path2fieldinfo_df]: if not os.path.exists(os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) pub2year = {} fieldinfo = {} ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub_df,'publication{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue publication_df = [] paa_df = [] pub2field_df = [] pub2ref_df = [] xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser) all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) pub_record = self._blank_pubmed_publication(PublicationId) article = medline.find("Article") pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle'))) if article.find('Pagination') == None: pub_record['Pages'] = None else: pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn"))) journal = article.find("Journal") pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title"))) pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume"))) pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue"))) pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN"))) history = article_bucket.find("PubmedData/History") if not history is None: pdate = history.find('PubMedPubDate') if not pdate is None: pub_record['Year'] = load_int(load_xml_text(pdate.find("Year"))) pub_record['Month'] = load_int(load_xml_text(pdate.find("Month"))) pub_record['Day'] = load_int(load_xml_text(pdate.find("Day"))) if pub_record['Year'] > 0: pub2year[PublicationId] = pub_record['Year'] article_ids = article_bucket.find("PubmedData/ArticleIdList") if article_ids is not None: doi = article_ids.find('ArticleId[@IdType="doi"]') pub_record['Doi'] = load_xml_text(doi) author_list = article.find('AuthorList') if not author_list is None: for seq, author in enumerate(author_list.findall('Author')): author_record = self._blank_pubmed_author() author_record['PublicationId'] = PublicationId author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName"))) author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName"))) author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName'] if author.find("AffiliationInfo/Affiliation") is not None: author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation"))) author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","") author_record['AuthorSequence'] = seq+1 paa_df.append(author_record) pub_record['TeamSize'] = seq + 1 meshterms = medline.find("MeshHeadingList") if meshterms is not None: for term in meshterms.getchildren(): ui = term.find("DescriptorName").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh'] chemicals = medline.find("ChemicalList") if chemicals is not None: for chemical in chemicals.findall("Chemical"): ui = chemical.find("NameOfSubstance").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem'] references = article_bucket.find("PubmedData/ReferenceList") if not references is None: for ref in references.findall("Reference"): citation = load_xml_text(ref.find("Citation")) if not ref.find('ArticleIdList') is None: pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]'))) else: pmid = "" pub2ref_df.append([PublicationId, pmid, citation]) publication_df.append(pub_record) self._save_dataframes(ifile, publication_df, paa_df, pub2ref_df, pub2field_df) ifile += 1 # if rewriting dest_file_name = os.path.join(self.path2database, self.path2fieldinfo_df,'fieldinfo.hdf') if rewrite_existing: # save field info dictionary mesh_id_df_list = list(field_info.values()) for i, j in enumerate(field_info.keys()): mesh_id_df_list[i].insert(0, j) fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int) fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w') with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8'))
def parse_fields(self, preprocess = True, num_file_lines=10**7, rewrite_existing=False,xml_directory = 'RawXML'): """ Parse the PubMed field (mesh term) raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication-Term ID DataFrame and Term ID - Term DataFrame """ if preprocess: for hier_dir_type in [self.path2pub2field_df, self.path2fieldinfo_df]: if not os.path.exists(os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) # global id to term mapping fieldinfo = {} ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub2field_df,'pub2field{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue pub2field_df = [] medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) meshterms = medline.find("MeshHeadingList") if meshterms is not None: for term in meshterms.getchildren(): ui = term.find("DescriptorName").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [load_xml_text(term.find("DescriptorName")), 'mesh'] chemicals = medline.find("ChemicalList") if chemicals is not None: for chemical in chemicals.findall("Chemical"): ui = chemical.find("NameOfSubstance").attrib.get("UI", "") if len(ui)>0: pub2field_df.append([PublicationId, ui]) fieldinfo[ui] = [ui, load_xml_text(chemical.find("NameOfSubstance")), 'chem'] # save the pub-field id pub2field_df = pd.DataFrame(pub2field_df, columns = ['PublicationId', 'FieldId'], dtype=int) pub2field_df.to_hdf( os.path.join(self.path2database, self.path2pub2field_df, 'pub2field{}.hdf'.format(ifile)), key = 'pub2field', mode='w') # if rewriting dest_file_name = os.path.join(self.path2database, self.path2pub2fieldinfo_df,'fieldinfo.hdf') if rewrite_existing: # save field info dictionary mesh_id_df_list = list(field_info.values()) for i, j in enumerate(field_info.keys()): mesh_id_df_list[i].insert(0, j) fieldinfo = pd.DataFrame(mesh_id_df_list, columns = ['FieldId', 'FieldName', 'FieldType'], dtype=int) fieldinfo.to_hdf( os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf'), key = 'fieldinfo', mode='w') # load the dataframes # pub2field pub2field_files = glob.glob(os.path.join(self.path2database, self.path2pub2field_df) + 'pub2field*.hdf') pub2field_df = pd.DataFrame() for pub2field_tmp_file in tqdm(pub2field_files, desc='PubMed pub2field files', leave=True, disable=not show_progress): pub2field_df = pub2field_df.append(pd.read_hdf(pub2field_tmp_file), ignore_index=True) # field info map fieldinfo_df = pd.read_hdf(os.path.join(self.path2database, self.path2fieldinfo_df, 'fieldinfo.hdf')) return pub2field_df, fildinfo_df
def parse_publicationauthoraffiliation(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing = False): """ Parse the PubMed publication-author raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication-Author DataFrame. """ # process author files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'publicationauthoraffiliation')): os.mkdir(os.path.join(self.path2database, 'publicationauthoraffiliation')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed author xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publicationauthoraffiliation{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue paa_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) author_list = article.find('AuthorList') if not author_list is None: for seq, author in enumerate(author_list.findall('Author')): author_record = self._blank_pubmed_author() author_record['PublicationId'] = PublicationId author_record['FirstName'] = load_html_str(load_xml_text(author.find("ForeName"))) author_record['LastName'] = load_html_str(load_xml_text(author.find("LastName"))) author_record['FullName'] = author_record['FirstName'] + ' ' + author_record['LastName'] if author.find("AffiliationInfo/Affiliation") is not None: author_record['Affiliations'] = load_html_str(load_xml_text(author.find("AffiliationInfo/Affiliation"))) author_record['Affiliations'] = author_record['Affiliations'].replace("For a full list of the authors' affiliations please see the Acknowledgements section.","") author_record['AuthorSequence'] = seq+1 paa_df.append(author_record) paa_df = pd.DataFrame(paa_df) paa_df['AuthorSequence'] = paa_df['AuthorSequence'].astype(int) paa_df.to_hdf( os.path.join(self.path2database, self.path2paa_df, 'publicationauthoraffiliation{}.hdf'.format(ifile)), key = 'paa', mode='w') ## load publication author dataframe into a large file paa_files_list = glob.glob(os.path.join(self.path2database, self.path2paa_df) + 'publicationauthoraffiliation*.hdf') paa_df = pd.DataFrame() print("Parsing files...") for tmp_paa_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress): paa_df = paa_df.append(pd.read_hdf(tmp_paa_df), ignore_index = True) return paa_df
def parse_references(self, xml_directory='RawXML',preprocess = True, num_file_lines=10**7, rewrite_existing=False,show_progress=True): """ Parse the PubMed References raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Citations DataFrame. """ # process author files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'pub2ref')): os.mkdir(os.path.join(self.path2database, 'pub2ref')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed reference xml files', leave=True, disable=not show_progress): xmltree = etree.parse(os.path.join(self.path2database, xml_directory, xml_file_name), parser) # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2pub2ref_df,'pub2ref{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue pub2ref_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) references = article_bucket.find("PubmedData/ReferenceList") if not references is None: for ref in references.findall("Reference"): citation = load_xml_text(ref.find("Citation")) if not ref.find('ArticleIdList') is None: pmid = load_int(load_xml_text(ref.find('ArticleIdList').find('ArticleId[@IdType="pubmed"]'))) else: pmid = "" pub2ref_df.append([PublicationId, pmid, citation]) # save file pub2ref_df = pd.DataFrame(pub2ref_df, columns = ['CitedPublicationId', 'CitingPublicationId', 'Citation'], dtype=int) pub2ref_df.to_hdf( os.path.join(self.path2database, self.path2pub2ref_df, 'pub2ref{}.hdf'.format(ifile)), key = 'pub2ref', mode='w') # load the citations into a large dataframe pub2ref_files = glob.glob(os.path.join(self.path2database, self.path2pub2ref_df)+ 'pub2ref*.hdf') pub2ref_df = pd.DataFrame() print("parsing citation data...") for pub2ref_tmp in tqdm(pub2ref_files,desc='PubMed citation xml files', leave=True, disable=not show_progress): pub2ref_df = pub2ref_df.append(pd.read_hdf(pub2ref_tmp), ignore_indexTrue) return pub2ref_df
def parse_publications(self, xml_directory = 'RawXML',preprocess = True, num_file_lines=10**7,rewrite_existing = False): """ Parse the PubMed publication raw data. Parameters ---------- preprocess: bool, default True Save the processed data in new DataFrames. process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. num_file_lines: int, default 5*10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. Returns ---------- DataFrame Publication metadata DataFrame. """ # process publication files through xml if preprocess: if not os.path.exists(os.path.join(self.path2database, 'publication')): os.mkdir(os.path.join(self.path2database, 'publication')) xmlfiles = sorted([fname for fname in os.listdir(os.path.join(self.path2database, xml_directory)) if '.xml' in fname]) # read dtd - this takes path2database = self.path2database # remove self to use inside of this class class DTDResolver(etree.Resolver): def resolve(self, system_url, public_id, context): return self.resolve_filename(os.path.join(path2database, system_url), context) parser = etree.XMLParser(load_dtd=True, resolve_entities=True) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='PubMed publication xml files', leave=True, disable=not show_progress): # check if the xml file was already parsed dest_file_name = os.path.join(self.path2database, self.path2paa_df,'publication{}.hdf'.format(ifile)) if not rewrite_existing and os.path.isfile(dest_file_name): ifile+=1 continue publication_df = [] all_pubmed_articles = xmltree.findall("/PubmedArticle") for article_bucket in all_pubmed_articles: medline = article_bucket.find("MedlineCitation") # scrape the publication information PublicationId = load_int(load_xml_text(medline.find('PMID'))) pub_record = self._blank_pubmed_publication(PublicationId) article = medline.find("Article") pub_record['Title'] = load_html_str(load_xml_text(article.find('ArticleTitle'))) if article.find('Pagination') == None: pub_record['Pages'] = None else: pub_record['Pages'] = load_html_str(load_xml_text(article.find('Pagination').find("MedlinePgn"))) journal = article.find("Journal") pub_record['JournalId'] = load_html_str(load_xml_text(journal.find("Title"))) pub_record['Volume'] = load_int(load_xml_text(journal.find("JournalIssue").find("Volume"))) pub_record['Issue'] = load_int(load_xml_text(journal.find("JournalIssue").find("Issue"))) pub_record['ISSN'] = load_html_str(load_xml_text(journal.find("ISSN"))) history = article_bucket.find("PubmedData/History") if not history is None: pdate = history.find('PubMedPubDate') if not pdate is None: pub_record['Year'] = load_int(load_xml_text(pdate.find("Year"))) pub_record['Month'] = load_int(load_xml_text(pdate.find("Month"))) pub_record['Day'] = load_int(load_xml_text(pdate.find("Day"))) article_ids = article_bucket.find("PubmedData/ArticleIdList") if article_ids is not None: doi = article_ids.find('ArticleId[@IdType="doi"]') pub_record['Doi'] = load_xml_text(doi) author_list = article.find('AuthorList') if not author_list is None: pub_record['TeamSize'] = len(author_list.findall('Author')) publication_df.append(pub_record) # save publication dataframe publication_df = pd.DataFrame(publication_df) publication_df['PublicationId'] = publication_df['PublicationId'].astype(int) publication_df['Year'] = publication_df['Year'].astype(int) publication_df['Month'] = publication_df['Month'].astype(int) publication_df['Day'] = publication_df['Day'].astype(int) publication_df['Volume'] = pd.to_numeric(publication_df['Volume']) publication_df['TeamSize'] = publication_df['TeamSize'].astype(int) publication_df.to_hdf( os.path.join(self.path2database, self.path2pub_df, 'publication{}.hdf'.format(ifile)), key = 'pub', mode='w') ## load publication dataframe into a large file pub_files_list = glob.glob(os.path.join(self.path2database, self.path2pub_df) + 'publication*.hdf') pub_df = pd.DataFrame() print("Parsing files...") for tmp_pub_df in tqdm(paa_files_list, desc='PubMed author files', leave=True, disable=not show_progress): pub_df = pub_df.append(pd.read_hdf(tmp_pub_df), ignore_index = True) return pub_df
def preprocess( self, xml_directory='RawXML', name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord', process_name=True, num_file_lines=10**6, show_progress=True): """ Bulk preprocess of the Web of Science raw data. Parameters ---------- process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. xml_file_name: str, default 'dblp.xml.gz' The xml file name. num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. """ pub_column_names = [ 'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title', 'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize' ] author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName'] if show_progress: print("Starting to preprocess the WOS database.") for hier_dir_type in [ 'publication', 'author', 'publicationauthoraffiliation', 'pub2field', 'pub2ref', 'affiliation' ]: if not os.path.exists( os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) pub2year = {} pub2doctype = {} found_aids = set([]) found_affiliations = {} ns = {"ns": name_space} xmlfiles = sorted([ fname for fname in os.listdir( os.path.join(self.path2database, xml_directory)) if '.xml' in fname ]) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='WOS xml files', leave=True, disable=not show_progress): publication_df = [] author_df = [] paa_df = [] pub2field_df = [] pub2ref_df = [] affiliation_df = [] field_df = [] name, extension = os.path.splitext(xml_file_name) if extension == '.gz': with gzip.open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() bytesxml = BytesIO(xml_file) elif extension == '.xml': with open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() # extract the desired fields from the XML tree # xmltree = etree.iterparse(bytesxml, events=('end', ), tag="{{{0}}}REC".format(name_space)) if show_progress: print("{} Xml tree parsed, iterating through elements.".format( xml_file_name)) last_position = 0 for event, elem in xmltree: # scrape the publication information PublicationId = load_html_str( elem.xpath('./ns:UID', namespaces=ns)[0].text) pub_record = self._blank_wos_publication(PublicationId) pub_record['Title'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]', namespaces=ns))) pub_record['JournalId'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]', namespaces=ns))) pub_info = elem.xpath( './ns:static_data/ns:summary/ns:pub_info', namespaces=ns)[0] pub_record['Year'] = load_int(pub_info.get('pubyear', '')) pub_record['Date'] = load_html_str(pub_info.get( 'sortdate', '')) pub_record['Volume'] = load_int(pub_info.get('vol', '')) pub_record['Issue'] = load_int(pub_info.get('issue', '')) pub2year[PublicationId] = pub_record['Year'] pub_record['Pages'] = load_html_str( load_xml_text(elem.xpath( './ns:static_data/ns:summary/ns:pub_info/ns:page', namespaces=ns), default='')) for ident in ['ISSN', 'Doi']: identobject = elem.xpath( './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]' .format(ident.lower()), namespaces=ns) if len(identobject) > 0: pub_record[ident] = load_html_str(identobject[0].get( 'value', '')) #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns))) pub_record['DocType'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:doctypes/ns:doctype', namespaces=ns))) pub2doctype[PublicationId] = pub_record['DocType'] # now scrape the authors pub_authors = {} author_objects = elem.xpath( './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]', namespaces=ns) pub_record['TeamSize'] = len(author_objects) for author_obj in author_objects: author_record = self._blank_wos_author(None) author_record['AuthorId'] = author_obj.get('dais_id', None) author_record['FullName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:full_name', namespaces=ns))) author_record['FirstName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:first_name', namespaces=ns))) author_record['LastName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:last_name', namespaces=ns))) author_record['Affiliations'] = author_obj.get( 'addr_no', '') author_record['Affiliations'] = [ int(single_addr_no) for single_addr_no in author_record['Affiliations'].split(' ') if len(single_addr_no) > 0 ] author_record['AuthorOrder'] = int( author_obj.get('seq_no', None)) pub_authors[author_record['AuthorOrder']] = author_record #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns) address_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec', namespaces=ns) for addr_obj in address_objects: addr_record = self._blank_wos_affiliation() organization_objects = addr_obj.xpath( './ns:organizations/ns:organization[@pref="Y"]', namespaces=ns) if len(organization_objects) == 0: organization_objects = addr_obj.xpath( './ns:organizations/ns:organization', namespaces=ns) if len(organization_objects) == 0: orgtext = '' else: orgtext = organization_objects[0].text address_no = int(addr_obj.get('addr_no')) affiliation_df.append([PublicationId, addr_no, orgtext]) #if found_affiliations #article['addresses'][address_no] = address_info field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'heading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'subheading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]', namespaces=ns) field_df.extend([[ PublicationId, field_obj.text, 'ASCA traditional subject' ] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'ASCA extended subject'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'keyword'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:item/ns:keywords_plus/ns:keyword', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'keyword plus'] for field_obj in field_objects if field_obj is not None]) reference_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference', namespaces=ns) for ref_obj in reference_objects: for ref_elem in ref_obj: if ref_elem.tag == "{{{0}}}uid".format(name_space): refid = load_html_str( ref_elem.text.replace('WOS:', '')) pub2ref_df.append([PublicationId, refid]) elif ref_elem.tag == "{{{0}}}year".format(name_space): pub2year[refid] = load_int(ref_elem.text) publication_df.append( [pub_record[k] for k in pub_column_names]) for aorder, author_record in pub_authors.items(): if not author_record[ 'AuthorId'] is None and not author_record[ 'AuthorId'] in found_aids: found_aids.add(author_record['AuthorId']) author_df.append( [author_record[k] for k in author_column_names]) paa_df.append([ PublicationId, author_record['AuthorId'], aorder, author_record['FullName'] ]) self._save_dataframes(ifile, publication_df, pub_column_names, author_df, author_column_names, paa_df, pub2ref_df, affiliation_df, field_df) ifile += 1 with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8'))