def _convert_wos_record(record, ns): """ Takes an XML tree of a single WoS record and returns a dictionary representing the record's information. Args: record: An XML tree, where the root is the 'REC' tag ns: A dictionary that contains the key 'ns' whose value is the null XML namespace. Returns: A dictionary representing the record's information with the following keys and value types: wosid: unicode title: unicode journal: unicode issue: unicode volume: unicode pubdate: int institutions: {int: (unicode, [unicode])} authors: [(unicode, [int])] citcount: int """ r = dict() r['wosid'] = xpath_str(record, "ns:UID/text()", ns) r['title'] = xpath_str(record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='item']/text()",ns) r['journal'] = xpath_str(record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='source']/text()", ns) pubinfo = record.xpath("ns:static_data/ns:summary/ns:pub_info", namespaces=ns)[0] (r['issue'], r['volume'], pubdate) = (pubinfo.attrib.get('issue'), pubinfo.attrib.get('vol'), pubinfo.attrib.get('sortdate')) if pubdate: m = _date_re.match(pubdate) r['pubdate'] = int(m.group('yr') + m.group('mon') + m.group('day')) r['institutions'] = {} num_institutions = int(record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses", namespaces=ns)[0].attrib['count']) for institution_tag in record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec", namespaces=ns): index = int(institution_tag.attrib['addr_no']) address = xpath_str(institution_tag, "ns:full_address/text()", ns) organizations = xpath_strs(institution_tag, "ns:organizations/ns:organization/text()", ns) r['institutions'][index] = (address, organizations) r['authors'] = [] num_authors = int(record.xpath("ns:static_data/ns:summary/ns:names", namespaces=ns)[0].attrib['count']) for i in range(1, num_authors + 1): author_tag = record.xpath("ns:static_data/ns:summary/ns:names/ns:name[@seq_no='%d']" % i, namespaces=ns)[0] author_name = xpath_str(author_tag, "ns:wos_standard/text()", ns) if author_name == None: continue affiliation_indices = map(int, author_tag.attrib['addr_no'].split(' ')) if 'addr_no' in author_tag.attrib else None r['authors'].append((author_name, affiliation_indices)) cittag = record.xpath("ns:dynamic_data/ns:citation_related/ns:tc_list/ns:silo_tc[@coll_id='WOS']/@local_count", namespaces=ns) if cittag: r['citcount'] = int(cittag[0]) return r
def _convert_wos_record(record, ns): """ Takes an XML tree of a single WoS record and returns a dictionary representing the record's information. Args: record: An XML tree, where the root is the 'REC' tag ns: A dictionary that contains the key 'ns' whose value is the null XML namespace. Returns: A dictionary representing the record's information with the following keys and value types: wosid: unicode title: unicode journal: unicode issue: unicode volume: unicode pubdate: int institutions: {int: (unicode, [unicode])} authors: [(unicode, [int])] citcount: int """ r = dict() r['wosid'] = xpath_str(record, "ns:UID/text()", ns) r['title'] = xpath_str( record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='item']/text()", ns) r['journal'] = xpath_str( record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='source']/text()", ns) pubinfo = record.xpath("ns:static_data/ns:summary/ns:pub_info", namespaces=ns)[0] (r['issue'], r['volume'], pubdate) = (pubinfo.attrib.get('issue'), pubinfo.attrib.get('vol'), pubinfo.attrib.get('sortdate')) if pubdate: m = _date_re.match(pubdate) r['pubdate'] = int(m.group('yr') + m.group('mon') + m.group('day')) r['institutions'] = {} num_institutions = int( record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses", namespaces=ns)[0].attrib['count']) for institution_tag in record.xpath( "ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec", namespaces=ns): index = int(institution_tag.attrib['addr_no']) address = xpath_str(institution_tag, "ns:full_address/text()", ns) organizations = xpath_strs(institution_tag, "ns:organizations/ns:organization/text()", ns) r['institutions'][index] = (address, organizations) r['authors'] = [] num_authors = int( record.xpath("ns:static_data/ns:summary/ns:names", namespaces=ns)[0].attrib['count']) for i in range(1, num_authors + 1): author_tag = record.xpath( "ns:static_data/ns:summary/ns:names/ns:name[@seq_no='%d']" % i, namespaces=ns)[0] author_name = xpath_str(author_tag, "ns:wos_standard/text()", ns) if author_name == None: continue affiliation_indices = map(int, author_tag.attrib['addr_no'].split( ' ')) if 'addr_no' in author_tag.attrib else None r['authors'].append((author_name, affiliation_indices)) cittag = record.xpath( "ns:dynamic_data/ns:citation_related/ns:tc_list/ns:silo_tc[@coll_id='WOS']/@local_count", namespaces=ns) if cittag: r['citcount'] = int(cittag[0]) return r
def _article_to_pubmed_ref(article): '''Convert PubMed XML data about an article into a ref (dictionary containing the article data). The returned dictionary will contain this: { "pmid": a string containing the article's PMID "authors": a list of tuples (string, integer), where the first element is the author's name and the second is the author's affiliation if known (otherwise it's None) "institutions": a dictionary, where the key is an integer and value is a list of strings contanining the institutional hierarchy "title": the article's title "pubdate": an integer of the form 19850726 (i.e., 1985/07/26) "year": the publication year as integer "journal": a string "grantagencies": a list of strings "pubtypes": a list of strings specifying the publication types as per PubMed "meshterms": a nested list of strings specifying the MeSH terms as per PubMed }''' r = {} r['pmid'] = xpath_str( article, 'PubmedData/ArticleIdList/ArticleId[@IdType=\'pubmed\']/text()') institutions = {} authors = [] for author in article.xpath('MedlineCitation/Article/AuthorList/Author'): lastname = xpath_str(author, 'LastName/text()') initials = xpath_str(author, 'Initials/text()') if lastname and initials: name = lastname + u' ' + initials else: continue institution_address = xpath_str(author, 'Affiliation/text()') institution_index = len( institutions) + 1 if institution_address else None if institution_address: institutions[institution_index] = (institution_address, None) authors.append((name, institution_index)) r['authors'] = authors r['institutions'] = institutions r['title'] = xpath_str(article, 'MedlineCitation/Article/ArticleTitle/text()') pubdate_str = u'' pubdate_elem = article.xpath( 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0] pubdate_yr = xpath_str(pubdate_elem, 'Year/text()') if pubdate_yr: pubdate_str += pubdate_yr pubdate_mon = xpath_str(pubdate_elem, 'Month/text()') if pubdate_mon: pubdate_str += '%02d' % int(pubdate_mon) pubdate_day = xpath_str(pubdate_elem, 'Day/text()') if pubdate_day: pubdate_str += '%02d' % int(pubdate_day) else: pubdate_str += '00' else: pubdate_str += '0000' r['pubdate'] = int(pubdate_str) if pubdate_str else None r['year'] = pubdate_yr r['journal'] = xpath_str( article, 'MedlineCitation/MedlineJournalInfo/MedlineTA/text()') r['grantagencies'] = xpath_strs( article, 'MedlineCitation/Article/GrantList[last()]/Grant/Agency/text()') r['pubtypes'] = xpath_strs( article, 'MedlineCitation/Article/PublicationTypeList/PublicationType/text()') allterms = [] for meshheading in article.xpath( 'MedlineCitation/MeshHeadingList/MeshHeading'): terms = xpath_strs(meshheading, 'DescriptorName/text() | QualifierName/text()') allterms.append(terms) r['meshterms'] = allterms return r
def _article_to_pubmed_ref(article): '''Convert PubMed XML data about an article into a ref (dictionary containing the article data). The returned dictionary will contain this: { "pmid": a string containing the article's PMID "authors": a list of tuples (string, integer), where the first element is the author's name and the second is the author's affiliation if known (otherwise it's None) "institutions": a dictionary, where the key is an integer and value is a list of strings contanining the institutional hierarchy "title": the article's title "pubdate": an integer of the form 19850726 (i.e., 1985/07/26) "year": the publication year as integer "journal": a string "grantagencies": a list of strings "pubtypes": a list of strings specifying the publication types as per PubMed "meshterms": a nested list of strings specifying the MeSH terms as per PubMed }''' r = {} r['pmid'] = xpath_str(article, 'PubmedData/ArticleIdList/ArticleId[@IdType=\'pubmed\']/text()') institutions = {} authors = [] for author in article.xpath('MedlineCitation/Article/AuthorList/Author'): lastname = xpath_str(author, 'LastName/text()') initials = xpath_str(author, 'Initials/text()') if lastname and initials: name = lastname + u' ' + initials else: continue institution_address = xpath_str(author, 'Affiliation/text()') institution_index = len(institutions) + 1 if institution_address else None if institution_address: institutions[institution_index] = (institution_address, None) authors.append((name, institution_index)) r['authors'] = authors r['institutions'] = institutions r['title'] = xpath_str(article, 'MedlineCitation/Article/ArticleTitle/text()') pubdate_str = u'' pubdate_elem = article.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0] pubdate_yr = xpath_str(pubdate_elem, 'Year/text()') if pubdate_yr: pubdate_str += pubdate_yr pubdate_mon = xpath_str(pubdate_elem, 'Month/text()') if pubdate_mon: pubdate_str += '%02d' % int(pubdate_mon) pubdate_day = xpath_str(pubdate_elem, 'Day/text()') if pubdate_day: pubdate_str += '%02d' % int(pubdate_day) else: pubdate_str += '00' else: pubdate_str += '0000' r['pubdate'] = int(pubdate_str) if pubdate_str else None r['year'] = pubdate_yr r['journal'] = xpath_str(article, 'MedlineCitation/MedlineJournalInfo/MedlineTA/text()') r['grantagencies'] = xpath_strs(article, 'MedlineCitation/Article/GrantList[last()]/Grant/Agency/text()') r['pubtypes'] = xpath_strs(article, 'MedlineCitation/Article/PublicationTypeList/PublicationType/text()') allterms = [] for meshheading in article.xpath('MedlineCitation/MeshHeadingList/MeshHeading'): terms = xpath_strs(meshheading, 'DescriptorName/text() | QualifierName/text()') allterms.append(terms) r['meshterms'] = allterms return r