Пример #1
0
def _convert_wos_record(record, ns):
  """
  Takes an XML tree of a single WoS record and returns a dictionary
  representing the record's information.

  Args:
    record: An XML tree, where the root is the 'REC' tag
    ns: A dictionary that contains the key 'ns' whose value is the null XML namespace.

  Returns:
    A dictionary representing the record's information
    with the following keys and value types:
      wosid:        unicode
      title:        unicode
      journal:      unicode
      issue:        unicode
      volume:       unicode
      pubdate:      int
      institutions: {int: (unicode, [unicode])}
      authors:      [(unicode, [int])]
      citcount:     int
  """

  r = dict()
  r['wosid'] = xpath_str(record, "ns:UID/text()", ns)
  r['title'] = xpath_str(record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='item']/text()",ns)
  r['journal'] = xpath_str(record, "ns:static_data/ns:summary/ns:titles/ns:title[@type='source']/text()", ns)
  pubinfo = record.xpath("ns:static_data/ns:summary/ns:pub_info", namespaces=ns)[0]
  (r['issue'], r['volume'], pubdate) = (pubinfo.attrib.get('issue'), pubinfo.attrib.get('vol'), pubinfo.attrib.get('sortdate'))
  if pubdate:
    m = _date_re.match(pubdate)
    r['pubdate'] = int(m.group('yr') + m.group('mon') + m.group('day'))

  r['institutions'] = {}
  num_institutions = int(record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses", namespaces=ns)[0].attrib['count'])
  for institution_tag in record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec", namespaces=ns):
    index = int(institution_tag.attrib['addr_no'])
    address = xpath_str(institution_tag, "ns:full_address/text()", ns)
    organizations = xpath_strs(institution_tag, "ns:organizations/ns:organization/text()", ns)

    r['institutions'][index] = (address, organizations)

  r['authors'] = []
  num_authors = int(record.xpath("ns:static_data/ns:summary/ns:names", namespaces=ns)[0].attrib['count'])
  for i in range(1, num_authors + 1):
    author_tag = record.xpath("ns:static_data/ns:summary/ns:names/ns:name[@seq_no='%d']" % i, namespaces=ns)[0]
    author_name = xpath_str(author_tag, "ns:wos_standard/text()", ns)
    if author_name == None: continue
    affiliation_indices = map(int, author_tag.attrib['addr_no'].split(' ')) if 'addr_no' in author_tag.attrib else None

    r['authors'].append((author_name, affiliation_indices))

  cittag = record.xpath("ns:dynamic_data/ns:citation_related/ns:tc_list/ns:silo_tc[@coll_id='WOS']/@local_count", namespaces=ns)
  if cittag:
    r['citcount'] = int(cittag[0])

  return r
Пример #2
0
def _convert_wos_record(record, ns):
    """
  Takes an XML tree of a single WoS record and returns a dictionary
  representing the record's information.

  Args:
    record: An XML tree, where the root is the 'REC' tag
    ns: A dictionary that contains the key 'ns' whose value is the null XML namespace.

  Returns:
    A dictionary representing the record's information
    with the following keys and value types:
      wosid:        unicode
      title:        unicode
      journal:      unicode
      issue:        unicode
      volume:       unicode
      pubdate:      int
      institutions: {int: (unicode, [unicode])}
      authors:      [(unicode, [int])]
      citcount:     int
  """

    r = dict()
    r['wosid'] = xpath_str(record, "ns:UID/text()", ns)
    r['title'] = xpath_str(
        record,
        "ns:static_data/ns:summary/ns:titles/ns:title[@type='item']/text()",
        ns)
    r['journal'] = xpath_str(
        record,
        "ns:static_data/ns:summary/ns:titles/ns:title[@type='source']/text()",
        ns)
    pubinfo = record.xpath("ns:static_data/ns:summary/ns:pub_info",
                           namespaces=ns)[0]
    (r['issue'], r['volume'], pubdate) = (pubinfo.attrib.get('issue'),
                                          pubinfo.attrib.get('vol'),
                                          pubinfo.attrib.get('sortdate'))
    if pubdate:
        m = _date_re.match(pubdate)
        r['pubdate'] = int(m.group('yr') + m.group('mon') + m.group('day'))

    r['institutions'] = {}
    num_institutions = int(
        record.xpath("ns:static_data/ns:fullrecord_metadata/ns:addresses",
                     namespaces=ns)[0].attrib['count'])
    for institution_tag in record.xpath(
            "ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec",
            namespaces=ns):
        index = int(institution_tag.attrib['addr_no'])
        address = xpath_str(institution_tag, "ns:full_address/text()", ns)
        organizations = xpath_strs(institution_tag,
                                   "ns:organizations/ns:organization/text()",
                                   ns)

        r['institutions'][index] = (address, organizations)

    r['authors'] = []
    num_authors = int(
        record.xpath("ns:static_data/ns:summary/ns:names",
                     namespaces=ns)[0].attrib['count'])
    for i in range(1, num_authors + 1):
        author_tag = record.xpath(
            "ns:static_data/ns:summary/ns:names/ns:name[@seq_no='%d']" % i,
            namespaces=ns)[0]
        author_name = xpath_str(author_tag, "ns:wos_standard/text()", ns)
        if author_name == None: continue
        affiliation_indices = map(int, author_tag.attrib['addr_no'].split(
            ' ')) if 'addr_no' in author_tag.attrib else None

        r['authors'].append((author_name, affiliation_indices))

    cittag = record.xpath(
        "ns:dynamic_data/ns:citation_related/ns:tc_list/ns:silo_tc[@coll_id='WOS']/@local_count",
        namespaces=ns)
    if cittag:
        r['citcount'] = int(cittag[0])

    return r
Пример #3
0
def _article_to_pubmed_ref(article):
    '''Convert PubMed XML data about an article into a ref (dictionary containing the article data).
  The returned dictionary will contain this:
  {
    "pmid": a string containing the article's PMID
    "authors": a list of tuples (string, integer), where the first element is the author's name and the second is the author's affiliation if known (otherwise it's None)
    "institutions": a dictionary, where the key is an integer and value is a list of strings contanining the institutional hierarchy
    "title": the article's title
    "pubdate": an integer of the form 19850726 (i.e., 1985/07/26)
    "year": the publication year as integer
    "journal": a string
    "grantagencies": a list of strings
    "pubtypes": a list of strings specifying the publication types as per PubMed
    "meshterms": a nested list of strings specifying the MeSH terms as per PubMed
  }'''
    r = {}
    r['pmid'] = xpath_str(
        article,
        'PubmedData/ArticleIdList/ArticleId[@IdType=\'pubmed\']/text()')

    institutions = {}
    authors = []
    for author in article.xpath('MedlineCitation/Article/AuthorList/Author'):
        lastname = xpath_str(author, 'LastName/text()')
        initials = xpath_str(author, 'Initials/text()')
        if lastname and initials:
            name = lastname + u' ' + initials
        else:
            continue
        institution_address = xpath_str(author, 'Affiliation/text()')
        institution_index = len(
            institutions) + 1 if institution_address else None
        if institution_address:
            institutions[institution_index] = (institution_address, None)
        authors.append((name, institution_index))
    r['authors'] = authors
    r['institutions'] = institutions

    r['title'] = xpath_str(article,
                           'MedlineCitation/Article/ArticleTitle/text()')

    pubdate_str = u''
    pubdate_elem = article.xpath(
        'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0]
    pubdate_yr = xpath_str(pubdate_elem, 'Year/text()')
    if pubdate_yr:
        pubdate_str += pubdate_yr
        pubdate_mon = xpath_str(pubdate_elem, 'Month/text()')
        if pubdate_mon:
            pubdate_str += '%02d' % int(pubdate_mon)
            pubdate_day = xpath_str(pubdate_elem, 'Day/text()')
            if pubdate_day:
                pubdate_str += '%02d' % int(pubdate_day)
            else:
                pubdate_str += '00'
        else:
            pubdate_str += '0000'

    r['pubdate'] = int(pubdate_str) if pubdate_str else None
    r['year'] = pubdate_yr
    r['journal'] = xpath_str(
        article, 'MedlineCitation/MedlineJournalInfo/MedlineTA/text()')
    r['grantagencies'] = xpath_strs(
        article,
        'MedlineCitation/Article/GrantList[last()]/Grant/Agency/text()')
    r['pubtypes'] = xpath_strs(
        article,
        'MedlineCitation/Article/PublicationTypeList/PublicationType/text()')

    allterms = []
    for meshheading in article.xpath(
            'MedlineCitation/MeshHeadingList/MeshHeading'):
        terms = xpath_strs(meshheading,
                           'DescriptorName/text() | QualifierName/text()')
        allterms.append(terms)
    r['meshterms'] = allterms
    return r
Пример #4
0
def _article_to_pubmed_ref(article):
  '''Convert PubMed XML data about an article into a ref (dictionary containing the article data).
  The returned dictionary will contain this:
  {
    "pmid": a string containing the article's PMID
    "authors": a list of tuples (string, integer), where the first element is the author's name and the second is the author's affiliation if known (otherwise it's None)
    "institutions": a dictionary, where the key is an integer and value is a list of strings contanining the institutional hierarchy
    "title": the article's title
    "pubdate": an integer of the form 19850726 (i.e., 1985/07/26)
    "year": the publication year as integer
    "journal": a string
    "grantagencies": a list of strings
    "pubtypes": a list of strings specifying the publication types as per PubMed
    "meshterms": a nested list of strings specifying the MeSH terms as per PubMed
  }'''
  r = {}
  r['pmid'] = xpath_str(article, 'PubmedData/ArticleIdList/ArticleId[@IdType=\'pubmed\']/text()')

  institutions = {}
  authors = []
  for author in article.xpath('MedlineCitation/Article/AuthorList/Author'):
    lastname = xpath_str(author, 'LastName/text()')
    initials = xpath_str(author, 'Initials/text()')
    if lastname and initials:
      name = lastname + u' ' + initials
    else:
      continue
    institution_address = xpath_str(author, 'Affiliation/text()')
    institution_index = len(institutions) + 1 if institution_address else None
    if institution_address:
      institutions[institution_index] = (institution_address, None)
    authors.append((name, institution_index))
  r['authors'] = authors
  r['institutions'] = institutions
  
  r['title'] = xpath_str(article, 'MedlineCitation/Article/ArticleTitle/text()')

  pubdate_str = u''
  pubdate_elem = article.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0]
  pubdate_yr = xpath_str(pubdate_elem, 'Year/text()')
  if pubdate_yr:
    pubdate_str += pubdate_yr
    pubdate_mon = xpath_str(pubdate_elem, 'Month/text()')
    if pubdate_mon:
      pubdate_str += '%02d' % int(pubdate_mon)
      pubdate_day = xpath_str(pubdate_elem, 'Day/text()')
      if pubdate_day:
        pubdate_str += '%02d' % int(pubdate_day)
      else:
        pubdate_str += '00'
    else:
      pubdate_str += '0000'

  r['pubdate'] = int(pubdate_str) if pubdate_str else None
  r['year'] = pubdate_yr
  r['journal'] = xpath_str(article, 'MedlineCitation/MedlineJournalInfo/MedlineTA/text()')
  r['grantagencies'] = xpath_strs(article, 'MedlineCitation/Article/GrantList[last()]/Grant/Agency/text()')
  r['pubtypes'] = xpath_strs(article, 'MedlineCitation/Article/PublicationTypeList/PublicationType/text()')

  allterms = []
  for meshheading in article.xpath('MedlineCitation/MeshHeadingList/MeshHeading'):
    terms = xpath_strs(meshheading, 'DescriptorName/text() | QualifierName/text()')
    allterms.append(terms)
  r['meshterms'] = allterms
  return r