def author_papers(papers, node_id='ayjid', paper_attribs=[], **kwargs): """ Generate an author_papers network NetworkX directed graph. ============== ========================================================= Element Description ============== ========================================================= Node Two kinds of nodes with distinguishing "type" attributes: * type = paper - a paper in papers * type = person - a person in papers Papers node attributes defined by paper_attribs. Edge Directed, Author -> his/her Paper. ============== ========================================================= Parameters ---------- papers : list A list of wos_objects. node_id : string A key from :class:`.Paper` used to identify the nodes. paper_attribs : list List of user-provided optional arguments apart from the provided positional arguments. Returns ------- author_papers_graph : networkx.DiGraph A DiGraph 'author_papers_graph'. Raises ------ KeyError : Raised when node_id is not present in Papers. """ author_papers_graph = nx.DiGraph(type='author_papers') # Validate node_id. meta_dict = ds.Paper() meta_keys = meta_dict.keys() meta_keys.remove('citations') if node_id not in meta_keys: raise KeyError('node_id' + node_id + ' cannot be used to identify' + ' papers.') for entry in papers: # Define paper_attribute dictionary. paper_attrib_dict = util.subdict(entry, paper_attribs) paper_attrib_dict['type'] = 'paper' # Add paper node with attributes. author_papers_graph.add_node(entry[node_id], paper_attrib_dict) authors = util.concat_list(entry['aulast'], entry['auinit'], ' ') for i in xrange(len(authors)): # Add person node. author_papers_graph.add_node(authors[i], type="person") # Draw edges. author_papers_graph.add_edge(authors[i], entry[node_id], date=entry['date']) return author_papers_graph
def _handle_paper(article): """ Yields a :class:`.Paper` from an article ET node. Parameters ---------- article : Element ElementTree Element 'article'. Returns ------- paper : :class:`.Paper` """ paper = dt.Paper() pdata = dict_from_node(article) # Direct mappings. translator = _dfr2paper_map() for key, value in translator.iteritems(): try: paper[value] = str(pdata[key]).upper() except KeyError: # Article may not have all keys of interest. pass # Handle author names. paper['aulast'], paper['auinit'] = _handle_authors(pdata['author']) # Handle pubdate. paper['date'] = _handle_pubdate(pdata['pubdate']) # Handle pagerange. paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange']) # Generate ayjid. try: paper['ayjid'] = _create_ayjid(paper['aulast'][0], paper['auinit'][0], \ paper['date'], paper['jtitle']) except IndexError: # Article may not have authors. pass return paper
def merge(P1, P2, fields=['ayjid']): """ Combines two lists (P1 and P2) of :class:`.Paper` instances into a single list, and attempts to merge papers with matching fields. Where there are conflicts, values from :class:`.Paper` in P1 will be preferred. Parameters ---------- P1 : list A list of :class:`.Paper` instances. P2 : list A list of :class:`.Paper` instances. fields : list Fields used to identify matching :class:`.Paper` Returns ------- combined : list A list of :class:`.Paper` instances. Examples -------- .. code-block:: python >>> import tethne.readers as rd >>> P1 = rd.wos.read("/Path/to/data1.txt") >>> P2 = rd.dfr.read("/Path/to/DfR") >>> papers = rd.merge(P1, P2, ['ayjid']) """ combined = [] del_P1 = [] del_P2 = [] for x in xrange(len(P1)): p_1 = P1[x] for y in xrange(len(P2)): p_2 = P2[y] match = True for field in fields: if p_1[field] != p_2[field]: match = False break if match: # Add values first from P2 paper, then from P1 paper. new_p = dt.Paper() for key, value in p_2.iteritems(): if value != '' and value != None: new_p[key] = value for key, value in p_1.iteritems(): if value != '' and value != None: new_p[key] = value del_P1.append(x) # Flag for deletion. del_P2.append(y) combined.append(new_p) for x in xrange(len(P1)): if x not in del_P1: combined.append(P1[x]) for x in xrange(len(P2)): if x not in del_P2: combined.append(P2[x]) return combined
def direct_citation(papers, node_id='ayjid', node_attribs=['date'], **kwargs): """ Create a traditional directed citation network. Direct-citation graphs are `directed acyclic graphs`__ in which vertices are papers, and each (directed) edge represents a citation of the target paper by the source paper. The :func:`.networks.papers.direct_citation` method generates both a global citation graph, which includes all cited and citing papers, and an internal citation graph that describes only citations among papers in the original dataset. .. _dag: http://en.wikipedia.org/wiki/Directed_acyclic_graph __ dag_ To generate direct-citation graphs, use the :func:`.networks.papers.direct_citation` method. Note the size difference between the global and internal citation graphs. .. code-block:: python >>> gDC, iDC = nt.papers.direct_citation(papers) >>> len(gDC) 5998 >>> len(iDC) 163 ============== ========================================================= Element Description ============== ========================================================= Node Papers, represented by node_id. Edge From a paper to a cited reference. Edge Attribute Publication date of the citing paper. ============== ========================================================= Parameters ---------- papers : list A list of :class:`.Paper` instances. node_id : int A key from :class:`.Paper` to identify the nodes. Default is 'ayjid'. node_attribs : list List of user provided optional arguments apart from the provided positional arguments. Returns ------- citation_network : networkx.DiGraph Global citation network (all citations). citation_network_internal : networkx.DiGraph Internal citation network where only the papers in the list are nodes in the network. Raises ------ KeyError : If node_id is not present in the meta_list. """ citation_network = nx.DiGraph(type='citations') citation_network_internal = nx.DiGraph(type='citations') # Check node_id validity. meta_dict = ds.Paper() meta_keys = meta_dict.keys() if node_id not in meta_keys: raise KeyError('node_id:' + node_id + 'is not in the set of' + 'meta_keys') for entry in papers: # Check the head. head_has_id = True if entry[node_id] is None: head_has_id = False if head_has_id: # Then create node to both global and internal networks. node_attrib_dict = util.subdict(entry, node_attribs) citation_network.add_node(entry[node_id], node_attrib_dict) citation_network_internal.add_node(entry[node_id], node_attrib_dict) if entry['citations'] is not None: for citation in entry['citations']: # Check the tail. tail_has_id = True if citation[node_id] is None: tail_has_id = False if tail_has_id: # Then create node to global but not internal network. node_attrib_dict = util.subdict(citation, node_attribs) citation_network.add_node(citation[node_id], node_attrib_dict) if head_has_id and tail_has_id: # Then draw an edge in the network. citation_network.add_edge(entry[node_id], citation[node_id], date=entry['date']) # And check if it can be added to the internal network, too. if (util.contains (papers, lambda wos_obj: wos_obj[node_id] == citation[node_id])): citation_network_internal.add_edge( entry[node_id], citation[node_id], date=entry['date']) # Checking if both the graphs are Directed Acyclic Graphs. if not nx.is_directed_acyclic_graph(citation_network): raise nx.NetworkXError("Citation graph is not a DAG.") elif not nx.is_directed_acyclic_graph(citation_network_internal): raise nx.NetworkXError("Internal citation graph is not a DAG.") else: return citation_network, citation_network_internal
def bibliographic_coupling(papers, citation_id='ayjid', threshold=1, node_id='ayjid', node_attribs=['date'], weighted=False, **kwargs): """ Generate a bibliographic coupling network. Two papers are **bibliographically coupled** when they both cite the same, third, paper. You can generate a bibliographic coupling network using the :func:`.networks.papers.bibliographic_coupling` method. .. code-block:: python >>> BC = nt.papers.bibliographic_coupling(papers) >>> BC <networkx.classes.graph.Graph object at 0x102eec710> Especially when working with large datasets, or disciplinarily narrow literatures, it is usually helpful to set a minimum number of shared citations required for two papers to be coupled. You can do this by setting the **`threshold`** parameter. .. code-block:: python >>> BC = nt.papers.bibliographic_coupling(papers, threshold=1) >>> len(BC.edges()) 1216 >>> BC = nt.papers.bibliographic_coupling(papers, threshold=2) >>> len(BC.edges()) 542 =============== ========================================================= Element Description =============== ========================================================= Node Papers represented by node_id. Node Attributes node_attribs in :class:`.Paper` Edge (a,b) in E(G) if a and b share x citations where x >= threshold. Edge Attributes overlap: the number of citations shared =============== ========================================================= Parameters ---------- papers : list A list of wos_objects. citation_id: string A key from :class:`.Paper` to identify the citation overlaps. Default is 'ayjid'. threshold : int Minimum number of shared citations to consider two papers "coupled". node_id : string Field in :class:`.Paper` used to identify the nodes. Default is 'ayjid'. node_attribs : list List of fields in :class:`.Paper` to include as node attributes in graph. weighted : bool If True, edge attribute `overlap` is a float in {0-1} calculated as :math:`\cfrac{N_{ij}}{\sqrt{N_{i}N_{j}}}` where :math:`N_{i}` and :math:`N_{j}` are the number of references in :class:`.Paper` *i* and *j*, respectively, and :math:`N_{ij}` is the number of references shared by papers *i* and *j*. Returns ------- bcoupling : networkx.Graph A bibliographic coupling network. Raises ------ KeyError : Raised when citation_id is not present in the meta_list. Notes ----- Lists cannot be attributes? causing errors for both gexf and graphml also nodes cannot be none. """ bcoupling = nx.Graph(type='biblio_coupling') # Validate identifiers. meta_dict = ds.Paper() meta_keys = meta_dict.keys() if node_id not in meta_keys: raise KeyError('node_id' + node_id + ' is not a meta_dict key.') # 'citations' is the only invalid meta_key for citation_id meta_keys.remove('citations') if citation_id not in meta_keys: raise KeyError('citation_id' + citation_id + ' is not a meta_dict' + ' key or otherwise cannot be used to detect citation' + ' overlap.') for i in xrange(len(papers)): # Make a list of citation_id's for each paper... i_list = [] if papers[i]['citations'] is not None: for citation in papers[i]['citations']: i_list.append(citation[citation_id]) # ...and construct that paper's node. node_i_attribs = util.subdict(papers[i], node_attribs) for j in xrange(i+1, len(papers)): # Make a list of citation_id's for each paper... j_list = [] if papers[j]['citations'] is not None: for citation in papers[j]['citations']: j_list.append(citation[citation_id]) # ...and construct that paper's node. node_j_attribs = util.subdict(papers[j], node_attribs) # Add nodes and edge if the citation overlap is sufficiently high. overlap = util.overlap(i_list, j_list) if weighted: if len(overlap) > 0: w = (float(len(i_list)) * float(len(j_list)))**0.5 similarity = float(len(overlap)) / w else: similarity = 0 else: similarity = len(overlap) if similarity >= threshold: bcoupling.add_node(papers[i][node_id], node_i_attribs) bcoupling.add_node(papers[j][node_id], node_j_attribs) #nx.set_node_attributes(bcoupling,"",node_i_attribs) bcoupling.add_edge(papers[i][node_id], papers[j][node_id], similarity=similarity) return bcoupling
def read(filepath): """ Given a file with PubMed XML, return a list of :class:`.Paper` instances. See the following hyperlinks regarding possible structures of XML: * http://www.ncbi.nlm.nih.gov/pmc/pmcdoc/tagging-guidelines/citations/v2/citationtags.html#2Articlewithmorethan10authors%28listthefirst10andaddetal%29 * http://dtd.nlm.nih.gov/publishing/ Each :class:`.Paper` is tagged with an accession id for this read/conversion. **Usage** .. code-block:: python >>> import tethne.readers as rd >>> papers = rd.pubmed.read("/Path/to/PubMedData.xml") Parameters ---------- filepath : string Path to PubMed XML file. Returns ------- meta_list : list A list of :class:`.Paper` instances. """ try: with open(filepath, 'r') as f: tree = ET.fromstring(text, parser)(filepath) root = tree.getroot() except IOError: # File does not exist, or couldn't be read. raise IOError("File does not exist, or cannot be read.") accession = str(uuid.uuid4()) # define location of simple article meta data relative to xml tree rooted # at 'article' meta_loc = { 'atitle': './front/article-meta/title-group/article-title', 'jtitle': ('./front/journal-meta/journal-title-group/' + 'journal-title'), 'volume': './front/article-meta/volume', 'issue': './front/article-meta/issue', 'spage': './front/article-meta/fpage', 'epage': './front/article-meta/lpage' } # location relative to element-citation element cit_meta_loc = { 'atitle': './article-title', 'jtitle': './source', 'date': './year', 'volume': './volume', 'spage': './fpage', 'epage': './epage' } meta_list = [] for article in root.iter('article'): paper = ds.Paper() # collect information from the 'front' section of the article # collect the simple data for key in meta_loc.iterkeys(): key_data = article.find(meta_loc[key]) if key_data is not None: paper[key] = key_data.text else: paper[key] = None # collect doi and pmid id_list = article.findall('./front/article-meta/article-id') for identifier in id_list: id_type = identifier.get('pub-id-type') if id_type == 'doi': paper['doi'] = identifier.text elif id_type == 'pmid': paper['pmid'] = identifier.text else: # if never found, remain at None from initialization pass # collect aulast and auinint aulast = [] auinit = [] contribs = article.findall( './front/article-meta/contrib-group/contrib') # if contrib is not found then loop is skipped for contrib in contribs: contrib_type = contrib.get('contrib-type') if contrib_type == 'author': surname = contrib.find('./name/surname') if surname is not None: # then it was found aulast.append(surname.text) else: aulast.append(None) # multiple given names? this takes first one given_name = contrib.find('./name/given-names') if given_name is not None: # then it was found auinit.append(given_name.text[0]) else: auinit.append(None) paper['aulast'] = aulast paper['auinit'] = auinit # collect date pub_dates = article.findall('./front/article-meta/pub-date') # if pub-date is not found then loop is skipped for pub_date in pub_dates: pub_type = pub_date.get('pub-type') print pub_type if pub_type == 'collection': year = pub_date.find('./year') if year is not None: # then it was found paper['date'] = year.text else: paper['date'] = None meta_list.append(paper) # construct ayjid paper['ayjid'] = create_ayjid(**paper) # THIS IS BROKEN. # citations citations_list = [] # element-citation handling different from mixed-citation handling citations = article.findall('./back/ref-list/ref/element-citation') for cite in citations: cite_dict = ds.Paper() # simple meta data for key in cit_meta_loc.iterkeys(): key_data = cite.find(cit_meta_loc[key]) if key_data is not None: paper[key] = key_data.text else: paper[key] = None # doi and pmid pub_id = cite.find('./pub-id') if pub_id is not None: pub_id_type = pub_id.get('pub-id-type') if pub_id_type == 'doi': cite_dict['doi'] = pub_id.text elif pub_id_type == 'pmid': cite_dict['pmid'] = pub_id.text # aulast and auinit cite_aulast = [] cite_auinit = [] # determine if person group is authors person_group = cite.find('./person-group') if person_group is not None: group_type = person_group.get('person-group-type') else: group_type = None # then add the authors to the cite_dict if group_type == 'author': names = person_group.findall('./name') for name in names: # add surname surname = name.find('./surname') if surname is not None: # then it was found cite_aulast.append(surname.text) else: cite_aulast.append(None) # add given names given_names = name.find('./given-names') if given_names is not None: # then it was found cite_auinit.append(given_names.text[0]) else: cite_auinit.append(None) if not cite_aulast: # then empty cite_aulast = None if not cite_auinit: # then empty cite_auinit = None cite_dict['aulast'] = cite_aulast cite_dict['auinit'] = cite_auinit citations_list.append(cite_dict) # end cite loop paper['citations'] = citations_list paper['accession'] = accession meta_list.append(paper) # end article loop return meta_list
def convert(wos_data): """ Convert parsed field-tagged data to :class:`.Paper` instances. Convert a dictionary or list of dictionaries with keys from the Web of Science field tags into a :class:`.Paper` instance or list of :class:`.Paper` instances, the standard for Tethne. Each :class:`.Paper` is tagged with an accession id for this conversion. Parameters ---------- wos_data : list A list of dictionaries with keys from the WoS field tags. Returns ------- papers : list A list of :class:`.Paper` instances. Examples -------- .. code-block:: python >>> import tethne.readers as rd >>> wos_list = rd.wos.parse("/Path/to/data.txt") >>> papers = rd.wos.convert(wos_list) Notes ----- Need to handle author name anomolies (case, blank spaces, etc.) that may make the same author appear to be two different authors in Networkx; this is important for any graph with authors as nodes. """ accession = str(uuid.uuid4()) #create a Paper for each wos_dict and append to this list papers = [] #handle dict inputs by converting to a 1-item list if type(wos_data) is dict: wos_data = [wos_data] #print 'wos data \n' , wos_data # Calling the validate function here, before even building papers list # [62809724] status = _validate(wos_data) if not status: #raise Error pass # Define the direct relationships between WoS fieldtags and Paper keys. translator = _wos2paper_map() # Perform the key convertions for wos_dict in wos_data: paper = ds.Paper() #direct translations for key in translator.iterkeys(): paper[translator[key]] = wos_dict[key] # Group authors ('CA') are treated as personal authors. if 'CA' in wos_dict.keys(): try: wos_dict['AU'] += wos_dict['CA'] except TypeError: wos_dict['AU'] = wos_dict['CA'] try: wos_dict['AF'] += wos_dict['CA'] except KeyError: wos_dict['AF'] = wos_dict['CA'] # more complicated translations # FIXME: not robust to all names, organziation authors, etc. if wos_dict['AU'] is not None: paper['aulast'], paper['auinit'] = _handle_authors(wos_dict) #construct ayjid ayjid = _create_ayjid(paper['aulast'], paper['auinit'], paper['date'], paper['jtitle']) paper['ayjid'] = ayjid # Parse author-institution affiliations. #60216226, #57746858. if wos_dict['C1'] is not None: paper['institutions'] = _handle_author_institutions(wos_dict) # Convert CR references into paper format if wos_dict['CR'] is not None: meta_cr_list = [] for ref in wos_dict['CR']: meta_cr_list.append(_parse_cr(ref)) #print 'meta_cr_list' , meta_cr_list paper['citations'] = meta_cr_list paper['accession'] = accession papers.append(paper) # End wos_dict for loop. return papers
def _parse_institutions(ref): """ Supports the Web of Science reader by converting the strings found at the C1 fieldtag of a record into a minimum :class:`.Paper` instance. Parameters ---------- ref : str 'C1' field tag data from a plain text Web of Science file which contains Author First and Last names, Institution affiliated, and the location/city where they are affiliated to. Returns ------- addr_dict : :class:`.Paper` A :class:`.Paper` instance. Raises ------ IndexError When input 'ref' has less number of tokens than necessary ones. ValueError Gets input with mismacthed inputtype. Ex: getting no numbers for a date field. Notes ----- Needs to check many test cases to check various input types. """ addr_dict = ds.Paper() #tokens of form: tokens = ref.split(',') try: name = tokens[0] name_tokens = name.split(' ') addr_dict['aulast'] = name_tokens[0] addr_dict['auinit'] = name_tokens[1] #strip initial characters based on the field (spaces, 'V', 'DOI') addr_dict['addr2'] = tokens[1][1:] addr_dict['addr3'] = tokens[2][1:] addr_dict['country'] = tokens[3][2:] except IndexError: #ref did not have the full set of tokens pass except ValueError: #this occurs when the program expects a date but gets a string with #no numbers, we leave the field incomplete because chances are #the CR string is too sparse to use anyway pass auinsid = _create_ayjid(addr_dict['aulast'], addr_dict['auinit'], addr_dict['date'], addr_dict['jtitle']) addr_dict['auinsid'] = auinsid return addr_dict
def _parse_cr(ref): """ Supports the Web of Science reader by converting the strings found at the CR field tag of a record into a minimum :class:`.Paper` instance. Parameters ---------- ref : str CR field tag data from a plain text Web of Science file. Returns ------- paper : :class:`.Paper` A :class:`.Paper` instance. Raises ------ IndexError When input 'ref' has less number of tokens than necessary ones. ValueError Gets input with mismacthed inputtype. Ex: getting no numbers for a date field. Notes ----- Needs a sophisticated name parser, would like to use an open source resource for this. If WoS is missing a field in the middle of the list there are NOT commas indicating that; the following example does NOT occur: Doe J, ,, Some Journal instead Doe J, Some Journal This threatens the integrity of WoS data; should we address it? Another threat: if WoS is unsure of the DOI number there will be multiple DOI numbers in a list of form [doi1, doi2, ...], address this? """ paper = ds.Paper() #tokens of form: aulast auinit, date, jtitle, volume, spage, doi tokens = ref.split(',') try: #FIXME: needs better name parser # Checking for few parsers, in the meantime trying out few things. name = tokens[0] # Temp Solution for #62809724 pattern = re.compile(r'\[(.*?)\]') match = pattern.search(name) if match: # remove the [] and make it a proper one. name = name[match.start() + 1:match.end() - 1] if DEBUG: print 'stripped name: ', name name_tokens = name.split(' ') if len(name_tokens) < 2: # name_tokens.append('None') name_tokens.append(' ') paper['aulast'] = [name_tokens[0]] paper['auinit'] = [''.join(name_tokens[1:]).replace('.', '')] if DEBUG: print "Final Meta Dicts", paper['aulast'], paper['auinit'] # Temp Solution for #62809724 if paper['auinit'] == 'None' or paper['aulast'] == 'None': raise ("The Cited References field is not in the expeceted format") #strip initial characters based on the field (spaces, 'V', 'DOI') paper['date'] = int(tokens[1][1:]) paper['jtitle'] = tokens[2][1:] paper['volume'] = tokens[3][2:] paper['spage'] = tokens[4][2:] paper['doi'] = tokens[5][5:] except IndexError as E: # ref did not have the full set of tokens pass except ValueError as E: # This occurs when the program expects a date pass # but gets a string with no numbers. We leave # the field incomplete because chances are the # CR string is too sparse to use anyway. ayjid = _create_ayjid(paper['aulast'], paper['auinit'], paper['date'], paper['jtitle']) paper['ayjid'] = ayjid return paper