def get_indicators(identifiers, data=None, usagedata=None): ind = {} ind_ref = {} # Get the necessary data if we did not get any if not data: data = get_indicator_data(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) # Organize the citations with a running index (the citation # data is already ordered from most to least cited) citations = [(i + 1, p.citation_num) for i, p in enumerate(data)] # First the Hirsch index ind['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0]) # Next the g index ind['g'] = max([i for (c, i) in zip(list(np.cumsum([x[1] for x in citations], axis=0)), [x[0] for x in citations]) if i**2 <= c] or [0]) # The number of paper with 10 or more citations (i10) ind['i10'] = len([x for x in citations if x[1] >= 10]) # The number of paper with 100 or more citations (i100) ind['i100'] = len([x for x in citations if x[1] >= 100]) # The m index is the g index divided by the range of publication years yrange = datetime.now().year - \ min([int(p.bibcode[:4]) for p in usagedata]) + 1 ind['m'] = float(ind['h']) / float(yrange) # The read10 index is calculated from current reads for papers published # in the last 10 years, normalized by number of authors year = datetime.now().year Nentries = year - 1996 + 1 ind['read10'] = sum([float(p.reads[-2]) / float(p.author_num) for p in usagedata if int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries]) # Now all the values for the refereed publications citations = [(i + 1, n) for i, n in enumerate([p.citation_num for p in data if p.refereed])] # First the Hirsch index ind_ref['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0]) # Next the g index ind_ref['g'] = max([i for (c, i) in zip(list(np.cumsum( [x[1] for x in citations], axis=0)), [x[0] for x in citations]) if i**2 <= c] or [0]) # The number of paper with 10 or more citations (i10) ind_ref['i10'] = len([x for x in citations if x[1] >= 10]) # The number of paper with 100 or more citations (i100) ind_ref['i100'] = len([x for x in citations if x[1] >= 100]) # The m index is the g index divided by the range of publication years yrange_ref = datetime.now().year - \ min([int(p.bibcode[:4]) for p in usagedata]) + 1 ind_ref['m'] = float(ind_ref['h']) / float(yrange_ref) # The read10 index is calculated from current reads for papers published # in the last 10 years, normalized by number of authors year = datetime.now().year Nentries = year - 1996 + 1 ind_ref['read10'] = sum([float(p.reads[-1]) / float(p.author_num) for p in usagedata if p.refereed and int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries]) # Send results back return ind, ind_ref
def test_get_usage_data(self): '''Test getting usage data''' from models import get_usage_data data = get_usage_data(testset) # The most important thing here is to test that it is a list # of MetricsModel instances self.assertEqual(isinstance(data, list), True) self.assertTrue(False not in [x.__class__.__name__ == 'MetricsModel' for x in data])
def test_get_usage_data(self): '''Test getting usage data''' from models import get_usage_data data = get_usage_data(testset) # The most important thing here is to test that it is a list # of MetricsModel instances self.assertEqual(isinstance(data, list), True) self.assertTrue( False not in [x.__class__.__name__ == 'MetricsModel' for x in data])
def get_time_series(identifiers, bibcodes, data=None, usagedata=None, tori_data=None, include_tori=True, self_cits=None): series = {} i10 = {} i100 = {} h = {} g = {} r10 = {} tori = {} # Get data if nothing was supplied if not data: data = get_citations(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) if not self_cits and include_tori: self_cits = get_selfcitations(identifiers, bibcodes)[1] self_citations = set((itertools.chain(*[x[0] for x in self_cits]))) if not tori_data and include_tori: tdata = get_tori_data(identifiers) tori_data = [ p for p in list( itertools.chain( *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if p['bibcode'] not in self_citations and 'pubyear' in p ] # Determine the year range Nentries = datetime.now().year - 1996 + 1 years = [int(b[:4]) for b in bibcodes] yrange = range(min(years), datetime.now().year + 1) d0 = date(datetime.now().year, 1, 1) d1 = date(datetime.now().year, datetime.now().month, datetime.now().day) d2 = date(datetime.now().year, 12, 31) delta = (d1 - d0).days + 1 ndays = (d2 - d0).days + 1 try: r10_corr = float(ndays) / float(delta) except: r10_corr = 1.0 for year in yrange: biblist = [b for b in bibcodes if int(b[:4]) <= year] citations = sorted([ len([int(c[:4]) for c in p.citations if int(c[:4]) <= year]) for p in data if p.bibcode in biblist ], reverse=True) if year < 1996: r10[year] = 0.0 else: idx = year - 1996 r10[year] = sum([ float(p.reads[idx]) / float(p.author_num) for p in usagedata if p.bibcode in biblist and int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries ]) try: h[year] = max([i for i, n in enumerate(citations) if i <= n]) g[year] = max([ i for i, n in enumerate(np.cumsum(citations, axis=0)) if i**2 <= n ]) except: h[year] = 0 g[year] = 0 i10[year] = len([c for c in citations if c >= 10]) i100[year] = len([c for c in citations if c >= 100]) if include_tori: tori[year] = np.sum( np.array([ r['auth_norm'] * r['ref_norm'] for r in tori_data if r['pubyear'] <= year and r['cityear'] <= year ])) r10[datetime.now().year] = r10[datetime.now().year] * r10_corr series['i10'] = i10 series['i100'] = i100 series['h'] = h series['g'] = g series['read10'] = r10 if include_tori: series['tori'] = tori return series
def get_indicators(identifiers, data=None, usagedata=None): ind = {} ind_ref = {} # Get the necessary data if we did not get any if not data: data = get_indicator_data(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) # Organize the citations with a running index (the citation # data is already ordered from most to least cited) citations = [(i + 1, p.citation_num) for i, p in enumerate(data)] # First the Hirsch index ind['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0]) # Next the g index ind['g'] = max([ i for (c, i) in zip(list(np.cumsum([x[1] for x in citations], axis=0)), [x[0] for x in citations]) if i**2 <= c ] or [0]) # The number of paper with 10 or more citations (i10) ind['i10'] = len([x for x in citations if x[1] >= 10]) # The number of paper with 100 or more citations (i100) ind['i100'] = len([x for x in citations if x[1] >= 100]) # The m index is the g index divided by the range of publication years yrange = datetime.now().year - \ min([int(p.bibcode[:4]) for p in usagedata]) + 1 ind['m'] = float(ind['h']) / float(yrange) # The read10 index is calculated from current reads for papers published # in the last 10 years, normalized by number of authors year = datetime.now().year Nentries = year - 1996 + 1 ind['read10'] = sum([ float(p.reads[-2]) / float(p.author_num) for p in usagedata if int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries ]) # Now all the values for the refereed publications citations = [ (i + 1, n) for i, n in enumerate([p.citation_num for p in data if p.refereed]) ] # First the Hirsch index ind_ref['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0]) # Next the g index ind_ref['g'] = max([ i for (c, i) in zip(list(np.cumsum([x[1] for x in citations], axis=0)), [x[0] for x in citations]) if i**2 <= c ] or [0]) # The number of paper with 10 or more citations (i10) ind_ref['i10'] = len([x for x in citations if x[1] >= 10]) # The number of paper with 100 or more citations (i100) ind_ref['i100'] = len([x for x in citations if x[1] >= 100]) # The m index is the g index divided by the range of publication years yrange_ref = datetime.now().year - \ min([int(p.bibcode[:4]) for p in usagedata]) + 1 ind_ref['m'] = float(ind_ref['h']) / float(yrange_ref) # The read10 index is calculated from current reads for papers published # in the last 10 years, normalized by number of authors year = datetime.now().year Nentries = year - 1996 + 1 ind_ref['read10'] = sum([ float(p.reads[-1]) / float(p.author_num) for p in usagedata if p.refereed and int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries ]) # Send results back return ind, ind_ref
def get_usage_histograms(identifiers, usage_type='reads', data=None): uh = {} # Get necessary data if nothing was provided if not data: data = get_usage_data(identifiers) # Determine the current year (so that we know how many entries to expect # in usage lists) year = datetime.now().year Nentries = year - 1996 + 1 zeros = [[0] * Nentries] if usage_type == 'reads': # Get all reads data and sum up the individual lists usage_data = [ p.reads for p in data if p.reads and len(p.reads) == Nentries ] usage = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # and also get the normalized reads usage_data = [ np.array(p.reads) / float(p.author_num) for p in data if p.reads and len(p.reads) == Nentries ] usage_norm = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # Do the same for just the refereed publications usage_data = [ p.reads for p in data if p.refereed and p.reads and len(p.reads) == Nentries ] usage_ref = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # and also get the normalized version usage_data = [ np.array(p.reads) / float(p.author_num) for p in data if p.refereed and p.reads and len(p.reads) == Nentries ] usage_ref_norm = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] else: usage_type = 'downloads' # Get all downloads data and sum up the individual lists usage_data = [ p.downloads for p in data if p.downloads and len(p.downloads) == Nentries ] usage = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # and also get the normalized version usage_data = [ np.array(p.downloads) / float(p.author_num) for p in data if p.downloads and len(p.downloads) == Nentries ] usage_norm = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # Do the same for just the refereed publications usage_data = [ p.downloads for p in data if p.refereed and p.downloads and len(p.downloads) == Nentries ] usage_ref = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # and also get the normalized version usage_data = [ np.array(p.downloads) / float(p.author_num) for p in data if p.refereed and p.downloads and len(p.downloads) == Nentries ] usage_ref_norm = [ sum(sublist) for sublist in itertools.izip(*usage_data or zeros) ] # Construct the histograms (index 0 corresponds with year 1996) uh['all %s' % usage_type] = dict([(1996 + i, v) for i, v in enumerate(usage)]) uh['all %s normalized' % usage_type] = dict([ (1996 + i, v) for i, v in enumerate(usage_norm) ]) uh['refereed %s' % usage_type] = dict([(1996 + i, v) for i, v in enumerate(usage_ref)]) uh['refereed %s normalized' % usage_type] = dict([ (1996 + i, v) for i, v in enumerate(usage_ref_norm) ]) return uh
def get_time_series(identifiers, bibcodes, data=None, usagedata=None, tori_data=None, include_tori=True, self_cits=None): series = {} i10 = {} i100 = {} h = {} g = {} r10 = {} tori = {} # Get data if nothing was supplied if not data: data = get_citations(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) if not self_cits and include_tori: self_cits = get_selfcitations(identifiers, bibcodes)[1] self_citations = set((itertools.chain(*[x[0] for x in self_cits]))) if not tori_data and include_tori: tdata = get_tori_data(identifiers) tori_data = [p for p in list(itertools.chain( *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if p['bibcode'] not in self_citations and 'pubyear' in p] # Determine the year range Nentries = datetime.now().year - 1996 + 1 years = [int(b[:4]) for b in bibcodes] yrange = range(min(years), datetime.now().year + 1) d0 = date(datetime.now().year, 1, 1) d1 = date(datetime.now().year, datetime.now().month, datetime.now().day) d2 = date(datetime.now().year, 12, 31) delta = (d1 - d0).days + 1 ndays = (d2 - d0).days + 1 try: r10_corr = float(ndays)/float(delta) except: r10_corr = 1.0 for year in yrange: biblist = [b for b in bibcodes if int(b[:4]) <= year] citations = sorted([len([int(c[:4]) for c in p.citations if int( c[:4]) <= year]) for p in data if p.bibcode in biblist], reverse=True) if year < 1996: r10[year] = 0.0 else: idx = year - 1996 r10[year] = sum([float(p.reads[idx]) / float(p.author_num) for p in usagedata if p.bibcode in biblist and int( p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries]) try: h[year] = max([i for i, n in enumerate(citations) if i <= n]) g[year] = max( [i for i, n in enumerate(np.cumsum(citations, axis=0)) if i**2 <= n]) except: h[year] = 0 g[year] = 0 i10[year] = len([c for c in citations if c >= 10]) i100[year] = len([c for c in citations if c >= 100]) if include_tori: tori[year] = np.sum(np.array([r['auth_norm'] * r['ref_norm'] for r in tori_data if r['pubyear'] <= year and r['cityear'] <= year])) r10[datetime.now().year] = r10[datetime.now().year] * r10_corr series['i10'] = i10 series['i100'] = i100 series['h'] = h series['g'] = g series['read10'] = r10 if include_tori: series['tori'] = tori return series
def get_usage_histograms(identifiers, usage_type='reads', data=None): uh = {} # Get necessary data if nothing was provided if not data: data = get_usage_data(identifiers) # Determine the current year (so that we know how many entries to expect # in usage lists) year = datetime.now().year Nentries = year - 1996 + 1 zeros = [[0] * Nentries] if usage_type == 'reads': # Get all reads data and sum up the individual lists usage_data = [ p.reads for p in data if p.reads and len(p.reads) == Nentries] usage = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # and also get the normalized reads usage_data = [np.array(p.reads) / float(p.author_num) for p in data if p.reads and len(p.reads) == Nentries] usage_norm = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # Do the same for just the refereed publications usage_data = [p.reads for p in data if p.refereed and p.reads and len(p.reads) == Nentries] usage_ref = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # and also get the normalized version usage_data = [np.array(p.reads) / float(p.author_num) for p in data if p.refereed and p.reads and len(p.reads) == Nentries] usage_ref_norm = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] else: usage_type = 'downloads' # Get all downloads data and sum up the individual lists usage_data = [ p.downloads for p in data if p.downloads and len(p.downloads) == Nentries] usage = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # and also get the normalized version usage_data = [np.array(p.downloads) / float(p.author_num) for p in data if p.downloads and len(p.downloads) == Nentries] usage_norm = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # Do the same for just the refereed publications usage_data = [p.downloads for p in data if p.refereed and p.downloads and len(p.downloads) == Nentries] usage_ref = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # and also get the normalized version usage_data = [np.array(p.downloads) / float(p.author_num) for p in data if p.refereed and p.downloads and len(p.downloads) == Nentries] usage_ref_norm = [sum(sublist) for sublist in itertools.izip(*usage_data or zeros)] # Construct the histograms (index 0 corresponds with year 1996) uh['all %s' % usage_type] = dict( [(1996 + i, v) for i, v in enumerate(usage)]) uh['all %s normalized' % usage_type] = dict( [(1996 + i, v) for i, v in enumerate(usage_norm)]) uh['refereed %s' % usage_type] = dict( [(1996 + i, v) for i, v in enumerate(usage_ref)]) uh['refereed %s normalized' % usage_type] = dict( [(1996 + i, v) for i, v in enumerate(usage_ref_norm)]) return uh