Пример #1
0
def get_indicators(identifiers, data=None, usagedata=None):
    ind = {}
    ind_ref = {}
    # Get the necessary data if we did not get any
    if not data:
        data = get_indicator_data(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    # Organize the citations with a running index (the citation
    # data is already ordered from most to least cited)
    citations = [(i + 1, p.citation_num) for i, p in enumerate(data)]
    # First the Hirsch index
    ind['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0])
    # Next the g index
    ind['g'] = max([i for (c, i) in zip(list(np.cumsum([x[1] for
                    x in citations], axis=0)), [x[0] for x in citations]) if
                    i**2 <= c] or [0])
    # The number of paper with 10 or more citations (i10)
    ind['i10'] = len([x for x in citations if x[1] >= 10])
    # The number of paper with 100 or more citations (i100)
    ind['i100'] = len([x for x in citations if x[1] >= 100])
    # The m index is the g index divided by the range of publication years
    yrange = datetime.now().year - \
        min([int(p.bibcode[:4]) for p in usagedata]) + 1
    ind['m'] = float(ind['h']) / float(yrange)
    # The read10 index is calculated from current reads for papers published
    # in the last 10 years, normalized by number of authors
    year = datetime.now().year
    Nentries = year - 1996 + 1
    ind['read10'] = sum([float(p.reads[-2]) / float(p.author_num)
                         for p in usagedata if
                         int(p.bibcode[:4]) > year - 10 and p.reads and
                         len(p.reads) == Nentries])
    # Now all the values for the refereed publications
    citations = [(i + 1, n) for i, n in enumerate([p.citation_num for p in
                                                   data if p.refereed])]
    # First the Hirsch index
    ind_ref['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0])
    # Next the g index
    ind_ref['g'] = max([i for (c, i) in zip(list(np.cumsum(
        [x[1] for x in citations], axis=0)), [x[0] for x in citations]) if
        i**2 <= c] or [0])
    # The number of paper with 10 or more citations (i10)
    ind_ref['i10'] = len([x for x in citations if x[1] >= 10])
    # The number of paper with 100 or more citations (i100)
    ind_ref['i100'] = len([x for x in citations if x[1] >= 100])
    # The m index is the g index divided by the range of publication years
    yrange_ref = datetime.now().year - \
        min([int(p.bibcode[:4]) for p in usagedata]) + 1
    ind_ref['m'] = float(ind_ref['h']) / float(yrange_ref)
    # The read10 index is calculated from current reads for papers published
    # in the last 10 years, normalized by number of authors
    year = datetime.now().year
    Nentries = year - 1996 + 1
    ind_ref['read10'] = sum([float(p.reads[-1]) / float(p.author_num)
                             for p in usagedata if p.refereed and
                             int(p.bibcode[:4]) > year - 10 and
                             p.reads and len(p.reads) == Nentries])
    # Send results back
    return ind, ind_ref
 def test_get_usage_data(self):
     '''Test getting usage data'''
     from models import get_usage_data
     data = get_usage_data(testset)
     # The most important thing here is to test that it is a list
     # of MetricsModel instances
     self.assertEqual(isinstance(data, list), True)
     self.assertTrue(False not in
                     [x.__class__.__name__ == 'MetricsModel' for x in data])
 def test_get_usage_data(self):
     '''Test getting usage data'''
     from models import get_usage_data
     data = get_usage_data(testset)
     # The most important thing here is to test that it is a list
     # of MetricsModel instances
     self.assertEqual(isinstance(data, list), True)
     self.assertTrue(
         False not in [x.__class__.__name__ == 'MetricsModel' for
                       x in data])
Пример #4
0
def get_time_series(identifiers,
                    bibcodes,
                    data=None,
                    usagedata=None,
                    tori_data=None,
                    include_tori=True,
                    self_cits=None):
    series = {}
    i10 = {}
    i100 = {}
    h = {}
    g = {}
    r10 = {}
    tori = {}
    # Get data if nothing was supplied
    if not data:
        data = get_citations(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    if not self_cits and include_tori:
        self_cits = get_selfcitations(identifiers, bibcodes)[1]
    self_citations = set((itertools.chain(*[x[0] for x in self_cits])))
    if not tori_data and include_tori:
        tdata = get_tori_data(identifiers)
        tori_data = [
            p for p in list(
                itertools.chain(
                    *[p.rn_citation_data for p in tdata
                      if p.rn_citation_data]))
            if p['bibcode'] not in self_citations and 'pubyear' in p
        ]
    # Determine the year range
    Nentries = datetime.now().year - 1996 + 1
    years = [int(b[:4]) for b in bibcodes]
    yrange = range(min(years), datetime.now().year + 1)
    d0 = date(datetime.now().year, 1, 1)
    d1 = date(datetime.now().year, datetime.now().month, datetime.now().day)
    d2 = date(datetime.now().year, 12, 31)
    delta = (d1 - d0).days + 1
    ndays = (d2 - d0).days + 1
    try:
        r10_corr = float(ndays) / float(delta)
    except:
        r10_corr = 1.0
    for year in yrange:
        biblist = [b for b in bibcodes if int(b[:4]) <= year]
        citations = sorted([
            len([int(c[:4]) for c in p.citations if int(c[:4]) <= year])
            for p in data if p.bibcode in biblist
        ],
                           reverse=True)
        if year < 1996:
            r10[year] = 0.0
        else:
            idx = year - 1996
            r10[year] = sum([
                float(p.reads[idx]) / float(p.author_num) for p in usagedata
                if p.bibcode in biblist and int(p.bibcode[:4]) > year -
                10 and p.reads and len(p.reads) == Nentries
            ])
        try:
            h[year] = max([i for i, n in enumerate(citations) if i <= n])
            g[year] = max([
                i for i, n in enumerate(np.cumsum(citations, axis=0))
                if i**2 <= n
            ])
        except:
            h[year] = 0
            g[year] = 0
        i10[year] = len([c for c in citations if c >= 10])
        i100[year] = len([c for c in citations if c >= 100])
        if include_tori:
            tori[year] = np.sum(
                np.array([
                    r['auth_norm'] * r['ref_norm'] for r in tori_data
                    if r['pubyear'] <= year and r['cityear'] <= year
                ]))

    r10[datetime.now().year] = r10[datetime.now().year] * r10_corr
    series['i10'] = i10
    series['i100'] = i100
    series['h'] = h
    series['g'] = g
    series['read10'] = r10
    if include_tori:
        series['tori'] = tori

    return series
Пример #5
0
def get_indicators(identifiers, data=None, usagedata=None):
    ind = {}
    ind_ref = {}
    # Get the necessary data if we did not get any
    if not data:
        data = get_indicator_data(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    # Organize the citations with a running index (the citation
    # data is already ordered from most to least cited)
    citations = [(i + 1, p.citation_num) for i, p in enumerate(data)]
    # First the Hirsch index
    ind['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0])
    # Next the g index
    ind['g'] = max([
        i for (c, i) in zip(list(np.cumsum([x[1] for x in citations], axis=0)),
                            [x[0] for x in citations]) if i**2 <= c
    ] or [0])
    # The number of paper with 10 or more citations (i10)
    ind['i10'] = len([x for x in citations if x[1] >= 10])
    # The number of paper with 100 or more citations (i100)
    ind['i100'] = len([x for x in citations if x[1] >= 100])
    # The m index is the g index divided by the range of publication years
    yrange = datetime.now().year - \
        min([int(p.bibcode[:4]) for p in usagedata]) + 1
    ind['m'] = float(ind['h']) / float(yrange)
    # The read10 index is calculated from current reads for papers published
    # in the last 10 years, normalized by number of authors
    year = datetime.now().year
    Nentries = year - 1996 + 1
    ind['read10'] = sum([
        float(p.reads[-2]) / float(p.author_num) for p in usagedata
        if int(p.bibcode[:4]) > year -
        10 and p.reads and len(p.reads) == Nentries
    ])
    # Now all the values for the refereed publications
    citations = [
        (i + 1, n)
        for i, n in enumerate([p.citation_num for p in data if p.refereed])
    ]
    # First the Hirsch index
    ind_ref['h'] = max([x[0] for x in citations if x[1] >= x[0]] or [0])
    # Next the g index
    ind_ref['g'] = max([
        i for (c, i) in zip(list(np.cumsum([x[1] for x in citations], axis=0)),
                            [x[0] for x in citations]) if i**2 <= c
    ] or [0])
    # The number of paper with 10 or more citations (i10)
    ind_ref['i10'] = len([x for x in citations if x[1] >= 10])
    # The number of paper with 100 or more citations (i100)
    ind_ref['i100'] = len([x for x in citations if x[1] >= 100])
    # The m index is the g index divided by the range of publication years
    yrange_ref = datetime.now().year - \
        min([int(p.bibcode[:4]) for p in usagedata]) + 1
    ind_ref['m'] = float(ind_ref['h']) / float(yrange_ref)
    # The read10 index is calculated from current reads for papers published
    # in the last 10 years, normalized by number of authors
    year = datetime.now().year
    Nentries = year - 1996 + 1
    ind_ref['read10'] = sum([
        float(p.reads[-1]) / float(p.author_num) for p in usagedata
        if p.refereed and int(p.bibcode[:4]) > year -
        10 and p.reads and len(p.reads) == Nentries
    ])
    # Send results back
    return ind, ind_ref
Пример #6
0
def get_usage_histograms(identifiers, usage_type='reads', data=None):
    uh = {}
    # Get necessary data if nothing was provided
    if not data:
        data = get_usage_data(identifiers)
    # Determine the current year (so that we know how many entries to expect
    # in usage lists)
    year = datetime.now().year
    Nentries = year - 1996 + 1
    zeros = [[0] * Nentries]
    if usage_type == 'reads':
        # Get all reads data and sum up the individual lists
        usage_data = [
            p.reads for p in data if p.reads and len(p.reads) == Nentries
        ]
        usage = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # and also get the normalized reads
        usage_data = [
            np.array(p.reads) / float(p.author_num) for p in data
            if p.reads and len(p.reads) == Nentries
        ]
        usage_norm = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # Do the same for just the refereed publications
        usage_data = [
            p.reads for p in data
            if p.refereed and p.reads and len(p.reads) == Nentries
        ]
        usage_ref = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # and also get the normalized version
        usage_data = [
            np.array(p.reads) / float(p.author_num) for p in data
            if p.refereed and p.reads and len(p.reads) == Nentries
        ]
        usage_ref_norm = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
    else:
        usage_type = 'downloads'
        # Get all downloads data and sum up the individual lists
        usage_data = [
            p.downloads for p in data
            if p.downloads and len(p.downloads) == Nentries
        ]
        usage = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # and also get the normalized version
        usage_data = [
            np.array(p.downloads) / float(p.author_num) for p in data
            if p.downloads and len(p.downloads) == Nentries
        ]
        usage_norm = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # Do the same for just the refereed publications
        usage_data = [
            p.downloads for p in data
            if p.refereed and p.downloads and len(p.downloads) == Nentries
        ]
        usage_ref = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # and also get the normalized version
        usage_data = [
            np.array(p.downloads) / float(p.author_num) for p in data
            if p.refereed and p.downloads and len(p.downloads) == Nentries
        ]
        usage_ref_norm = [
            sum(sublist) for sublist in itertools.izip(*usage_data or zeros)
        ]
        # Construct the histograms (index 0 corresponds with year 1996)
    uh['all %s' % usage_type] = dict([(1996 + i, v)
                                      for i, v in enumerate(usage)])
    uh['all %s normalized' % usage_type] = dict([
        (1996 + i, v) for i, v in enumerate(usage_norm)
    ])
    uh['refereed %s' % usage_type] = dict([(1996 + i, v)
                                           for i, v in enumerate(usage_ref)])
    uh['refereed %s normalized' % usage_type] = dict([
        (1996 + i, v) for i, v in enumerate(usage_ref_norm)
    ])
    return uh
Пример #7
0
def get_time_series(identifiers, bibcodes, data=None, usagedata=None,
                    tori_data=None, include_tori=True, self_cits=None):
    series = {}
    i10 = {}
    i100 = {}
    h = {}
    g = {}
    r10 = {}
    tori = {}
    # Get data if nothing was supplied
    if not data:
        data = get_citations(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    if not self_cits and include_tori:
        self_cits = get_selfcitations(identifiers, bibcodes)[1]
    self_citations = set((itertools.chain(*[x[0] for x in self_cits])))
    if not tori_data and include_tori:
        tdata = get_tori_data(identifiers)
        tori_data = [p for p in list(itertools.chain(
            *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if
            p['bibcode'] not in self_citations and 'pubyear' in p]
    # Determine the year range
    Nentries = datetime.now().year - 1996 + 1
    years = [int(b[:4]) for b in bibcodes]
    yrange = range(min(years), datetime.now().year + 1)
    d0 = date(datetime.now().year, 1, 1)
    d1 = date(datetime.now().year, datetime.now().month, datetime.now().day)
    d2 = date(datetime.now().year, 12, 31)
    delta = (d1 - d0).days + 1
    ndays = (d2 - d0).days + 1
    try:
       r10_corr = float(ndays)/float(delta)
    except:
       r10_corr = 1.0
    for year in yrange:
        biblist = [b for b in bibcodes if int(b[:4]) <= year]
        citations = sorted([len([int(c[:4]) for c in p.citations if int(
            c[:4]) <= year]) for p in data if
            p.bibcode in biblist], reverse=True)
        if year < 1996:
            r10[year] = 0.0
        else:
            idx = year - 1996
            r10[year] = sum([float(p.reads[idx]) / float(p.author_num) for
                             p in usagedata if p.bibcode in biblist and int(
                p.bibcode[:4]) > year - 10 and p.reads and
                len(p.reads) == Nentries])
        try:
            h[year] = max([i for i, n in enumerate(citations) if i <= n])
            g[year] = max(
                [i for i, n in enumerate(np.cumsum(citations, axis=0)) if
                 i**2 <= n])
        except:
            h[year] = 0
            g[year] = 0
        i10[year] = len([c for c in citations if c >= 10])
        i100[year] = len([c for c in citations if c >= 100])
        if include_tori:
            tori[year] = np.sum(np.array([r['auth_norm'] * r['ref_norm'] for
                                          r in tori_data if
                                          r['pubyear'] <= year and
                                          r['cityear'] <= year]))

    r10[datetime.now().year] = r10[datetime.now().year] * r10_corr
    series['i10'] = i10
    series['i100'] = i100
    series['h'] = h
    series['g'] = g
    series['read10'] = r10
    if include_tori:
        series['tori'] = tori

    return series
Пример #8
0
def get_usage_histograms(identifiers, usage_type='reads', data=None):
    uh = {}
    # Get necessary data if nothing was provided
    if not data:
        data = get_usage_data(identifiers)
    # Determine the current year (so that we know how many entries to expect
    # in usage lists)
    year = datetime.now().year
    Nentries = year - 1996 + 1
    zeros = [[0] * Nentries]
    if usage_type == 'reads':
        # Get all reads data and sum up the individual lists
        usage_data = [
            p.reads for p in data if p.reads and len(p.reads) == Nentries]
        usage = [sum(sublist)
                 for sublist in itertools.izip(*usage_data or zeros)]
        # and also get the normalized reads
        usage_data = [np.array(p.reads) / float(p.author_num)
                      for p in data if p.reads and len(p.reads) == Nentries]
        usage_norm = [sum(sublist)
                      for sublist in itertools.izip(*usage_data or zeros)]
        # Do the same for just the refereed publications
        usage_data = [p.reads for p in data if p.refereed and p.reads and
                      len(p.reads) == Nentries]
        usage_ref = [sum(sublist)
                     for sublist in itertools.izip(*usage_data or zeros)]
        # and also get the normalized version
        usage_data = [np.array(p.reads) / float(p.author_num)
                      for p in data if p.refereed and p.reads and
                      len(p.reads) == Nentries]
        usage_ref_norm = [sum(sublist)
                          for sublist in itertools.izip(*usage_data or zeros)]
    else:
        usage_type = 'downloads'
        # Get all downloads data and sum up the individual lists
        usage_data = [
            p.downloads for p in data if p.downloads and
            len(p.downloads) == Nentries]
        usage = [sum(sublist)
                 for sublist in itertools.izip(*usage_data or zeros)]
        # and also get the normalized version
        usage_data = [np.array(p.downloads) / float(p.author_num)
                      for p in data if p.downloads and
                      len(p.downloads) == Nentries]
        usage_norm = [sum(sublist)
                      for sublist in itertools.izip(*usage_data or zeros)]
        # Do the same for just the refereed publications
        usage_data = [p.downloads for p in data if p.refereed and
                      p.downloads and len(p.downloads) == Nentries]
        usage_ref = [sum(sublist)
                     for sublist in itertools.izip(*usage_data or zeros)]
        # and also get the normalized version
        usage_data = [np.array(p.downloads) / float(p.author_num)
                      for p in data if p.refereed and p.downloads and
                      len(p.downloads) == Nentries]
        usage_ref_norm = [sum(sublist)
                          for sublist in itertools.izip(*usage_data or zeros)]
        # Construct the histograms (index 0 corresponds with year 1996)
    uh['all %s' % usage_type] = dict(
        [(1996 + i, v) for i, v in enumerate(usage)])
    uh['all %s normalized' % usage_type] = dict(
        [(1996 + i, v) for i, v in enumerate(usage_norm)])
    uh['refereed %s' % usage_type] = dict(
        [(1996 + i, v) for i, v in enumerate(usage_ref)])
    uh['refereed %s normalized' % usage_type] = dict(
        [(1996 + i, v) for i, v in enumerate(usage_ref_norm)])
    return uh