示例#1
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE))
    # Now gather all usage data numbers from the MongoDB 'adsdata' collection,
    # keyed on bibcode
    ads_data = get_mongo_data(bibcodes=bibcodes)
    missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes)
    app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    # Get precomputed and citation data
    metrics_data = get_metrics_data(bibcodes=bibcodes)
    # Get the number of citing papers
    Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values())))))
    Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values())))))
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,ads_data,metrics_data)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
示例#2
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    solr_url = config.SOLR_URL
    max_hits = config.METRICS_MAX_HITS
    threads  = config.METRICS_THREADS
    chunk_size = config.METRICS_CHUNK_SIZE
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,chunk_size))
    # Gather all publication information into one publication dictionary,
    # keyed on bibcode
    publication_data = get_publication_data(biblists=biblists)
    missing_bibcodes = filter(lambda a: a not in publication_data.keys(), bibcodes)
    app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    # Get citation dictionaries (all, refereed and non-refereed citations in
    # separate dictionaries, so that we don't have to figure this out later)
    (cit_dict,ref_cit_dict,non_ref_cit_dict) = get_citations(bibcodes=bibcodes, pubdata=publication_data, type='metrics')
    # divide by 4 because the values of the dictionary are 4-tuples
    # and the flattening removed all structure.
    Nciting = len(set([x[0] for v in cit_dict.values() for x in v]))
    Nciting_ref = len(set([x[0] for v in ref_cit_dict.values() for x in v]))
    # Now gather all usage data numbers from the MongoDB 'adsdata' collection
    # This info will get stored in the dictionary 'adsdata', also keyed on bibcode
    ads_data = get_mongo_data(bibcodes=bibcodes)
    # Generate the list of document attribute vectors and then
    # sort this list by citations (descending).
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,publication_data,ads_data,cit_dict,ref_cit_dict,non_ref_cit_dict)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref