def get_attributes(args): """ Gather all data necessary for metrics calculations """ # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE)) # Now gather all usage data numbers from the MongoDB 'adsdata' collection, # keyed on bibcode ads_data = get_mongo_data(bibcodes=bibcodes) missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes) app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) # Get precomputed and citation data metrics_data = get_metrics_data(bibcodes=bibcodes) # Get the number of citing papers Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values()))))) Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values()))))) # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,ads_data,metrics_data) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
def get_attributes(args): """ Gather all data necessary for metrics calculations """ solr_url = config.SOLR_URL max_hits = config.METRICS_MAX_HITS threads = config.METRICS_THREADS chunk_size = config.METRICS_CHUNK_SIZE # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,chunk_size)) # Gather all publication information into one publication dictionary, # keyed on bibcode publication_data = get_publication_data(biblists=biblists) missing_bibcodes = filter(lambda a: a not in publication_data.keys(), bibcodes) app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) # Get citation dictionaries (all, refereed and non-refereed citations in # separate dictionaries, so that we don't have to figure this out later) (cit_dict,ref_cit_dict,non_ref_cit_dict) = get_citations(bibcodes=bibcodes, pubdata=publication_data, type='metrics') # divide by 4 because the values of the dictionary are 4-tuples # and the flattening removed all structure. Nciting = len(set([x[0] for v in cit_dict.values() for x in v])) Nciting_ref = len(set([x[0] for v in ref_cit_dict.values() for x in v])) # Now gather all usage data numbers from the MongoDB 'adsdata' collection # This info will get stored in the dictionary 'adsdata', also keyed on bibcode ads_data = get_mongo_data(bibcodes=bibcodes) # Generate the list of document attribute vectors and then # sort this list by citations (descending). # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,publication_data,ads_data,cit_dict,ref_cit_dict,non_ref_cit_dict) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref