def check_test_eids(project_id): """ calcluates the Recall by comparing the list of EIDs retrieved from the query against lists of EIDs as obtained from the survey :param project_id: the ID of the current project :return: a JSON formatted relevance measure object. """ test_eids = eids_service.load_eid_list(project_id, 'test_') app.logger.info('project {}: loaded test eids'.format(project_id)) # load collected eids eids = eids_service.load_eid_list(project_id) relevance_measure = relevance_measure_service.load_relevance_measure( project_id) if relevance_measure is None: relevance_measure = RelevanceMeasure() relevance_measure.number_of_search_results = len(eids) relevance_measure.number_test_entries = len(test_eids) relevance_measure.number_test_entries_found = 0 for test_eid in test_eids: if test_eid in eids: relevance_measure.number_test_entries_found = relevance_measure.number_test_entries_found + 1 if relevance_measure.number_of_search_results > 0: relevance_measure.recall = relevance_measure.number_test_entries_found / relevance_measure.number_test_entries else: relevance_measure.recall = 0 relevance_measure_service.save_relevance_measures(project_id, relevance_measure) app.logger.info( 'project {}: calculated relevance measure recall'.format(project_id)) return jsonify(relevance_measure)
def get_eids_list_length(project_id): prefix = request.args.get('prefix') try: eids = eids_service.load_eid_list(project_id, prefix) return Response(str(eids.__len__()), status=200) except FileNotFoundError: return Response("File not found", status=404)
def getPrecision(project_id): # load collected eids eids = eids_service.load_eid_list(project_id) relevance_measure = relevance_measure_service.load_relevance_measure( project_id) if relevance_measure is None: relevance_measure = RelevanceMeasure() relevance_measure.number_of_search_results = len(eids) judgement_list = eids_service.load_judgement_file(project_id) app.logger.info('project {}: loaded judgements'.format(project_id)) relevance_measure.number_sample_entries = len(judgement_list) relevance_measure.number_positive_sample_entries = 0 for judgement in judgement_list: if judgement.isRelevant: relevance_measure.number_positive_sample_entries = \ relevance_measure.number_positive_sample_entries + 1 if relevance_measure.number_sample_entries > 0: relevance_measure.precision = relevance_measure.number_positive_sample_entries / relevance_measure.number_sample_entries else: relevance_measure.precision = 0 relevance_measure_service.save_relevance_measures(project_id, relevance_measure) app.logger.info( 'project {}: calculated relevance measure precision'.format( project_id)) return jsonify(relevance_measure)
def get_eids_list_length(project_id): prefix = request.args.get('prefix') try: eids = eids_service.load_eid_list(project_id, prefix) return Response(str(len(eids)), status=200) except FileNotFoundError: app.logger.error('project {}: could not send {} eid file'.format( project_id, prefix)) return Response("File not found", status=404)
def check_test_eids(project_id): # load test eids test_eids = eids_service.load_eid_list(project_id, 'test_') # load collected eids eids = eids_service.load_eid_list(project_id) relevance_measure = relevance_measure_service.load_relevance_measure( project_id) if relevance_measure is None: relevance_measure = RelevanceMeasure() relevance_measure.number_of_search_results = len(eids) relevance_measure.number_test_entries = len(test_eids) for test_eid in test_eids: if test_eid in eids: relevance_measure.number_test_entries_found = relevance_measure.number_test_entries_found + 1 if relevance_measure.number_of_search_results > 0: relevance_measure.recall = relevance_measure.number_test_entries_found / relevance_measure.number_test_entries else: relevance_measure.recall = 0 relevance_measure_service.save_relevance_measures(project_id, relevance_measure) return jsonify(relevance_measure.__dict__)
def generate_sample_publication_list(project_id, sample_size, session_id): # path to the file eids = eids_service.load_eid_list(project_id) number = eids.__len__() random_sample_eids = [] if number > sample_size: test_indices = random.sample(range(1, eids.__len__()), sample_size) for index, value in enumerate(eids): if index in test_indices: random_sample_eids.append(value) else: random_sample_eids = eids eids_service.save_eid_list(project_id, random_sample_eids, session_id) return random_sample_eids
def get_eids_scopus_search_string(project_id): prefix = request.args.get('prefix') if prefix == 'sample_': sample_size = int(request.args.get('sample_size')) build_sample_list(project_id, sample_size) try: eids = eids_service.load_eid_list(project_id, prefix) except FileNotFoundError: return Response("File not found", status=404) search_string = 'EID(' for index, eid in enumerate(eids): if index > 0: search_string = search_string + ' OR ' search_string = search_string + eid search_string = search_string + ')' return Response(search_string, status=200)
def check_sample_eids(project_id): # load collected eids eids = eids_service.load_eid_list(project_id) relevance_measure = relevance_measure_service.load_relevance_measure( project_id) if relevance_measure is None: relevance_measure = RelevanceMeasure() relevance_measure.number_of_search_results = len(eids) judgement_list = eids_service.load_judgement_file(project_id) relevance_measure['number_sample_entries'] = len(judgement_list) for judgement in judgement_list: if judgement['isRelevant']: relevance_measure['number_positive_sample_entries'] = \ relevance_measure['number_positive_sample_entries'] + 1 relevance_measure_service.save_relevance_measures(project_id, relevance_measure) return jsonify(relevance_measure.__dict__)
def build_sample_list(project_id, sample_size=100): if sample_size is None: sample_size = 100 # path to the file eids = eids_service.load_eid_list(project_id) number = eids.__len__() random_sample_eids = [] if number > sample_size: test_indices = random.sample(range(1, eids.__len__()), sample_size) for index, value in enumerate(eids): if index in test_indices: random_sample_eids.append(value) eids_service.save_eid_list(project_id=project_id, eids=random_sample_eids, prefix='sample_') else: eids_service.save_eid_list(project_id=project_id, eids=eids, prefix='sample_')
def retrieve_publications_sample(project_id, query_id): session_id = request.args.get('session') sample_size = int(request.args.get('sample_size')) if sample_size is None: sample_size = 100 if session_id is None: session_id = 'default_session_' try: random_sample_eids = eids_service.load_eid_list(project_id, session_id) except: random_sample_eids = generate_sample_publication_list( project_id, sample_size, session_id) search_string = utils.generate_scopus_search_from_eid_list( random_sample_eids) search = scopus.ScopusSearch(search_string, refresh=True, project_id=project_id) sample_publications_json = json.dumps(search.results, cls=PropertyEncoder) return Response(sample_publications_json, status=200, mimetype='application/json')
def add_query_ids(project_id): query_ids = query_service.load_scopus_queries(project_id).search_ids for query_id in query_ids: eids = eids_service.load_eid_list(project_id, prefix=query_id + '_') for eid in eids: try: record = elasticsearch_service.get_record(project_id, eid) except: app.logger.warning('eid not in index: '.format(eid)) continue try: if query_id in record['query_id']: continue if record['query_id'] == '': record['query_id'] = query_id else: record['query_id'] = record['query_id'] + '; ' + query_id except KeyError: record['query_id'] = query_id elasticsearch_service.append_to_index(record, eid, project_id) app.logger.info('set query id {} to entry {}'.format( query_id, eid)) return Response({"status": "FINISHED"}, status=204)
def data_collection_execution(project_id): """ run the data collection :parameter project_id the id of the current project """ mode = '' if request.args.get('mode') is not None: mode = request.args.get('mode') app.logger.info('project {}: collecting data with mode {}'.format( project_id, mode)) # load project, set status bools, and load and eid list. initialize missed eid list project = project_service.load_project(project_id) project.isDataCollecting = True project.isDataCollected = False eids = eids_service.load_eid_list(project_id, mode) missed_eids = [] with app.app_context(): keys = app.config.get("LIBINTEL_SCOPUS_KEYS") # initialize status, set to collecting and save status status = Status("DATA_COLLECTING") status.total = len(eids) status_service.save_status(project_id, status) if status.total > 0: if mode != 'missed': elasticsearch_service.delete_index(project.project_id) else: eids_service.deleteMissedEids() if type(keys) is tuple: # the number of threads is given by the number of available API keys number_of_threads = len(keys) app.logger.info('project {}: collecting data in {} threads'.format( project_id, number_of_threads)) # gather the individual chunks provided to each process length_of_chunks = math.ceil(status.total / number_of_threads) list_chunks = list(chunks(eids, length_of_chunks)) # make asynchronous calls and delegate the individual collection to the individual threads for key_index, key in enumerate(keys): if len(list_chunks) > key_index: thread = Thread(target=collect_data, args=(list_chunks[key_index], project.project_id, project.name, key_index, key, app._get_current_object())) thread.start() return Response('finished', status=204) collect_data(eids=eids, project_id=project.project_id, project_name=project.name, i=0, key=keys, app=app._get_current_object()) # if only one API-Key is given, collect data sequentially for idx, eid in enumerate(eids): # set scopus api-key to the provided key scopus.config['Authentication']['APIKEy'] = keys # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve data from scopus try: scopus_abstract = scopus.AbstractRetrieval(identifier=eid, id_type='eid', view="FULL", refresh=True) app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) except Exception as inst: app.logger.error( 'project {}: could not collect scopus data for EID {}, exception: {}' .format(project_id, eid, type(inst))) missed_eids.append(eid) continue # create new AllResponses object to hold the individual information response = AllResponses(eid, project.name, project.project_id) # add scopus abstract to AllResponses object response.scopus_abstract_retrieval = scopus_abstract # get doi and collect unpaywall data and Altmetric data doi = scopus_abstract.doi if doi is not None: if doi != "": response.unpaywall_response = Unpaywall(doi) response.altmetric_response = Altmetric(doi) response.scival_data = Scival([]) # send response to elastic search index elasticsearch_service.send_to_index(response, project.project_id) app.logger.info('project {}: saved EID {} to elasticsearch'.format( project_id, eid)) eids_service.save_eid_list(project_id=project.project_id, eids=missed_eids, prefix='missed_') app.logger.info('project {}: all EID data collected'.format(project_id)) status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) project.isDataCollecting = False project.isDataCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def references_collection_execution(project_id): """ collects the references for a given collection of publications :param project_id: the ID of the current project :return: 204 if successful """ # initialize lists, read sample size from request and load eid list sample_size = int(request.args.get('sample_size')) missed_eids = [] references_eids = [] eids = eids_service.load_eid_list(project_id) # load project and set booleans project = project_service.load_project(project_id) project.isReferencesCollecting = True project.isReferencesCollected = False project_service.save_project(project) # prepare status status = Status("REFERENCES_COLLECTING") status.total = eids.__len__() status_service.save_status(project_id, status) # if eids are given, cycle through all of them if status.total > 0: for idx, eid in enumerate(eids): # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve refereces from scopus try: scopus_abstract = scopus.AbstractRetrieval(eid, view="FULL") app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) if scopus_abstract.references is not None: references_eids = references_eids + scopus_abstract.references else: app.logger.warn( 'project {}: no references given in scopus export for EID {}.' .format(project_id, eid)) except IOError: app.logger.error( 'project {}: could not collect scopus data for EID {}'. format(project_id, eid)) missed_eids.append(eid) continue # transform references eids into tuple and calculate the occurences references_eids_tuple = tuple(references_eids) occurences = Counter(references_eids_tuple) most_occurences = occurences.most_common(sample_size) # save the counter with the most occurences to disk counter_service.save_counter(project_id, most_occurences, 'references_') eids_service.save_eid_list(project_id, missed_eids, prefix='missed_') # set the status and save it to disk status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) # set the project booleans and save it to disk project.isReferencesCollecting = False project.isReferencesCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)