def find_and_process_updates(process_initials): """ Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean """ if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0]["value"] if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def find_and_process_orphans(iterations=1): """ Finds and processes orphaned virtual authors. @param iterations: Number of rounds to do this processing @type iterations: int """ multi_attach = False # processed_orphans = set() for iteration in xrange(iterations): if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue(mode="orphaned") while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished." " Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() # if va_id not in dat.PROCESSED_ORPHANS: va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0][ "value" ] bconfig.LOGGER.log(25, "|> Inserting orphaned VA: %s Name: %s" % (va_id, va_name)) if (bconfig.ATTACH_VA_TO_MULTIPLE_RAS) and (iteration == iterations - 1): multi_attach = True add_virtualauthor(va_id, multi_attach)
def compare_va_to_ra(va_id, ra_id): ''' Compares the origin names of a virtual author against the name list of a real author @param va_id: ID of the virtual author @type va_id: int @param ra_id: ID of the real author @type ra_id: int @return: The probability resulting from the name comparison. @rtype: float ''' bconfig.LOGGER.info("|-> Start of name comparison (va %s : ra %s)" % (va_id, ra_id)) ra_names = get_realauthor_names_from_set(ra_id) va_nameid_recs = get_virtualauthor_records(va_id, tag='orig_authorname_id') # print "RA Names: ", ra_names # print "VA Name: ", va_name authorname_id = -1 if va_nameid_recs: authorname_id = va_nameid_recs[0]['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) if not authorname_strings["name"]: return 0.0 comparisons = [] for ra_name in ra_names: comparison = compare_names(authorname_strings["name"], ra_name) bconfig.LOGGER.info("|-> %s & %s -> %s" % (authorname_strings["name"], ra_name, comparison)) comparisons.append(comparison) #print "checking ",name_1," against ", name_2 bconfig.LOGGER.debug("|--> Name comparisons: %s" % (comparisons)) bconfig.LOGGER.info("|-> End of name comparison") # ret = average(comparisons) ret = float(sum(comparisons)) / len(comparisons) if ret < .1: ret = 0 #.1 bconfig.LOGGER.info("|--> Resulting name probability: %s" % (ret)) return ret
def compare_va_to_ra(va_id, ra_id): """ Compares the origin names of a virtual author against the name list of a real author @param va_id: ID of the virtual author @type va_id: int @param ra_id: ID of the real author @type ra_id: int @return: The probability resulting from the name comparison. @rtype: float """ bconfig.LOGGER.info("|-> Start of name comparison (va %s : ra %s)" % (va_id, ra_id)) ra_names = get_realauthor_names_from_set(ra_id) va_nameid_recs = get_virtualauthor_records(va_id, tag="orig_authorname_id") # print "RA Names: ", ra_names # print "VA Name: ", va_name authorname_id = -1 if va_nameid_recs: authorname_id = va_nameid_recs[0]["value"] authorname_strings = get_name_and_db_name_strings(authorname_id) if not authorname_strings["name"]: return 0.0 comparisons = [] for ra_name in ra_names: comparison = compare_names(authorname_strings["name"], ra_name) bconfig.LOGGER.info("|-> %s & %s -> %s" % (authorname_strings["name"], ra_name, comparison)) comparisons.append(comparison) # print "checking ",name_1," against ", name_2 bconfig.LOGGER.debug("|--> Name comparisons: %s" % (comparisons)) bconfig.LOGGER.info("|-> End of name comparison") # ret = average(comparisons) ret = float(sum(comparisons)) / len(comparisons) if ret < 0.1: ret = 0 # .1 bconfig.LOGGER.info("|--> Resulting name probability: %s" % (ret)) return ret
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the data of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the data will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of the data @rtype: True if ra_id > -1 or list of strings ''' if dat.RUNTIME_CONFIG["populate_aid_from_personid"]: return True va_data = get_virtualauthor_records(va_id) bibrec_id = "" authorname_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading info for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) inspireid = get_field_values_on_condition( bibrec_id, ['100', '700'], 'i', 'a', authorname_strings["db_name"], "==") if inspireid: inspireid = list(inspireid)[0] if ra_id > -1: if inspireid: set_realauthor_data(ra_id, "inspireid", "%s" % inspireid) return True else: return inspireid
def create_realauthors_from_orphans(): ''' Find all orphaned virtual authors and create a real author for every one. ''' va_list = bibauthorid_virtualauthor_utils.get_orphan_virtualauthors() for va_entry in va_list: bconfig.LOGGER.log( 25, "INSERTING VA %s Name: %s" % (va_entry['virtualauthorid'], bibauthorid_virtualauthor_utils.get_virtualauthor_records( va_entry['virtualauthorid'], tag='orig_name_string')[0]['value'])) add_virtualauthor(va_entry['virtualauthorid']) bconfig.LOGGER.debug("va_list lengtht: %s" % (len(va_list)))
def get_information_from_dataset(va_id, ra_id= -1): ''' Retrieves information about the citations of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the data will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of the data @rtype: True if ra_id > -1 or list of strings ''' if dat.RUNTIME_CONFIG["populate_aid_from_personid"]: return True va_data = get_virtualauthor_records(va_id) authorname_id = -1 bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s" % (va_id, authorname_id, bibrec_id)) cites = get_field_values_on_condition(bibrec_id, 'cites') if ra_id > -1: if cites: for cite in cites: set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite)) return True else: return cites
def get_information_from_dataset(va_id, ra_id= -1): ''' Retrieves information about the data of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the data will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of the data @rtype: True if ra_id > -1 or list of strings ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading info for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) data = get_field_values_on_condition( bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") if ra_id > -1: formatted = "something" set_realauthor_data(ra_id, "module_tag", "module_value %s" % (formatted)) return True else: return data
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the data of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the data will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of the data @rtype: True if ra_id > -1 or list of strings ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading info for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) data = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") if ra_id > -1: formatted = "something" set_realauthor_data(ra_id, "module_tag", "module_value %s" % (formatted)) return True else: return data
def create_realauthors_from_orphans(): """ Find all orphaned virtual authors and create a real author for every one. """ va_list = bibauthorid_virtualauthor_utils.get_orphan_virtualauthors() for va_entry in va_list: bconfig.LOGGER.log( 25, "INSERTING VA %s Name: %s" % ( va_entry["virtualauthorid"], bibauthorid_virtualauthor_utils.get_virtualauthor_records( va_entry["virtualauthorid"], tag="orig_name_string" )[0]["value"], ), ) add_virtualauthor(va_entry["virtualauthorid"]) bconfig.LOGGER.debug("va_list lengtht: %s" % (len(va_list)))
def compare_va_to_ra(va_id, ra_id): ''' Compares the currently processed paper with the list of already attributed papers of the real author. Should the currently processed paper be amongst the list of papers of the real author, the returned value will be 1--the highest probability. And 0 otherwise. Due to the configuration of this function in the configuration file, a parity of the papers will nullify the entire calculation. @param va_id: ID of the virtual author @type va_id: int @param ra_id: ID of the real author @type ra_id: int @return: The probability resulting from the paper equality comparison. @rtype: float ''' va_records_raw = get_virtualauthor_records(va_id, "bibrec_id") ra_records_raw = get_realauthor_data(ra_id, "bibrec_id") paper_parity = 0 va_records = [] ra_records = [] for i in va_records_raw: va_records.append(i['value']) for i in ra_records_raw: ra_records.append(i['value']) for va_record in va_records: if va_record in ra_records: paper_parity += 1 if paper_parity > 0: bconfig.LOGGER.warn("|-> Paper parity detected" + " -> Impossibility of author equality") return 1.0 else: return 0.0
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the citations of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the data will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of the data @rtype: True if ra_id > -1 or list of strings ''' va_data = get_virtualauthor_records(va_id) authorname_id = -1 bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s" % (va_id, authorname_id, bibrec_id)) cites = get_field_values_on_condition(bibrec_id, 'cites') if ra_id > -1: if cites: for cite in cites: set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite)) return True else: return cites
def find_and_process_updates(process_initials): ''' Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean ''' if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = (bibauthorid_virtualauthor_utils.get_virtualauthor_records( va_id, tag='orig_name_string')[0]['value']) if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record( va_id, 'updated')) bconfig.LOGGER.log( 25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record( va_id, 'updated')) bconfig.LOGGER.log( 25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def find_and_process_orphans(iterations=1): ''' Finds and processes orphaned virtual authors. @param iterations: Number of rounds to do this processing @type iterations: int ''' multi_attach = False # processed_orphans = set() for iteration in xrange(iterations): if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue(mode="orphaned") while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished." " Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() # if va_id not in dat.PROCESSED_ORPHANS: va_name = ( bibauthorid_virtualauthor_utils.get_virtualauthor_records( va_id, tag='orig_name_string')[0]['value']) bconfig.LOGGER.log( 25, "|> Inserting orphaned VA: %s Name: %s" % (va_id, va_name)) if ((bconfig.ATTACH_VA_TO_MULTIPLE_RAS) and (iteration == iterations - 1)): multi_attach = True add_virtualauthor(va_id, multi_attach)
def add_virtualauthor(va_id, multi_va_to_ra=False, get_raid_from_personid_table=False): ''' Adds a new virtual author to the real authors system: the idea is to search for possibly compatible real authors, then compare the compatibility of this virtual author with all the virtual authors connected to the selected real authors and add the new virtualauthor to the most compatible real author. In case we do not have a most compatible real author, we add the same virtual author to more then one real author with a lower probability; this behavior might be changed. @param va_id: Virtualauthor ID @type va_id: int ''' addstart = time.time() adding_threshold = bconfig.REALAUTHOR_VA_ADD_THERSHOLD if adding_threshold == ["-1"]: adding_threshold = 0.7 already_existing = get_realauthors_by_virtuala_id(va_id) ralist = [] if len(already_existing) <= 0: start = time.time() va_cluster = (bibauthorid_virtualauthor_utils. get_cluster_va_ids_from_va_id(va_id)) ralist_raw = [] va_hash = hash(str(va_cluster)) if not get_raid_from_personid_table: if va_hash in dat.RA_VA_CACHE: ralist_raw = dat.RA_VA_CACHE[va_hash] bconfig.LOGGER.debug("|-> Cache Hit for va cluster") else: bconfig.LOGGER.debug("|-> Cache Fail--Generating new hash") ralist_raw = update_ralist_cache(va_cluster, va_hash) ralist = [ids['ra_id'] for ids in ralist_raw if ids['va_id'] != va_id] ralist = list(set(ralist)) else: ralist = pidu.get_personid_from_paper(get_virtualauthor_records(va_id, tag="bibrefrecpair")[0]['value']) if ralist < 0: update_ralist_cache(va_cluster, va_hash) return add_realauthor_va(ralist, va_id, 1) update_ralist_cache(va_cluster, va_hash) bconfig.LOGGER.log(25, "|-> Adding to real author #%s" " with a compatability." % (ralist)) (bibauthorid_virtualauthor_utils. update_virtualauthor_record(va_id, 'connected', 'True')) (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) return if len(ralist) > 0: min_compatibilities = [] for i in ralist: compatibilities = [] compatibilities.append(cmp_virtual_to_real_author(va_id, i)) min_compatibilities.append(min(compatibilities)) max_min_compatibilities = max(min_compatibilities) if max_min_compatibilities < adding_threshold: bconfig.LOGGER.log(25, "|-> Creating NEW real author for this" + " virtual author (compatibility below adding threshold" + " of other RAs).") create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) else: if min_compatibilities.count(max_min_compatibilities) == 1: index = min_compatibilities.index(max_min_compatibilities) add_realauthor_va(ralist[index], va_id, max_min_compatibilities) bconfig.LOGGER.log(25, "|-> Adding to real author #%s" " with a compatability of %.2f" % (ralist[index], max_min_compatibilities)) elif min_compatibilities.count(max_min_compatibilities) > 1: if multi_va_to_ra: bconfig.LOGGER.log(25, "|-> virtual author" " comaptible with more than one realauthor.") indexes = set() for i in xrange(len(min_compatibilities)): indexes.add(min_compatibilities.index( max_min_compatibilities, i)) bconfig.LOGGER.log(25, "|-> virtual author" " will be attached to %s real authors" % (len(indexes))) for i in indexes: add_realauthor_va(ralist[i], va_id, max_min_compatibilities) bconfig.LOGGER.log(25, "|--> Adding to real author" " #%s with a compatability of %.2f" % (ralist[i], max_min_compatibilities)) else: bconfig.LOGGER.log(25, "|-> virtual author" " comaptible with more than one realauthor..." "skipped for now.") bconfig.LOGGER.log(25, "|> The (skipped) comparison " "with %s real authors took %.2fs" % (len(ralist), time.time() - start)) (bibauthorid_virtualauthor_utils. update_virtualauthor_record(va_id, 'connected', 'False')) (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) return else: bconfig.LOGGER.log(25, "|-> Creating NEW real author for this" " Virtual Author (currently, no real author exists)") create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) (bibauthorid_virtualauthor_utils. update_virtualauthor_record(va_id, 'connected', 'True')) (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) bconfig.LOGGER.log(25, "|> The comparison with %s real authors took %.2fs" % (len(ralist), time.time() - addstart))
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len( [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log( 25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [ row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids ]: for author_id in docs['authornameids']: author_name = [ an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id ] refrecs = [ ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id ] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [ row for row in authors if row['db_name'] == rec_author ] if author_in_list: for upd in [ row for row in authors if row['db_name'] == rec_author ]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({ 'db_name': rec_author, 'records': [rec], 'last_name': last_name }) for status, author_last_name in enumerate(author_last_names): current_authors = [ row for row in authors if row['last_name'] == author_last_name ] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log( 25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [ row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name'] ] if not authornamesid: bconfig.LOGGER.error( "The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records( va_id, "orig_authorname_id") for an_list in [ row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec ]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log( 25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id, )) bconfig.LOGGER.log( 25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log( 25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def get_information_from_dataset(va_id, ra_id= -1): ''' Retrieves information about the coauthors/collaboration attachment of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the coauthors will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of coauthors OR the name of a collaboration @rtype: True if ra_id > -1 or list of strings or string ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" authorname_id = -1 for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) coauthors = get_field_values_on_condition( bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") collaboration = get_field_values_on_condition(bibrec_id, "710", "g") if (not coauthors) and (not collaboration): bconfig.LOGGER.info("|-> No coauthors and no collaboration found " "for this author on this record") elif not ra_id: if collaboration: bconfig.LOGGER.info("|-> Collaboration found: %s" % (list(collaboration)[0])) else: bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors))) max_coauthors = MAX_COAUTHORS if ra_id > -1: if collaboration: cname = list(collaboration)[0] coauthor_formatted = create_unified_name(cname.lower()) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: if len(coauthors) <= max_coauthors: for coauthor in coauthors: coauthor_formatted = create_unified_name(coauthor.lower()) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: hashvalue = hash_coauthor_set(coauthors) bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve" " information, a hash will be stored: %s" % (max_coauthors, hashvalue)) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], hashvalue)) return True else: if collaboration: return collaboration else: return coauthors
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the coauthors/collaboration attachment of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the coauthors will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of coauthors OR the name of a collaboration @rtype: True if ra_id > -1 or list of strings or string ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" authorname_id = -1 for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) coauthors = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") collaboration = get_field_values_on_condition(bibrec_id, "710", "g") if (not coauthors) and (not collaboration): bconfig.LOGGER.info("|-> No coauthors and no collaboration found " "for this author on this record") elif not ra_id: if collaboration: bconfig.LOGGER.info("|-> Collaboration found: %s" % (list(collaboration)[0])) else: bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors))) max_coauthors = MAX_COAUTHORS if ra_id > -1: if collaboration: cname = list(collaboration)[0] coauthor_formatted = create_unified_name(cname.lower()) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: if len(coauthors) <= max_coauthors: for coauthor in coauthors: coauthor_formatted = create_unified_name(coauthor.lower()) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: hashvalue = hash_coauthor_set(coauthors) bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve" " information, a hash will be stored: %s" % (max_coauthors, hashvalue)) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], hashvalue)) return True else: if collaboration: return collaboration else: return coauthors
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the affiliation of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the affiliations will be returned. @param va_id: Virtual author ID to get the info from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: A list of affiliations or simply True, if ra_id is set. @rtype: list of strings or True if ra_id > -1 ''' va_data = get_virtualauthor_records(va_id) authorname_id = -1 bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading affiliations for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) affiliations = get_field_values_on_condition(bibrec_id, ['100', '700'], 'u', 'a', authorname_strings["db_name"]) record_date = get_field_values_on_condition(bibrec_id, '269', 'c') constructed_date = [] datearray = [] if len(record_date) > 0: datearray = list(record_date)[0].split("-") else: datearray = ['0000', '00'] length = len(datearray) if length == 3: datearray.pop() constructed_date = datearray elif length == 2: constructed_date = datearray else: constructed_date = datearray constructed_date += ['10'] affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1]) is_aff = False is_aff_date = False if not affiliations: bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None") affiliations = ["None"] else: bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations)) is_aff = True if affiliation_date == "0000-00": bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00") else: bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date)) is_aff_date = True aff_collection = [] if is_aff or is_aff_date: for affiliation in affiliations: bconfig.LOGGER.info( "|--> Found Affiliation: %s;;%s;;%s" % (affiliation_date, authorname_strings["name"], affiliation)) aff_collection.append( "%s;;%s;;%s" % (affiliation_date, authorname_strings["name"], affiliation)) if ra_id > -1: for affiliation in aff_collection: set_realauthor_data(ra_id, "affiliation", affiliation) return True else: return aff_collection
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def get_information_from_dataset(va_id, ra_id= -1): ''' Retrieves information about the affiliation of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the affiliations will be returned. @param va_id: Virtual author ID to get the info from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: A list of affiliations or simply True, if ra_id is set. @rtype: list of strings or True if ra_id > -1 ''' src = "MEM" if bconfig.STANDALONE or dat.RUNTIME_CONFIG["populate_aid_from_personid"]: src = "API" va_data = get_virtualauthor_records(va_id) authorname_id = -1 bibrec_id = "" for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading affiliations for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) affiliations = get_field_values_on_condition( bibrec_id, ['100', '700'], 'u', 'a', authorname_strings["db_name"], source=src) record_date = get_field_values_on_condition(bibrec_id, '269', 'c', source=src) constructed_date = [] datearray = [] if len(record_date) > 0: datearray = list(record_date)[0].split("-") else: datearray = ['0000', '00'] length = len(datearray) if length == 3: datearray.pop() constructed_date = datearray elif length == 2: constructed_date = datearray else: constructed_date = datearray constructed_date += ['10'] affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1]) is_aff = False is_aff_date = False if not affiliations: bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None") affiliations = ["None"] else: bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations)) is_aff = True if affiliation_date == "0000-00": bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00") else: bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date)) is_aff_date = True aff_collection = [] if is_aff or is_aff_date: for affiliation in affiliations: bconfig.LOGGER.info("|--> Found Affiliation: %s;;%s;;%s" % (affiliation_date, authorname_strings["name"], affiliation)) aff_collection.append("%s;;%s;;%s" % (affiliation_date, authorname_strings["name"], affiliation)) if ra_id > -1: for affiliation in aff_collection: set_realauthor_data(ra_id, "affiliation", affiliation) return True else: return aff_collection