def nameset_comp(c, lname, mp_variation_set): try: debugmsg(c, "List_creator: %s" % str(c) + " working on " + str(lname.encode('UTF-8'))) except: debugmsg(c, "List_creator: %s" % str(c) + " working on " + str('Error encoding name')) # old_variations_set = mp_variaton_set.get() # mp_variaton_set.put(old_variations_set) # if lname in old_variations_set: # if bconfig.TABLES_UTILS_DEBUG: # print time.strftime('%H:%M:%S') + ' ' + "List_creator: %s" % str(c) + " computation DROPPED." # return dat.reset_mem_cache(True) init_authornames(lname) nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES]) debugmsg(c, "List_creator: %s" % str(c) + " computation finished, pushing varset") mp_variation_set.put([lname, list(nameset)]) debugmsg(c, "List_creator: %s" % str(c) + " current variations pushed")
def list_creation_process(mp_queue, job_last_names, mp_termination_queue): ''' Sub process to build the pre-clustered last name blocks @param mp_queue: queue holding the last name blocks @type mp_queue: queue @param job_last_names: list of all last names in the db @type job_last_names: list of string @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue ''' #job_last_names = sorted(job_last_names, key=lambda k: len(k)) variations_set = set() jl = [] for lname in job_last_names: if lname in variations_set: continue if bconfig.TABLES_UTILS_DEBUG: print time.strftime( '%H:%M:%S') + ' ' + "List_creator: working on " + str( lname.encode('UTF-8')) dat.reset_mem_cache(True) init_authornames(lname) nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime( '%H:%M:%S' ) + ' ' + "List_creator: computation finished, getting queue" jl[:] = mp_queue.get() if bconfig.TABLES_UTILS_DEBUG: print time.strftime( '%H:%M:%S') + ' ' + "List_creator: appending " + str( nameset) + ' with still ' + str( len(jl)) + ' elements in queue' jl.append(list(nameset)) mp_queue.put(jl) for n in nameset: variations_set.add(n) if bconfig.TABLES_UTILS_DEBUG: print time.strftime( '%H:%M:%S') + ' ' + "List_creator: putting exit token" mp_termination_queue.put(True) return
def list_creation_process(mp_queue, job_last_names, mp_termination_queue): ''' Sub process to build the pre-clustered last name blocks @param mp_queue: queue holding the last name blocks @type mp_queue: queue @param job_last_names: list of all last names in the db @type job_last_names: list of string @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue ''' #job_last_names = sorted(job_last_names, key=lambda k: len(k)) variations_set = set() jl = [] for lname in job_last_names: if lname in variations_set: continue if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + "List_creator: working on " + str(lname.encode('UTF-8')) dat.reset_mem_cache(True) init_authornames(lname) nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + "List_creator: computation finished, getting queue" jl[:] = mp_queue.get() if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + "List_creator: appending " + str(nameset) + ' with still ' + str(len(jl)) + ' elements in queue' jl.append(list(nameset)) mp_queue.put(jl) for n in nameset: variations_set.add(n) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + "List_creator: putting exit token" mp_termination_queue.put(True) return
def _write_to_files(work_dir, job_lnames): ''' Wrapper function around this internal write process. Triggers the write-back to the files to the mem cache. @param work_dir: where shall the files be stored? @type work_dir: string @param job_lnames: list of names @type job_lnames: list ''' bibtask.task_update_progress('Writing to files in %s' % (work_dir)) bibtask.write_message("Writing cluster with %s entries to " "files in %s" % (len(dat.RELEVANT_RECORDS), work_dir,), stream=sys.stdout, verbose=0) if not os.path.exists(work_dir): os.mkdir(work_dir) write_mem_cache_to_files(work_dir, job_lnames) dat.reset_mem_cache(True)
def _write_to_files(work_dir, job_lnames): ''' Wrapper function around this internal write process. Triggers the write-back to the files to the mem cache. @param work_dir: where shall the files be stored? @type work_dir: string @param job_lnames: list of names @type job_lnames: list ''' bibtask.task_update_progress('Writing to files in %s' % (work_dir)) bibtask.write_message("Writing cluster with %s entries to " "files in %s" % ( len(dat.RELEVANT_RECORDS), work_dir, ), stream=sys.stdout, verbose=0) if not os.path.exists(work_dir): os.mkdir(work_dir) write_mem_cache_to_files(work_dir, job_lnames) dat.reset_mem_cache(True)
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: debugmsg(i, "getting name from queue") if job_mp_queue.qsize() > 0: job_last_names = job_mp_queue.get() debugmsg(i, "got queue item! %s items left in queue" % job_mp_queue.qsize()) else: debugmsg(i, "Queue is currently empty...") if not mp_termination_queue.empty(): debugmsg(i, "Exit token there, Process %s salutes to quit!" % i) return else: debugmsg(i, "Exit token not present, continuing in 15s!") time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break debugmsg(i, "starting with queue: " + str(last_name_queue.queue)) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log(25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len( [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log( 25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [ row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids ]: for author_id in docs['authornameids']: author_name = [ an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id ] refrecs = [ ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id ] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [ row for row in authors if row['db_name'] == rec_author ] if author_in_list: for upd in [ row for row in authors if row['db_name'] == rec_author ]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({ 'db_name': rec_author, 'records': [rec], 'last_name': last_name }) for status, author_last_name in enumerate(author_last_names): current_authors = [ row for row in authors if row['last_name'] == author_last_name ] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log( 25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [ row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name'] ] if not authornamesid: bconfig.LOGGER.error( "The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records( va_id, "orig_authorname_id") for an_list in [ row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec ]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log( 25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id, )) bconfig.LOGGER.log( 25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log( 25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': getting name from queue' job_last_names_list = job_mp_queue.get() if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue' if len(job_last_names_list) > 0: job_last_names = job_last_names_list[0] if len(job_last_names_list) > 1: job_mp_queue.put(job_last_names_list[1:]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put non empty list' else: job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': we got an empty list...' job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' if not mp_termination_queue.empty(): if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token there, exiting!' return else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token not there, continuing!' time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) del (job_last_names_list) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': starting with queue: ' + str(last_name_queue.queue) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log( 25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()
def populate_structs_from_files(work_dir, results=False): ''' Reads the content of the files in 'work_dir' and tries to load the contained data in the respective memory cache. These files are created by the daemon's -G or --prepare-grid function. The files to be read are: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param work_dir: the directory to read the files from @type work_dir: string @return: Returns True if the process finished without error. If it fails, the program will exit with system error code 1 @rtype: boolean ''' if work_dir.endswith("/"): work_dir = work_dir[:-1] bconfig.LOGGER.log(25, "Reading files from %s to mem cache" % (work_dir, )) if not os.path.exists(work_dir): bconfig.LOGGER.critical("Job directory does not exist. Aborting.") raise IOError dat.reset_mem_cache(True) try: dfile = open("%s/authornames.dat" % (work_dir), "r") dat.AUTHOR_NAMES = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_authors.dat" % (work_dir), "r") dat.VIRTUALAUTHORS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_data.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_DATA = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_clusters.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_CLUSTERS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_cluster_cache.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_CLUSTER_CACHE = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/realauthors.dat" % (work_dir), "r") dat.REALAUTHORS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/realauthor_data.dat" % (work_dir), "r") dat.REALAUTHOR_DATA = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/doclist.dat" % (work_dir), "r") dat.DOC_LIST = loads(decompress(dfile.read())) dfile.close() if not results: dfile = open("%s/records.dat" % (work_dir), "r") dat.RELEVANT_RECORDS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/ids.dat" % (work_dir), "r") dat.ID_TRACKER = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/ra_va_cache.dat" % (work_dir), "r") dat.RA_VA_CACHE = loads(decompress(dfile.read())) dfile.close() except IOError, message: bconfig.LOGGER.exception("IOError while trying to read from file %s." % (message, )) raise Exception()
def populate_structs_from_files(work_dir, results=False): ''' Reads the content of the files in 'work_dir' and tries to load the contained data in the respective memory cache. These files are created by the daemon's -G or --prepare-grid function. The files to be read are: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param work_dir: the directory to read the files from @type work_dir: string @return: Returns True if the process finished without error. If it fails, the program will exit with system error code 1 @rtype: boolean ''' if work_dir.endswith("/"): work_dir = work_dir[:-1] bconfig.LOGGER.log(25, "Reading files from %s to mem cache" % (work_dir,)) if not os.path.exists(work_dir): bconfig.LOGGER.critical("Job directory does not exist. Aborting.") raise IOError dat.reset_mem_cache(True) try: dfile = open("%s/authornames.dat" % (work_dir), "r") dat.AUTHOR_NAMES = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_authors.dat" % (work_dir), "r") dat.VIRTUALAUTHORS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_data.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_DATA = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_clusters.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_CLUSTERS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/virtual_author_cluster_cache.dat" % (work_dir), "r") dat.VIRTUALAUTHOR_CLUSTER_CACHE = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/realauthors.dat" % (work_dir), "r") dat.REALAUTHORS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/realauthor_data.dat" % (work_dir), "r") dat.REALAUTHOR_DATA = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/doclist.dat" % (work_dir), "r") dat.DOC_LIST = loads(decompress(dfile.read())) dfile.close() if not results: dfile = open("%s/records.dat" % (work_dir), "r") dat.RELEVANT_RECORDS = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/ids.dat" % (work_dir), "r") dat.ID_TRACKER = loads(decompress(dfile.read())) dfile.close() dfile = open("%s/ra_va_cache.dat" % (work_dir), "r") dat.RA_VA_CACHE = loads(decompress(dfile.read())) dfile.close() except IOError, message: bconfig.LOGGER.exception("IOError while trying to read from file %s." % (message,)) raise Exception()