def __init__(self, CUI): self.__cui = CUI if Concept.__storage is None: logging.info("Initializing concept storage from default location." " If this isn't what you want, call " "Concept.init_storage() before allocating a Concept") Concept.init_storage()
def __init__(self, tree, rule_data=None, skip_unknown_concepts=True, accepted_types=set(['a', 'i'])): logging.debug("Creating Converter with tree %r", tree) self._tree=tree if rule_data is None: rule_data=pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb")) logging.info("Using converter data from %r", _DEFAULT_CONVERTER_DATA) self._data=rule_data self._extra_checktags=set() self._skip_unknown=skip_unknown_concepts self._accepted_types=accepted_types
def __init__(self, tree, rule_data=None, skip_unknown_concepts=True, accepted_types=set(['a', 'i'])): logging.debug("Creating Converter with tree %r", tree) self._tree = tree if rule_data is None: rule_data = pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb")) logging.info("Using converter data from %r", _DEFAULT_CONVERTER_DATA) self._data = rule_data self._extra_checktags = set() self._skip_unknown = skip_unknown_concepts self._accepted_types = accepted_types
def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename=="*&$#$%#": filename=_DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup=None # Init the inverse name lookup database lazily self._origname=filename self.terms=self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict=None self._search_dict=None self.num_terms=len(self.terms) return
def graph_and_rank(self, article): """Turn the article into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" article_graph=self.graph_article(article) article_matrix=article_graph.as_mapped_link_matrix() if len(article_matrix)==0: logging.info("Skipping article %r. It has an empty matrix.", article) raise CouldNotRank("Article %r is not rankable." % article) try: ranked_article=self._ranker.evaluate(article_matrix) except ValueError: logging.info("%r returned an exception while ranking %r. " "Skipping.", self._ranker, article) raise CouldNotRank("There was an exception while ranking %r." % article) return ranked_article
def graph_and_rank(self, item): """Turn the item into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" item_graph = self.graph_item(item) logging.log(ULTRADEBUG, "The item graph is %r.", item_graph) item_matrix = item_graph.as_mapped_link_matrix() if len(item_matrix) == 0: logging.info("Skipping item %r. It has an empty matrix.", item) raise CouldNotRank("Item %r is not rankable." % item) try: ranked_item = self._ranker.evaluate(item_matrix) except ValueError: logging.info( "%r returned an exception while ranking %r. " "Skipping.", self._ranker, item) raise CouldNotRank("There was an exception while ranking %r." % item) return ranked_item
def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename == "*&$#$%#": filename = _DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree = StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup = None # Init the inverse name lookup database lazily self._origname = filename self.terms = self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict = None self._search_dict = None self.num_terms = len(self.terms) return
def graph_and_rank(self, article): """Turn the article into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" article_graph = self.graph_article(article) article_matrix = article_graph.as_mapped_link_matrix() if len(article_matrix) == 0: logging.info("Skipping article %r. It has an empty matrix.", article) raise CouldNotRank("Article %r is not rankable." % article) try: ranked_article = self._ranker.evaluate(article_matrix) except ValueError: logging.info( "%r returned an exception while ranking %r. " "Skipping.", self._ranker, article) raise CouldNotRank("There was an exception while ranking %r." % article) return ranked_article
def graph_and_rank(self, item): """Turn the item into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" item_graph=self.graph_item(item) logging.log(ULTRADEBUG, "The item graph is %r.", item_graph) item_matrix=item_graph.as_mapped_link_matrix() if len(item_matrix)==0: logging.info("Skipping item %r. It has an empty matrix.", item) raise CouldNotRank("Item %r is not rankable." % item) try: ranked_item=self._ranker.evaluate(item_matrix) except ValueError: logging.info("%r returned an exception while ranking %r. " "Skipping.", self._ranker, item) raise CouldNotRank("There was an exception while ranking %r." % item) return ranked_item
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results = {} evaluator = self.create_evaluator() count = 0 for each_article in self._reader: count += 1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log( ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms = self.convert(ranked_article) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug( "Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms( flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result = self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp = self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id] = eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results={} evaluator=self.create_evaluator() count=0 for each_article in self._reader: count+=1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms=self.convert(ranked_article) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result=self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp=self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id]=eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow=workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename ) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request=my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request=='STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results={} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow = workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request = my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request == 'STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results = {} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def build_idf_from_file(self, file_reader, default_score=None): tempdict = {} logging.info("Building the term frequency dictionary") count = 1 logging.debug("Checking for a cache file, and loading from it.") try: self.populate_from_cache( self.cache_file_name(file_reader.original_file.name)) logging.info("Loaded from cache. It's not necessary to build.") return except: logging.debug("Nope. Proceeding with building the dictionary.") for article in file_reader: logging.debug( "Processing article %r (number %d) for the term" " frequency dictionary", article, count) if article.set_id.pmid < 0: logging.warn("Article with unknown PubMed ID - skipping") continue count += 1 tempcounts = {} for line in article.lines: try: this_cui = line.CUI except AttributeError: continue # Use the confidence as the score if no default is specified #if default_score is None: # try: # this_score=line.confidence # except AttributeError: # continue #else: # this_score=default_score #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score tempcounts[this_cui] = 1 # Now have all the CUIs that appeared in the article. Update # the total counts. for k in tempcounts: tempdict[k] = tempdict.get(k, 0) + 1 logging.debug("Built a dictionary with %d items. Computing IDFs.", len(tempdict)) # max_value=max(tempdict.itervalues()) #logging.debug("Saving it to permanent storage.") for k, v in tempdict.iteritems(): self[k] = math.log(count / float(v)) + 1.0 logging.info("Done building the dictionary. Dumping it to a cache " "file.") self.dump_to_cache(self.cache_file_name( file_reader.original_file.name)) return
def build_idf_from_file(self, file_reader, default_score=None): tempdict={} logging.info("Building the term frequency dictionary") count=1 logging.debug("Checking for a cache file, and loading from it.") try: self.populate_from_cache( self.cache_file_name(file_reader.original_file.name)) logging.info("Loaded from cache. It's not necessary to build.") return except: logging.debug("Nope. Proceeding with building the dictionary.") for article in file_reader: logging.debug("Processing article %r (number %d) for the term" " frequency dictionary", article, count) if article.set_id.pmid < 0: logging.warn("Article with unknown PubMed ID - skipping") continue count+=1 tempcounts={} for line in article.lines: try: this_cui=line.CUI except AttributeError: continue # Use the confidence as the score if no default is specified #if default_score is None: # try: # this_score=line.confidence # except AttributeError: # continue #else: # this_score=default_score #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score tempcounts[this_cui]=1 # Now have all the CUIs that appeared in the article. Update # the total counts. for k in tempcounts: tempdict[k]=tempdict.get(k, 0)+1 logging.debug("Built a dictionary with %d items. Computing IDFs.", len(tempdict)) # max_value=max(tempdict.itervalues()) #logging.debug("Saving it to permanent storage.") for k, v in tempdict.iteritems(): self[k]=math.log(count/float(v))+1.0 logging.info("Done building the dictionary. Dumping it to a cache " "file.") self.dump_to_cache( self.cache_file_name(file_reader.original_file.name)) return
def _generate_normalization_factors(self): """Computes the array of normalization factors for the current matrix.""" import operator logging.info("Generating array of normalization factors. This is a " "slow operation. Please wait.") for i in xrange(self._height): logging.debug("Generating normalization factor for row %d", i) # Add all of the elements of the row together matrix_row=self._get_row(i) logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row) this_row=reduce(operator.add, matrix_row) if this_row==0.0: logging.info("Row %d in the matrix adds up to 0. This may " "be a problem, depending on your evaluation function. Since " "this is a normalization calculation, it will be replaced by " "1.", i) this_row=1.0 self.normfactors[i]=this_row logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i, this_row) logging.info("Normalization factor generation done.")
def _generate_normalization_factors(self): """Computes the array of normalization factors for the current matrix.""" import operator logging.info("Generating array of normalization factors. This is a " "slow operation. Please wait.") for i in xrange(self._height): logging.debug("Generating normalization factor for row %d", i) # Add all of the elements of the row together matrix_row = self._get_row(i) logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row) this_row = reduce(operator.add, matrix_row) if this_row == 0.0: logging.info( "Row %d in the matrix adds up to 0. This may " "be a problem, depending on your evaluation function. Since " "this is a normalization calculation, it will be replaced by " "1.", i) this_row = 1.0 self.normfactors[i] = this_row logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i, this_row) logging.info("Normalization factor generation done.")
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes=cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold=gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval=sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5*original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes=[] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_processes)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process=Process(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process=(count-1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_processes=[x for x in processes if x[0].is_alive()] remaining_processes=len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes)>0: a_process=processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return
def multi_processor( reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_threads=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True, ): """ Perform the evaluation. Multithreading notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are thread-safe. """ if num_threads is None: num_threads = 1 logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) # Since there's no direct way of setting the concept cache's title, # we set it here, wait for it to be inherited, and then get the 'real' # process title for this one. if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() threads = [] logging.info("Creating %d worker threads.", num_threads) # task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_threads)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Thread( target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback) ) output_processor.start() for i in xrange(num_threads): this_thread = Thread( target=processor, args=( workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, ), name="MEDRank-Worker-%d" % i, ) logging.log(ULTRADEBUG, "Created thread: %r", this_thread) this_thread.start() threads.append((this_thread, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 # logging.info("Dispatching article %d: %r", count, each_article) target_thread = (count - 1) % num_threads logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name) task_queues[target_thread].put(each_article) # task_queue[target_process].put(each_article) # task_queue.put(each_article) # logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_threads = [x for x in threads if x[0].is_alive()] remaining_threads = len(alive_threads) logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads) for i in xrange(remaining_threads): alive_threads[i][2].put("STOP") # alive_threads[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the threads.") # Note end of output while len(threads) > 0: a_thread = threads.pop() # We join the process to wait for the end of the reading a_thread[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multithreading.") Pmid.close_storage() return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes = cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold = gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval = sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5 * original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes = [] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_processes)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process = Process( target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process = (count - 1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_processes = [x for x in processes if x[0].is_alive()] remaining_processes = len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes) > 0: a_process = processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_threads=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multithreading notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are thread-safe. """ if num_threads is None: num_threads=1 logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) # Since there's no direct way of setting the concept cache's title, # we set it here, wait for it to be inherited, and then get the 'real' # process title for this one. if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() threads=[] logging.info("Creating %d worker threads.", num_threads) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_threads)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Thread(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_threads): this_thread=Thread(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created thread: %r", this_thread) this_thread.start() threads.append((this_thread, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 # logging.info("Dispatching article %d: %r", count, each_article) target_thread=(count-1) % num_threads logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name) task_queues[target_thread].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_threads=[x for x in threads if x[0].is_alive()] remaining_threads=len(alive_threads) logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads) for i in xrange(remaining_threads): alive_threads[i][2].put('STOP') #alive_threads[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the threads.") # Note end of output while len(threads)>0: a_thread=threads.pop() # We join the process to wait for the end of the reading a_thread[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multithreading.") Pmid.close_storage() return