def buildConcepts(self): cc = ConcreteConcept('hareitosis', 'i', ['hareitosis'], ['B02.34'], ['']) cc2 = ConcreteConcept('inflammed hair cancer', 'a', ['hairitis', 'hairiatic cancer'], ['A12.23.45', 'B01.23'], ['']) self.storage = {'c12345': cc, 'c98765': cc2} Concept.init_storage(self.storage) return (Concept('c12345'), Concept('c98765'))
def buildConcepts(self): cc=ConcreteConcept('hareitosis', 'i', ['hareitosis'], ['B02.34'], ['']) cc2=ConcreteConcept('inflammed hair cancer', 'a', ['hairitis', 'hairiatic cancer'], ['A12.23.45', 'B01.23'], ['']) self.storage={'c12345': cc, 'c98765': cc2} Concept.init_storage(self.storage) return (Concept('c12345'), Concept('c98765'))
def testWrongMappingType(self): c1, c2 = self.buildConcepts() cc3 = ConcreteConcept('hairosis', 'g/p', ['hairotic hairisis'], ['C1.2.3'], ['']) # Add it in a roundabout way to the original storage self.storage['c2468'] = cc3 c3 = Concept('c2468') self.assertEqual([], self.ruleless_converter.convert(c3).utterance)
def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader=reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder=graph_builder self._ranker=MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff=ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree=Tree(mesh_tree_filename) logging.debug("Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix=SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters=eval_parameters self._eval_parameters.mesh_tree=self._mesh_tree self._eval_parameters.savcc_matrix=self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data=None else: converter_data=pickle.load(open(umls_converter_data_filename, "rb")) self._umls_converter=RankedConverter(Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file=output_file logging.debug("My output file is: %r", self._output_file) return
def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader = reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder = graph_builder self._ranker = MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree = Tree(mesh_tree_filename) logging.debug( "Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix = SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters = eval_parameters self._eval_parameters.mesh_tree = self._mesh_tree self._eval_parameters.savcc_matrix = self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data = None else: converter_data = pickle.load( open(umls_converter_data_filename, "rb")) self._umls_converter = RankedConverter( Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file = output_file logging.debug("My output file is: %r", self._output_file) return
def convert(self, a_ranked_result_set): """Convert a ranked result set into a RankedConversionResult. In other words, convert a ranked term list to its MeSH equivalents.""" result = RankedConversionResult() self._my_converter.start_conversion() for incoming_term, incoming_score in a_ranked_result_set: converted = self._my_converter.convert( Concept(incoming_term.node_id)) if converted.utterance != []: result.add_term_score(converted, incoming_score) converted = self._my_converter.end_conversion() if converted.utterance != []: result.add_term_score(converted, incoming_score + self._checktag_boost) logging.log(ULTRADEBUG, "RankedConverter results: %r", result) return result
def testTermNotInTreeRaisesException(self): c1, c2 = self.buildConcepts() cc3 = ConcreteConcept( 'hairosis', 'i', [ 'hairotic hairisis', 'hair-raising hair', # <--This'll be a descriptor 'hurry' ], ['Q12345678', 'D123456', 'Q987564'], ['']) # Add it in a roundabout way to the original storage self.storage['c2468'] = cc3 c3 = Concept('c2468') self.ruleless_converter._skip_unknown = False self.assertRaises(TermNotInTree, self.ruleless_converter.convert, c3)
def testDeepestOnePreferred(self): c1, c2 = self.buildConcepts() cc3 = ConcreteConcept( 'hair-raising hair', 'i', [ 'hairitis', # <-- a descriptor 'hairiatic cancer', # <-- a descriptor 'hareitosis' ], # <-- a descriptor ['D12345678', 'D123456', 'D987564'], ['']) # Add it in a roundabout way to the original storage self.storage['c2468'] = cc3 c3 = Concept('c2468') # Should choose the deepest one (hairiatic cancer, according to # self.my_tree) self.assertEqual([Term('hairiatic cancer')], self.ruleless_converter.convert(c3).utterance)
def testDescriptorNamedLikeConceptPreferred(self): c1, c2 = self.buildConcepts() cc3 = ConcreteConcept( 'hair-raising hair', 'i', [ 'hairotic hairisis', # <-- a descriptor 'hair-raising hair', # <-- a descriptor 'hurry' ], # <-- not a descriptor ['D12345678', 'D123456', 'Q987564'], ['']) # Add it in a roundabout way to the original storage self.storage['c2468'] = cc3 c3 = Concept('c2468') # Make sure the tree knows about this concept self.my_tree._tree['hair-raising hair'] = TreeNode( 'hair-raising hair', 'MH', 'Q', set(['B12.34'])) self.assertEqual([Term('hair-raising hair')], self.ruleless_converter.convert(c3).utterance)
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes=cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold=gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval=sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5*original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes=[] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_processes)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process=Process(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process=(count-1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_processes=[x for x in processes if x[0].is_alive()] remaining_processes=len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes)>0: a_process=processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_threads=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multithreading notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are thread-safe. """ if num_threads is None: num_threads=1 logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) # Since there's no direct way of setting the concept cache's title, # we set it here, wait for it to be inherited, and then get the 'real' # process title for this one. if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() threads=[] logging.info("Creating %d worker threads.", num_threads) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_threads)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Thread(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_threads): this_thread=Thread(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created thread: %r", this_thread) this_thread.start() threads.append((this_thread, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 # logging.info("Dispatching article %d: %r", count, each_article) target_thread=(count-1) % num_threads logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name) task_queues[target_thread].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_threads=[x for x in threads if x[0].is_alive()] remaining_threads=len(alive_threads) logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads) for i in xrange(remaining_threads): alive_threads[i][2].put('STOP') #alive_threads[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the threads.") # Note end of output while len(threads)>0: a_thread=threads.pop() # We join the process to wait for the end of the reading a_thread[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multithreading.") Pmid.close_storage() return
def __init__(self, original_line, converter): RelationLine.__init__(self, original_line) self._mesh1 = converter.convert(Concept(self.CUI1)) self._mesh2 = converter.convert(Concept(self.CUI2)) dummy = converter.end_conversion() # Discard extra terms generated by
def __init__(self, original_line, converter): EntityLine.__init__(self, original_line) self._mesh = converter.convert(Concept(self.CUI)) dummy = converter.end_conversion() # Discard extra terms generated by
def multi_processor( reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_threads=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True, ): """ Perform the evaluation. Multithreading notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are thread-safe. """ if num_threads is None: num_threads = 1 logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) # Since there's no direct way of setting the concept cache's title, # we set it here, wait for it to be inherited, and then get the 'real' # process title for this one. if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() threads = [] logging.info("Creating %d worker threads.", num_threads) # task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_threads)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Thread( target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback) ) output_processor.start() for i in xrange(num_threads): this_thread = Thread( target=processor, args=( workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, ), name="MEDRank-Worker-%d" % i, ) logging.log(ULTRADEBUG, "Created thread: %r", this_thread) this_thread.start() threads.append((this_thread, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 # logging.info("Dispatching article %d: %r", count, each_article) target_thread = (count - 1) % num_threads logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name) task_queues[target_thread].put(each_article) # task_queue[target_process].put(each_article) # task_queue.put(each_article) # logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_threads = [x for x in threads if x[0].is_alive()] remaining_threads = len(alive_threads) logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads) for i in xrange(remaining_threads): alive_threads[i][2].put("STOP") # alive_threads[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the threads.") # Note end of output while len(threads) > 0: a_thread = threads.pop() # We join the process to wait for the end of the reading a_thread[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multithreading.") Pmid.close_storage() return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes = cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold = gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval = sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5 * original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes = [] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_processes)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process = Process( target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process = (count - 1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_processes = [x for x in processes if x[0].is_alive()] remaining_processes = len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes) > 0: a_process = processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return