Exemplo n.º 1
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard, seen_terms, self._alpha)
     gold_standard_vector = self._my_tree.term_vector(gold_standard)
     seen_vector = self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator = gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator = gold_standard_vector.length() * modified_term.length()
     try:
         result = numerator / denominator
     except ZeroDivisionError:
         logging.warn(
             "ZeroDivisionError when computing SAVCC for %r and %r:",
             gold_standard, seen_terms)
         result = 0
     logging.log(ULTRADEBUG,
                 "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                 numerator, denominator, result)
     return result
Exemplo n.º 2
0
 def __iter__(self):
     current_set=[]
     current_id=None
     bad_id=-1
     for line in NLMOutput.__iter__(self):
         try:
             this_lines_set_id=self._chunkmap.pmid_from_block(line.line_id)
         except KeyError:
             logging.warn("Line without chunkmap equivalent. Emitting"
                             " as id %d", bad_id)
             this_lines_set_id=Pmid(bad_id)
         if this_lines_set_id!=current_id:
             # Is this the first invocation? If not, we have to emit the
             # linelist that just ended, but if it is we'll just pretend
             # that we did.
             if current_id is not None:
                 # Emit the linelist that just ended
                 logging.log(ULTRADEBUG, "Completed set of lines %s "
                               "according to the chunkmap. Emitting them.",
                                current_id)
                 if current_id<0:
                     # Decrement bad line counter
                     bad_id-=1
                 yield self._lines_type(current_id, current_set)
                     
             # Start a new, empty linelist
             current_id=this_lines_set_id
             current_set=[]
         current_set.append(line)
     # Is there something left to emit after the iteration's over?
     if len(current_set)>0:
         logging.log(ULTRADEBUG, "Completed iteration. Emitting the last "
                                 "lines left with set id %s", current_id)
         yield self._lines_type(current_id, current_set)
     return
Exemplo n.º 3
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard,
                   seen_terms,
                   self._alpha)
     gold_standard_vector=self._my_tree.term_vector(gold_standard)
     seen_vector=self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator=gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator=gold_standard_vector.length()*modified_term.length()
     try:
         result=numerator/denominator
     except ZeroDivisionError:
         logging.warn("ZeroDivisionError when computing SAVCC for %r and %r:",
                  gold_standard, seen_terms)
         result=0
     logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                   numerator,
                   denominator,
                   result)
     return result
Exemplo n.º 4
0
 def __init__(self, fileobject, transform_function):
     SavccMatrix.__init__(self, fileobject, transform_function)
     # Add normalization factors
     logging.log(ULTRADEBUG, "Initializing normalization array")
     # Default behavior: no normalization
     self.normfactors=[1.0]*self._height
     # Tentative normalization array name
     array_filename=self._expected_norm_array_name()
     logging.debug("Trying to load a normalization array from disk. The "
                   "file should be named %s.", array_filename)
     # Make sure that only one process or thread at a time can attempt to get 
     # the normalization factors
     _normfactor_lock.acquire()
     try:
         try:
             self._load_normalization_factors(open(array_filename, 'rb'))
             logging.debug('Normalization factors loaded from disk.')
         except IOError:
             logging.debug("Unable to load normalization factors from disk.")
             self._generate_normalization_factors()
             # Only save normalization factors if they are not a StringIO
             # object
             if not isinstance(fileobject, StringIO.StringIO):
                 logging.debug("Saving normalization factors to %s",
                               array_filename)
                 try:
                     self._save_normalization_factors(open(array_filename,
                                                           'wb'))
                 except IOError:
                     logging.warn("Unable to save the normalization array. "
                                  "It will have to be regenerated each "
                                  "time.")
     finally:
         _normfactor_lock.release()
Exemplo n.º 5
0
 def start_conversion(self):
     """start_conversion:
     Begin the conversion process by cleaning up the internal state of the
     converter."""
     if len(self._extra_checktags) > 0:
         logging.warn("Cleaning up _extra_checktags, but there was content"
                      " there. Someone didn't retrieve it.")
     self._extra_checktags = set()
Exemplo n.º 6
0
 def start_conversion(self):
     """start_conversion:
     Begin the conversion process by cleaning up the internal state of the
     converter."""
     if len(self._extra_checktags)>0:
         logging.warn("Cleaning up _extra_checktags, but there was content"
                      " there. Someone didn't retrieve it.")
     self._extra_checktags=set()
Exemplo n.º 7
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results = {}
     evaluator = self.create_evaluator()
     count = 0
     for each_article in self._reader:
         count += 1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(
                 ULTRADEBUG, "Skipping article %r due to exclusion "
                 " criteria.", each_article)
             continue
         try:
             ranked_article = self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms = self.convert(ranked_article)
         cut_terms = converted_terms.terms_higher_than_or_equal_to(
             self._ranking_cutoff)
         logging.debug(
             "Lowest-ranking term is term #%d out of %d"
             " (score=%1.5f, highest score=%1.5f)", len(cut_terms),
             len(converted_terms), [x[1] for x in cut_terms][-1],
             [x[1] for x in cut_terms][0])
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record().mesh_headings)
         flat_medline = medline_record_mesh_terms.flatten()
         flattened_terms = self.flatten_generated_terms(
             flat_medline, cut_terms)
         flattened_terms = self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline) == 0:
             logging.warn(
                 "No gold standard available for article %r. "
                 "Omitting it from the result set.", each_article)
             continue
         eval_result = self.perform_evaluation(each_article, evaluator,
                                               flat_medline,
                                               flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r",
                       medline_record_mesh_terms, flattened_major_headings)
         mh_result_temp = self.perform_evaluation(each_article, evaluator,
                                                  flattened_major_headings,
                                                  flattened_terms)
         mh_result = NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall = self.compute_total_recall(flat_medline,
                                                  converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id] = eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Exemplo n.º 8
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results={}
     evaluator=self.create_evaluator()
     count=0
     for each_article in self._reader:
         count+=1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                           " criteria.", each_article)
             continue
         try:
             ranked_article=self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms=self.convert(ranked_article)
         cut_terms=converted_terms.terms_higher_than_or_equal_to(
                             self._ranking_cutoff)
         logging.debug("Lowest-ranking term is term #%d out of %d"
                       " (score=%1.5f, highest score=%1.5f)",
                       len(cut_terms), len(converted_terms),
                       [x[1] for x in cut_terms][-1],
                       [x[1] for x in cut_terms][0])
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record().mesh_headings)
         flat_medline=medline_record_mesh_terms.flatten()
         flattened_terms=self.flatten_generated_terms(flat_medline,
                         cut_terms)
         flattened_terms=self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline)==0:
             logging.warn("No gold standard available for article %r. "
                          "Omitting it from the result set.", each_article)
             continue
         eval_result=self.perform_evaluation(each_article,
                                             evaluator,
                                             flat_medline,
                                             flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r", 
                         medline_record_mesh_terms,
                         flattened_major_headings)
         mh_result_temp=self.perform_evaluation(each_article, evaluator,
                                                flattened_major_headings,
                                                flattened_terms)
         mh_result=NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall=self.compute_total_recall(flat_medline, 
                                                converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id]=eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Exemplo n.º 9
0
def processor(workflow_class,
              graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params,
              eval_parameters, 
              ranking_cutoff,
              mesh_tree_filename, distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue, my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow=workflow_class(graph_builder_constructor,
                               graph_builder_params,
                               ranker_constructor,
                               ranker_params,
                               eval_parameters,
                               ranking_cutoff,
                               mesh_tree_filename,
                               distance_matrix_filename,
                               distance_function,
                               umls_converter_data_filename
                               )
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request=my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request=='STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results={}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                                            # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", 
                             traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
 def _create_graph(self, list_of_lines):
     """Build a graph by generating a relationship for every pair of
     co-occurring nodes. We take advantage of the fact that (for our
     purposes) all lines with the same line_id in METAMAP output come from
     the same sentence."""
     new_graph = self._type_of_graph_to_build()
     logging.debug("Retrieving semantic predications for %r",
                   self._line_set_id)
     try:
         predications = get_predications(self._line_set_id)
     except:
         logging.warn(
             "No predications for %r: an exception was raised.\n%s",
             self._line_set_id, traceback.format_exc())
         return new_graph
     logging.log(ULTRADEBUG,
                 "Building a METAMAP co-occurrence graph from %r",
                 list_of_lines)
     for sentence in self.sentence_iterator(list_of_lines):
         # Each "sentence" contains a set of potential nodes that need
         # screening.
         nodes = []
         for concept in sentence:
             new_node = self._node_factory(concept.CUI, concept.description,
                                           concept.confidence, concept.line)
             if self.include_node(new_node):
                 #nodes.append((concept.CUI, concept.confidence))
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph.",
                             new_node)
         # Once we have all the nodes in a sentence, we generate all
         # possible combinations (O(n^2)), yes it's ugly.
         for i in xrange(len(nodes)):
             # Since relationships are not directional we can skip half
             # of the generation (i.e. if we have i<-->j we don't need
             # j<-->i
             for j in xrange(i + 1, len(nodes)):
                 node1, node2 = nodes[i], nodes[j]
                 #new_link=AdirectionalLink(node1[0], node2[0],
                 #                          (node1[1]+node2[1])/2.0)
                 # Is there a predication?
                 try:
                     this_link = predications[(node1, node2)]
                 except KeyError:
                     continue
                 new_link = self._adirectional_link_factory(
                     node1, node2, this_link.weight)
                 if self.include_link(new_link):
                     new_graph.add_relationship(new_link)
                 else:
                     logging.log(ULTRADEBUG,
                                 "Excluding link %r from the graph",
                                 new_link)
     return new_graph
 def _create_graph(self, list_of_lines):
     """Build a graph by generating a relationship for every pair of
     co-occurring nodes. We take advantage of the fact that (for our
     purposes) all lines with the same line_id in METAMAP output come from
     the same sentence."""
     new_graph=self._type_of_graph_to_build()
     logging.debug("Retrieving semantic predications for %r", 
                   self._line_set_id)
     try:
         predications=get_predications(self._line_set_id)
     except:
         logging.warn("No predications for %r: an exception was raised.\n%s",
                      self._line_set_id, traceback.format_exc())
         return new_graph
     logging.log(ULTRADEBUG, 
                 "Building a METAMAP co-occurrence graph from %r",
                 list_of_lines)
     for sentence in self.sentence_iterator(list_of_lines):
         # Each "sentence" contains a set of potential nodes that need
         # screening.
         nodes=[]
         for concept in sentence:
             new_node=self._node_factory(concept.CUI, 
                                         concept.description, 
                                         concept.confidence,
                                         concept.line)
             if self.include_node(new_node):
                 #nodes.append((concept.CUI, concept.confidence))
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph.", new_node)
         # Once we have all the nodes in a sentence, we generate all
         # possible combinations (O(n^2)), yes it's ugly.
         for i in xrange(len(nodes)):
             # Since relationships are not directional we can skip half
             # of the generation (i.e. if we have i<-->j we don't need
             # j<-->i
             for j in xrange(i+1, len(nodes)):
                 node1, node2=nodes[i],nodes[j] 
                 #new_link=AdirectionalLink(node1[0], node2[0], 
                 #                          (node1[1]+node2[1])/2.0)
                 # Is there a predication?
                 try:
                     this_link=predications[(node1, node2)]
                 except KeyError:
                     continue
                 new_link=self._adirectional_link_factory(node1, node2,
                                        this_link.weight)
                 if self.include_link(new_link):
                     new_graph.add_relationship(new_link)
                 else:
                     logging.log(ULTRADEBUG, "Excluding link %r from the graph",
                                   new_link)
     return new_graph
Exemplo n.º 12
0
 def term_vector(self, list_of_terms):
     """Returns a VocabularyVector representing the list of terms as seen 
     by this tree."""
     new_vector = VocabularyVector(self.num_terms)
     for term in list_of_terms:
         try:
             new_vector[self.index(term)] = 1
         except TermNotInTree:
             logging.warn(
                 'Weird: term %r could not be found in %r. It '
                 'should be there.', term, self)
     return new_vector
Exemplo n.º 13
0
def processor(workflow_class,
              graph_builder_constructor,
              graph_builder_params,
              ranker_constructor,
              ranker_params,
              eval_parameters,
              ranking_cutoff,
              mesh_tree_filename,
              distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue,
              my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow = workflow_class(graph_builder_constructor,
                                 graph_builder_params, ranker_constructor,
                                 ranker_params, eval_parameters,
                                 ranking_cutoff, mesh_tree_filename,
                                 distance_matrix_filename, distance_function,
                                 umls_converter_data_filename)
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request = my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request == 'STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results = {}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
Exemplo n.º 14
0
Arquivo: tree.py Projeto: YZWD/MEDRank
 def term_vector(self, list_of_terms):
     """Returns a VocabularyVector representing the list of terms as seen 
     by this tree."""
     new_vector=VocabularyVector(self.num_terms)
     for term in list_of_terms:
         try:
             new_vector[self.index(term)]=1
         except TermNotInTree:
             logging.warn('Weird: term %r could not be found in %r. It '
                          'should be there.',
                          term, self)
     return new_vector
Exemplo n.º 15
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                       " criteria.", each_article)
         return
     try:
         ranked_article=self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms=self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms=converted_terms.terms_higher_than_or_equal_to(
                         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record()['MH'])
     except:
         logging.warn("Could not obtain an article record for %r. "
                      "Skipping.", each_article)
         return
     flat_medline=medline_record_mesh_terms.flatten()
     flattened_terms=self.flatten_generated_terms(flat_medline,
                     cut_terms)
     flattened_terms=self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline)==0:
         logging.warn("No gold standard available for article %r. "
                      "Omitting it from the result set.", each_article)
         return
     eval_result=self.perform_evaluation(each_article,
                                         self.evaluator,
                                         flat_medline,
                                         flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r", 
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp=self.perform_evaluation(each_article, self.evaluator,
                                            flattened_major_headings,
                                            flattened_terms)
     mh_result=NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall=self.compute_total_recall(flat_medline, 
                                            converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id]=eval_result | mh_result
     return
Exemplo n.º 16
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict = {}
     logging.info("Building the term frequency dictionary")
     count = 1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug(
             "Processing article %r (number %d) for the term"
             " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count += 1
         tempcounts = {}
         for line in article.lines:
             try:
                 this_cui = line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui] = 1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k] = tempdict.get(k, 0) + 1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k] = math.log(count / float(v)) + 1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(self.cache_file_name(
         file_reader.original_file.name))
     return
Exemplo n.º 17
0
    def from_graphml_file(self, file_object, default_link=Link):
        from xml.etree.ElementTree import iterparse

        def get_subelement_data(elem, key):
            result = [
                x.text for x in elem.getiterator()
                if x.tag == "{http://graphml.graphdrawing.org/xmlns}data"
                and x.get('key') == key
            ]
            if len(result) == 0:
                return None
            return result[0]

        nodes = {}
        # Discover the names of the attributes we're looking for by investigating the keys
        # Then actually read the file
        keystore = {}
        for event, element in iterparse(file_object):
            #print element
            if element.tag == "{http://graphml.graphdrawing.org/xmlns}key":
                if element.get('attr.name') is None:
                    continue
                keystore[element.get('for') + '.' +
                         element.get('attr.name')] = element.get('id')
            # print keystore
            if element.tag == "{http://graphml.graphdrawing.org/xmlns}node":
                # The next line supports yEd's NodeLabel and Profuse's label
                nodename = get_subelement_data(element,
                                               keystore['node.description'])
                if nodename is None:
                    nodename = "NoName"
                nodekey = get_subelement_data(element, keystore['node.MR_id'])
                nodes[element.get('id')] = Node(nodekey, nodename, 1.0)
            if element.tag == "{http://graphml.graphdrawing.org/xmlns}edge":
                n1 = nodes[element.get('source')]
                n2 = nodes[element.get('target')]
                try:
                    weight = float(
                        get_subelement_data(element, keystore['edge.weight']))
                except:
                    logging.warn('Failed at reading weight because of:\n%s',
                                 traceback.format_exc())
                    weight = 1.0
                try:
                    relname = get_subelement_data(element,
                                                  keystore['edge.description'])
                except:
                    relname = ""
                self.add_relationship(default_link(n1, n2, weight, relname))
        self.consolidate_graph()
        return
Exemplo n.º 18
0
 def iter_concepts(self):
     """Iterates through the concepts (only one per position) so that
     they can be extracted in order. We will get the first concept that
     covers each positional 'slot' in the original. """
     #concepts_iter=MappingLine.ev_parser.finditer(self.line)
     #concept_slots={}
     #for concept in concepts_iter:
     #    concept=concept.groupdict()
     #    positions=MappingLine.position_extractor.findall(
     #            concept['match_positions'])
     #    this_pos=int(positions[0])
     #    covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in
     #                                      positions])
     #    if covered_pos in concept_slots:
     #        continue
     #    concept_slots[this_pos]=ConceptLine(concept['cui'], 
     #                             concept['preferred_concept_name'],
     #                             -int(concept['candidate_score']))
     #    # Fill in the rest of the slots covered by this concept
     #    for each_slot in positions[1:]:
     #        concept_slots[int(each_slot)]=None
     #        
     #ordered_slots=concept_slots.keys()
     #ordered_slots.sort()
     #for slot in ordered_slots:
     #    if concept_slots[slot] is not None:
     #        yield concept_slots[slot]
     #return
     try:
         all_mappings=mappings.parseString(self.line)[0]
     except:
         logging.warn("FAIL parsing %s", self.line)
         raise
     if len(all_mappings)==0:
         return
     # Get the mapping with the best score. If all have the same score,
     # uses the first one.
     best_mapping=all_mappings[0]['Expression'][0]
     best_mapping_score=all_mappings[0]['Score']
     for m in all_mappings[1:]:
         if m['Score']>best_mapping_score:
             best_mapping_score=m['Score']
             best_mapping=m['Expression'][0]
     # The EVs are in order
     for e in best_mapping:
         new_concept=ConceptLine(e['ConceptID'],
                                 e['Name'],
                                 int(e['Score']))
         logging.debug("Emitting %r", new_concept)
         yield new_concept
     return
Exemplo n.º 19
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict={}
     logging.info("Building the term frequency dictionary")
     count=1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug("Processing article %r (number %d) for the term"
                      " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count+=1
         tempcounts={}
         for line in article.lines:
             try:
                 this_cui=line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui]=1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k]=tempdict.get(k, 0)+1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k]=math.log(count/float(v))+1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(
             self.cache_file_name(file_reader.original_file.name))
     return
Exemplo n.º 20
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                     " criteria.", each_article)
         return
     try:
         ranked_article = self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms = self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms = converted_terms.terms_higher_than_or_equal_to(
         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record()['MH'])
     except:
         logging.warn(
             "Could not obtain an article record for %r. "
             "Skipping.", each_article)
         return
     flat_medline = medline_record_mesh_terms.flatten()
     flattened_terms = self.flatten_generated_terms(flat_medline, cut_terms)
     flattened_terms = self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline) == 0:
         logging.warn(
             "No gold standard available for article %r. "
             "Omitting it from the result set.", each_article)
         return
     eval_result = self.perform_evaluation(each_article, self.evaluator,
                                           flat_medline, flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r",
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp = self.perform_evaluation(each_article, self.evaluator,
                                              flattened_major_headings,
                                              flattened_terms)
     mh_result = NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall = self.compute_total_recall(flat_medline, converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id] = eval_result | mh_result
     return
Exemplo n.º 21
0
 def iter_concepts(self):
     """Iterates through the concepts (only one per position) so that
     they can be extracted in order. We will get the first concept that
     covers each positional 'slot' in the original. """
     #concepts_iter=MappingLine.ev_parser.finditer(self.line)
     #concept_slots={}
     #for concept in concepts_iter:
     #    concept=concept.groupdict()
     #    positions=MappingLine.position_extractor.findall(
     #            concept['match_positions'])
     #    this_pos=int(positions[0])
     #    covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in
     #                                      positions])
     #    if covered_pos in concept_slots:
     #        continue
     #    concept_slots[this_pos]=ConceptLine(concept['cui'],
     #                             concept['preferred_concept_name'],
     #                             -int(concept['candidate_score']))
     #    # Fill in the rest of the slots covered by this concept
     #    for each_slot in positions[1:]:
     #        concept_slots[int(each_slot)]=None
     #
     #ordered_slots=concept_slots.keys()
     #ordered_slots.sort()
     #for slot in ordered_slots:
     #    if concept_slots[slot] is not None:
     #        yield concept_slots[slot]
     #return
     try:
         all_mappings = mappings.parseString(self.line)[0]
     except:
         logging.warn("FAIL parsing %s", self.line)
         raise
     if len(all_mappings) == 0:
         return
     # Get the mapping with the best score. If all have the same score,
     # uses the first one.
     best_mapping = all_mappings[0]['Expression'][0]
     best_mapping_score = all_mappings[0]['Score']
     for m in all_mappings[1:]:
         if m['Score'] > best_mapping_score:
             best_mapping_score = m['Score']
             best_mapping = m['Expression'][0]
     # The EVs are in order
     for e in best_mapping:
         new_concept = ConceptLine(e['ConceptID'], e['Name'],
                                   int(e['Score']))
         logging.debug("Emitting %r", new_concept)
         yield new_concept
     return
Exemplo n.º 22
0
 def freeze(self):
     """Dumps the configuration to special keys so that the DBDict
     state can be replicated later (i.e. for persistence).
     This procedure is performed every WRITE_EVERY writes, but
     COUNTER_KEY is kept updated continuously."""
     if self.my_mode == "r":
         return
     #self.my_lock.acquire()
     try:
         self.my_store[SYNC_KEY] = str(self.sync_every)
         self.my_store[COUNTER_KEY] = str(self.write_counter)
         self.my_store[WRITE_EVERY_KEY] = str(self.write_every)
         self.my_store.sync()
     except:
         logging.warn("ERROR while storing state: %s",
                      traceback.format_exc())
Exemplo n.º 23
0
 def freeze(self):
     """Dumps the configuration to special keys so that the DBDict
     state can be replicated later (i.e. for persistence).
     This procedure is performed every WRITE_EVERY writes, but
     COUNTER_KEY is kept updated continuously."""        
     if self.my_mode=="r":
         return
     #self.my_lock.acquire()
     try:
         self.my_store[SYNC_KEY]=str(self.sync_every)
         self.my_store[COUNTER_KEY]=str(self.write_counter)
         self.my_store[WRITE_EVERY_KEY]=str(self.write_every)
         self.my_store.sync()
     except:
         logging.warn("ERROR while storing state: %s", 
                      traceback.format_exc())
Exemplo n.º 24
0
 def from_graphml_file(self, file_object, default_link=Link):
     from xml.etree.ElementTree import iterparse
     def get_subelement_data(elem, key):
         result=[x.text for x in elem.getiterator()
                 if x.tag=="{http://graphml.graphdrawing.org/xmlns}data"
                 and x.get('key')==key]
         if len(result)==0:
             return None
         return result[0]
     nodes={}
     # Discover the names of the attributes we're looking for by investigating the keys
     # Then actually read the file
     keystore={}
     for event, element in iterparse(file_object):
         #print element
         if element.tag=="{http://graphml.graphdrawing.org/xmlns}key":
             if element.get('attr.name') is None:
                 continue
             keystore[element.get('for')+'.'+element.get('attr.name')]=element.get('id')
         # print keystore
         if element.tag=="{http://graphml.graphdrawing.org/xmlns}node":
             # The next line supports yEd's NodeLabel and Profuse's label
             nodename=get_subelement_data(element, keystore['node.description'])
             if nodename is None:
                 nodename="NoName"
             nodekey=get_subelement_data(element, keystore['node.MR_id'])
             nodes[element.get('id')]=Node(nodekey, nodename, 1.0)
         if element.tag=="{http://graphml.graphdrawing.org/xmlns}edge":
             n1=nodes[element.get('source')]
             n2=nodes[element.get('target')]
             try:
                 weight=float(get_subelement_data(element, keystore['edge.weight']))
             except:
                 logging.warn('Failed at reading weight because of:\n%s', 
                              traceback.format_exc())
                 weight=1.0
             try:
                 relname=get_subelement_data(element, keystore['edge.description'])
             except:
                 relname=""
             self.add_relationship(default_link(n1, n2, weight, relname))
     self.consolidate_graph()
     return
Exemplo n.º 25
0
    def article_record(self):
        if Pmid.__article_cache is None:
            Pmid.init_storage()
        fetch = False
        my_record = self.__article_cache.get_record(self.__pmid)
        if my_record is not None:
            return my_record
        else:
            fetch = self.__fetch_new
            logging.warn('Could not read %s from the cache: \n', self.__pmid)

        if fetch:
            my_record = self.__article_cache.fetch_record(self.__pmid)
            if not self.__article_cache.put_record(self.__pmid, my_record):
                logging.warn("Unable to update the database: \n%r", my_record)
        else:
            raise KeyError("No record for article %r could be found." %
                           self.__pmid)
        return my_record
Exemplo n.º 26
0
Arquivo: pmid.py Projeto: YZWD/MEDRank
    def article_record(self):
        if Pmid.__article_cache is None:
            Pmid.init_storage()
        fetch=False
        my_record=self.__article_cache.get_record(self.__pmid)
        if my_record is not None:
            return my_record
        else:
            fetch=self.__fetch_new
            logging.warn('Could not read %s from the cache: \n',
                         self.__pmid)

        if fetch:
            my_record=self.__article_cache.fetch_record(self.__pmid)
            if not self.__article_cache.put_record(self.__pmid, my_record):
                logging.warn("Unable to update the database: \n%r",
                             my_record)
        else:
            raise KeyError("No record for article %r could be found." % self.__pmid)
        return my_record
Exemplo n.º 27
0
 def ignore_exception(self, which_exception, on_which_line):
     """Decides whether exceptions during parsing correspond to known
     problems with SEMREP's output, and whether to ignore the corresponding 
     lines."""
     if type(which_exception) is CUINotFoundError:
         logging.log(ULTRADEBUG, "Skipping line '%s' because no CUI could be found "
                       "on it" % on_which_line)
         return True
     if type(which_exception) is NoLineTypeError:
         logging.log(ULTRADEBUG, "Skipping line '%s' because its type could not be "
                       "determined.", on_which_line)
         return True
     if type(which_exception) is NoConfidenceError:
         logging.log(ULTRADEBUG, "Skipping line '%s' because it has no confidence.",
                       on_which_line)
         return True
     if type(which_exception) is UnknownLineTypeError:
         logging.warn("Skipping line '%s' because it has an unknown type",
                       on_which_line)
         return True
     return False
Exemplo n.º 28
0
    def __iter__(self):
        current_set = []
        current_id = None
        bad_id = -1
        for line in NLMOutput.__iter__(self):
            try:
                this_lines_set_id = self._chunkmap.pmid_from_block(
                    line.line_id)
            except KeyError:
                logging.warn(
                    "Line without chunkmap equivalent. Emitting"
                    " as id %d", bad_id)
                this_lines_set_id = Pmid(bad_id)
            if this_lines_set_id != current_id:
                # Is this the first invocation? If not, we have to emit the
                # linelist that just ended, but if it is we'll just pretend
                # that we did.
                if current_id is not None:
                    # Emit the linelist that just ended
                    logging.log(
                        ULTRADEBUG, "Completed set of lines %s "
                        "according to the chunkmap. Emitting them.",
                        current_id)
                    if current_id < 0:
                        # Decrement bad line counter
                        bad_id -= 1
                    yield self._lines_type(current_id, current_set)

                # Start a new, empty linelist
                current_id = this_lines_set_id
                current_set = []
            current_set.append(line)
        # Is there something left to emit after the iteration's over?
        if len(current_set) > 0:
            logging.log(
                ULTRADEBUG, "Completed iteration. Emitting the last "
                "lines left with set id %s", current_id)
            yield self._lines_type(current_id, current_set)
        return
Exemplo n.º 29
0
 def ignore_exception(self, which_exception, on_which_line):
     """Decides whether exceptions during parsing correspond to known
     problems with SEMREP's output, and whether to ignore the corresponding 
     lines."""
     if type(which_exception) is CUINotFoundError:
         logging.log(
             ULTRADEBUG, "Skipping line '%s' because no CUI could be found "
             "on it" % on_which_line)
         return True
     if type(which_exception) is NoLineTypeError:
         logging.log(
             ULTRADEBUG, "Skipping line '%s' because its type could not be "
             "determined.", on_which_line)
         return True
     if type(which_exception) is NoConfidenceError:
         logging.log(ULTRADEBUG,
                     "Skipping line '%s' because it has no confidence.",
                     on_which_line)
         return True
     if type(which_exception) is UnknownLineTypeError:
         logging.warn("Skipping line '%s' because it has an unknown type",
                      on_which_line)
         return True
     return False
Exemplo n.º 30
0
 def __init__(self, fileobject, transform_function):
     SavccMatrix.__init__(self, fileobject, transform_function)
     # Add normalization factors
     logging.log(ULTRADEBUG, "Initializing normalization array")
     # Default behavior: no normalization
     self.normfactors = [1.0] * self._height
     # Tentative normalization array name
     array_filename = self._expected_norm_array_name()
     logging.debug(
         "Trying to load a normalization array from disk. The "
         "file should be named %s.", array_filename)
     # Make sure that only one process or thread at a time can attempt to get
     # the normalization factors
     _normfactor_lock.acquire()
     try:
         try:
             self._load_normalization_factors(open(array_filename, 'rb'))
             logging.debug('Normalization factors loaded from disk.')
         except IOError:
             logging.debug(
                 "Unable to load normalization factors from disk.")
             self._generate_normalization_factors()
             # Only save normalization factors if they are not a StringIO
             # object
             if not isinstance(fileobject, StringIO.StringIO):
                 logging.debug("Saving normalization factors to %s",
                               array_filename)
                 try:
                     self._save_normalization_factors(
                         open(array_filename, 'wb'))
                 except IOError:
                     logging.warn("Unable to save the normalization array. "
                                  "It will have to be regenerated each "
                                  "time.")
     finally:
         _normfactor_lock.release()
Exemplo n.º 31
0
import os.path
import sys
from MEDRank.computation.base_matrix import Matrix
from ctypes import cdll, CDLL, byref
# Disable warnings about spaces before operators (they drive me crazy)
# pylint: disable-msg=C0322
try:
    LIBRARY_LOCATION = os.path.join(sys.exec_prefix, 'lib',
                                    'python' + sys.version[:3],
                                    'site-packages', 'MEDRank', 'computation',
                                    '_distmat.so')
    cdll.LoadLibrary(LIBRARY_LOCATION)
    DISTLIB = CDLL(LIBRARY_LOCATION)
except:
    DISTLIB = None
    logging.warn("_distmat.so is not available; attempts to compute graph "
                 "metrics will result in an exception.")


class DistanceMatrix(object):
    """Represents a distance matrix, in which each C[i, j] encodes the 
    distance from i to j in a graph.
    Pass the value you plan on using as an unreachable distance to the 
    constructor. If you omit it, it will default to the link matrix's size
    (reasonable in most cases).
    
    The distance matrix is meant to compute stats on, so it's immutable by
    design.
    """
    def __init__(self, a_link_matrix, unreachable_distance=None):
        self._matrix = Matrix(len(a_link_matrix))
        if unreachable_distance is None:
Exemplo n.º 32
0
def output(output_file, result_queue, headers_callback=output_headers, 
           item_callback=output_one_item, initial_result_set_size=100):
    """Actually dumps the result set to output. Override for easy output
    customization."""
    result_set={}
    proctitle.setproctitle("MEDRank-output-processor")
    stop_requested=False
    # Gather a few values
    logging.log(ULTRADEBUG, "Gathering values for initial analysis.")
    for i in xrange(initial_result_set_size):
        logging.log(ULTRADEBUG, "Getting results %d.", i)
        try:
            request=result_queue.get()
            if request=='STOP':
                stop_requested=True
                break
            result_set.update(request)
        except KeyboardInterrupt:
            return
        except:
            logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc())

    logging.log(ULTRADEBUG, "Values gathered. Computing columns.")
            
    column_names=set([])
    # Add the colnames to the csv
    if headers_callback is not None:
        for result in result_set.itervalues():
            column_names|=result.columns()
        # Create a writer
        column_names=['pmid'] + [x for x in column_names]
        headers_callback(output_file, column_names)
    logging.log(ULTRADEBUG, "Looping to get more results and output them.")
    while True:
        if not stop_requested:
            try:
                request=result_queue.get()
                if request=='STOP':
                    stop_requested=True
                else:
                    result_set.update(request)
            except KeyboardInterrupt:
                return
            except:
                logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc())
        if stop_requested and len(result_set)==0:
            break
        if len(result_set)==0:
            continue # It can happen! We might get no results, or an empty set.
        pmid=result_set.keys()[0]
        logging.log(ULTRADEBUG, "Output: article %r.", pmid)
        result=result_set[pmid]
        item_callback(output_file, pmid, result, column_names)
        del result_set[pmid]
    try:
        output_file.flush()
    except:
        logging.warn("The output file object does not support flushing.")
    try:
        os.fsync(output_file.fileno())
    except:
        logging.warn("Could not fsync the output file. Traceback follows.\n%s",
                     traceback.format_exc())
    return
Exemplo n.º 33
0
import os.path
import sys
from MEDRank.computation.base_matrix import Matrix
from ctypes import cdll, CDLL, byref
# Disable warnings about spaces before operators (they drive me crazy)
# pylint: disable-msg=C0322
try:
    LIBRARY_LOCATION=os.path.join(sys.exec_prefix, 'lib', 
                                  'python'+sys.version[:3],
                                  'site-packages', 'MEDRank', 'computation',
                                  '_distmat.so')
    cdll.LoadLibrary(LIBRARY_LOCATION)
    DISTLIB=CDLL(LIBRARY_LOCATION)
except:
    DISTLIB=None
    logging.warn("_distmat.so is not available; attempts to compute graph "
                 "metrics will result in an exception.")
                 
class DistanceMatrix(object):
    """Represents a distance matrix, in which each C[i, j] encodes the 
    distance from i to j in a graph.
    Pass the value you plan on using as an unreachable distance to the 
    constructor. If you omit it, it will default to the link matrix's size
    (reasonable in most cases).
    
    The distance matrix is meant to compute stats on, so it's immutable by
    design.
    """
    def __init__(self, a_link_matrix, unreachable_distance=None):
        self._matrix=Matrix(len(a_link_matrix))
        if unreachable_distance is None:
            unreachable_distance=len(a_link_matrix)