Пример #1
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard,
                   seen_terms,
                   self._alpha)
     gold_standard_vector=self._my_tree.term_vector(gold_standard)
     seen_vector=self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator=gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator=gold_standard_vector.length()*modified_term.length()
     try:
         result=numerator/denominator
     except ZeroDivisionError:
         logging.warn("ZeroDivisionError when computing SAVCC for %r and %r:",
                  gold_standard, seen_terms)
         result=0
     logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                   numerator,
                   denominator,
                   result)
     return result
Пример #2
0
    def sentence_iterator(self, list_of_lines):
        """Iterates through the list of lines, returning a group with the
        same line_id each time.
        At the end of the iteration, the procedure updates the measurements
        of the graph builder to record the number of sentences it emitted."""
        current_group = []
        current_id = None
        sentence_count = 0
        for each_line in list_of_lines:
            if each_line.line_id != current_id:
                # Is it the first time? If not, emit the current group
                if current_id is not None:
                    logging.log(ULTRADEBUG,
                                "Emitting sentence %s with %d terms",
                                current_id, len(current_group))
                    sentence_count += 1
                    yield current_group
                current_id = each_line.line_id
                current_group = []
            current_group.append(each_line)
        # Are there lines left? Emit them
        if len(current_group) > 0:
            logging.log(ULTRADEBUG, "Emitting last sentence %s with %d terms",
                        current_id, len(current_group))
            sentence_count += 1
            yield current_group

        # Iteration ended - record the measurement
        self._measurements.add(ArticleSentenceCount(sentence_count))
        return
Пример #3
0
 def __init__(
         self,
         original_line,
         cui_position=6,  # Support for different SEMREP output formats
         description_position=7,
         semantic_type_position=8):
     SemrepLine.__init__(self, original_line)
     #line_breakup=self._line.split(self.split_char)
     try:
         self._cui = self.split_line[cui_position]
     except IndexError:
         raise CUINotFoundError("There was no CUI in the line '%s'" %
                                self._line)
     if self._cui == '':
         raise CUINotFoundError("There was no CUI in the line '%s'" %
                                self._line)
     try:
         self._description = self.split_line[description_position]
         self._semantic_type = self.split_line[semantic_type_position]
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     # Some entities have no stated confidence. We use 0 in such cases,
     # so they can be eliminated from the workflow later.
     try:
         self.confidence = float(self.split_line[-3]) / 1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     logging.log(ULTRADEBUG, "Created an entity_line @ %d: %s (%s) %1.3f",
                 self.line_id, self._cui, self._description,
                 self.confidence)
Пример #4
0
 def __init__(self, original_line):
     SemrepLine.__init__(self, original_line)
     #line_breakup=self._line.split(self.split_char)
     try:
         self._cui1=self.split_line[11]
         if self._cui1=='':
             raise IndexError() # Trigger the CUINotFoundError
     except IndexError:
         raise CUINotFoundError("There was no CUI1 in the line '%s'" % 
                                self._line)
     try:
         self._cui2=self.split_line[33]
         if self._cui2=='':
             raise IndexError() # Trigger the CUINotFoundError
     except IndexError:
         raise CUINotFoundError("There was no CUI2 in the line '%s'" % 
                                self._line)
     try:
         self._relation_type=self.split_line[24]
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     
     try:
         self.confidence=float(self.split_line[-3])/1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     logging.log(ULTRADEBUG, "Created a relation_line @ %d: %s--%s-->%s (%1.3f)", 
                   self.line_id, self._cui1, self._relation_type,
                   self._cui2, self.confidence)
Пример #5
0
 def __init__(self, original_line, 
              cui_position=6, # Support for different SEMREP output formats
              description_position=7,
              semantic_type_position=8):
     SemrepLine.__init__(self, original_line)
     #line_breakup=self._line.split(self.split_char)
     try:
         self._cui=self.split_line[cui_position]
     except IndexError:
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     if self._cui=='':
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     try:
         self._description=self.split_line[description_position]
         self._semantic_type=self.split_line[semantic_type_position]
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     # Some entities have no stated confidence. We use 0 in such cases,
     # so they can be eliminated from the workflow later.
     try:
         self.confidence=float(self.split_line[-3])/1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     logging.log(ULTRADEBUG, "Created an entity_line @ %d: %s (%s) %1.3f", 
                   self.line_id, self._cui,
                   self._description, self.confidence)
Пример #6
0
 def __init__(self, fileobject, transform_function):
     SavccMatrix.__init__(self, fileobject, transform_function)
     # Add normalization factors
     logging.log(ULTRADEBUG, "Initializing normalization array")
     # Default behavior: no normalization
     self.normfactors=[1.0]*self._height
     # Tentative normalization array name
     array_filename=self._expected_norm_array_name()
     logging.debug("Trying to load a normalization array from disk. The "
                   "file should be named %s.", array_filename)
     # Make sure that only one process or thread at a time can attempt to get 
     # the normalization factors
     _normfactor_lock.acquire()
     try:
         try:
             self._load_normalization_factors(open(array_filename, 'rb'))
             logging.debug('Normalization factors loaded from disk.')
         except IOError:
             logging.debug("Unable to load normalization factors from disk.")
             self._generate_normalization_factors()
             # Only save normalization factors if they are not a StringIO
             # object
             if not isinstance(fileobject, StringIO.StringIO):
                 logging.debug("Saving normalization factors to %s",
                               array_filename)
                 try:
                     self._save_normalization_factors(open(array_filename,
                                                           'wb'))
                 except IOError:
                     logging.warn("Unable to save the normalization array. "
                                  "It will have to be regenerated each "
                                  "time.")
     finally:
         _normfactor_lock.release()
Пример #7
0
    def __init__(self, original_line):
        SemrepLine.__init__(self, original_line)
        #line_breakup=self._line.split(self.split_char)
        try:
            self._cui1 = self.split_line[11]
            if self._cui1 == '':
                raise IndexError()  # Trigger the CUINotFoundError
        except IndexError:
            raise CUINotFoundError("There was no CUI1 in the line '%s'" %
                                   self._line)
        try:
            self._cui2 = self.split_line[33]
            if self._cui2 == '':
                raise IndexError()  # Trigger the CUINotFoundError
        except IndexError:
            raise CUINotFoundError("There was no CUI2 in the line '%s'" %
                                   self._line)
        try:
            self._relation_type = self.split_line[24]
        except IndexError:
            raise ParsingError("Data missing from line '%s'" % self._line)

        try:
            self.confidence = float(self.split_line[-3]) / 1000.0
        except ValueError:
            raise NoConfidenceError("Could not parse a confidence value in "
                                    "line '%s'" % self._line)
        logging.log(ULTRADEBUG,
                    "Created a relation_line @ %d: %s--%s-->%s (%1.3f)",
                    self.line_id, self._cui1, self._relation_type, self._cui2,
                    self.confidence)
Пример #8
0
 def __init__(self, damping_factor=0.85, max_iterations=10000, 
              epsilon=0.0001):
     logging.log(ULTRADEBUG, "Creating a ranker object.")
     self._max_iter=max_iterations
     self._e=epsilon
     self._d=damping_factor
     self._latest_stats=None
Пример #9
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard, seen_terms, self._alpha)
     gold_standard_vector = self._my_tree.term_vector(gold_standard)
     seen_vector = self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator = gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator = gold_standard_vector.length() * modified_term.length()
     try:
         result = numerator / denominator
     except ZeroDivisionError:
         logging.warn(
             "ZeroDivisionError when computing SAVCC for %r and %r:",
             gold_standard, seen_terms)
         result = 0
     logging.log(ULTRADEBUG,
                 "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                 numerator, denominator, result)
     return result
Пример #10
0
 def unfreeze(self):
     """Restores dictionary state from the database. Assumes a very 
     conservative state if it can't read state from disk."""
     # Is this the first-ever load??
     logging.log(ULTRADEBUG, "Restoring state from disk.")
     if len(self.my_store.keys()) == 0:
         return
     #self.my_lock.acquire()
     try:
         self.sync_every = int(self.my_store[SYNC_KEY])
     except KeyError:
         self.sync_every = 1
     try:
         self.write_counter = int(self.my_store[COUNTER_KEY])
     except KeyError:
         self.write_counter = 0
     try:
         # If there's no WRITE_EVERY, keep the default (for compatibility
         # With previous versions of the DBDict)
         result = int(self.my_store[WRITE_EVERY_KEY])
     except KeyError:
         result = self.write_every
     self.write_every = result
     #self.my_lock.release()
     logging.log(ULTRADEBUG, "State restored.")
Пример #11
0
 def check_extra_checktag_rules(self, an_expression):
     """Checks to see if a mesh term is in a known checktag-emitting
     tree"""
     positions = [
         self._tree[a_term.term].position
         for a_term in an_expression.utterance
     ]
     for position in positions:
         for rule in self.data.extra_checktag_rules:
             """Check each position in each tree for membership in the
             tree-based checktag rules"""
             if 'in' in rule:
                 in_rule = any(
                     [any([x in y for x in rule['in']]) for y in position])
             else:
                 in_rule = False
             if 'not in' in rule:
                 not_in_rule = any([
                     any([x in y for x in rule['not in']]) for y in position
                 ])
             else:
                 not_in_rule = False
             if in_rule and not not_in_rule:
                 logging.log(ULTRADEBUG,
                             "Expression %r matches checktag rule %r",
                             an_expression, rule)
                 self._extra_checktags |= set(
                     [Term(x) for x in rule['terms']])
     return
Пример #12
0
 def check_extra_checktag_rules(self, an_expression):
     """Checks to see if a mesh term is in a known checktag-emitting
     tree"""
     positions=[self._tree[a_term.term].position for a_term in            
                an_expression.utterance]
     for position in positions:
         for rule in self.data.extra_checktag_rules:
             """Check each position in each tree for membership in the
             tree-based checktag rules"""
             if 'in' in rule:
                 in_rule=any([any([x in y for x in rule['in']]) 
                              for y in position])
             else:
                 in_rule=False
             if 'not in' in rule:
                 not_in_rule=any([any([x in y for x in rule['not in']])
                                  for y in position])
             else:
                 not_in_rule=False
             if in_rule and not not_in_rule:
                 logging.log(ULTRADEBUG, "Expression %r matches checktag rule %r",
                               an_expression, rule)
                 self._extra_checktags|=set([Term(x) for x in 
                                             rule['terms']])
     return
Пример #13
0
 def __init__(self, fileobject=None, transform_function=None):
     # The default matrix is installed together with the package
     if fileobject is None:
         fileobject=open(_DEFAULT_MATRIX_NAME, "rb")
     # The matrix file has a header that describes the size of the 
     # matrix in the first bytes
     self.header_size=struct.calcsize(">HH")
     # Read the file header and get the height and width of the matrix 
     self._height, self._width=struct.unpack('>HH',
                                   fileobject.read(self.header_size))
     logging.debug("We're reading a %dx%d matrix", self._height,
                                                   self._width)
     # Keep a link to the file. We'll need it.
     self._matrix_file=fileobject
     #self._matrix_file_handle=self._matrix_file.fileno()
     # This will store the mappings from bytes in the matrix to actual
     # results
     self.transform=[0.0]*256
     logging.log(ULTRADEBUG, "Building the transformation array.")
     for i in xrange(255):
         self.transform[i]=transform_function(i)
     # We leave the last value blank; it's always 0.
     logging.debug("The transformation array is %s.", str(self.transform))
     # Save the size of a byte for later
     self.byte_size=struct.calcsize('<B')
     self._cached_row=-1
     self._row_cache=None
Пример #14
0
 def unfreeze(self):
     """Restores dictionary state from the database. Assumes a very 
     conservative state if it can't read state from disk."""
     # Is this the first-ever load??
     logging.log(ULTRADEBUG, "Restoring state from disk.")
     if len(self.my_store.keys())==0:
         return
     #self.my_lock.acquire()
     try:
         self.sync_every=int(self.my_store[SYNC_KEY])
     except KeyError:
         self.sync_every=1
     try:
         self.write_counter=int(self.my_store[COUNTER_KEY])
     except KeyError:
         self.write_counter=0
     try:
         # If there's no WRITE_EVERY, keep the default (for compatibility
         # With previous versions of the DBDict)
         result=int(self.my_store[WRITE_EVERY_KEY])
     except KeyError:
         result=self.write_every
     self.write_every=result
     #self.my_lock.release()
     logging.log(ULTRADEBUG, "State restored.")
Пример #15
0
 def init_storage(cache_name=pmid_cache.DEFAULT_CACHE_HOST,
                  fetch_new_articles=True):
     if Pmid.__article_cache is None:
         Pmid.__article_cache = pmid_cache.Client(cache_name)
         Pmid.__fetch_new = fetch_new_articles
     logging.log(ULTRADEBUG,
                 "Started up a connection to the pubmed cache database.")
Пример #16
0
 def __init__(self, fileobject=None, transform_function=None):
     # The default matrix is installed together with the package
     if fileobject is None:
         fileobject = open(_DEFAULT_MATRIX_NAME, "rb")
     # The matrix file has a header that describes the size of the
     # matrix in the first bytes
     self.header_size = struct.calcsize(">HH")
     # Read the file header and get the height and width of the matrix
     self._height, self._width = struct.unpack(
         '>HH', fileobject.read(self.header_size))
     logging.debug("We're reading a %dx%d matrix", self._height,
                   self._width)
     # Keep a link to the file. We'll need it.
     self._matrix_file = fileobject
     #self._matrix_file_handle=self._matrix_file.fileno()
     # This will store the mappings from bytes in the matrix to actual
     # results
     self.transform = [0.0] * 256
     logging.log(ULTRADEBUG, "Building the transformation array.")
     for i in xrange(255):
         self.transform[i] = transform_function(i)
     # We leave the last value blank; it's always 0.
     logging.debug("The transformation array is %s.", str(self.transform))
     # Save the size of a byte for later
     self.byte_size = struct.calcsize('<B')
     self._cached_row = -1
     self._row_cache = None
Пример #17
0
 def __init__(self, original_line):
     Line.__init__(self, original_line, id_position=0)
     try:
         self._cui=self.split_line[2]
     except IndexError:
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     if self._cui=='':
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     try:
         self._description=self.split_line[1]
         self._source=self.split_line[7]
         self._type=self.split_line[4].upper()
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     # Some entities have no stated confidence. We use 0 in such cases,
     # so they can be eliminated from the workflow later.
     try:
         self.confidence=float(self.split_line[3])/1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     logging.log(ULTRADEBUG, "Created a MtiLine @ %d: %s (%s) %1.3f", 
                   self.line_id, self._cui,
                   self._description, self.confidence)
Пример #18
0
 def init_storage(cache_name=pmid_cache.DEFAULT_CACHE_HOST, 
                  fetch_new_articles=True):
     if Pmid.__article_cache is None:
         Pmid.__article_cache=pmid_cache.Client(cache_name)
         Pmid.__fetch_new=fetch_new_articles
     logging.log(ULTRADEBUG, 
                 "Started up a connection to the pubmed cache database.")  
Пример #19
0
 def check_for_subheadings(self, an_expression):
     "Checks to see if this expression needs a subheading added."
     positions = [
         self._tree[a_term.term].position
         for a_term in an_expression.utterance
     ]
     for position in positions:
         for rule in self.data.subheading_rules:
             """Check each position in each tree for membership in the
             tree-based checktag rules"""
             if 'in' in rule:
                 in_rule = any(
                     [any([x in y for x in rule['in']]) for y in position])
             else:
                 in_rule = False
             if 'not in' in rule:
                 not_in_rule = any([
                     any([x in y for x in rule['not in']]) for y in position
                 ])
             else:
                 not_in_rule = False
             if in_rule and not not_in_rule:
                 logging.log(ULTRADEBUG,
                             "Expression %r matches subheading rule %r",
                             an_expression, rule)
                 return [Term(x) for x in rule['terms']]
     return []
 def sentence_iterator(self, list_of_lines):
     """Iterates through the list of lines, returning a group with the
     same line_id each time.
     At the end of the iteration, the procedure updates the measurements
     of the graph builder to record the number of sentences it emitted."""
     current_group=[]
     current_id=None
     sentence_count=0
     for each_line in list_of_lines:
         if each_line.line_id!=current_id:
             # Is it the first time? If not, emit the current group
             if current_id is not None:
                 logging.log(ULTRADEBUG, "Emitting sentence %s with %d terms",
                               current_id, len(current_group))
                 sentence_count+=1
                 yield current_group
             current_id=each_line.line_id
             current_group=[]
         current_group.append(each_line)
     # Are there lines left? Emit them
     if len(current_group)>0:
         logging.log(ULTRADEBUG, "Emitting last sentence %s with %d terms",
                       current_id, len(current_group))
         sentence_count+=1
         yield current_group
     
     # Iteration ended - record the measurement
     self._measurements.add(ArticleSentenceCount(sentence_count))
     return
Пример #21
0
 def __iter__(self):
     current_set=[]
     current_id=None
     bad_id=-1
     for line in NLMOutput.__iter__(self):
         try:
             this_lines_set_id=self._chunkmap.pmid_from_block(line.line_id)
         except KeyError:
             logging.warn("Line without chunkmap equivalent. Emitting"
                             " as id %d", bad_id)
             this_lines_set_id=Pmid(bad_id)
         if this_lines_set_id!=current_id:
             # Is this the first invocation? If not, we have to emit the
             # linelist that just ended, but if it is we'll just pretend
             # that we did.
             if current_id is not None:
                 # Emit the linelist that just ended
                 logging.log(ULTRADEBUG, "Completed set of lines %s "
                               "according to the chunkmap. Emitting them.",
                                current_id)
                 if current_id<0:
                     # Decrement bad line counter
                     bad_id-=1
                 yield self._lines_type(current_id, current_set)
                     
             # Start a new, empty linelist
             current_id=this_lines_set_id
             current_set=[]
         current_set.append(line)
     # Is there something left to emit after the iteration's over?
     if len(current_set)>0:
         logging.log(ULTRADEBUG, "Completed iteration. Emitting the last "
                                 "lines left with set id %s", current_id)
         yield self._lines_type(current_id, current_set)
     return
Пример #22
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results={}
     evaluator=self.create_evaluator()
     count=0
     for each_article in self._reader:
         count+=1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                           " criteria.", each_article)
             continue
         try:
             ranked_article=self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms=self.convert(ranked_article)
         cut_terms=converted_terms.terms_higher_than_or_equal_to(
                             self._ranking_cutoff)
         logging.debug("Lowest-ranking term is term #%d out of %d"
                       " (score=%1.5f, highest score=%1.5f)",
                       len(cut_terms), len(converted_terms),
                       [x[1] for x in cut_terms][-1],
                       [x[1] for x in cut_terms][0])
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record().mesh_headings)
         flat_medline=medline_record_mesh_terms.flatten()
         flattened_terms=self.flatten_generated_terms(flat_medline,
                         cut_terms)
         flattened_terms=self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline)==0:
             logging.warn("No gold standard available for article %r. "
                          "Omitting it from the result set.", each_article)
             continue
         eval_result=self.perform_evaluation(each_article,
                                             evaluator,
                                             flat_medline,
                                             flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r", 
                         medline_record_mesh_terms,
                         flattened_major_headings)
         mh_result_temp=self.perform_evaluation(each_article, evaluator,
                                                flattened_major_headings,
                                                flattened_terms)
         mh_result=NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall=self.compute_total_recall(flat_medline, 
                                                converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id]=eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Пример #23
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results = {}
     evaluator = self.create_evaluator()
     count = 0
     for each_article in self._reader:
         count += 1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(
                 ULTRADEBUG, "Skipping article %r due to exclusion "
                 " criteria.", each_article)
             continue
         try:
             ranked_article = self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms = self.convert(ranked_article)
         cut_terms = converted_terms.terms_higher_than_or_equal_to(
             self._ranking_cutoff)
         logging.debug(
             "Lowest-ranking term is term #%d out of %d"
             " (score=%1.5f, highest score=%1.5f)", len(cut_terms),
             len(converted_terms), [x[1] for x in cut_terms][-1],
             [x[1] for x in cut_terms][0])
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record().mesh_headings)
         flat_medline = medline_record_mesh_terms.flatten()
         flattened_terms = self.flatten_generated_terms(
             flat_medline, cut_terms)
         flattened_terms = self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline) == 0:
             logging.warn(
                 "No gold standard available for article %r. "
                 "Omitting it from the result set.", each_article)
             continue
         eval_result = self.perform_evaluation(each_article, evaluator,
                                               flat_medline,
                                               flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r",
                       medline_record_mesh_terms, flattened_major_headings)
         mh_result_temp = self.perform_evaluation(each_article, evaluator,
                                                  flattened_major_headings,
                                                  flattened_terms)
         mh_result = NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall = self.compute_total_recall(flat_medline,
                                                  converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id] = eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Пример #24
0
 def __init__(self,
              damping_factor=0.85,
              max_iterations=10000,
              epsilon=0.0001):
     logging.log(ULTRADEBUG, "Creating a ranker object.")
     self._max_iter = max_iterations
     self._e = epsilon
     self._d = damping_factor
     self._latest_stats = None
Пример #25
0
 def ignore_exception(self, which_exception, on_which_line):
     """Checks whether an exception generated by the parser is actionable
     or if it should be ignored. We ignore CUINotFoundError, because it is
     impossible to do anything in MEDRank without a CUI."""
     if type(which_exception) is CUINotFoundError:
         logging.log(ULTRADEBUG, "Skipping line '%s' because no CUI could be found "
                       "on it" % on_which_line)
         return True
     return False
Пример #26
0
 def ignore_exception(self, which_exception, on_which_line):
     """Decides whether exceptions during parsing correspond to known
     problems with SEMREP's output, and whether to ignore the corresponding 
     lines."""
     if type(which_exception) is WrongTypeOfLineError:
         logging.log(ULTRADEBUG, "Skipping line '%s' because its type could not be "
                       "determined.", on_which_line)
         return True
     return False
Пример #27
0
 def is_ignorable(self, which_line):
     """We need to specify ignorable lines in terms of useless strings,
     so we need to check the input against these."""
     for line in self.__lines_to_ignore:
         if line in which_line:
             logging.log(ULTRADEBUG, "Line '%s' contains this "
                           "skippable string: '%s'", which_line, line)
             return True
     return False
Пример #28
0
 def evaluate(self, term_list_1, term_list_2):
     """Performs a set of evaluations (in no particular order) and
     returns their results as members of a ResultSet"""
     results=ResultSet()
     for each_evaluator in self:
         logging.log(ULTRADEBUG, "Applying %s as part of an EvaluationGroup", 
                       each_evaluator.__class__.__name__)
         results.add(each_evaluator.evaluate(term_list_1, term_list_2))
     return results
Пример #29
0
 def _init_inverse_lookup(self):
     """Sets up the internal data store to perform reverse lookups."""
     logging.debug("First request of a reverse lookup. Building the " \
                   "inverse lookup dictionary.")
     self._invlookup={}
     for k, items in self._tree.iteritems():
         for item in items.position:
             self._invlookup[item]=k
     logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
     return
Пример #30
0
 def is_ignorable(self, which_line):
     """We need to specify ignorable lines in terms of useless strings,
     so we need to check the input against these."""
     for line in self.__lines_to_ignore:
         if line in which_line:
             logging.log(
                 ULTRADEBUG, "Line '%s' contains this "
                 "skippable string: '%s'", which_line, line)
             return True
     return False
Пример #31
0
 def _init_inverse_lookup(self):
     """Sets up the internal data store to perform reverse lookups."""
     logging.debug("First request of a reverse lookup. Building the " \
                   "inverse lookup dictionary.")
     self._invlookup = {}
     for k, items in self._tree.iteritems():
         for item in items.position:
             self._invlookup[item] = k
     logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
     return
Пример #32
0
 def ignore_exception(self, which_exception, on_which_line):
     """Decides whether exceptions during parsing correspond to known
     problems with SEMREP's output, and whether to ignore the corresponding 
     lines."""
     if type(which_exception) is WrongTypeOfLineError:
         logging.log(
             ULTRADEBUG, "Skipping line '%s' because its type could not be "
             "determined.", on_which_line)
         return True
     return False
Пример #33
0
 def __init__(self, original_line):
     MachineOutputLine.__init__(self, original_line)
     if self.line_type != 'utterance':
         raise WrongTypeOfLineError("%r is not an utterance." % self.line)
     parsed_line = UtteranceLine.parser.match(self.line).groupdict()
     self._my_line_id = int(
         UtteranceLine.numbers.findall(parsed_line['id'])[0])
     logging.log(ULTRADEBUG, "Created an UtteranceLine with set id %d",
                 self._my_line_id)
     # Update the line id for this lexer
     MachineOutputLine._line_id = self._my_line_id
Пример #34
0
 def __del__(self):
     self._lock.acquire()
     try:
         if self.my_persistence:
             self.__t.commit()
         self.__t.close()
         if not self.my_persistence:
             logging.log(ULTRADEBUG, "Deleting temporary file %r", self.my_filename)
             os.unlink(self.my_filename)
     finally:
         self._lock.release()
Пример #35
0
 def __init__(self, original_line):
     MachineOutputLine.__init__(self, original_line)
     if self.line_type!='utterance':
         raise WrongTypeOfLineError("%r is not an utterance." % self.line)
     parsed_line=UtteranceLine.parser.match(self.line).groupdict()
     self._my_line_id=int(
         UtteranceLine.numbers.findall(parsed_line['id'])[0])
     logging.log(ULTRADEBUG, "Created an UtteranceLine with set id %d",
                   self._my_line_id)
     # Update the line id for this lexer
     MachineOutputLine._line_id=self._my_line_id
Пример #36
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                       " criteria.", each_article)
         return
     try:
         ranked_article=self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms=self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms=converted_terms.terms_higher_than_or_equal_to(
                         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record()['MH'])
     except:
         logging.warn("Could not obtain an article record for %r. "
                      "Skipping.", each_article)
         return
     flat_medline=medline_record_mesh_terms.flatten()
     flattened_terms=self.flatten_generated_terms(flat_medline,
                     cut_terms)
     flattened_terms=self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline)==0:
         logging.warn("No gold standard available for article %r. "
                      "Omitting it from the result set.", each_article)
         return
     eval_result=self.perform_evaluation(each_article,
                                         self.evaluator,
                                         flat_medline,
                                         flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r", 
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp=self.perform_evaluation(each_article, self.evaluator,
                                            flattened_major_headings,
                                            flattened_terms)
     mh_result=NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall=self.compute_total_recall(flat_medline, 
                                            converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id]=eval_result | mh_result
     return
Пример #37
0
 def check_checktag_rules(self, CUI):
     """Compares a CUI to the checktag rules, and emits checktags if it
     matches. If a CUI is a member of an MTI list, and the list is a known
     match to a checktag, we emit the checktag at the end of the
     process."""
     # We check every list for membership (except exclusions)
     for (listname, checktags) in self.data.checktag_rules.iteritems():
         if listname=='_exclusions':
             continue
         if CUI in self.data.lists[listname]:
             logging.log(ULTRADEBUG, 'CUI %r matches list %r. Checktags %r added.',
                           CUI, listname, checktags)
             self._extra_checktags|=set([Term(x) for x in checktags])
Пример #38
0
 def process_item(self, one_item):
     if not self.include_item(one_item):
         logging.log(ULTRADEBUG, "Skipping item %r due to exclusion "
                     " criteria.", one_item)
         return
     try:
         ranked_item = self.graph_and_rank(one_item)
     except CouldNotRank:
         return
     cut_item = [x for x in ranked_item if x[1] >= self._ranking_cutoff]
     # Unify the result sets
     self.all_results[one_item.set_id] = cut_item
     return
Пример #39
0
    def _create_graph(self, list_of_lines):
        new_graph = self._type_of_graph_to_build()
        logging.log(
            ULTRADEBUG, "Building a METAMAP proximity co-occurrence graph "
            "from %r", list_of_lines)
        # Iterate through each sentence, emitting links for each pair of
        # adjacent concepts (concept evaluators permitting)
        for sentence in self.sentence_iterator(list_of_lines):
            nodes = []
            for concept in sentence:
                if not isinstance(concept, MetamapLine):
                    logging.log(
                        ULTRADEBUG, "Skipping line %r, as it isn't a "
                        "MetamapLine", sentence)
                    continue
                new_node = self._node_factory(concept.CUI, concept.description,
                                              concept.confidence, concept.line)
                if self.include_node(new_node):
                    nodes.append(new_node)
                    logging.log(ULTRADEBUG, "%r included in the graph",
                                new_node)
                else:
                    logging.log(ULTRADEBUG, "%r excluded from the graph",
                                new_node)
            for i in xrange(len(nodes) - 1):
                for j in xrange(i + 1, len(nodes)):
                    # Adjacent nodes are related more in this model.
                    # The weight of the relationship is given by the distance
                    node1, node2 = nodes[i], nodes[j]
                    if self._direction_inferrer is None:
                        new_link = self._adirectional_link_factory(
                            node1, node2, self._link_strength(j - i))
                    else:
                        new_dir=\
                          self._direction_inferrer.infer_relation_direction(
                           node1.node_id, node2.node_id)
                        if new_dir == 0:
                            new_link = self._adirectional_link_factory(
                                node1, node2, self._link_strength(j - i))
                        else:
                            new_link = self._link_factory(
                                node1, node2,
                                new_dir * self._link_strength(j - i))

                    if self.include_link(new_link):
                        new_graph.add_relationship(new_link)
                    else:
                        logging.log(ULTRADEBUG,
                                    "Excluding link %r from the graph",
                                    new_link)
        return new_graph
Пример #40
0
 def compute_measures(self):
     """Computes graph metrics for the current object."""
     self._consolidate_if_necessary()
     logging.log(ULTRADEBUG, "Computing graph metrics for %r", self)
     graph_measures=ResultSet()
     graph_measures.add(GraphNumberLinks(len(self._relationships)))
     unique_nodes=set()
     for a_relation in self._relationships:
         unique_nodes.add(a_relation.node1)
         unique_nodes.add(a_relation.node2)
     graph_measures.add(GraphNumberNodes(len(unique_nodes)))
     graph_measures.add(GraphAverageNodeWeight(reduce(operator.add,
                                 [x.weight for x in unique_nodes])/
                                 float(len(unique_nodes))))
     graph_measures.add(GraphAverageLinkWeight(reduce(operator.add,
                                 [x.weight for x in self._relationships])/
                                 float(len(self._relationships))))
     graph_measures.add(GraphLinkDegree(float(len(self._relationships))/
                                        float(len(unique_nodes))))
     logging.log(ULTRADEBUG, "Starting computation of the distance matrix.")
     distmat=DistanceMatrix(self.as_mapped_link_matrix())
     logging.log(ULTRADEBUG, "Distance matrix obtained. Computing stats.")
     rocs=[distmat.relative_out_centrality(x) for x in 
           xrange(len(distmat))]
     rics=[distmat.relative_in_centrality(x) for x in 
           xrange(len(distmat))]
     avrocs=reduce(operator.add, rocs)/float(len(distmat))
     avrics=reduce(operator.add, rics)/float(len(distmat))
     graph_measures.add(GraphRelativeOutCentrality(avrocs))
     graph_measures.add(GraphRelativeInCentrality(avrics))
     graph_measures.add(GraphStratum(distmat.stratum()))
     graph_measures.add(GraphCompactness(distmat.compactness()))
     logging.log(ULTRADEBUG, "Finished computing graph metrics.")
     return graph_measures
 def _create_graph(self, list_of_lines):
     new_graph=self._type_of_graph_to_build()
     logging.log(ULTRADEBUG, "Building a SEMREP co-occurrence graph from %r", 
                   list_of_lines)
     # Iterate through each sentence, emitting links for each pair of
     # adjacent concepts (concept evaluators permitting)
     for sentence in self.sentence_iterator(list_of_lines):
         nodes=[]
         for concept in sentence:
             if not isinstance(concept, EntityMeSHLine):
                 logging.log(ULTRADEBUG, "Skipping line %r, as it isn't an "
                               "EntityMeSHLine", concept)
                 continue
             new_node=self._node_factory(concept.CUI, concept.description, 
                                         concept.confidence, concept.mesh)
             if self.include_node(new_node):
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph", new_node)
         for i in xrange(len(nodes)-1):
             # Adjacent nodes are related in this model. 
             node1, node2=nodes[i:i+2]
             new_link=self._adirectional_link_factory(node1, node2, 
                                       (node1.weight+node2.weight)/2.0)
             if self.include_link(new_link):
                 new_graph.add_relationship(new_link)
             else:
                 logging.log(ULTRADEBUG, "Excluding link %r from the graph", 
                               new_link)
     return new_graph
Пример #42
0
 def _create_graph(self, list_of_lines):
     new_graph=self._type_of_graph_to_build()
     logging.log(ULTRADEBUG, "Building a SEMREP co-occurrence graph from %r", 
                   list_of_lines)
     # Iterate through each sentence, emitting links for each pair of
     # adjacent concepts (concept evaluators permitting)
     for sentence in self.sentence_iterator(list_of_lines):
         nodes=[]
         for concept in sentence:
             if not isinstance(concept, EntityLine):
                 logging.log(ULTRADEBUG, "Skipping line %r, as it isn't an "
                               "EntityLine", concept)
                 continue
             new_node=self._node_factory(concept.CUI, concept.description, 
                                         concept.confidence)
             if self.include_node(new_node):
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph", new_node)
         for i in xrange(len(nodes)-1):
             # Adjacent nodes are related in this model. 
             node1, node2=nodes[i:i+2]
             new_link=self._adirectional_link_factory(node1, node2, 
                                       (node1.weight+node2.weight)/2.0)
             if self.include_link(new_link):
                 new_graph.add_relationship(new_link)
             else:
                 logging.log(ULTRADEBUG, "Excluding link %r from the graph", 
                               new_link)
     return new_graph
Пример #43
0
 def check_checktag_rules(self, CUI):
     """Compares a CUI to the checktag rules, and emits checktags if it
     matches. If a CUI is a member of an MTI list, and the list is a known
     match to a checktag, we emit the checktag at the end of the
     process."""
     # We check every list for membership (except exclusions)
     for (listname, checktags) in self.data.checktag_rules.iteritems():
         if listname == '_exclusions':
             continue
         if CUI in self.data.lists[listname]:
             logging.log(ULTRADEBUG,
                         'CUI %r matches list %r. Checktags %r added.', CUI,
                         listname, checktags)
             self._extra_checktags |= set([Term(x) for x in checktags])
Пример #44
0
 def process_item(self, one_item):
     if not self.include_item(one_item):
         logging.log(ULTRADEBUG, "Skipping item %r due to exclusion "
                       " criteria.", one_item)
         return
     try:
         ranked_item=self.graph_and_rank(one_item)
     except CouldNotRank:
         return
     cut_item=[x for x in ranked_item if x[1] >= self._ranking_cutoff]
     # Unify the result sets
     self.all_results[one_item.set_id]=cut_item
     return
     
Пример #45
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                     " criteria.", each_article)
         return
     try:
         ranked_article = self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms = self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms = converted_terms.terms_higher_than_or_equal_to(
         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record()['MH'])
     except:
         logging.warn(
             "Could not obtain an article record for %r. "
             "Skipping.", each_article)
         return
     flat_medline = medline_record_mesh_terms.flatten()
     flattened_terms = self.flatten_generated_terms(flat_medline, cut_terms)
     flattened_terms = self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline) == 0:
         logging.warn(
             "No gold standard available for article %r. "
             "Omitting it from the result set.", each_article)
         return
     eval_result = self.perform_evaluation(each_article, self.evaluator,
                                           flat_medline, flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r",
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp = self.perform_evaluation(each_article, self.evaluator,
                                              flattened_major_headings,
                                              flattened_terms)
     mh_result = NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall = self.compute_total_recall(flat_medline, converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id] = eval_result | mh_result
     return
Пример #46
0
 def _create_table_if_necessary(self):
     self._lock.acquire()
     try:
         try:
             dummy=self.__t.execute('select * from s limit 1')
         except sqlite3.OperationalError:
             # Table doesn't exist
             logging.log(ULTRADEBUG, "Table doesn't exist - must be a new database.")
             self.__t.execute("""create table s 
                                 (pkey TEXT PRIMARY KEY NOT NULL,
                                  data BLOB NOT NULL)""")
             logging.debug("Table created.")
     finally:
         self._lock.release()
     return
Пример #47
0
 def convert(self, a_ranked_result_set):
     """Convert a ranked result set into a RankedConversionResult.
     In other words, convert a ranked term list to its MeSH equivalents."""
     result = RankedConversionResult()
     self._my_converter.start_conversion()
     for incoming_term, incoming_score in a_ranked_result_set:
         converted = self._my_converter.convert(
             Concept(incoming_term.node_id))
         if converted.utterance != []:
             result.add_term_score(converted, incoming_score)
         converted = self._my_converter.end_conversion()
         if converted.utterance != []:
             result.add_term_score(converted,
                                   incoming_score + self._checktag_boost)
     logging.log(ULTRADEBUG, "RankedConverter results: %r", result)
     return result
Пример #48
0
 def convert(self, a_ranked_result_set):
     """Convert a ranked result set into a RankedConversionResult.
     In other words, convert a ranked term list to its MeSH equivalents."""
     result=RankedConversionResult()
     self._my_converter.start_conversion()
     for incoming_term, incoming_score in a_ranked_result_set:
         converted=self._my_converter.convert(
                     Concept(incoming_term.node_id))
         if converted.utterance!=[]:
             result.add_term_score(converted, incoming_score)
         converted=self._my_converter.end_conversion()
         if converted.utterance!=[]:
             result.add_term_score(converted, incoming_score+
                                              self._checktag_boost)
     logging.log(ULTRADEBUG, "RankedConverter results: %r", result)
     return result
 def _create_graph(self, list_of_lines):
     new_graph=self._type_of_graph_to_build()
     logging.log(ULTRADEBUG,
                 "Building a METAMAP proximity co-occurrence graph "
                 "from %r", list_of_lines)
     # Iterate through each sentence, emitting links for each pair of
     # adjacent concepts (concept evaluators permitting)
     for sentence in self.sentence_iterator(list_of_lines):
         nodes=[]
         for concept in sentence:
             if not isinstance(concept, MetamapLine):
                 logging.log(ULTRADEBUG, "Skipping line %r, as it isn't a "
                               "MetamapLine", sentence)
                 continue
             new_node=self._node_factory(concept.CUI, concept.description, 
                                         concept.confidence, concept.line)
             if self.include_node(new_node):
                 nodes.append(new_node)
                 logging.log(ULTRADEBUG, "%r included in the graph", new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph", new_node)
         for i in xrange(len(nodes)-1):
             for j in xrange(i+1, len(nodes)):
                 # Adjacent nodes are related more in this model. 
                 # The weight of the relationship is given by the distance
                 node1, node2=nodes[i], nodes[j]
                 if self._direction_inferrer is None:
                     new_link=self._adirectional_link_factory(node1, node2, 
                                                 self._link_strength(j-i))
                 else:
                     new_dir=\
                       self._direction_inferrer.infer_relation_direction(
                        node1.node_id, node2.node_id)
                     if new_dir==0:
                         new_link=self._adirectional_link_factory(node1,
                                                                  node2, 
                                             self._link_strength(j-i))
                     else:
                         new_link=self._link_factory(node1, node2,
                                     new_dir*self._link_strength(j-i))
                         
                 if self.include_link(new_link):
                     new_graph.add_relationship(new_link)
                 else:
                     logging.log(ULTRADEBUG, "Excluding link %r from the graph", 
                                   new_link)
     return new_graph
Пример #50
0
 def mult_by_vector(self, vector):
     """Multiplies the matrix by a vocabulary_vector, returning a new
     vocabulary_vector"""
     vec_size = len(vector)
     logging.log(ULTRADEBUG, "Multiplying by %s", vector)
     # Size checks
     if vec_size != self._width:
         raise ValueError("The vector and matrix shapes do not match.")
     result = VocabularyVector(self._height)
     # Operands to include in the multiplication
     nonzero_indices = vector.nonzero()
     for i in xrange(self._height):
         this_result = 0.0
         for j in nonzero_indices:
             this_result += self[i, j] * vector[j]
         result[i] = this_result
     return result
Пример #51
0
 def mult_by_vector(self, vector):
     """Multiplies the matrix by a vocabulary_vector, returning a new
     vocabulary_vector"""
     vec_size=len(vector)
     logging.log(ULTRADEBUG, "Multiplying by %s", vector)
     # Size checks
     if vec_size != self._width:
         raise ValueError("The vector and matrix shapes do not match.")
     result=VocabularyVector(self._height)
     # Operands to include in the multiplication
     nonzero_indices=vector.nonzero()
     for i in xrange(self._height):
         this_result=0.0
         for j in nonzero_indices:
             this_result+=self[i,j]*vector[j]
         result[i]=this_result
     return result
Пример #52
0
def processor(workflow_class,
              graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params,
              eval_parameters, 
              ranking_cutoff,
              mesh_tree_filename, distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue, my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow=workflow_class(graph_builder_constructor,
                               graph_builder_params,
                               ranker_constructor,
                               ranker_params,
                               eval_parameters,
                               ranking_cutoff,
                               mesh_tree_filename,
                               distance_matrix_filename,
                               distance_function,
                               umls_converter_data_filename
                               )
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request=my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request=='STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results={}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                                            # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", 
                             traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
Пример #53
0
 def graph_and_rank(self, item):
     """Turn the item into a graph, then a link matrix, and then rank
     it. Returns the ranked list of nodes."""
     item_graph = self.graph_item(item)
     logging.log(ULTRADEBUG, "The item graph is %r.", item_graph)
     item_matrix = item_graph.as_mapped_link_matrix()
     if len(item_matrix) == 0:
         logging.info("Skipping item %r. It has an empty matrix.", item)
         raise CouldNotRank("Item %r is not rankable." % item)
     try:
         ranked_item = self._ranker.evaluate(item_matrix)
     except ValueError:
         logging.info(
             "%r returned an exception while ranking %r. "
             "Skipping.", self._ranker, item)
         raise CouldNotRank("There was an exception while ranking %r." %
                            item)
     return ranked_item
Пример #54
0
def processor(workflow_class,
              graph_builder_constructor,
              graph_builder_params,
              ranker_constructor,
              ranker_params,
              eval_parameters,
              ranking_cutoff,
              mesh_tree_filename,
              distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue,
              my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow = workflow_class(graph_builder_constructor,
                                 graph_builder_params, ranker_constructor,
                                 ranker_params, eval_parameters,
                                 ranking_cutoff, mesh_tree_filename,
                                 distance_matrix_filename, distance_function,
                                 umls_converter_data_filename)
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request = my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request == 'STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results = {}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
Пример #55
0
 def _post_process_graph(self, built_graph):
     """Post-processes the graph. The default implementation consolidates 
     it and adds orphan nodes to the graph, consolidating it again."""
     built_graph.consolidate_graph()
     self._tf_idf_scores = None  # Make sure the scores aren't recycled
     # accidentally
     if self._add_orphan_nodes:
         added = 0
         rels = built_graph.relationships
         known_nodes = set([x.node1
                            for x in rels] + [x.node2 for x in rels])
         for n in self._node_cache:
             if n not in known_nodes:
                 added += 1
                 built_graph.add_relationship(
                     AdirectionalLink(n, n, n.weight))
         built_graph.consolidate_graph()
         logging.log(ULTRADEBUG, "Added %d orphan nodes", added)
     self._node_cache = set([])
     return built_graph