def test_watchful_stemmer(): """Test the watchful stemmer""" stemmer = WatchfulStemmer() words = (['verb'] * 3 + ['verbs'] * 5 + ['verbed'] * 2 + ['verbing'] + ['verbings'] * 8 + ['verbification'] * 5 + ['verbifications'] * 3 + ['verbize'] * 7 + ['verbization'] * 2 + ['verbizations'] * 4) for word in words: stemmer.stem(word) assert stemmer.most_frequent('verb') == 'verbings' assert stemmer.most_frequent('verbif') == 'verbification' assert stemmer.most_frequent('verbiz') == 'verbize' assert stemmer.most_frequent('verbize') == 'verbizations' assert stemmer.counts['verb']['verbs'] == 5 assert stemmer.counts['verb']['verbing'] == 1 assert stemmer.counts['verbize']['verbization'] == 2 assert stemmer.counts['verbif']['verbifications'] == 3 # raises value error if stem has not been observed stemmer.most_frequent('ver')
class AdeftMiner(object): """Finds possible longforms corresponding to an abbreviation in a text corpus. Makes use of the `Acromine <http://www.chokkan.org/research/acromine/>`_ algorithm developed by Okazaki and Ananiadou, combined with an alignment based longform scoring method we've developed. Acromine based likelihood scores are scaled to the range [0, 1] using the formula (likelihood_score - 1)/(M + smoothing_param - 1). Where smoothing_param is a positive number and M is the maximum likelihood score between a candidate node and all of its ancestors and descendents (technically this score can be less than 0 for some poor candidate longforms). Scaled likelihood scores are combined with alignment based scores through a weighted average where the weight associated to the alignment based score decays exponentially with the value of M defined above. This gives more weight to the alignment based score for rarer longform expansions which the Acromine algorithm has difficulty handling. [Okazaki06] Naoaki Okazaki and Sophia Ananiadou. "Building an abbreviation dicationary using a term recognition approach". Bioinformatics. 2006. Oxford University Press. Parameters ---------- shortform : str Shortform to disambiguate window : Optional[int] Specifies range of characters before a defining pattern (DP) to consider when finding longforms. If set to 30, candidate longforms would be taken from the string "ters before a defining pattern". Default: 100 Attributes ---------- _internal_trie : :py:class:`adeft.discover._TrieNode` Stores trie data-structure used to implement the algorithm _stemmer : :py:class:`adeft.nlp.stem.SnowCounter` English stemmer that keeps track of counts of the number of times a given word has been mapped to a given stem. Wraps the class EnglishStemmer from nltk.stem.snowball _alignment_scores_computed : bool Will be True if alignment scores have been computed for the current state of the candidate trie. It is reset to False any time the process_texts method is run _scores_propagated : bool Will be True if best ancestor and best descendent likelihood scores have been propagated to each node for the current state of the candidate trie. It is reset to False any time the process_texts method is run. """ def __init__(self, shortform, window=100): self.shortform = shortform self._internal_trie = _TrieNode(shortform=shortform) self._stemmer = WatchfulStemmer() self.window = window self._alignment_scores_computed = False self._scores_propagated = False def process_texts(self, texts): """Update longform candidate scores from a corpus of texts Runs co-occurence statistics in a corpus of texts to compute likelihood scores for candidate longforms associated to the shortform. This is an online method, it can be run multiple times to process_texts multiple batches of text. This allows previously trained AdeftMiners to be updated when new content becomes available. Parameters ---------- texts : list of str A list of texts """ for text in texts: # lonform candidates taken from a window of text before each # defining pattern fragments = get_candidate_fragments(text, self.shortform, self.window) for fragment in fragments: if fragment: candidate, _ = get_candidate(fragment) self._add(candidate) self._alignment_scores_computed = False self._scores_propagated = False def top(self, limit=None, smoothing_param=4, max_length='auto', use_alignment_based_scoring=True, weight_decay_param=0.001): """Return top scoring candidates. Parameters ---------- limit : Optional[int] Limit for the number of candidates to return. Default: None smoothing_param : Optional[float] Smoothing parameter for the scaled likelihood score. Likelihood scores are scaled using the formula (likelihood_score - 1)/(M + smoothing_param - 1) where M is the maximum likelihood score between a candidate node and all of its ancestors and descendents (technically this score can be less than 0 for some poor candidate longforms). Larger values of smoothing_param lead to more penalization of candidates with small count. Default: 4 use_alignment_based_scoring : Optional[bool] If true use combined acromine/alignment scoring. Alignment scores will be computed with default parameters if they have not been computed previously using the compute_alignment_scores method. The combined score is a weighted average of the acromine score and the alignment based score, with weight for the alignment based score decaying exponentially with M, the maximum likelihood score between a candidate node and all of its ancestors and descendents. weight_decay_param : Optional[float] Adjusts rate of decay for alignment score with respect to the Value of M (maximum likelihood score between a candidate node and all of its ancestors and descendents.) score = weight*alignment_score + (1-weight)*likelihood_score where weight = e^{-weight_decay_param*max(M, 0)} max_length : Optional[str|int|None] Maximum number of tokens in an accepted longform. If None, accepted longforms can be arbitrarily long. If 'auto', max_length is set to 2*len(self.shortform)+1 Returns ------ candidates : list of tuple List of tuples, each containing a candidate string, it's associated score, and it's count. Sorted first in descending order by likelihood score, then by count, and length from shortest to longest candidate measured in number of tokens, and finally by lexicographic order. """ if max_length == 'auto': max_length = 2 * len(self.shortform) + 1 score_func = self._get_score_function(smoothing_param, use_alignment_based_scoring, weight_decay_param) root = self._internal_trie stack = [(root, 0)] result = [] while stack: current, depth = stack.pop() if max_length is not None and depth + 1 > max_length: continue for child in current.children.values(): score, count = score_func(child) result.append([ child.longform, self._make_readable(child.longform), count, score ]) stack.append((child, depth + 1)) result.sort(key=lambda x: (-x[3], -x[2], len(x[0]), x[1])) return [(longform, score, count) for _, longform, score, count in result[:limit]] def get_longforms(self, cutoff=0.1, smoothing_param=4, max_length='auto', use_alignment_based_scoring=True, weight_decay_param=0.001): """Return a list of extracted longforms with their scores Traverse the candidates trie to search for nodes with score greater than or equal to the scores of all children and strictly greater than the scores of all ancestors. Parameters ---------- cutoff : Optional[int] Return only longforms with a score greater than the cutoff. Default: 0.1 smoothing_param : Optional[float] Smoothing parameter for the scaled likelihood score. Likelihood scores are scaled using the formula (likelihood_score - 1)/(M + smoothing_param - 1) where M is the maximum likelihood score between a candidate node and all of its ancestors and descendents (technically this score can be less than 0 for some poor candidate longforms). Larger values of smoothing_param lead to more penalization of candidates with small count. Default: 4 use_alignment_based_scoring : Optional[bool] If true use combined acromine/alignment scoring. Alignment scores will be computed with default parameters if they have not been computed previously using the compute_alignment_scores method. The combined score is a weighted average of the acromine score and the alignment based score, with weight for the alignment based score decaying exponentially with M, the maximum likelihood score between a candidate node and all of its ancestors and descendents. weight_decay_param : Optional[float] Adjusts rate of decay for alignment score with respect to the Value of M (maximum likelihood score between a candidate node and all of its ancestors and descendents.) score = weight*alignment_score + (1-weight)*likelihood_score where weight = e^{-weight_decay_param*max(M, 0)} max_length : Optional[str|int|None] Maximum number of tokens in an accepted longform. If None, accepted longforms can be arbitrarily long. If 'auto', max_length is set to 2*len(self.shortform)+1 Returns ------- longforms : list of tuple list of triples of the form (longform, count, score) It is sorted in descending order by count and then score. Ties are resolved through lexicographic order. """ if max_length == 'auto': max_length = 2 * len(self.shortform) + 1 def _get_longform_helper(node, score_func, depth): if not node.children or (max_length is not None and depth == max_length): score, count = score_func(node) return [(node.longform, score, count)] result = [] for child in node.children.values(): child_longforms = _get_longform_helper(child, score_func, depth + 1) result.extend([(longform, score, count) for longform, score, count in child_longforms if node.is_root() or score > score_func(node)[0] ]) if not result: score, count = score_func(node) result = [(node.longform, score, count)] return result score_func = self._get_score_function(smoothing_param, use_alignment_based_scoring, weight_decay_param) root = self._internal_trie longforms = _get_longform_helper(root, score_func, 0) # Convert longforms as tuples in reverse order into reader strings # mapping stems back to the most frequent token that had been mapped longforms = [(longform, score, count) for longform, score, count in longforms if score > cutoff] # Map stems to the most frequent word that had been mapped to them. # Convert longforms as tuples in reverse order into reader strings # mapping stems back to the most frequent token that had been # mapped to them. tuple of stemmed tokens can be recovered by # tokenizing, stemming, and reversing longforms = [(self._make_readable(longform), score, count) for longform, score, count in longforms] # Sort in preferred order longforms = sorted(longforms, key=lambda x: (-x[2], -x[1], x[0])) return [(longform, count, score) for longform, score, count in longforms] def _propagate_scores(self): """Add best descendent and best ancestor likelihood scores for nodes """ root = self._internal_trie stack = [root] while stack: current = stack.pop() for _, child in current.children.items(): if child.score > current.best_ancestor_score: child.best_ancestor_score = child.score else: child.best_ancestor_score = current.best_ancestor_score stack.append(child) if not current.children: current.best_descendent_score = current.score while (current.parent is not None and (current.best_descendent_score > current.parent.score or not current.parent.best_descendent_score)): parent = current.parent if parent.score > current.best_descendent_score: parent.best_descendent_score = parent.score else: parent.best_descendent_score = \ current.best_descendent_score current = parent def compute_alignment_scores(self, **params): """Compute and add alignment scores to candidate nodes in trie Parameters ---------- **params Parameters for py:class`AlignmentBasedScorer` """ abs_ = AlignmentBasedScorer(self.shortform, **params) root = self._internal_trie stack = [root] # Perform depth first search calculating scores for each candidate in # trie. Alignment score of best ancestor is used to decide how current # node is processed (No computation is performed if score cannot be # improved. No computation for permutations with inversion count that # makes improving on best score impossible. while stack: current = stack.pop() for token, child in current.children.items(): data = [ current.alignment_score, current.encoded_tokens, current.word_prizes, current.best_ancestor_align_score, current.best_char_scores, current.sum_parent_word_scores, current.stop_count ] new_data = abs_._next_score(token, *data) child.alignment_score = new_data[0] child.encoded_tokens = new_data[1] child.word_prizes = new_data[2] child.best_ancestor_align_score = new_data[3] child.best_char_scores = new_data[4] child.sum_parent_word_scores = new_data[5] child.stop_count = new_data[6] stack.append(child) self._abs_fit = True def prune(self, max_depth): """Prune away all nodes with depth greater than max_depth Parameters ---------- max_depth : int Positive integer. Maximum depth for nodes to keep in the candidate trie. Corresponds to maximum number of tokens in longforms. """ root = self._internal_trie stack = [(root, 0)] while stack: current, depth = stack.pop() if depth + 1 > max_depth: for child in current.children.values(): child = None current.children = {} continue for stack in current.children.values(): stack.append((child, depth + 1)) def _add(self, tokens): """Add a list of tokens to the internal trie and update likelihoods. Parameters ---------- tokens : str A list of tokens to add to the internal trie. """ # start at top of trie current = self._internal_trie # apply snowball stemmer to each token and put them in reverse order tokens = tuple(self._stemmer.stem(token) for token in tokens)[::-1] for token in tokens: if token not in current.children: # candidate longform is observed for the first time # add a new entry for it in the trie longform = current.longform + (token, ) new = _TrieNode(longform, parent=current) # update likelihood of current node to account for the new # child unless current node is the root if not current.is_root(): current.update_likelihood(1) # set newly observed longform to be the child of current node current.children[token] = new # update current node to the newly formed node current = new else: # candidate longform has been observed before # update count for candidate longform and associated LH value current.children[token].increment_count() # Update entry for candidate longform in the candidates # dictionary if not current.is_root(): # we are not at the top of the trie. observed candidate # has a parent # update likelihood of candidate's parent count = current.children[token].count current.update_likelihood(count) current = current.children[token] def _get_score_function(self, smoothing_param, use_alignment_based_scoring, weight_decay_param): """Returns scoring function for determining longforms Also computes alignment scores and propagates acromine score information for ancestors and descendents in the tree of candidates if necessary. """ if not self._scores_propagated: self._propagate_scores() self._scores_propagated = True def scaled_score(node): numerator = node.score - 1 denominator = max(node.best_ancestor_score, node.best_descendent_score) denominator += smoothing_param - 1 score = 0 if denominator <= 0 else numerator / denominator return score if not use_alignment_based_scoring: def score_func(node): return scaled_score(node), node.count else: if not self._alignment_scores_computed: self.compute_alignment_scores() self._alignment_scores_computed = True def score_func(node): acro_score = scaled_score(node) phi = np.exp(-weight_decay_param * max(0, node.best_ancestor_score - 1, node.best_descendent_score - 1)) score = phi * node.alignment_score + (1 - phi) * acro_score return score, node.count return score_func def _make_readable(self, tokens): """Convert longform from internal representation to a human readable one """ return ' '.join( self._stemmer.most_frequent(token) for token in tokens[::-1]) def to_dict(self): """Returns dictionary serialization of AdeftMiner """ out = {} out['shortform'] = self.shortform out['internal_trie'] = self._internal_trie.to_dict() out['stemmer'] = self._stemmer.dump() out['window'] = self.window return out def dump(self, f): """Serialize AdeftMiner to json into file f""" json.dump(self.to_dict(), f) def update(self, adeft_miner): """Compose two adeft miners trained on separate texts""" self._stemmer.counts.update(adeft_miner._stemmer.counts) stack = [(self._internal_trie, deepcopy(adeft_miner._internal_trie))] while stack: left, right = stack.pop() for token, child in right.children.items(): if token not in left.children: left.children[token] = child if not left.is_root(): left.update_likelihood(child.count, child.count) else: current = left.children[token] current.increment_count(child.count) if not left.is_root(): count1, count2 = current.count, child.count left.update_likelihood(count1, count2) stack.append((current, child))
class AdeftMiner(object): """Finds possible longforms corresponding to an abbreviation in a text corpus. Makes use of the `Acromine <http://www.chokkan.org/research/acromine/>`_ algorithm developed by Okazaki and Ananiadou. [Okazaki06] Naoaki Okazaki and Sophia Ananiadou. "Building an abbreviation dicationary using a term recognition approach". Bioinformatics. 2006. Oxford University Press. Parameters ---------- shortform : str Shortform to disambiguate window : Optional[int] Specifies range of characters before a defining pattern (DP) to consider when finding longforms. If set to 30, candidate longforms would be taken from the string "ters before a defining pattern". Default: 100 exclude : Optional[set of str] Terms that are to be excluded from candidate longforms. Default: None Attributes ---------- _internal_trie : :py:class:`adeft.discover._TrieNode` Stores trie data-structure used to implement the algorithm _longforms : dict Dictionary mapping candidate longforms to their likelihoods as produced by the acromine algorithm _stemmer : :py:class:`adeft.nlp.stem.SnowCounter` English stemmer that keeps track of counts of the number of times a given word has been mapped to a given stem. Wraps the class EnglishStemmer from nltk.stem.snowball """ def __init__(self, shortform, window=100, exclude=None): self.shortform = shortform self._internal_trie = _TrieNode() self._longforms = {} self._stemmer = WatchfulStemmer() self.window = window if exclude is None: self.exclude = set() else: self.exclude = exclude def process_texts(self, texts): """Update longform candidate scores from a corpus of texts Runs co-occurence statistics in a corpus of texts to compute scores for candidate longforms associated to the shortform. This is an online method, additional texts can be processed after training has taken place. Parameters ---------- texts : list of str A list of texts """ for text in texts: # lonform candidates taken from a window of text before each # defining pattern fragments = get_candidate_fragments(text, self.shortform, self.window) for fragment in fragments: if fragment: candidate = get_candidate(fragment, self.exclude) self._add(candidate) def top(self, limit=None): """Return top scoring candidates. Parameters ---------- limit : Optional[int] Limit for the number of candidates to return. Default: None Returns ------ candidates : list of tuple List of tuples, each containing a candidate string and its likelihood score. Sorted first in descending order by likelihood score, then by length from shortest to longest, and finally by lexicographic order. """ if not self._longforms: return [] candidates = sorted(self._longforms.items(), key=lambda x: (-x[1], len(x[0]), x[0])) if limit is not None and limit < len(candidates): candidates = candidates[0:limit] # Map stems back to the most frequent word that had been mapped to them # and convert longforms in tuple format into readable strings. candidates = [(' '.join( self._stemmer.most_frequent(token) for token in longform), score) for longform, score in candidates] return candidates def get_longforms(self, cutoff=1): """Return a list of extracted longforms with their scores Runs a breadth first search to search for nodes with score greater than or equal to the scores of all children and strictly less than the scores of all ancestors. Parameters ---------- cutoff : Optional[int] Return only longforms with a score greater than the cutoff. Default: 1 Returns ------- longforms : list of tuple list of longforms along with their scores. It is sorted first in descending order by score, then by the length of the longform from shortest to longest, and finally by lexicographic order. """ # Forward pass longforms = set() # The root contains no longform. Initialize queue with all of its # children queue = deque(self._internal_trie.children.values()) while queue: node = queue.popleft() # if a node has a better score than its parent's best # ancestor it becomes its own best ancestor if node.score > node.parent.best_ancestor_score: node.best_ancestor_score = node.score node.best_ancestor = node # otherwise set its best ancestor to its parents best ancestor else: node.best_ancestor_score = node.parent.best_ancestor_score node.best_ancestor = node.parent.best_ancestor # a nodes score cannot exceed the count of its expected longform. # if the count for a child is less or equal to the best ancestor # score, the node is not added to the queue. track how many # children are added to the queue worthy = 0 for child in node.children.values(): if child.count > node.best_ancestor_score: queue.append(child) worthy += 1 # if no children are added, the becomes a leaf. the optimal # longforms are given by the best ancestors of the leaves. if worthy == 0: longforms.add( (node.best_ancestor.longform, node.best_ancestor.score)) # Convert longforms as tuples in reverse order into reader strings # mapping stems back to the most frequent token that had been mapped longforms = [(longform, score) for longform, score in longforms if score > cutoff] # Map stems to the most frequent word that had been mapped to them. # Convert longforms as tuples in reverse order into reader strings # mapping stems back to the most frequent token that had been # mapped to them. tuple of stemmed tokens can be recovered by # tokenizing, stemming, and reversing longforms = [(self._make_readable(longform), score) for longform, score in longforms] # Sort in preferred order longforms = sorted(longforms, key=lambda x: (-x[1], len(x[0]), x[0])) # Reset best ancestor and best_ancestor score values for all children # of the root. This is required for the algorithm to be able to run # successfully in subsequent calls to this method return longforms def _add(self, tokens): """Add a list of tokens to the internal trie and update likelihoods. Parameters ---------- tokens : str A list of tokens to add to the internal trie. """ # start at top of trie current = self._internal_trie # apply snowball stemmer to each token and put them in reverse order tokens = tuple(self._stemmer.stem(token) for token in tokens)[::-1] for token in tokens: if token not in current.children: # candidate longform is observed for the first time # add a new entry for it in the trie longform = current.longform + (token, ) new = _TrieNode(longform, parent=current) # update likelihood of current node to account for the new # child unless current node is the root if not current.is_root(): current.update_likelihood(1) self._longforms[current.longform[::-1]] = current.score # Add newly observed longform to the dictionary of candidates self._longforms[new.longform[::-1]] = new.score # set newly observed longform to be the child of current node current.children[token] = new # update current node to the newly formed node current = new else: # candidate longform has been observed before # update count for candidate longform and associated LH value current.children[token].increment_count() # Update entry for candidate longform in the candidates # dictionary self._longforms[current.children[token].longform[::-1]] = \ current.children[token].score if not current.is_root(): # we are not at the top of the trie. observed candidate # has a parent # update likelihood of candidate's parent count = current.children[token].count current.update_likelihood(count) # Update candidates dictionary self._longforms[current.longform[::-1]] = current.score current = current.children[token] def _make_readable(self, tokens): """Convert longform from internal representation to a human readable one """ return ' '.join( self._stemmer.most_frequent(token) for token in tokens[::-1])