def GetScoreSimilarity(self, src_treep, trg_treep): src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)] trg_leaves = [ leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) and leaf != '[]'] uri = trg_leaves[-1].lstrip('!') uri_candidates = self.GetURIs(src_leaves, filterq=self.filterq, k=self.kgen) try: score = 1.0 / (uri_candidates.index(uri) + 1) except ValueError: score = None return score
def IsEligible(self, src_treep, trg_treep): if src_treep.HasVariables(): return False src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)] if not len(src_leaves) <= self.max_src_phrase_length: return False if trg_treep is not None: trg_leaves = [leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)] if len(trg_leaves) > self.max_trg_phrase_length: return False return True
def GetSimilarity(self, tree_pattern1, tree_pattern2): src_nodes = tree_pattern1.GetInnerNodes() trg_nodes = tree_pattern2.GetInnerNodes() src_leaves = GetLeaves(tree_pattern1) trg_leaves = GetLeaves(tree_pattern2) src_length = len(src_nodes) + len(src_leaves) trg_length = len(trg_nodes) + len(trg_leaves) cost = max(0, src_length)**2 + max(0, trg_length)**2 similarities = [ Similarity(cost, 'tree_size', tree_pattern1, tree_pattern2) ] return similarities
def GetSimilarity(self, tree_pattern1, tree_pattern2): if not (tree_pattern1.IsString() and tree_pattern2.IsString()): return [] tree1_leaves = GetLeaves(tree_pattern1) tree2_leaves = GetLeaves(tree_pattern2) phrase1 = '_'.join(tree1_leaves).lower() phrase2 = '_'.join(tree2_leaves).lower() if (phrase1, phrase2) in self.dictionary: cost = self.dictionary[(phrase1, phrase2)] return [ Similarity(cost, 'dictionary', tree_pattern1, tree_pattern2) ] return []
def GetSimilarity(self, tree_pattern1, tree_pattern2): if tree_pattern1.IsString() and tree_pattern2.IsString(): return [ Similarity(self.kSubstitutionCost, None, tree_pattern1, tree_pattern2) ] tree1_leaves = set(GetLeaves(tree_pattern1)) tree2_leaves = set(GetLeaves(tree_pattern2)) num_tree1_leaves = len(tree1_leaves) num_tree2_leaves = len(tree2_leaves) weight = num_tree1_leaves * self.kDeletionCost \ + num_tree2_leaves * self.kInsertionCost return [Similarity(weight, None, tree_pattern1, tree_pattern2)]
def GetSimilarity(self, tree_pattern1, tree_pattern2): if tree_pattern1.IsString() and tree_pattern2.IsString(): return [Similarity(self.kSubstitutionCost, self.relation, tree_pattern1, tree_pattern2)] tree1_leaves = GetLeaves(tree_pattern1) tree2_leaves = GetLeaves(tree_pattern2) num_tree1_leaves = len(tree1_leaves) num_tree2_leaves = len(tree2_leaves) num_substitution_leaves = min(num_tree1_leaves, num_tree2_leaves) num_deletion_leaves = max(0, num_tree1_leaves - num_substitution_leaves) num_insertion_leaves = max(0, num_tree2_leaves - num_substitution_leaves) cost = num_substitution_leaves * self.kSubstitutionCost \ + num_deletion_leaves * self.kDeletionCost \ + num_insertion_leaves * self.kInsertionCost return [Similarity(cost, self.relation, tree_pattern1, tree_pattern2)]
def GetCostSimilarity(self, src_treep, trg_treep): src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)] trg_leaves = [ leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) and leaf != '[]'] uri = trg_leaves[-1].lstrip('!') uri_candidates = self.GetURIs(src_leaves, filterq=self.filterq, k=self.krecog) try: cost = 1.0 - 1.0 / (uri_candidates.index(uri) + 1) num_leaves = len(src_leaves) + len(trg_leaves) cost *= num_leaves if len(trg_leaves) > 1: cost += self.extra_cost except ValueError: cost = None return cost
def get_trg_words_from_treep(self, trg_treep): trg_leaves = [ leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) ] target = ' '.join(trg_leaves) tokens = self.get_target_words(target) return tokens
def get_src_words_from_treep(self, src_treep): src_leaves = [ leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf) ] source = ' '.join(src_leaves) tokens = self.get_index_words(source, None) return tokens
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) uri_candidates = self.GetURIs(src_leaves, k=self.kgen) path, subpaths = (), [] trg_treeps = [TreePattern( tree_or_string(u'(ID [] {0})'.format(uri)), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) # uri_candidate_docs = self.GetDocs(src_leaves, context=None, fields=['uri']) uri_candidates = self.GetURIs(src_leaves, k=self.kgen) path, subpaths = (), [] trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def IsEligible(self, src_treep, trg_treep): if not self.IsEligibleSrc(src_treep): return False if trg_treep is not None: trg_leaves = [leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)] num_trg_vars = trg_treep.GetNumVariables() if len(trg_leaves) != self.trg_phrase_length or num_trg_vars > 1: return False return True
def IsEligibleSrc(self, src_treep): if src_treep.HasVariables(): return False src_leaves = filter_tokens(GetLeaves(src_treep)) if not src_leaves: return False if not len(src_leaves) <= self.max_src_phrase_length: return False return True
def IsEligible_(self, src_treep, trg_treep): """ The source tree pattern should not contain any variable (hence, no variables in target tree pattern either), have equal or less leaves than self.max_src_phrase_length and the target tree pattern have self.trg_phrase_length leaves. """ if src_treep.HasVariables(): return False src_leaves = GetLeaves(src_treep) if not len(src_leaves) <= self.max_src_phrase_length: return False if trg_treep is not None: trg_leaves = [ leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) ] if not len(trg_leaves) == self.trg_phrase_length: return False return True
def GetSimilarity(self, tree_pattern1, tree_pattern2): if not (tree_pattern1.IsString() and tree_pattern2.IsString()): """ if tree_pattern1 == tree_pattern2: return [Similarity(self.kLinguisticVariation, 'copy', tree_pattern1, tree_pattern2)] """ return [] tree1_leaves = set(GetLeaves(tree_pattern1)) tree2_leaves = set(GetLeaves(tree_pattern2)) phrase1 = '_'.join(tree1_leaves) phrase2 = '_'.join(tree2_leaves) linguistic_relationships = LinguisticRelationship(phrase1, phrase2) similarities = [] for relation in linguistic_relationships: similarity = Similarity(self.kLinguisticVariation, relation, tree_pattern1, tree_pattern2) similarities.append(similarity) return similarities
def FilterOutRulesWithCVT(rules): remaining_rules = [] for r in rules: all_leaves = GetLeaves(r.rhs) for leaf in all_leaves: if IsVariable(leaf) or IsOperator(leaf): continue if leaf.lstrip('!') in cvts: break else: remaining_rules.append(r) return remaining_rules
def GetSimilarity(self, src_treep, trg_treep): """ If 'predicate' is within the roles of the target URI, then the relation is labelled as self.predicate_relation. Otherwise, as self.entity_relation. Assuming the best possible cost to be 1.0 for the transformation of each source or target token, this cost function cannot give costs below that. In case the ngram ratio is 1.0 (perfect match of source into target; note it is asymmetric), then the cost to transform each token will be (2.1 - ngram_ratio) = 1.1 Lower ngram ratios will give higher costs. The minimum ngram ratio that we consider for a mapping to be eligible is self.ngram_min_ratio. """ similarities = [] if not self.IsEligible(src_treep, trg_treep): return similarities src_leaves = GetLeaves(src_treep) trg_leaves = GetLeaves(trg_treep) uri = trg_leaves[-1] num_src_leaves = len(src_leaves) num_trg_leaves = len(trg_leaves) trg_leaves = SplitLeavesBy([uri], self.trg_token_separators) src_leaves = filter_tokens(src_leaves) trg_leaves = filter_tokens(trg_leaves) ngram_ratio = get_ngram_ratio(src_leaves, trg_leaves) if ngram_ratio >= self.ngram_min_ratio: cost = (2.1 - ngram_ratio) * (num_src_leaves + 1) if num_trg_leaves == 1: relation = self.GetURIRole(uri) if not relation: return similarities else: cost += self.extra_cost relation = self.bridge_relation if relation in [self.entity_relation, self.bridge_relation] and \ not IsPlausibleEntityPhrase(src_treep): return similarities similarities = [Similarity(cost, relation, src_treep, trg_treep)] return similarities
def GetSimilarity(self, tree_pattern1, tree_pattern2): """ The cost associated to a partial match when checking the source and the target sides is equivalent to the worst cost (max cost) among all entries where there is a partial match. The rationale is that this cost function should not have preference over the exact match implemented in DictionaryCost(). """ similarities = [] if not self.IsEligible(tree_pattern1, tree_pattern2): return similarities tree1_leaves = GetLeaves(tree_pattern1) tree2_leaves = [l.lstrip('!') for l in GetLeaves(tree_pattern2)] # Split source and target leaves by token separators. src_words = SplitLeavesBy(tree1_leaves, self.src_token_separators) trg_words = SplitLeavesBy(tree2_leaves, self.trg_token_separators) if self.lowercase: src_words = [word.lower() for word in src_words] trg_words = [word.lower() for word in trg_words] # Obtain indices of bilingual phrases for which at least one source word # appears. src_word_indices = [self.get_src_index(word) for word in src_words] src_indices = set(itertools.chain(*src_word_indices)) # The same for target words. trg_word_indices = [self.trg_index.get(word, []) for word in trg_words] trg_indices = set(itertools.chain(*trg_word_indices)) common_indices = src_indices.intersection(trg_indices) if not common_indices: return similarities cost = self.GetSimilarityCost(src_words, trg_words, common_indices) similarities = [ Similarity(cost, self.relation, tree_pattern1, tree_pattern2) ] return similarities
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) uri_candidates_direct = self.GetURIs( src_leaves, filterq=self.filterq, k=self.kgen) uri_candidates = [] for uri in uri_candidates_direct: uri_candidates.append(uri) uri_candidates.append('!' + uri) path, subpaths = (), [] src_has_variables = src_treep.HasVariables() if src_has_variables: trg_treeps = [TreePattern(tree_or_string(u'(ID {0} ?x0|)'.format(uri)), path, subpaths) for uri in uri_candidates] else: trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def GetSimilar(self, src_tree_pattern): similarities = [] if not self.IsEligibleSrc(src_tree_pattern): return similarities src_leaves = GetLeaves(src_tree_pattern) src_words = SplitLeavesBy(src_leaves, self.src_token_separators) if self.lowercase: src_words = [word.lower() for word in src_words] # Obtain indices of bilingual phrases for which at least one source word # appears. src_word_indices = [self.get_src_index(word) for word in src_words] src_indices = set(itertools.chain(*src_word_indices)) similarities = self.MakeSimilar(src_tree_pattern, src_words, src_indices) return sorted(similarities, key=lambda s: s.score, reverse=True)[:self.n_best]
def GetSimilar(self, tree_pattern1): tree1_leaves = GetLeaves(tree_pattern1) num_tree1_leaves = len(tree1_leaves) if len(tree1_leaves) > self.max_phrase_length or not tree1_leaves: cost = num_tree1_leaves similarities = [ Similarity(cost, 'q0', tree_pattern1, tree_pattern2) ] else: entities = self.GetLexicon(tree1_leaves, 'entity') unary_predicates = self.GetLexicon(tree1_leaves, 'unary') binary_predicates = self.GetLexicon(tree1_leaves, 'binary') lexicon = entities + unary_predicates + binary_predicates similarities = [] cost = 0.0 for lex, lex_type in lexicon: path, subpaths = (), [] tree_pattern2 = TreePattern(lex, path, subpaths) similarity = Similarity(cost, lex_type, tree_pattern1, tree_pattern2) similarities.append(similarity) return similarities
def GetOutputVocabulary(rules): return set([l for rule in rules \ for l in GetLeaves(rule.rhs) \ if not l.startswith('?x')])
def GetSimilarity(self, tree_pattern1, tree_pattern2): tree1_leaves = set(GetLeaves(tree_pattern1)) tree2_leaves = set(GetLeaves(tree_pattern2)) weight = len(tree1_leaves) * self.kDeletionCost \ + len(tree2_leaves) * self.kInsertionCost return [Similarity(weight, None, tree_pattern1, tree_pattern2)]