def __init__(self, language): self.config = AlignerConfig(language) if "paraphrases" in self.config.selected_lexical_resources: load_resources.load_ppdb(self.config.path_to_ppdb) if "distributional" in self.config.selected_lexical_resources: load_resources.load_word_vectors(self.config.path_to_vectors)
class AlignerStanford(object): config = None alignments = set() source_indices_aligned = set() target_indices_aligned = set() similarity_types = dict() def __init__(self, language): self.config = AlignerConfig(language) if "paraphrases" in self.config.selected_lexical_resources: load_resources.load_ppdb(self.config.path_to_ppdb) if "distributional" in self.config.selected_lexical_resources: load_resources.load_word_vectors(self.config.path_to_vectors) def _add_to_alignments(self, source_index, target_index, similarity_type='Exact'): if (source_index, target_index) not in self.alignments: self.alignments.add((source_index, target_index)) self.source_indices_aligned.add(source_index) self.target_indices_aligned.add(target_index) self.similarity_types[(source_index, target_index)] = similarity_type def _align_ending_punctuation(self, source, target): if (source[len(source) - 1].is_sentence_ending_punctuation() and target[len(target) - 1].is_sentence_ending_punctuation())\ or source[len(source) - 1].form == target[len(target) - 1].form: self._add_to_alignments(len(source), len(target)) elif source[len(source) - 2].is_sentence_ending_punctuation() and target[len(target) - 1].is_sentence_ending_punctuation(): self._add_to_alignments(len(source) - 1, len(target)) elif source[len(source) - 1].is_sentence_ending_punctuation() and target[len(target) - 2].is_sentence_ending_punctuation(): self._add_to_alignments(len(source), len(target) - 1) elif source[len(source) - 2].is_sentence_ending_punctuation() and target[len(target) - 2].is_sentence_ending_punctuation(): self._add_to_alignments(len(source) - 1, len(target) - 1) return def _align_contiguous_sublists(self, source, target): sublists = find_all_common_contiguous_sublists(source, target, True) for item in sublists: only_stopwords = True for jtem in item[0]: if jtem not in cobalt_stopwords and jtem not in punctuations: only_stopwords = False break if len(item[0]) >= 3 and not only_stopwords: for j in range(len(item[0])): if item[0][j]+1 not in self.source_indices_aligned \ and item[1][j]+1 not in self.target_indices_aligned \ and (item[0][j]+1, item[1][j]+1) not in self.alignments: self._add_to_alignments(item[0][j]+1, item[1][j]+1) def _align_hyphenated_word_groups(self, source, target): for word in source: if word.index in self.source_indices_aligned: continue if '-' in word.form and word.form != '-': tokens = word.form.split('-') sublists = find_all_common_contiguous_sublists(tokens, target) for item in sublists: if len(item[1]) > 1: for jtem in item[1]: if (word.index, jtem+1) not in self.alignments: self._add_to_alignments(word.index, jtem+1) for word in target: if word.index in self.target_indices_aligned: continue if '-' in word.form and word.form != '-': tokens = word.form.split('-') sublists = find_all_common_contiguous_sublists(source, tokens) for item in sublists: if len(item[0]) > 1: for jtem in item[0]: if (jtem+1, word.index) not in self.alignments: self._add_to_alignments(jtem+1, word.index) def _extend_named_entities_group_using_opposite_sentence(self, sentence, same_ne_groups, opposite_ne_groups): # learn from the other sentence that a certain word/phrase is a named entity for word in sentence: included = False for source_ne_group in same_ne_groups: if word.index in source_ne_group.indicies: included = True break if included or not word.form.isupper(): continue for opposite_ne_group in opposite_ne_groups: if word.form in opposite_ne_group.forms: new_ne_group = NamedEntityGroup([word.index], [word], opposite_ne_group.ner) # check if the current item is part of a named entity part of which has already been added (by checking contiguousness) is_part = False for group in same_ne_groups: if group.indicies[-1] == new_ne_group.indicies[0] - 1: group.indicies.append(word.index) group.words.append(word) group.forms.append(word.form) is_part = True if not is_part: same_ne_groups.append(new_ne_group) elif is_acronym_stanford(word, opposite_ne_group) \ and next((x for x in same_ne_groups if word.form in x.forms), None) is None: same_ne_groups.append(NamedEntityGroup([word.index], [word], opposite_ne_group.ner)) def _count_form_occurences(self, ne_group, groups): count = 0 for group in groups: if group.forms == ne_group.forms: count += 1 return count def _align_named_entity_subsets(self, smaller_group, bigger_group, source_is_smaller, full_smaller_words): unaligned_indicies_bigger = [] unaligned_indicies_bigger.extend(bigger_group.indicies) for i, smaller_form in enumerate(smaller_group.forms): for j, bigger_form in enumerate(bigger_group.forms): if smaller_form == bigger_form: if source_is_smaller: self._add_to_alignments(smaller_group.indicies[i], bigger_group.indicies[j]) else: self._add_to_alignments(bigger_group.indicies[j], smaller_group.indicies[i]) if bigger_group.indicies[j] in unaligned_indicies_bigger: unaligned_indicies_bigger.remove(bigger_group.indicies[j]) for i, smaller_word in enumerate(smaller_group.words): for j, bigger_word in enumerate(bigger_group.words): # find if the current term in the longer name has already been aligned, do not align_sentence it in that case dont_insert = smaller_word.is_punctuation() or bigger_word.is_punctuation() for a in self.alignments: if (source_is_smaller and a[1] == bigger_group.indicies[j]) or (not source_is_smaller and a[0] == bigger_group.indicies[j]): dont_insert = True break for word in full_smaller_words: if bigger_word.form == word.form: dont_insert = True break if bigger_group.indicies[j] not in unaligned_indicies_bigger or dont_insert: continue if source_is_smaller: self._add_to_alignments(smaller_group.indicies[i], bigger_group.indicies[j]) else: self._add_to_alignments(bigger_group.indicies[j], smaller_group.indicies[i]) def _align_named_entities(self, source, target): source_ne_groups = sorted(find_named_entity_groups(source), key=NamedEntityGroup.sort_key) target_ne_groups = sorted(find_named_entity_groups(target), key=NamedEntityGroup.sort_key) self._extend_named_entities_group_using_opposite_sentence(source, source_ne_groups, target_ne_groups) self._extend_named_entities_group_using_opposite_sentence(target, target_ne_groups, source_ne_groups) if len(source_ne_groups) == 0 or len(target_ne_groups) == 0: return source_ne_groups_aligned = [] target_ne_groups_aligned = [] # align_sentence all full matches for source_group in source_ne_groups: if self._count_form_occurences(source_group, source_ne_groups) > 1: continue for target_group in target_ne_groups: if self._count_form_occurences(target_group, target_ne_groups) > 1: continue # get rid of dots and hyphens canonical_source_forms = [i.replace('.', '').replace('-', '') for i in source_group.forms] canonical_target_forms = [j.replace('.', '').replace('-', '') for j in target_group.forms] if canonical_source_forms == canonical_target_forms: for k in range(len(source_group.indicies)): if (source_group.indicies[k], target_group.indicies[k]) not in self.alignments: self._add_to_alignments(source_group.indicies[k], target_group.indicies[k]) source_ne_groups_aligned.append(source_group) target_ne_groups_aligned.append(target_group) # align_sentence acronyms with their elaborations for source_group in source_ne_groups: for target_group in target_ne_groups: if len(source_group.words) == 1 and len(source_group.forms[0].replace('.', '')) > 1 and is_acronym_stanford(source_group.words[0], target_group): for i in range(len(target_group.indicies)): if (source_group.indicies[0], target_group.indicies[i]) not in self.alignments: self._add_to_alignments(source_group.indicies[0], target_group.indicies[i]) elif len(target_group.words) == 1 and len(target_group.forms[0].replace('.', '')) > 1 and is_acronym_stanford(target_group.words[0], source_group): for i in range(len(source_group.indicies)): if (source_group.indicies[i], target_group.indicies[0]) not in self.alignments: self._add_to_alignments(source_group.indicies[i], target_group.indicies[0]) # align_sentence subset matches for source_group in source_ne_groups: if source_group in source_ne_groups_aligned or self._count_form_occurences(source_group, source_ne_groups) > 1: continue for target_group in target_ne_groups: if target_group in target_ne_groups_aligned or source_group.ner != target_group.ner \ or self._count_form_occurences(target_group, target_ne_groups) > 1: continue # find if the first is a part of the second if is_sublist(source_group.forms, target_group.forms): self._align_named_entity_subsets(source_group, target_group, True, source) # else find if the second is a part of the first if is_sublist(target_group.forms, source_group.forms): self._align_named_entity_subsets(target_group, source_group, False, target) return def _is_similar(self, dep1, dep2, pos1, pos2, is_opposite, relation): result = False group = self.config.get_similar_group(pos1, pos2, is_opposite, relation) if is_opposite: for subgroup in group: if dep1 in subgroup[0] and dep2 in subgroup[1]: result = True else: for subgroup in group: if dep1 in subgroup and dep2 in subgroup: result = True return result def _compare_nodes(self, source, target, pos, opposite, relation_direction): # search for nodes in common or equivalent function result = {} for word1 in source: for word2 in target: similarity, similarity_type = word_relatedness_alignment_stanford(word1, word2, self.config) if ((word1.index, word2.index) in self.alignments or similarity >= self.config.alignment_similarity_threshold) and ( (word1.dep == word2.dep) or ((pos != '' and relation_direction != 'child_parent') and ( self._is_similar(word1.dep, word2.dep, pos, 'noun', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'verb', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'adjective', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'adverb', opposite, relation_direction))) or ((pos != '' and relation_direction == 'child_parent') and ( self._is_similar(word1.dep, word2.dep, pos, 'noun', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'verb', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'adjective', opposite, relation_direction) or self._is_similar(word1.dep, word2.dep, pos, 'adverb', opposite, relation_direction)))): result[(word1.index, word2.index)] = (similarity, similarity_type) return result def _calculate_absolute_score(self, word_similarities): max_left = {} max_right = {} for similarity in word_similarities.keys(): if not similarity[0] in max_left or word_similarities[max_left[similarity[0]]][0] < word_similarities[similarity][0]: max_left[similarity[0]] = similarity if not similarity[1] in max_right or word_similarities[max_right[similarity[1]]][0] < word_similarities[similarity][0]: max_right[similarity[1]] = similarity left_right = dict() left_right.update(max_left) left_right.update(max_right) max_relations = set(left_right.values()) score = 0 source_nodes_considered = [] target_nodes_considered = [] for rel in max_relations: if rel[0] not in source_nodes_considered and rel[1] not in target_nodes_considered: score += word_similarities[rel][0] source_nodes_considered.append(rel[0]) target_nodes_considered.append(rel[1]) return score def _find_dependency_similarity(self, pos, source_word, target_word): comparison = dict() comparison.update(self._compare_nodes(source_word.parents, target_word.parents, pos, False, 'parent')) comparison.update(self._compare_nodes(source_word.children, target_word.children, pos, False, 'child')) comparison.update(self._compare_nodes(source_word.parents, target_word.children, pos, True, 'parent_child')) comparison.update(self._compare_nodes(source_word.parents, target_word.children, pos, True, 'child_parent')) alignments = [] word_similarities = {} for alignment in comparison.keys(): alignments.append([alignment[0], alignment[1]]) word_similarities[alignment] = comparison[alignment] return [self._calculate_absolute_score(word_similarities), alignments] def _align_on_dependency_match(self, pos, pos_code, source, target): pos_count_in_source = 0 evidence_counts_matrix = {} relative_alignments_matrix = {} word_similarities = {} # construct the two matrices in the following loop for item in source: i = item.index if i in self.source_indices_aligned or not item.matches_pos_code(pos_code): continue pos_count_in_source += 1 for jtem in target: j = jtem.index if j in self.target_indices_aligned or not jtem.matches_pos_code(pos_code): continue similarity, similarity_type = word_relatedness_alignment_stanford(item, jtem, self.config) if similarity < self.config.alignment_similarity_threshold: continue word_similarities[(i, j)] = similarity, similarity_type dependency_similarity = self._find_dependency_similarity(pos, item, jtem) if word_similarities[(i, j)][0] == self.config.alignment_similarity_threshold: if word_similarities[(i, j)][0] + dependency_similarity[0] <= 1.0: continue if dependency_similarity[0] >= self.config.alignment_similarity_threshold: evidence_counts_matrix[(i, j)] = dependency_similarity[0] relative_alignments_matrix[(i, j)] = dependency_similarity[1] else: evidence_counts_matrix[(i, j)] = 0 # now use the collected stats to align_sentence for n in range(pos_count_in_source): max_overall_value_for_pass = 0 index_pair_with_strongest_tie_for_pass = [-1, -1] for item in source: i = item.index if i in self.source_indices_aligned or not item.matches_pos_code(pos_code): continue for jtem in target: j = jtem.index if j in self.target_indices_aligned or not jtem.matches_pos_code(pos_code): continue if (i, j) in evidence_counts_matrix and self.config.theta * word_similarities[(i, j)][0] + (1 - self.config.theta) * evidence_counts_matrix[(i, j)] > max_overall_value_for_pass: max_overall_value_for_pass = self.config.theta * word_similarities[(i, j)][0] + (1 - self.config.theta) * evidence_counts_matrix[(i, j)] index_pair_with_strongest_tie_for_pass = [i, j] if max_overall_value_for_pass > 0: strongest_i = index_pair_with_strongest_tie_for_pass[0] strongest_j = index_pair_with_strongest_tie_for_pass[1] self._add_to_alignments(strongest_i, strongest_j, similarity_type=word_similarities[(strongest_i, strongest_j)][1]) else: break return def _get_unaligned_words(self, source, target, word_function): content_source = [] content_target = [] for word in source: if word.index in self.source_indices_aligned or not word_function(word): continue content_source.append(word) for word in target: if word.index in self.target_indices_aligned or not word_function(word): continue content_target.append(word) return content_source, content_target def _collect_evidence_from_textual_neighborhood(self, unaligned_source, unaligned_target, full_source, full_target): word_similarities = {} textual_neighborhood_similarities = {} for source_word in unaligned_source: for target_word in unaligned_target: word_similarities[(source_word.index, target_word.index)] = word_relatedness_alignment_stanford(source_word, target_word, self.config) # textual neighborhood similarities source_neighborhood = find_textual_neighborhood_stanford(full_source, source_word.index, 3, 3) target_neighborhood = find_textual_neighborhood_stanford(full_target, target_word.index, 3, 3) evidence = 0 for source_neighbor in source_neighborhood: for target_neighbor in target_neighborhood: similarity, similarity_type = word_relatedness_alignment_stanford(source_neighbor, target_neighbor, self.config) if (source_neighbor.index, target_neighbor.index) in self.alignments \ or similarity >= self.config.alignment_similarity_threshold: evidence += similarity textual_neighborhood_similarities[(source_word.index, target_word.index)] = evidence return word_similarities, textual_neighborhood_similarities def _collect_evidence_from_dependency_neighborhood(self, unaligned_source, unaligned_target): word_similarities = {} dependency_neighborhood_similarities = {} for source_word in unaligned_source: for target_word in unaligned_target: i = source_word.index j = target_word.index similarity, similarity_type = word_relatedness_alignment_stanford(source_word, target_word, self.config) if (source_word.lemma != target_word.lemma) and (similarity < self.config.alignment_similarity_threshold): word_similarities[(i, j)] = (0, similarity_type) dependency_neighborhood_similarities[(i, j)] = 0 continue word_similarities[(i, j)] = (similarity, similarity_type) evidence = 0 for source_parent in source_word.parents: for target_parent in target_word.parents: if (source_parent.index, target_parent.index) in self.alignments: evidence += 1 for source_child in source_word.children: for target_child in target_word.children: if (source_child.index, target_child.index) in self.alignments: evidence += 1 dependency_neighborhood_similarities[(i, j)] = evidence return word_similarities, dependency_neighborhood_similarities def _collect_evidence_from_textual_neighborhood_for_stopwords(self, unaligned_source, unaligned_target, full_source, full_target): word_similarities = {} textual_neighborhood_similarities = {} for source_word in unaligned_source: for target_word in unaligned_target: i = source_word.index j = target_word.index similarity, similarity_type = word_relatedness_alignment_stanford(source_word, target_word, self.config) if similarity < self.config.alignment_similarity_threshold: word_similarities[(i, j)] = (0, similarity_type) textual_neighborhood_similarities[(i, j)] = 0 continue word_similarities[(i, j)] = (similarity, similarity_type) # textual neighborhood evidence, increasing evidence if content words around this stop word are aligned evidence = 0 k = i l = j while k > 0: if full_source[k-1].is_stopword() or full_source[k-1].is_punctuation(): k -= 1 else: break while l > 0: if full_target[l-1].is_stopword() or full_target[l-1].is_punctuation(): l -= 1 else: break m = i n = j while m < len(full_source) - 1: if full_source[m-1].is_stopword() or full_source[m-1].is_punctuation(): m += 1 else: break while n < len(full_target) - 1: if full_target[n-1].is_stopword() or full_target[n-1].is_punctuation(): n += 1 else: break if (k, l) in self.alignments: evidence += 1 if (m, n) in self.alignments: evidence += 1 textual_neighborhood_similarities[(i, j)] = evidence return word_similarities, textual_neighborhood_similarities def _align_words(self, unaligned_source, unaligned_target, word_similarities, neighborhood_similarities, limit_by_neighborhood): # now align_sentence: find the best alignment in each iteration of the following loop and include in alignments if good enough for item in range(len(unaligned_source)): highest_weighted_similarity = 0 best_word_similarity = 0 best_neighborhood_similarity = 0 best_source = None best_target = None best_word_similarity_type = None for source_word in unaligned_source: for target_word in unaligned_target: i = source_word.index j = target_word.index if (i, j) not in word_similarities: continue if word_similarities[(i, j)][0] == self.config.alignment_similarity_threshold: if word_similarities[(i, j)][0] + neighborhood_similarities[(i, j)] <= 1.0: continue if self.config.theta * word_similarities[(i, j)][0] + (1 - self.config.theta) * neighborhood_similarities[(i, j)] > highest_weighted_similarity: highest_weighted_similarity = self.config.theta * word_similarities[(i, j)][0] + (1 - self.config.theta) * neighborhood_similarities[(i, j)] best_source = source_word best_target = target_word best_word_similarity = word_similarities[(i, j)][0] best_word_similarity_type = word_similarities[(i, j)][1] best_neighborhood_similarity = neighborhood_similarities[(i, j)] if best_word_similarity >= self.config.alignment_similarity_threshold \ and (not limit_by_neighborhood or best_neighborhood_similarity > 0) \ and best_source.index not in self.source_indices_aligned \ and best_target.index not in self.target_indices_aligned: self._add_to_alignments(best_source.index, best_target.index, similarity_type=best_word_similarity_type) if best_source is not None: unaligned_source.remove(best_source) if best_target is not None: unaligned_target.remove(best_target) return unaligned_source, unaligned_target def _align_remaining_if_hyphenated(self, remaining_source, remaining_target, full_source, full_target): # look if any remaining word is a part of a hyphenated word for source_word in remaining_source: if '-' in source_word.form and source_word.form != '-': tokens = source_word.form.split('-') for item in find_all_common_contiguous_sublists(tokens, full_target): if len(item[0]) == 1 and full_target[item[1][0]].is_stopword(): for jtem in item[1]: if (source_word.index, jtem+1) not in self.alignments and jtem+1 not in self.target_indices_aligned: self._add_to_alignments(source_word.index, jtem+1) for target_word in remaining_target: if '-' in target_word.form and target_word.form != '-': tokens = target_word.form.split('-') for item in find_all_common_contiguous_sublists(full_source, tokens): if len(item[0]) == 1 and not full_source[item[0][0]].is_stopword(): for jtem in item[0]: if (jtem+1, target_word.index) not in self.alignments and target_word.index not in self.target_indices_aligned: self._add_to_alignments(jtem+1, target_word.index) def _align_content_words(self, source, target): def is_content_word(word): return word.is_content_word() content_source, content_target = self._get_unaligned_words(source, target, is_content_word) word_similarities, neighborhood_similarities = self._collect_evidence_from_textual_neighborhood(content_source, content_target, source, target) remaining_source, remaining_target = self._align_words(content_source, content_target, word_similarities, neighborhood_similarities, False) self._align_remaining_if_hyphenated(remaining_source, remaining_target, source, target) def _align_stop_words_by_dependency_neighborhood(self, source, target): def is_stopword(word): return word.is_stopword() stop_source, stop_target = self._get_unaligned_words(source, target, is_stopword) word_similarities, neighborhood_similarities = self._collect_evidence_from_dependency_neighborhood(stop_source, stop_target) self._align_words(stop_source, stop_target, word_similarities, neighborhood_similarities, True) def _align_stop_words_and_punctuations_by_textual_neighborhood(self, source, target): def is_stopword_or_punctuation(word): return word.is_stopword() or word.is_punctuation() stop_source, stop_target = self._get_unaligned_words(source, target, is_stopword_or_punctuation) word_similarities, neighborhood_similarities = self._collect_evidence_from_textual_neighborhood_for_stopwords(stop_source, stop_target, source, target) self._align_words(stop_source, stop_target, word_similarities, neighborhood_similarities, True) def align(self, source, target): self.alignments = set() self.source_indices_aligned = set() self.target_indices_aligned = set() self.similarity_types = dict() self._align_ending_punctuation(source, target) self._align_contiguous_sublists(source, target) self._align_hyphenated_word_groups(source, target) self._align_named_entities(source, target) self._align_on_dependency_match('verb', 'v', source, target) self._align_on_dependency_match('noun', 'n', source, target) self._align_on_dependency_match('adjective', 'j', source, target) self._align_on_dependency_match('adverb', 'r', source, target) self._align_content_words(source, target) self._align_stop_words_by_dependency_neighborhood(source, target) self._align_stop_words_and_punctuations_by_textual_neighborhood(source, target) return self.alignments, self.similarity_types