def create_fsa_from_file(fpath): """ Generate all sentence-level paraphrases for a given data file """ # Load sentences sents = [] tk_sents = {} with codecs.open(fpath, encoding='utf-8') as ifile: idx = 0 for line in ifile: line = line.strip() norm_line = normalize_sent(line) if norm_line in sents: print("Found a duplicated sentence.") continue doc = nlp(norm_line) sent_tks = [tk.text for tk in doc] tk_sents[idx] = sent_tks #print("Sent {}: {}".format(idx, line)) #print("Normalized sent: {}".format(idx, norm_line)) sents.append(norm_line) idx += 1 # Create word alignment print("\t Generate pairwise alignment between sentences...") align_matrix = make_alignment_matrix(sents) # Validity checking sents_cluster = create_valid_groups(align_matrix, tk_sents) #print("Sent clusters: {}".format(sents_cluster)) # Create the word lattice fsa = create_fsa(tk_sents) for i, cluster in enumerate(sents_cluster): fsa = process_sents(fsa, tk_sents, align_matrix, cluster) sent_num = len(sents) tk_num = 0 for _, tk_list in tk_sents.items(): tk_num += len(tk_list) # Display the word lattice # print("idx_to_node:{}".format(idx_to_node)) # nx.draw_circular(fsa, with_labels=True) # plt.show() return fsa, sent_num, sents, tk_num
def main(): # small test for the preprocess functions # I assume the input contains: list of lists storing alignments sents = ['This is simply a test.', 'is this simply a test?', 'simply a test this is.'] alignments = make_alignment_matrix(sents) # checking if we detect criss-cross print(split_crisscross(alignments)) # checking if we detect many-to-1s # creating a none injective mapping #alignments[2][0][2][1] = 3 print(split_none_injective(alignments)) # checking if we detect non-transitives # creating some non-transitive issue #alignments[2][0][1][1] = 4 print(split_none_transitive(alignments)) # checking all conditions together print(create_valid_groups(alignments))
def build_graph(sents): origin_sents = sents tk_sents = {} for i, sent in enumerate(sents): doc = nlp(sent) tk_st = [tk.text for tk in doc] tk_sents[i] = tk_st align_matrix = None sents_cluster = None if sultan_aligner: align_matrix = make_alignment_matrix(origin_sents) #merge_chunks(align_matrix, tk_sents, origin_sents) sents_cluster = create_valid_groups(align_matrix, tk_sents) else: align_matrix = make_alignment_matrix_with_rules(origin_sents) sents_cluster = create_valid_groups(align_matrix, tk_sents) #sents_cluster = [range(len(align_matrix))] # print("sentence clusters: {}".format(sents_cluster)) # print(align_matrix) fsa = create_fsa(tk_sents) for cluster in sents_cluster: fsa = process_sents(fsa, tk_sents, align_matrix, cluster) raw_names = idx_to_node names = {} for node_str, values_str in raw_names.items(): # print("node_str: {}".format(node_str)) # print("values_str: {}".format(values_str)) node = int(node_str) if node == start_state: names[node] = 'START' elif node == end_state: names[node] = 'END' else: values = eval(values_str) if values_str != "" else {} all_words = list(set([values[x][1].lower() for x in values])) names[node] = '/'.join(all_words) return fsa, names