示例#1
0
def create_fsa_from_file(fpath):
    """
    Generate all sentence-level paraphrases for a given data file
    """
    # Load sentences
    sents = []
    tk_sents = {}
    with codecs.open(fpath, encoding='utf-8') as ifile:
        idx = 0
        for line in ifile:
            line = line.strip()
            norm_line = normalize_sent(line)
            if norm_line in sents:
                print("Found a duplicated sentence.")
                continue

            doc = nlp(norm_line)
            sent_tks = [tk.text for tk in doc]

            tk_sents[idx] = sent_tks
            #print("Sent {}: {}".format(idx, line))
            #print("Normalized sent: {}".format(idx, norm_line))
            sents.append(norm_line)
            idx += 1

    # Create word alignment
    print("\t Generate pairwise alignment between sentences...")
    align_matrix = make_alignment_matrix(sents)

    # Validity checking
    sents_cluster = create_valid_groups(align_matrix, tk_sents)
    #print("Sent clusters: {}".format(sents_cluster))

    # Create the word lattice
    fsa = create_fsa(tk_sents)
    for i, cluster in enumerate(sents_cluster):
        fsa = process_sents(fsa, tk_sents, align_matrix, cluster)

    sent_num = len(sents)
    tk_num = 0
    for _, tk_list in tk_sents.items():
        tk_num += len(tk_list)

    # Display the word lattice
    # print("idx_to_node:{}".format(idx_to_node))
    # nx.draw_circular(fsa, with_labels=True)
    # plt.show()

    return fsa, sent_num, sents, tk_num
示例#2
0
def main():
    # small test for the preprocess functions
    # I assume the input contains: list of lists storing alignments
    sents = ['This is simply a test.',
             'is this simply a test?',
             'simply a test this is.']
    alignments = make_alignment_matrix(sents)
    # checking if we detect criss-cross
    print(split_crisscross(alignments))
    # checking if we detect many-to-1s
    # creating a none injective mapping
    #alignments[2][0][2][1] = 3
    print(split_none_injective(alignments))
    # checking if we detect non-transitives
    # creating some non-transitive issue
    #alignments[2][0][1][1] = 4
    print(split_none_transitive(alignments))
    # checking all conditions together
    print(create_valid_groups(alignments))
示例#3
0
def build_graph(sents):
    origin_sents = sents
    tk_sents = {}
    for i, sent in enumerate(sents):
        doc = nlp(sent)
        tk_st = [tk.text for tk in doc]
        tk_sents[i] = tk_st

    align_matrix = None
    sents_cluster = None
    if sultan_aligner:
        align_matrix = make_alignment_matrix(origin_sents)
        #merge_chunks(align_matrix, tk_sents, origin_sents)
        sents_cluster = create_valid_groups(align_matrix, tk_sents)
    else:
        align_matrix = make_alignment_matrix_with_rules(origin_sents)
        sents_cluster = create_valid_groups(align_matrix, tk_sents)
        #sents_cluster = [range(len(align_matrix))]
    # print("sentence clusters: {}".format(sents_cluster))
    # print(align_matrix)

    fsa = create_fsa(tk_sents)
    for cluster in sents_cluster:
        fsa = process_sents(fsa, tk_sents, align_matrix, cluster)

    raw_names = idx_to_node
    names = {}
    for node_str, values_str in raw_names.items():
        # print("node_str: {}".format(node_str))
        # print("values_str: {}".format(values_str))        
        node = int(node_str)
        if node == start_state:
            names[node] = 'START'
        elif node == end_state:
            names[node] = 'END'
        else:
            values = eval(values_str) if values_str != "" else {}
            all_words = list(set([values[x][1].lower() for x in values]))
            names[node] = '/'.join(all_words)
    return fsa, names