示例#1
0
def process_files(file_list,
                  feature_function,
                  filter_func=None,
                  neg_pos_ratio=1,
                  no_of_sentence=10000,
                  start_from=0):
    print("processing file {}...".format(file_list[0]))
    sentences = open(file_list[0]).read().splitlines()
    sentences_lpos = open(file_list[1].replace(
        '.txt', '_lpos.txt')).read().splitlines()
    sentences_trees = stanford_parse.get_trees_from_raw(
        open(file_list[1].replace('.txt', '_trees.txt')).read())
    if filter_func:
        sentences, sentences_lpos, sentences_trees = filter_func(
            sentences, sentences_lpos, sentences_trees, neg_pos_ratio,
            no_of_sentence, start_from)
    return feature_function(sentences, sentences_lpos, sentences_trees)
示例#2
0
def process_training_files(file_list,
                           feature_function,
                           filter_func=None,
                           neg_pos_ratio=1,
                           no_of_sentence=10000,
                           start_from=0,
                           align=False):
    print("processing file {}...".format(file_list[0]))
    no_of_sentence_for_preprocess = 18000
    sentences = open(file_list[0]).read().splitlines()
    sentences_lpos = open(file_list[1].replace(
        '.txt', '_lpos.txt')).read().splitlines()
    sentences_trees = stanford_parse.get_trees_from_raw(
        open(file_list[1].replace('.txt', '_trees.txt')).read())

    no_of_pos = no_of_sentence / (neg_pos_ratio + 1)
    no_of_neg = no_of_pos * neg_pos_ratio

    if filter_func:
        sentences, sentences_lpos, sentences_trees = filter_func(
            sentences, sentences_lpos, sentences_trees, 1,
            no_of_sentence_for_preprocess, start_from)
    all_instances = feature_function(sentences, sentences_lpos,
                                     sentences_trees)
    if align:
        aligned_file = open(file_list[1].replace(
            '.txt', '_align.txt')).read().splitlines()
        aligned_file = [x.split(' ') for x in aligned_file]

    all_instances = dict([((x[2], x[3]), x) for x in all_instances])

    filtered_all_instances = []
    neg_instances = []
    for x in all_instances.keys():
        if all_instances[x][1] == '1':
            if x[0] % 2:  # odd number, incorrect
                aligned_sent = x[0] - 1
            else:
                aligned_sent = x[0] + 1
            aligned_ind = int(aligned_file[x[0]][x[1]])
            if (aligned_sent, aligned_ind) in all_instances.keys():
                current_instance = all_instances[x]
                aligned_instance = all_instances[(aligned_sent, aligned_ind)]
                if current_instance[1] != aligned_instance[1]:
                    current_local_features = [
                        x for x in current_instance[0]
                        if "WORDTRI:" in x or "POSTRI:" in x
                    ]
                    current_parse_features = [
                        x for x in current_instance[0]
                        if "WORDTRI:" not in x and "POSTRI:" not in x
                    ]
                    aligned_local_features = [
                        x for x in aligned_instance[0]
                        if "WORDTRI:" in x or "POSTRI:" in x
                    ]
                    aligned_parse_features = [
                        x for x in aligned_instance[0]
                        if "WORDTRI:" not in x and "POSTRI:" not in x
                    ]
                    temp = list(
                        set(current_parse_features) -
                        set(aligned_parse_features))
                    aligned_instance[0] = list(
                        set(aligned_parse_features) -
                        set(current_parse_features)) + aligned_local_features
                    current_instance[0] = temp + current_local_features
                    if len(filtered_all_instances) < no_of_pos:
                        filtered_all_instances.append(current_instance)
                    if len(neg_instances) < no_of_neg:
                        neg_instances.append(aligned_instance)
                    else:
                        break

    print("Training data (filtered):")
    print("no of pos: {}".format(len(filtered_all_instances)))
    print("no of neg: {}".format(len(neg_instances)))
    filtered_all_instances += neg_instances
    return filtered_all_instances