def process_files(file_list, feature_function, filter_func=None, neg_pos_ratio=1, no_of_sentence=10000, start_from=0): print("processing file {}...".format(file_list[0])) sentences = open(file_list[0]).read().splitlines() sentences_lpos = open(file_list[1].replace( '.txt', '_lpos.txt')).read().splitlines() sentences_trees = stanford_parse.get_trees_from_raw( open(file_list[1].replace('.txt', '_trees.txt')).read()) if filter_func: sentences, sentences_lpos, sentences_trees = filter_func( sentences, sentences_lpos, sentences_trees, neg_pos_ratio, no_of_sentence, start_from) return feature_function(sentences, sentences_lpos, sentences_trees)
def process_training_files(file_list, feature_function, filter_func=None, neg_pos_ratio=1, no_of_sentence=10000, start_from=0, align=False): print("processing file {}...".format(file_list[0])) no_of_sentence_for_preprocess = 18000 sentences = open(file_list[0]).read().splitlines() sentences_lpos = open(file_list[1].replace( '.txt', '_lpos.txt')).read().splitlines() sentences_trees = stanford_parse.get_trees_from_raw( open(file_list[1].replace('.txt', '_trees.txt')).read()) no_of_pos = no_of_sentence / (neg_pos_ratio + 1) no_of_neg = no_of_pos * neg_pos_ratio if filter_func: sentences, sentences_lpos, sentences_trees = filter_func( sentences, sentences_lpos, sentences_trees, 1, no_of_sentence_for_preprocess, start_from) all_instances = feature_function(sentences, sentences_lpos, sentences_trees) if align: aligned_file = open(file_list[1].replace( '.txt', '_align.txt')).read().splitlines() aligned_file = [x.split(' ') for x in aligned_file] all_instances = dict([((x[2], x[3]), x) for x in all_instances]) filtered_all_instances = [] neg_instances = [] for x in all_instances.keys(): if all_instances[x][1] == '1': if x[0] % 2: # odd number, incorrect aligned_sent = x[0] - 1 else: aligned_sent = x[0] + 1 aligned_ind = int(aligned_file[x[0]][x[1]]) if (aligned_sent, aligned_ind) in all_instances.keys(): current_instance = all_instances[x] aligned_instance = all_instances[(aligned_sent, aligned_ind)] if current_instance[1] != aligned_instance[1]: current_local_features = [ x for x in current_instance[0] if "WORDTRI:" in x or "POSTRI:" in x ] current_parse_features = [ x for x in current_instance[0] if "WORDTRI:" not in x and "POSTRI:" not in x ] aligned_local_features = [ x for x in aligned_instance[0] if "WORDTRI:" in x or "POSTRI:" in x ] aligned_parse_features = [ x for x in aligned_instance[0] if "WORDTRI:" not in x and "POSTRI:" not in x ] temp = list( set(current_parse_features) - set(aligned_parse_features)) aligned_instance[0] = list( set(aligned_parse_features) - set(current_parse_features)) + aligned_local_features current_instance[0] = temp + current_local_features if len(filtered_all_instances) < no_of_pos: filtered_all_instances.append(current_instance) if len(neg_instances) < no_of_neg: neg_instances.append(aligned_instance) else: break print("Training data (filtered):") print("no of pos: {}".format(len(filtered_all_instances))) print("no of neg: {}".format(len(neg_instances))) filtered_all_instances += neg_instances return filtered_all_instances