示例#1
0
def proc_right_segments(relation, segments):
    seg_list = proc_line(segments, ';')
    word_list = []
    pos_list =[]
    for seg in seg_list:
        if seg == '': continue
        sp_list = proc_line(seg, ':')
        word = sp_list[0]
        pos = sp_list[1]
        word_list.append(word)
        pos_list.append(pos)
        total_featureset_dict[relation]['right_words'].add(word)
        total_featureset_dict[relation]['p2_right_pos'].add(pos)
    featureset_dict[relation]['right_words'].append(word_list)
    featureset_dict[relation]['p2_right_pos'].append(pos_list)
示例#2
0
def gen_total_featureset(relation):
    filename = trainset_prefix + relation
    file_in = os.path.join(data_dir, filename)
    fi = open(file_in.decode('utf-8'), 'r')
    for line in fi:
        seg_list = proc_line(line, '||')
        proc_left_segments(relation, seg_list[0])
        proc_middle_segments(relation, seg_list[1])
        proc_right_segments(relation, seg_list[2])
    fi.close()
示例#3
0
def gen_training_corpus(text):
    seg_list = proc_line(text, "\t")
    relation = seg_list[0].strip()
    person1 = seg_list[1].strip()
    person2 = seg_list[2].strip()
    title = seg_list[3].strip()
    has_rel = seg_list[4].strip()  # 是否的确有对应关系
    if has_rel == "0":
        relation = "null"
    label = fetch_label(relation)
    return corpus(title, person1, person2, label)
示例#4
0
def read_extracted_featureset():
    for feature_name in feature_name_list:
        filename = extracted_featset_prefix + feature_name
        file_in = os.path.join(featureset_dir, filename)
        fi = open(file_in, 'r')
        line = fi.readline()
        seg_list = proc_line(line, ';')
        featset = []
        for seg in seg_list:
            if seg == '': continue
            featset.append(seg)
        extracted_featureset_dict[feature_name] = featset
        fi.close()