예제 #1
0
def feature(title, person1, person2):
    if len(extracted_featureset_dict) == 0:
        # 读取文件
        read_extracted_featureset()
    sp_list = divide_line(title, person1, person2)
    word_list = []
    pos_list = []
    idx = 0
    features = []
    for sp in sp_list:
        if sp == '':
            sp = ' ' # 以防下面调用Seg()报错
        for t in Seg(sp):
            word_list.append(t[0])
            pos_list.append(t[1])
        idx += 1
        if idx == 1:
            feats = sub_feature(word_list, extracted_featureset_dict['left_words'])
            add_features(feats, features)
            feats = sub_feature(pos_list, extracted_featureset_dict['p1_left_pos'])
            add_features(feats, features)
        elif idx == 2:
            feats = sub_feature(word_list, extracted_featureset_dict['middle_words'])
            add_features(feats, features)
            feats = sub_feature(pos_list, extracted_featureset_dict['p1_right_pos'])
            add_features(feats, features)
            feats = sub_feature(pos_list, extracted_featureset_dict['p2_left_pos'])
            add_features(feats, features)
        elif idx == 3:
            feats = sub_feature(word_list, extracted_featureset_dict['right_words'])
            add_features(feats, features)
            feats = sub_feature(pos_list, extracted_featureset_dict['p2_right_pos'])
            add_features(feats, features)
    return features
예제 #2
0
# 将每一种关系实例归类
if __name__ == '__main__':
    fi_train = open(raw_trainingset, 'r')
    # 为每一种关系建立一个预处理文件
    file_out_list = []
    fo_relation_list = []
    for i in range(len(relations)):
        filename = trainset_prefix + str(i)
        filename = os.path.join(data_dir, filename)
        file_out_list.append(filename)
        fo_relation_list.append(open(filename.decode('utf-8'), 'w'))
    # 读取训练集
    for line in fi_train:
        corpus = gen_training_corpus(line)
        sp_list = divide_line(corpus.title, corpus.person1, corpus.person2)# 将新闻标题以人名为分隔符划分成3部分
        string = ""
        idx = 0
        for sp in sp_list:
            if sp != '':
                for t in Seg(sp): # 调用中科院的分词
                    s = '%s:%s;' % (t[0],t[1])
                    string += s
            idx += 1
            if idx < len(sp_list):
                string += '||'
        string += '\n'
        fo_relation_list[corpus.label].write(string)
    fi_train.close()
    for fo in fo_relation_list:
        fo.close()