def feature(title, person1, person2): if len(extracted_featureset_dict) == 0: # 读取文件 read_extracted_featureset() sp_list = divide_line(title, person1, person2) word_list = [] pos_list = [] idx = 0 features = [] for sp in sp_list: if sp == '': sp = ' ' # 以防下面调用Seg()报错 for t in Seg(sp): word_list.append(t[0]) pos_list.append(t[1]) idx += 1 if idx == 1: feats = sub_feature(word_list, extracted_featureset_dict['left_words']) add_features(feats, features) feats = sub_feature(pos_list, extracted_featureset_dict['p1_left_pos']) add_features(feats, features) elif idx == 2: feats = sub_feature(word_list, extracted_featureset_dict['middle_words']) add_features(feats, features) feats = sub_feature(pos_list, extracted_featureset_dict['p1_right_pos']) add_features(feats, features) feats = sub_feature(pos_list, extracted_featureset_dict['p2_left_pos']) add_features(feats, features) elif idx == 3: feats = sub_feature(word_list, extracted_featureset_dict['right_words']) add_features(feats, features) feats = sub_feature(pos_list, extracted_featureset_dict['p2_right_pos']) add_features(feats, features) return features
# 将每一种关系实例归类 if __name__ == '__main__': fi_train = open(raw_trainingset, 'r') # 为每一种关系建立一个预处理文件 file_out_list = [] fo_relation_list = [] for i in range(len(relations)): filename = trainset_prefix + str(i) filename = os.path.join(data_dir, filename) file_out_list.append(filename) fo_relation_list.append(open(filename.decode('utf-8'), 'w')) # 读取训练集 for line in fi_train: corpus = gen_training_corpus(line) sp_list = divide_line(corpus.title, corpus.person1, corpus.person2)# 将新闻标题以人名为分隔符划分成3部分 string = "" idx = 0 for sp in sp_list: if sp != '': for t in Seg(sp): # 调用中科院的分词 s = '%s:%s;' % (t[0],t[1]) string += s idx += 1 if idx < len(sp_list): string += '||' string += '\n' fo_relation_list[corpus.label].write(string) fi_train.close() for fo in fo_relation_list: fo.close()