예제 #1
0
 def load_data(self):
     features = []
     targets = []
     f_train = open(raw_trainingset, 'r')
     for line in f_train:
         corpus = gen_training_corpus(line)
         feat = feature(corpus.title, corpus.person1, corpus.person2)
         features.append(np.array(feat))
         targets.append(corpus.label)
     self.features = np.array(features)
     self.targets = np.array(targets)
     f_train.close()
예제 #2
0
def predict_main():
    fi_test = open(test_file, 'r')
    for line in fi_test:
        corpus = gen_training_corpus(line)
        test_corpora = gen_test_corpora(corpus.title)
        if test_corpora == None: continue
        print '--------------'
        max_proba = 0.0
        pred_label = -1
        for corpus in test_corpora:
            feats = feature(corpus.title, corpus.person1, corpus.person2)
            cls = classfier.predict_proba(feats)
            for i in range(len(relations)):
                if cls[0][i] > max_proba:
                    max_proba = cls[0][i]
                    pred_label = i
        print pred_label, max_proba
        print '--------------'
    fi_test.close()
예제 #3
0
trainset_prefix = "project2_TrainingSet7000_"

# 将每一种关系实例归类
if __name__ == '__main__':
    fi_train = open(raw_trainingset, 'r')
    # 为每一种关系建立一个预处理文件
    file_out_list = []
    fo_relation_list = []
    for i in range(len(relations)):
        filename = trainset_prefix + str(i)
        filename = os.path.join(data_dir, filename)
        file_out_list.append(filename)
        fo_relation_list.append(open(filename.decode('utf-8'), 'w'))
    # 读取训练集
    for line in fi_train:
        corpus = gen_training_corpus(line)
        sp_list = divide_line(corpus.title, corpus.person1, corpus.person2)# 将新闻标题以人名为分隔符划分成3部分
        string = ""
        idx = 0
        for sp in sp_list:
            if sp != '':
                for t in Seg(sp): # 调用中科院的分词
                    s = '%s:%s;' % (t[0],t[1])
                    string += s
            idx += 1
            if idx < len(sp_list):
                string += '||'
        string += '\n'
        fo_relation_list[corpus.label].write(string)
    fi_train.close()
    for fo in fo_relation_list: