示例#1
0
def test(clfs = None):
    if clfs == None:
        return
    print('begin test')
    #test_loader
    load_test = rst.Load(testdata_path)
    #测试数据,原始数据
    test_docs = load_test.datas
    #测试数据label,原始label。like:<Opinion target="food" category="FOOD#QUALITY" polarity="negative" from="4" to="8"/>
    test_docs_labels = load_test.labels
    #同上
    load_train = rst.Load(traindata_path)
    #加载特征
    feature_loader = fp.LoadFeature(load_train.datas,load_train.labels)
    testX = feature_loader.get_all_feature(test_docs)
    #测试数据label转化为矩阵形式
    testY = raw_label_process(test_docs_labels)
    #预测概率初始化
    predict_prob = np.zeros((len(test_docs), category_num))
    #预测
    for col, clf in enumerate(clfs):
        prob = clf.predict_proba(testX)[:, 1]
        predict_prob[:, col] = prob

    # 根据threshold决定每一句预测的label(字符串形式)
    labels = []
    for docid, test_doc in enumerate(test_docs):
        # up_threshold_ids : a tuple
        up_threshold_ids = np.where(predict_prob[docid, :] >= paras['threshold'])
        ids = up_threshold_ids[0]
        # for id in up_threshold_ids:
        if len(ids) == 0:
            labels.append(None)
            continue
        label = []
        for id in ids:
            label.append(categorys[id])
        labels.append(label)
    #生成预测结果xml,利用评估工具A.jar来评测
    rst.generate_xml(labels, input_path=testdata_path, output_path=output_path)

    #利用自己的评测函数来评测,二者评测结果差不多
    threshod_list = [paras['threshold'] for j in range(category_num)]
    threshod_vec = np.array(threshod_list)
    for row in range(len(test_docs)):
        predict_prob[row, :] = np.less_equal(threshod_vec, predict_prob[row, :])
    predict_prob =predict_prob.astype(int)
    # get f score     predict_prob and  testY
    fscore = get_fscore(testY, predict_prob)
    print('fscore:'+str(fscore))
示例#2
0
def train():
    print('load train datas and labels')
    load = rst.Load(traindata_path)
    train_docs = load.datas
    train_docs_labels = load.labels
    feature_loader = fp.LoadFeature(train_docs,train_docs_labels)
    print('begin train')
    clfs = []
    trainX = feature_loader.get_all_feature(train_docs)
    #tfidf_matrix 是稀疏矩阵,转化为正常矩阵

    trainY = raw_label_process(train_docs_labels)
    #trainX, trainY = data_feature_precess(train_docs, train_all_docs_raw_str_labels,train_docs)
    for category,trainX,y in getdata(trainX,trainY):
        print('train the classfier:'+category)
        clf = MLPClassifier(solver=paras['solver'],
                            hidden_layer_sizes=paras['hidden_layer_sizes'],
                            alpha=paras['alpha'],
                            learning_rate_init=paras['learning_rate_init'],
                            random_state=paras['random_state'],
                            verbose = False
                            )
        #clf = SVC(kernel='linear', probability=True)
        clf.fit(trainX,y)
        clfs.append(clf)
    return clfs
示例#3
0
def model_selection2():
    load = rst.Load(traindata_path)
    docs = load.datas
    docs_labels = load.labels
    #key paras value:fscore
    paras_fscore_map = {}
    # 5 - cross validate
    for i in range(5):
        #random_state = 0 : 每次随机种子不一样
        train_docs,eval_docs,train_docs_labels,eval_docs_labels = train_test_split(docs,docs_labels,test_size=0.2,random_state=0)
        feature_loader = fp.LoadFeature(train_docs)
        trainX = feature_loader.get_all_feature(train_docs)
        trainY = raw_label_process(train_docs_labels)
        evalX = feature_loader.get_all_feature(eval_docs)
        evalY = raw_label_process(eval_docs_labels)

        paras_group = []
        for i1,i2,i3,i4,i5 in paras_generate():
            paras_str = str(i1)+','+str(i2)+','+str(i3)+','+str(i4)+","+str(i5)
            print('begin'+paras_str)
            f = 0.0
            clfs = []
            for category, trainX, y in getdata(trainX,trainY):
                #print('train the classfier in model selection MODEL:' + category)
                clf = MLPClassifier(solver=i3,
                                             hidden_layer_sizes=i2,
                                             alpha=i4,
                                             learning_rate_init=i1,
                                             random_state=paras['random_state']
                                    )
                clf.fit(trainX, y)
                clfs.append(clf)
            predict_prob = np.zeros((len(eval_docs), category_num))
            for col, clf in enumerate(clfs):
                prob = clf.predict_proba(evalX)[:, 1]
                predict_prob[:, col] = prob
            clfs.clear()
            #根据predict_label和threshold来决定预测label
            threshod_list = [i5 for j in range(category_num)]
            threshod_vec = np.array(threshod_list)
            for row in range(len(eval_docs)):
                predict_prob[row,:] = np.less_equal(threshod_vec,predict_prob[row,:])
            predict_prob.astype(int)
            #get f score     predict_prob and  testY
            fscore = get_fscore(evalY,predict_prob)
            if i == 0:
                paras_fscore_map[paras_str] = fscore
            else:
                paras_fscore_map[paras_str] = (paras_fscore_map[paras_str] * (i-1)+fscore)/i
            # print((fscore,[i1,i2,i3,i4,i5]))
            # paras_group.append((fscore,[i1,i2,i3,i4,i5]))
    sorted(paras_fscore_map.items(),lambda x,y : x[1] > y[1])
    for item in paras_fscore_map.items():
        print(item)
示例#4
0

def qselect(A, k):
    if len(A) < k: return A
    pivot = A[-1]
    right = [pivot] + [x for x in A[:-1] if x[1] >= pivot[1]]
    rlen = len(right)
    if rlen == k:
        return right
    if rlen > k:
        return qselect(right, k)
    else:
        left = [x for x in A[:-1] if x[1] < pivot[1]]
        return qselect(left, k - rlen) + right


if __name__ == '__main__':
    # docs = [
    #     'Judging from previous posts this used to be a good place, but not any longer.',
    #     'The food was lousy - too sweet or too salty and the portions tiny.',
    #
    # ]
    # #<Opinion target="place" category="RESTAURANT#GENERAL" polarity="negative" from="51" to="56"/>
    # #<Opinion target="food" category="FOOD#QUALITY" polarity="negative" from="4" to="8"/>
    # raw_sent_labels = [
    #     [{'Opinion target':'good place'}],
    #     [{'Opinion target':'good food'}]
    # ]
    traindata_path = './restaurant2015/ABSA-15_Restaurants_Train_Final.xml'
    load = rst.Load(traindata_path)
    get_namelist(load.datas, load.labels)
def cross_validating():
    load = rst.Load(traindata_path)
    docs = load.datas
    docs_labels = load.labels
    # key paras value:fscore
    paras_fscore_map = {}
    for i in range(5):
        #random_state = 0 : 每次随机种子不一样
        train_docs, eval_docs, train_docs_labels, eval_docs_labels = train_test_split(
            docs, docs_labels, test_size=0.2, random_state=0)

        trainY = tn.raw_label_process(train_docs_labels)

        evalY = tn.raw_label_process(eval_docs_labels)

        paras_group = []
        for i1, i2 in gridsearch_threshold():
            feature_loader = fp.LoadFeature(train_docs, train_docs_labels, i1,
                                            i2)
            trainX = feature_loader.get_all_feature(train_docs)
            print(trainX.shape)
            evalX = feature_loader.get_all_feature(eval_docs)
            paras_str = str(i1) + ',' + str(i2)
            print('begin' + paras_str)
            f = 0.0
            clfs = []
            for category, trainX, y in tn.getdata(trainX, trainY):
                #print('train the classfier in model selection MODEL:' + category)
                clf = MLPClassifier(
                    solver=paras['solver'],
                    hidden_layer_sizes=paras['hidden_layer_sizes'],
                    alpha=paras['alpha'],
                    learning_rate_init=paras['learning_rate_init'],
                    random_state=paras['random_state'],
                    verbose=False)
                clf.fit(trainX, y)
                clfs.append(clf)
            predict_prob = np.zeros((len(eval_docs), tn.category_num))
            for col, clf in enumerate(clfs):
                prob = clf.predict_proba(evalX)[:, 1]
                predict_prob[:, col] = prob
            clfs.clear()
            #根据predict_label和threshold来决定预测label
            threshod_list = [
                paras['threshold'] for j in range(tn.category_num)
            ]
            threshod_vec = np.array(threshod_list)
            for row in range(len(eval_docs)):
                predict_prob[row, :] = np.less_equal(threshod_vec,
                                                     predict_prob[row, :])
            predict_prob.astype(int)
            #get f score     predict_prob and  testY
            fscore = tn.get_fscore(evalY, predict_prob)
            if i == 0:
                paras_fscore_map[paras_str] = fscore
            else:
                paras_fscore_map[paras_str] = (paras_fscore_map[paras_str] *
                                               (i - 1) + fscore) / i
            # print((fscore,[i1,i2,i3,i4,i5]))
            # paras_group.append((fscore,[i1,i2,i3,i4,i5]))
    paras_fscore_map = sorted(paras_fscore_map.items(), key=lambda x: x[1])
    for item in list(paras_fscore_map.items()):
        print(item)