def doc2vecRes(): df_all = dataPropre() splitWords(df_all) dbow, dm = doc2vec() # df_all['queryVec_dbow'] = dbow.docvecs # df_all['queryVec_dm'] = dm.docvecs return dbow, dm
def myAcc(y_true, y_pred): ''' 计算预测的accuracy ''' y_pred = np.argmax(y_pred, axis=1) return np.mean(y_true == y_pred) ''' 导入特征数据和标签,dbow+dm拼接为特征,年龄段为标签 ''' dbow = Doc2Vec.load(r'D:\DPIDataAnalysis\model\dbow_d2v.model') dm = Doc2Vec.load(r'D:\DPIDataAnalysis\model\dm_d2v.model') alldata = dataPropre() X_sp = np.array([ np.append(dbow.docvecs[i], dm.docvecs[i]) for i in range(len(dbow.docvecs)) ]) ys = {} ys['agePd'] = np.array(alldata['agePd']) ''' 前12000个样本为训练样本,5折交叉验证 ''' trainNum = 12000 crossVali = 5 X = X_sp[:trainNum] # 训练集 + 验证集 X_te = X_sp[trainNum:] # 测试集 y = ys['agePd'][:trainNum]
@author: ASY 使用朴素贝叶斯分类器完成第一层stack """ import tensorflow as tf import numpy as np from math import sqrt import pandas as pd from queryDataPro import dataPropre from sklearn.cross_validation import KFold ''' 对所有汉字字符编码,使用与char-rnn中文文本生成相同的方式 ''' query_all = dataPropre()['queries'].values label_all = dataPropre()['agePd'].values ''' 求P(Y),key为对应类别label ''' value_counts = pd.value_counts(label_all) p_y = {} for i in value_counts.index: p_y[i] = value_counts[i] / len(label_all) ''' 求似然函数P(X|Y),根据独立分布假设 ''' p_xy = {} label_prob_all = np.zeros((len(pd.value_counts(label_all)), len(label_all))) for i in value_counts.index: '''