Пример #1
0
def doc2vecRes():
    df_all = dataPropre()
    splitWords(df_all)
    dbow, dm = doc2vec()
    #    df_all['queryVec_dbow'] = dbow.docvecs
    #    df_all['queryVec_dm'] = dm.docvecs
    return dbow, dm
Пример #2
0

def myAcc(y_true, y_pred):
    '''
    计算预测的accuracy
    '''
    y_pred = np.argmax(y_pred, axis=1)
    return np.mean(y_true == y_pred)


'''
导入特征数据和标签,dbow+dm拼接为特征,年龄段为标签
'''
dbow = Doc2Vec.load(r'D:\DPIDataAnalysis\model\dbow_d2v.model')
dm = Doc2Vec.load(r'D:\DPIDataAnalysis\model\dm_d2v.model')
alldata = dataPropre()

X_sp = np.array([
    np.append(dbow.docvecs[i], dm.docvecs[i]) for i in range(len(dbow.docvecs))
])
ys = {}
ys['agePd'] = np.array(alldata['agePd'])
'''
前12000个样本为训练样本,5折交叉验证
'''
trainNum = 12000
crossVali = 5

X = X_sp[:trainNum]  # 训练集 + 验证集
X_te = X_sp[trainNum:]  # 测试集
y = ys['agePd'][:trainNum]
Пример #3
0
@author: ASY

使用朴素贝叶斯分类器完成第一层stack
"""

import tensorflow as tf
import numpy as np
from math import sqrt
import pandas as pd
from queryDataPro import dataPropre
from sklearn.cross_validation import KFold
'''
对所有汉字字符编码,使用与char-rnn中文文本生成相同的方式
'''

query_all = dataPropre()['queries'].values
label_all = dataPropre()['agePd'].values
'''
求P(Y),key为对应类别label
'''
value_counts = pd.value_counts(label_all)
p_y = {}
for i in value_counts.index:
    p_y[i] = value_counts[i] / len(label_all)
'''
求似然函数P(X|Y),根据独立分布假设
'''
p_xy = {}
label_prob_all = np.zeros((len(pd.value_counts(label_all)), len(label_all)))
for i in value_counts.index:
    '''