def train_lr_model(train_file,model_coef,model_file,feature_num_file): total_feature_num=GF.get_feature_num(feature_num_file) train_label = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range (total_feature_num) train_feature = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf=LRCV(Cs=[1],penalty='l2',tol=0.0001,max_iter=500,cv=5).fit(train_feature,train_label) scores=list(lr_cf.scores_.values())[0] print('diff:%s' %(','.join([str(ele) for ele in scores.mean(axis=0)]))) print('Accuracy:%s (+-%0.2f)' %(scores.mean(),scores.std()*2)) #平均值0.842616805029923上下0.01就可覆盖90%的值,说明0.842616805029923是很靠谱的 lr_cf = LRCV (Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5,scoring='roc_auc').fit (train_feature, train_label) scores = list (lr_cf.scores_.values ())[0] print ('diff:%s' % (','.join ([str (ele) for ele in scores.mean (axis=0)]))) print ('AUC:%s (+-%0.2f)' % (scores.mean (), scores.std () * 2)) coef=lr_cf.coef_[0] fw=open(model_coef,'w+') fw.write(','.join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf,model_file)
def get_test_data(test_file,feature_num_file): ''' :param test_file: :param feature_num_file: :return: ''' total_feature_num = GF.get_feature_num (feature_num_file) test_label = np.genfromtxt(test_file, dtype= np.float32, delimiter=",", usecols= -1) feature_list = range(total_feature_num) test_feature = np.genfromtxt(test_file, dtype= np.float32, delimiter=",", usecols= feature_list) return test_feature, test_label
def get_train_data(train_file, feature_num_file): total_feature_num = get_feature_num(feature_num_file) train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = list(range(total_feature_num)) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) return train_feature, train_label
def get_test_data(test_file,feature_num_file): """ :param test_file: file to check performance :param feature_num_file: the file record total num of feature :return: two np.array:test_feature,test_label """ total_feature_num = GF.get_feature_num(feature_num_file) test_label = np.genfromtxt(test_file, dtype=np.float32, delimiter=",", usecols=-1) feature_list = range(total_feature_num) test_feature = np.genfromtxt(test_file, dtype=np.float32, delimiter=",", usecols=feature_list) return test_feature, test_label
def get_train_data(train_file,feature_num_file): ''' 准备训练数据 :param train_file: :param feature_num_file: :return: ''' total_feature_num=GF.get_feature_num(feature_num_file) train_label=np.genfromtxt(train_file,dtype=np.int32,delimiter=',',usecols=-1) feature_list=range(total_feature_num) train_feature=np.genfromtxt(train_file,dtype=np.int32,delimiter=',',usecols=feature_list) return train_feature,train_label
def get_test_data(test_file, feature_num_file): total_feature_num = GF.get_feature_num(feature_num_file) test_label = np.genfromtxt(test_file, dtype=np.int32, delimiter=',', usecols=-1) feature_list = range(total_feature_num) test_feature = np.genfromtxt(test_file, dtype=np.int32, delimiter=',', usecols=feature_list) return test_feature, test_label
def get_train_data(train_file, feature_num_file): """ get train data and label for training """ total_feature_num = GF.get_feature_num(feature_num_file) train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) return train_feature, train_label
def get_test_data(test_file, feature_num_file): """ :param test_file: file to check performance :return: two np array: test_feature, test_label """ total_feature_num = get_feature_num(feature_num_file) test_label = np.genfromtxt(test_file, dtype=np.int32, delimiter=",", usecols=-1) test_feature_list = list(range(total_feature_num)) test_feature = np.genfromtxt(test_file, dtype=np.int32, delimiter=",", usecols=test_feature_list) return test_feature, test_label
def get_test_data(test_file, feature_num_file): ''' 将测试数据和特征个数解析 :param test_file: :param feature_num_file: :return: 2 np.array: test_feature,test_label ''' total_feature_num = GF.get_feature_num(feature_num_file) test_label = np.genfromtxt(test_file, dtype=np.float32, delimiter=',', usecols=-1) feature_list = range(total_feature_num) test_feature = np.genfromtxt(test_file, dtype=np.float32, delimiter=',', usecols=feature_list) return test_feature, test_label
def get_train_data(train_file, feature_num_file): """ 得到训练数据和标签 :param train_file: :param feature_num_file: get train data and label for training """ total_feature_num = int(GF.get_feature_num(feature_num_file)) #共有103维特征 train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) return train_feature, train_label
def train_lr_model(train_file, model_coef, model_file, feature_num_file): """ Args: train_file: process file for lr train model_coef: w1 w2... model_file:model pkl feature_num_file: file to record num of feature """ total_feature_num = gf.get_feature_num(feature_num_file) train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf = lrcv(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5).fit(train_feature, train_label) scores = lr_cf.scores_.values()[0] print "diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)])) print "Accuracy:%s (+-%0.2f)" % (scores.mean(), scores.std() * 2) lr_cf = lrcv(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(train_feature, train_label) scores = lr_cf.scores_.values()[0] print "diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)])) print "AUC:%s (+-%0.2f)" % (scores.mean(), scores.std() * 2) coef = lr_cf.coef_[0] fw = open(model_coef, "w+") fw.write(",".join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf, model_file)
# -*- coding: UTF-8 -*- import sys import numpy as np from sklearn.linear_model import LogisticRegressionCV as LRCV from sklearn.externals import joblib sys.path.append("../") import util.get_feature_num as GF FEATURE_NUM = GF.get_feature_num("../data/feature_num") def train_lr_model(train_file, model_coef, model_file): """ :param train_file: process file for lr training :param model_coef: w1, w2, ... :param model_file: model pkl """ # 98+20=118. 所有离散特征的总维度为98,所有连续特征的总维度为20 # 118 表示所有特征的总维度。label的维度为1,因此train_file.txt、test_file.txt的列数为119 total_feature_num = FEATURE_NUM # usecols=-1 表示使用最后一列, 也就是label train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = list(range(total_feature_num)) train_feature = np.genfromtxt(train_file, dtype=np.int32,