예제 #1
0
def train_lr_model(train_file,model_coef,model_file,feature_num_file):


    total_feature_num=GF.get_feature_num(feature_num_file)

    train_label = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=-1)
    feature_list = range (total_feature_num)
    train_feature = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=feature_list)

    lr_cf=LRCV(Cs=[1],penalty='l2',tol=0.0001,max_iter=500,cv=5).fit(train_feature,train_label)
    scores=list(lr_cf.scores_.values())[0]
    print('diff:%s' %(','.join([str(ele) for ele in scores.mean(axis=0)])))
    print('Accuracy:%s (+-%0.2f)' %(scores.mean(),scores.std()*2))
    #平均值0.842616805029923上下0.01就可覆盖90%的值,说明0.842616805029923是很靠谱的

    lr_cf = LRCV (Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5,scoring='roc_auc').fit (train_feature, train_label)
    scores = list (lr_cf.scores_.values ())[0]
    print ('diff:%s' % (','.join ([str (ele) for ele in scores.mean (axis=0)])))
    print ('AUC:%s (+-%0.2f)' % (scores.mean (), scores.std () * 2))

    coef=lr_cf.coef_[0]
    fw=open(model_coef,'w+')
    fw.write(','.join(str(ele) for ele in coef))
    fw.close()
    joblib.dump(lr_cf,model_file)
예제 #2
0
def get_test_data(test_file,feature_num_file):
    '''

    :param test_file:
    :param feature_num_file:
    :return:
    '''
    total_feature_num = GF.get_feature_num (feature_num_file)
    test_label = np.genfromtxt(test_file, dtype= np.float32, delimiter=",", usecols= -1)
    feature_list = range(total_feature_num)
    test_feature = np.genfromtxt(test_file, dtype= np.float32, delimiter=",", usecols= feature_list)
    return test_feature, test_label
예제 #3
0
파일: train.py 프로젝트: atm1992/Tree
def get_train_data(train_file, feature_num_file):
    total_feature_num = get_feature_num(feature_num_file)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = list(range(total_feature_num))
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    return train_feature, train_label
예제 #4
0
def get_test_data(test_file,feature_num_file):
	"""
		:param test_file: file to check performance
		:param feature_num_file: the file record total num of feature
		:return:
			two np.array:test_feature,test_label
		"""
	total_feature_num = GF.get_feature_num(feature_num_file)
	test_label = np.genfromtxt(test_file, dtype=np.float32, delimiter=",", usecols=-1)
	feature_list = range(total_feature_num)
	test_feature = np.genfromtxt(test_file, dtype=np.float32, delimiter=",", usecols=feature_list)
	return test_feature, test_label
예제 #5
0
def get_train_data(train_file,feature_num_file):
    '''
    准备训练数据
    :param train_file:
    :param feature_num_file:
    :return:
    '''
    total_feature_num=GF.get_feature_num(feature_num_file)
    train_label=np.genfromtxt(train_file,dtype=np.int32,delimiter=',',usecols=-1)
    feature_list=range(total_feature_num)
    train_feature=np.genfromtxt(train_file,dtype=np.int32,delimiter=',',usecols=feature_list)
    return train_feature,train_label
예제 #6
0
def get_test_data(test_file, feature_num_file):

    total_feature_num = GF.get_feature_num(feature_num_file)
    test_label = np.genfromtxt(test_file,
                               dtype=np.int32,
                               delimiter=',',
                               usecols=-1)
    feature_list = range(total_feature_num)

    test_feature = np.genfromtxt(test_file,
                                 dtype=np.int32,
                                 delimiter=',',
                                 usecols=feature_list)
    return test_feature, test_label
예제 #7
0
def get_train_data(train_file, feature_num_file):
    """
    get train data and label for training
    """
    total_feature_num = GF.get_feature_num(feature_num_file)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    return train_feature, train_label
예제 #8
0
파일: check.py 프로젝트: atm1992/Tree
def get_test_data(test_file, feature_num_file):
    """

    :param test_file: file to check performance
    :return: two np array: test_feature, test_label
    """
    total_feature_num = get_feature_num(feature_num_file)
    test_label = np.genfromtxt(test_file,
                               dtype=np.int32,
                               delimiter=",",
                               usecols=-1)
    test_feature_list = list(range(total_feature_num))
    test_feature = np.genfromtxt(test_file,
                                 dtype=np.int32,
                                 delimiter=",",
                                 usecols=test_feature_list)
    return test_feature, test_label
예제 #9
0
def get_test_data(test_file, feature_num_file):
    '''
    将测试数据和特征个数解析
    :param test_file:
    :param feature_num_file:
    :return: 2 np.array:  test_feature,test_label
    '''
    total_feature_num = GF.get_feature_num(feature_num_file)
    test_label = np.genfromtxt(test_file,
                               dtype=np.float32,
                               delimiter=',',
                               usecols=-1)
    feature_list = range(total_feature_num)
    test_feature = np.genfromtxt(test_file,
                                 dtype=np.float32,
                                 delimiter=',',
                                 usecols=feature_list)
    return test_feature, test_label
def get_train_data(train_file, feature_num_file):
    """
	得到训练数据和标签
	:param train_file:
	:param feature_num_file:
	get train data and label for training
	"""
    total_feature_num = int(GF.get_feature_num(feature_num_file))  #共有103维特征
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    return train_feature, train_label
예제 #11
0
def train_lr_model(train_file, model_coef, model_file, feature_num_file):
    """
    Args:
        train_file: process file for lr train
        model_coef: w1 w2...
        model_file:model pkl
        feature_num_file: file to record num of feature
    """
    total_feature_num = gf.get_feature_num(feature_num_file)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    lr_cf = lrcv(Cs=[1], penalty="l2", tol=0.0001, max_iter=500,
                 cv=5).fit(train_feature, train_label)
    scores = lr_cf.scores_.values()[0]
    print "diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))
    print "Accuracy:%s (+-%0.2f)" % (scores.mean(), scores.std() * 2)
    lr_cf = lrcv(Cs=[1],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(train_feature, train_label)
    scores = lr_cf.scores_.values()[0]
    print "diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))
    print "AUC:%s (+-%0.2f)" % (scores.mean(), scores.std() * 2)
    coef = lr_cf.coef_[0]
    fw = open(model_coef, "w+")
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()
    joblib.dump(lr_cf, model_file)
예제 #12
0
파일: train.py 프로젝트: atm1992/LR
# -*- coding: UTF-8 -*-
import sys

import numpy as np
from sklearn.linear_model import LogisticRegressionCV as LRCV
from sklearn.externals import joblib

sys.path.append("../")
import util.get_feature_num as GF

FEATURE_NUM = GF.get_feature_num("../data/feature_num")


def train_lr_model(train_file, model_coef, model_file):
    """

    :param train_file: process file for lr training
    :param model_coef: w1, w2, ...
    :param model_file: model pkl
    """
    # 98+20=118. 所有离散特征的总维度为98,所有连续特征的总维度为20
    # 118 表示所有特征的总维度。label的维度为1,因此train_file.txt、test_file.txt的列数为119
    total_feature_num = FEATURE_NUM
    # usecols=-1 表示使用最后一列, 也就是label
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = list(range(total_feature_num))
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,