示例#1
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
    model_name = 'Distr_GBTClassifier'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict,'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name)
    dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("GBTClassifier_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc=sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试 
        #dataset = dataset[0:1000]
        #限制多数类的数据
        #dataset = too.CalcMostLabel(dataset,Y_names)
        Y_datavec = dataset[Y_names].values
        #输出每个标签的数量
        print 'Counter:original y',Counter(Y_datavec)
        print'----------------------------------------------'
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec,normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X,Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num,ret = mlp.GS_PCA(X)
            print 'PCA Information:',pca_num,ret
            print'----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X,ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara,vocabset,ret_num)

        print'--------------Train data shape----------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print 'Y.shape:',Y.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,) 
        targets = pd.DataFrame(Y, columns = ['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis = 1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666)
        #调用模型
        clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print'----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara,'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close')
        #数据归一化
        X = too.Data_process(X_datavec,normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X,ret_num)

        print'-------------Pdedict data shape---------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,)
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features)
        clf_model = GBTClassificationModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration
示例#2
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'MFNNCla'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)
    dir_of_storeModel = train_result_dir + '/%s_model.h5' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试
        #dataset = dataset[0:1000]
        #限制多数类的数据
        #dataset = too.CalcMostLabel(dataset,Y_names)

        Y_datavec = dataset[Y_names].values
        #输出每个标签的数量
        print 'Counter:original y', Counter(Y_datavec)
        print '----------------------------------------------'
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form(
            dataset, names_str, names_num, names_show, 'vocabset', 'open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec, normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X, Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num, ret = mlp.GS_PCA(X)
            print 'PCA Information:', pca_num, ret
            print '----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X, ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara, vocabset, ret_num)

        print '--------------Train data shape----------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print 'Y.shape:', Y.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            Y,
            train_size=train_size,
            test_size=test_size,
            stratify=Y,
            random_state=0)
        clf_model = dlp.GS_cla_MFNN(X_train, X_test, y_train, y_test)
        #保存模型参数
        clf_model.save(dir_of_storeModel)
        print '----------------------------------------------'
        too.Predict_test_data(X_test, y_test, datavec_show_list, names_show,
                              clf_model, dir_of_outputdata, 'MFNN')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara, 'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, datavec_show_list = too.Merge_form(dataset, names_str,
                                                      names_num, names_show,
                                                      vocabset, 'close')
        #数据归一化
        X = too.Data_process(X_datavec, normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X, ret_num)

        print '-------------Pdedict data shape---------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        clf_model = load_model(dir_of_storeModel)
        too.Predict_data(X, datavec_show_list, names_show, clf_model,
                         dir_of_outputdata)
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration
示例#3
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'DeepAutoEncoder'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'non-supervision')
    name_dict,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,\
    dir_of_inputdata,dir_of_outputdata,open_pca,normalized_type = bag
    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    column_names = names_str + names_num
    column_names_show = names_str + names_num + names_show

    time_start = time()
    #获取数据
    dataset = pd.read_csv(dir_of_inputdata)
    #用于测试
    #dataset = dataset[0:1000]

    dataset_show = dataset[column_names_show]

    #分别获得字符字段和数值型字段数据
    dataset_str = dataset[names_str]
    dataset_num = dataset[names_num]
    dataset_str_list = dataset_str.values.tolist()
    datavec_num_list = dataset_num.values.tolist()

    vocabset = too.CreateVocabList(dataset_str_list)
    datavec_str_list = too.BagofWords2Vec(vocabset, dataset_str_list)
    #vocabset_index = {y:i for i,y in enumerate(vocabset)}

    #将list转化为DataFrame,合并两表
    datavec_str = pd.DataFrame(datavec_str_list, columns=vocabset)
    datavec_num = pd.DataFrame(datavec_num_list, columns=names_num)
    #按照左表连接,右表可以为空
    data_tem = pd.merge(datavec_num,
                        datavec_str,
                        how="left",
                        right_index=True,
                        left_index=True)
    X_datavec = data_tem.values

    #数据归一化
    X = too.Data_process(X_datavec, normalized_type)

    ret_num = 'no_num'
    #PCA降维
    if open_pca == 'open_pca':
        pca_num, ret = mlp.GS_PCA(X)
        print 'PCA Information:', pca_num, ret
        print '----------------------------------------------'
        ret_num = ret['99%']
        X = mlp.Model_PCA(X, ret_num)
    #存储vocabset这个list和ret_num
    too.StorePara(dir_of_storePara, vocabset, ret_num)

    print '----------------data shape--------------------'
    print 'X.shape:', X.shape
    print '----------------------------------------------'
    loss, X_diff_loss = dlp.Model_deep_auto_encoder(X)
    clst_labels = dlp.GS_deep_auto_encoder_parameter(loss, X_diff_loss)
    #寻找异常原因,并给出异常值和基准值
    exception_data, base_value = too.Find_exception_reason(
        X, dataset_show, clst_labels, column_names)
    #存储异常结果
    exception_data.to_csv(dir_of_outputdata, index=False)

    duration = too.Duration(time() - time_start)
    print 'Total run time: %s' % duration