def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_GBTClassifier' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict,'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name) dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("GBTClassifier_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc=sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] #限制多数类的数据 #dataset = too.CalcMostLabel(dataset,Y_names) Y_datavec = dataset[Y_names].values #输出每个标签的数量 print 'Counter:original y',Counter(Y_datavec) print'----------------------------------------------' #分别获得字符字段和数值型字段数据,且合并 X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open') #数据归一化 X_datavec = too.Data_process(X_datavec,normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X,Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num,ret = mlp.GS_PCA(X) print 'PCA Information:',pca_num,ret print'----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X,ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara,vocabset,ret_num) print'--------------Train data shape----------------' print 'X.shape:',X.shape print'----------------------------------------------' print 'Y.shape:',Y.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) targets = pd.DataFrame(Y, columns = ['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis = 1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666) #调用模型 clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print'----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration if options == 'predict': time_start = time() with open(dir_of_storePara,'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close') #数据归一化 X = too.Data_process(X_datavec,normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X,ret_num) print'-------------Pdedict data shape---------------' print 'X.shape:',X.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features) clf_model = GBTClassificationModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'MFNNCla' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict, 'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) dir_of_storeModel = train_result_dir + '/%s_model.h5' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] #限制多数类的数据 #dataset = too.CalcMostLabel(dataset,Y_names) Y_datavec = dataset[Y_names].values #输出每个标签的数量 print 'Counter:original y', Counter(Y_datavec) print '----------------------------------------------' #分别获得字符字段和数值型字段数据,且合并 X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form( dataset, names_str, names_num, names_show, 'vocabset', 'open') #数据归一化 X_datavec = too.Data_process(X_datavec, normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X, Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num, ret = mlp.GS_PCA(X) print 'PCA Information:', pca_num, ret print '----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X, ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara, vocabset, ret_num) print '--------------Train data shape----------------' print 'X.shape:', X.shape print '----------------------------------------------' print 'Y.shape:', Y.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name X_train, X_test, y_train, y_test = train_test_split( X, Y, train_size=train_size, test_size=test_size, stratify=Y, random_state=0) clf_model = dlp.GS_cla_MFNN(X_train, X_test, y_train, y_test) #保存模型参数 clf_model.save(dir_of_storeModel) print '----------------------------------------------' too.Predict_test_data(X_test, y_test, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'MFNN') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration if options == 'predict': time_start = time() with open(dir_of_storePara, 'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec, datavec_show_list = too.Merge_form(dataset, names_str, names_num, names_show, vocabset, 'close') #数据归一化 X = too.Data_process(X_datavec, normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X, ret_num) print '-------------Pdedict data shape---------------' print 'X.shape:', X.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name clf_model = load_model(dir_of_storeModel) too.Predict_data(X, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'DeepAutoEncoder' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict, 'non-supervision') name_dict,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,\ dir_of_inputdata,dir_of_outputdata,open_pca,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) column_names = names_str + names_num column_names_show = names_str + names_num + names_show time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] dataset_show = dataset[column_names_show] #分别获得字符字段和数值型字段数据 dataset_str = dataset[names_str] dataset_num = dataset[names_num] dataset_str_list = dataset_str.values.tolist() datavec_num_list = dataset_num.values.tolist() vocabset = too.CreateVocabList(dataset_str_list) datavec_str_list = too.BagofWords2Vec(vocabset, dataset_str_list) #vocabset_index = {y:i for i,y in enumerate(vocabset)} #将list转化为DataFrame,合并两表 datavec_str = pd.DataFrame(datavec_str_list, columns=vocabset) datavec_num = pd.DataFrame(datavec_num_list, columns=names_num) #按照左表连接,右表可以为空 data_tem = pd.merge(datavec_num, datavec_str, how="left", right_index=True, left_index=True) X_datavec = data_tem.values #数据归一化 X = too.Data_process(X_datavec, normalized_type) ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num, ret = mlp.GS_PCA(X) print 'PCA Information:', pca_num, ret print '----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X, ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara, vocabset, ret_num) print '----------------data shape--------------------' print 'X.shape:', X.shape print '----------------------------------------------' loss, X_diff_loss = dlp.Model_deep_auto_encoder(X) clst_labels = dlp.GS_deep_auto_encoder_parameter(loss, X_diff_loss) #寻找异常原因,并给出异常值和基准值 exception_data, base_value = too.Find_exception_reason( X, dataset_show, clst_labels, column_names) #存储异常结果 exception_data.to_csv(dir_of_outputdata, index=False) duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration