def predictMulticlassBaggingModel_parallel(MatX,nrow,ncol,varnames,num_class,params,multiclassmethod,\ n_gpus,n_parallels,runtimes=300,bool_save=True,savedirbase=""): if not bool_save: print("Bagging Method has to save models!") return evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv" selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData bool_mask = init.getMask(MatX) #Assign task to worker RuntimeLists = [[] for i in range(n_parallels)] for runtime in range(runtimes): worker_id = runtime % n_parallels RuntimeLists[worker_id].append(runtime) #Judge bool_gpu if 'gpu' in params.get('tree_method'): bool_gpu = True else: bool_gpu = False P = Pool(n_parallels) results_parallel = [] manager = Manager() CPIDs = manager.list() for i in range(n_parallels): results_parallel.append(P.apply_async(_predictMulticlassBaggingModel,(CPIDs,RuntimeLists[i],MatX,nrow,ncol,varnames,num_class,params,\ selectruntimesvarnames,baggingweights,multiclassmethod,bool_gpu,n_gpus,n_parallels,bool_save,savedirbase))) P.close() P.join() del CPIDs # if multiclassmethod=='softmax': pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32) # elif multiclassmethod=='category': # pred_pY_ense=np.zeros(nrow*ncol*num_class,dtype=np.float32) for i in range(n_parallels): temp = results_parallel[i] pred_pY_ense_para = temp.get() pred_pY_ense = pred_pY_ense + pred_pY_ense_para # if multiclassmethod=='softmax': [pred_Y, pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense, nrow, ncol, num_class, bool_onearray=False, mask=bool_mask.flatten()) # elif multiclassmethod=='category': # [pred_Y,pred_pY]=init.ReshapeMulticlassMatrix(pred_pY_ense,nrow,ncol,num_class,1,bool_stretch=bool_stretch,mask=bool_mask.flatten()) return [pred_Y, pred_pY]
def rmvRepRecord(DataSet1, DataSet2, varnames): bool_reprec = np.ones(len(DataSet1), dtype=np.bool) for varname in varnames: cmpDS2 = init.getListFrompdDataSet(DataSet2, varname) repIds = np.array( init.findPDRowIndex(DataSet1, varname, cmpDS2, bool_list=True)) bool_reprec = bool_reprec & repIds nrepRecIds = np.argwhere(bool_reprec == False)[:, 0] rmvDataSet = DataSet1.loc[nrepRecIds, :] return rmvDataSet
def testSingleclassBaggingModel_parallel(Models,TestDataSet,vtname,params,n_gpus,n_parallels,\ single_thres=0.5,runtimes=300,bool_strclass=False,labelHeaderName="",\ bool_save=False,savedirbase=""): ModelList = [] if bool_save: evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv" selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData else: [ModelList, selectruntimesvarnames, ense_weights] = Models #Assign task to worker RuntimeLists = [[] for i in range(n_parallels)] for runtime in range(runtimes): worker_id = runtime % n_parallels RuntimeLists[worker_id].append(runtime) #Judge bool_gpu if 'gpu' in params.get('tree_method'): bool_gpu = True else: bool_gpu = False #Open multiprocessing parallel pools P = Pool(n_parallels) results_parallel = [] manager = Manager() CPIDs = manager.list() for i in range(n_parallels): results_parallel.append(P.apply_async(_testSingleclassBaggingModel,(CPIDs,RuntimeLists[i],TestDataSet,vtname,runtime,params,ModelList,\ bool_gpu,n_gpus,n_parallels,selectruntimesvarnames,baggingweights,single_thres,bool_strclass,labelHeaderName,bool_save,savedirbase))) P.close() P.join() del CPIDs pred_pY_ense = np.zeros(len(TestDataSet)) for i in range(n_parallels): temp = results_parallel[i] [pred_Y, pred_pY_ense_para, test_Y] = temp.get() pred_pY_ense = pred_pY_ense + pred_pY_ense_para pred_Y_ense = (pred_pY_ense >= single_thres) * 1 pred_Y = pred_Y_ense return [pred_Y, pred_pY_ense, test_Y]
def testMulticlassBaggingModel_parallel(TestDataSet,VegeTypes,params,multiclassmethod,n_gpus,n_parallels,runtimes=300,\ bool_strclass=False,labelHeaderName="",bool_save=True,savedirbase=""): if not bool_save: print("Bagging Method has to save models!") return evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv" selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData #Assign task to worker RuntimeLists = [[] for i in range(n_parallels)] for runtime in range(runtimes): worker_id = runtime % n_parallels RuntimeLists[worker_id].append(runtime) #Judge bool_gpu if 'gpu' in params.get('tree_method'): bool_gpu = True else: bool_gpu = False P = Pool(n_parallels) results_parallel = [] manager = Manager() CPIDs = manager.list() for i in range(n_parallels): results_parallel.append(P.apply_async(_testMulticlassBaggingModel,(CPIDs,RuntimeLists[i],TestDataSet,VegeTypes,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\ selectruntimesvarnames,baggingweights,bool_strclass,labelHeaderName,bool_save,savedirbase))) P.close() P.join() del CPIDs pred_pY_ense = np.zeros([len(TestDataSet), len(VegeTypes)]) for i in range(n_parallels): temp = results_parallel[i] [pred_Y, pred_pY_ense_para, test_Y] = temp.get() pred_pY_ense = pred_pY_ense + pred_pY_ense_para pred_Y = np.argmax(pred_pY_ense, axis=1) return [pred_Y, pred_pY_ense, test_Y]
def testSingleclassBaggingModel(Models,TestDataSet,vtname,params,single_thres=0.5,runtimes=300,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""): ModelList = [] if bool_save: evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv" selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData else: [ModelList, selectruntimesvarnames, ense_weights] = Models pred_pY_ense = np.zeros(len(TestDataSet)) for runtime in range(runtimes): print("Predicting runtime = %d" % runtime) if bool_save: savedir = savedirbase + os.sep + "runtime_" + str(runtime) modelName = vtname + '_xgboost_singleclass_run' + str( runtime) + ".model" modeldir = savedir + os.sep + modelName model = xgbf.loadModel(modeldir, params) else: model = ModelList[runtime] varnames = selectruntimesvarnames[runtime] [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True) [pred_Y, pred_pY] = xgbf.Predict(model, test_X, bool_binary=1, threshold=single_thres) pred_pY_ense = pred_pY_ense + pred_pY * ense_weights[runtime] pred_Y_ense = (pred_pY_ense >= single_thres) * 1 pred_Y = pred_Y_ense pred_pY = pred_pY_ense if len(test_Y.shape) > 1: test_Y = test_Y[:, 0] return [pred_Y, pred_pY, test_Y]
def testMulticlassBaggingModel(TestDataSet,VegeTypes,params,multiclassmethod,runtimes=300,bool_strclass=False,labelHeaderName="",\ bool_save=True,savedirbase=""): if not bool_save: print("Bagging Method has to save models!") return num_class = len(VegeTypes) evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv" selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData pred_pY_ense = np.zeros([len(TestDataSet), num_class]) for runtime in range(runtimes): if baggingweights[runtime] == 0: print("Model not established!") continue print("Predicting runtime = %d" % runtime) savedir = savedirbase + os.sep + "runtime_" + str(runtime) if multiclassmethod == 'softmax': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") # pred_pY_ense=pred_pY_ense+pred_pY*baggingweights[runtime] pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class) pred_pY_ense = pred_pY_ense + baggingweights[ runtime] * pred_Y_epd.astype(np.float32) pred_Y = np.argmax(pred_pY_ense, axis=1) return [pred_Y, pred_pY_ense, test_Y]
def assemFinalFeatureSet(dirto,Feature_Metrics_Values,top_percent=0.95): #Find feature metric local maximum localmax_ids=[] for i in range(1,len(Feature_Metrics_Values)-1): if Feature_Metrics_Values[i-1]<Feature_Metrics_Values[i] and Feature_Metrics_Values[i+1]<Feature_Metrics_Values[i]: localmax_ids.append(i) localmax_ids=np.array(localmax_ids) #Find the top num_iter feature set and aggregate the found sets fm_sort_indexes=np.argsort(-Feature_Metrics_Values) overmax_id=fm_sort_indexes[0] overmax_value=np.max(Feature_Metrics_Values) Select_Variables=[] for i in range(len(localmax_ids)): n_iter=localmax_ids[i] if Feature_Metrics_Values[n_iter]>top_percent*overmax_value and n_iter<=overmax_id: EvalueFolder=dirto+os.sep+"SelectVariables_run"+str(n_iter) featureScoreFiledirto=EvalueFolder+os.sep+"Feature_Scores.csv" variables_runtime=init.getListFrompdDataSet(init.readCSVasPandas(featureScoreFiledirto),"VariableName") for varname in variables_runtime: Select_Variables.append(varname) Final_Feature_Set=sorted(list(set(Select_Variables))) return Final_Feature_Set
def predictSingleclassBaggingModelMatrix_parallel(Models,MatX,vtname,varnames,params,n_gpus,n_parallels,\ single_thres=0.5,runtimes=300,filter_percent=0,bool_save=True,savedirbase=""): if not bool_save: print("Single Bagging Ensemble Only for bool_save=True!") return [] #Read weights and features file evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv" selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData matshape = MatX.shape bool_mask = init.getMask(MatX) pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]], dtype=np.float32) for i in range(matshape[2]): pred_X[:, i] = MatX[:, :, i].flatten() #Assign task to worker RuntimeLists = [[] for i in range(n_parallels)] for runtime in range(runtimes): worker_id = runtime % n_parallels RuntimeLists[worker_id].append(runtime) #Judge bool_gpu if 'gpu' in params.get('tree_method'): bool_gpu = True else: bool_gpu = False #Open multiprocessing parallel pools P = Pool(n_parallels) results_parallel = [] manager = Manager() CPIDs = manager.list() for i in range(n_parallels): results_parallel.append(P.apply_async(_predictSingleclassBaggingModelMatrix,(CPIDs,RuntimeLists[i],vtname,pred_X,varnames,\ selectruntimesvarnames,params,matshape,baggingweights,single_thres,bool_gpu,n_gpus,n_parallels,bool_save,savedirbase))) P.close() P.join() del CPIDs #Collect the multiprocessing results pred_pY_ense = np.zeros(matshape[0] * matshape[1], dtype=np.float32) for i in range(n_parallels): temp = results_parallel[i] pred_pY_ense_para = temp.get() pred_pY_ense = pred_pY_ense + pred_pY_ense_para pred_Y_ense = (pred_pY_ense >= single_thres) * 1 pred_pY_ense = pred_pY_ense.reshape(matshape[0], matshape[1]) pred_Y_ense = pred_Y_ense.reshape(matshape[0], matshape[1]) if filter_percent > 0: p_max = np.max(np.max(pred_pY_ense[bool_mask])) pred_pY_ense[pred_pY_ense < p_max * filter_percent] = 0 return [pred_Y_ense, pred_pY_ense]
def predictSingleclassBaggingModelMatrix(Models,MatX,vtname,varnames,params,single_thres=0.5,runtimes=300,filter_percent=0,\ bool_save=False,savedirbase=""): count = 0.0 if bool_save: evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv" selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData else: [ModelList, selectruntimesvarnames, ense_weights] = Models matshape = MatX.shape bool_mask = init.getMask(MatX) pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]], dtype=np.float32) for i in range(matshape[2]): pred_X[:, i] = MatX[:, :, i].flatten() pred_pY_ense = np.zeros(matshape[0] * matshape[1], dtype=np.float32) time_start = time.time() for runtime in range(runtimes): print("Predicting runtime = %d..." % (runtime)) if bool_save: savedir = savedirbase + os.sep + "runtime_" + str(runtime) modelName = vtname + '_xgboost_singleclass_run' + str( runtime) + ".model" modeldir = savedir + os.sep + modelName model = xgbf.loadModel(modeldir, params) else: model = ModelList[runtime] selruntimevarstr = selectruntimesvarnames[runtime] selruntimevaridx = _findListSubsetIndexes(selruntimevarstr, varnames) pred_X_runtime = pred_X[:, selruntimevaridx] [pred_Y, pred_pY] = xgbf.Predict(model, pred_X_runtime, bool_binary=1, threshold=single_thres) pred_pY_ense = pred_pY_ense + ense_weights[runtime] * pred_pY time_stop = time.time() count = count + 1 done = count / runtimes remain = (runtimes - count) / runtimes num_day, num_hour, num_min = _calDueTime(time_start, time_stop, done, 0.0) print( "Model: %d Calculating Finished! Done: %.2f%%, Remaining: %.2f%%" % (runtime, 100 * done, 100 * remain)) print("Calculating will finish in %d Days %d Hours %d Minutes\n" % (num_day, num_hour, num_min)) pred_Y_ense = (pred_pY_ense >= single_thres) * 1 pred_pY_ense = pred_pY_ense.reshape(matshape[0], matshape[1]) pred_Y_ense = pred_Y_ense.reshape(matshape[0], matshape[1]) if filter_percent > 0: p_max = np.max(np.max(pred_pY_ense[bool_mask])) pred_pY_ense[pred_pY_ense < p_max * filter_percent] = 0 return [pred_Y_ense, pred_pY_ense]
def predictMulticlassBaggingModel(MatX, nrow, ncol, varnames, num_class, params, multiclassmethod, runtimes=300, bool_save=True, savedirbase=""): count = 0.0 if not bool_save: print("Bagging Method has to save models!") return evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv" selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData bool_mask = init.getMask(MatX) time_start = time.time() if multiclassmethod == 'softmax': pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32) pred_X = init.fomatMulticlassSoftmaxMatrix(MatX) for runtime in range(runtimes): if baggingweights[runtime] == 0: print("Model not established!") continue selruntimevarstr = selectruntimesvarnames[runtime] selruntimevaridx = _findListSubsetIndexes(selruntimevarstr, varnames) pred_X_runtime = pred_X[:, selruntimevaridx] print("Predicting Bagging Model... runtime = %d" % runtime) savedir = savedirbase + os.sep + "runtime_" + str(runtime) pred_pY=mlc.predictMulticlassSoftmaxModelCvted([],pred_X_runtime,params,\ runtime=runtime,bool_save=bool_save,savedir=savedir) pred_Y = np.argmax(pred_pY, axis=1) pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class) pred_pY_ense = pred_pY_ense + baggingweights[ runtime] * pred_Y_epd.astype(np.float32) time_stop = time.time() count = count + 1 done = count / runtimes remain = (runtimes - count) / runtimes num_day, num_hour, num_min = _calDueTime(time_start, time_stop, done, 0.0) print( "Model: %d Calculating Finished! Done: %.2f%%, Remaining: %.2f%%" % (runtime, 100 * done, 100 * remain)) print("Calculating will finish in %d Days %d Hours %d Minutes\n" % (num_day, num_hour, num_min)) [pred_Y, pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense, nrow, ncol, num_class, bool_onearray=False, mask=bool_mask.flatten()) elif multiclassmethod == 'category': pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32) pred_X = init.formatMulticlassCategoryMatrix(MatX, num_class) for runtime in range(runtimes): if baggingweights[runtime] == 0: print("Model not established!") continue selruntimevarstr = selectruntimesvarnames[runtime] selruntimevaridx = _findListSubsetIndexes(selruntimevarstr, varnames) pred_X_runtime = pred_X[:, selruntimevaridx] print("Predicting Bagging Model... runtime = %d" % runtime) savedir = savedirbase + os.sep + "runtime_" + str(runtime) pred_Y=mlc.predictMulticlassCategoryModelCvted([],pred_X_runtime,params,runtime=runtime,bool_retlabel=True,num_instance=nrow*ncol,num_class=num_class,\ bool_save=bool_save,savedir=savedir) pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class) pred_pY_ense = pred_pY_ense + baggingweights[ runtime] * pred_Y_epd.astype(np.float32) time_stop = time.time() count = count + 1 done = count / runtimes remain = (runtimes - count) / runtimes num_day, num_hour, num_min = _calDueTime(time_start, time_stop, done, 0.0) print( "Model: %d Calculating Finished! Done: %.2f%%, Remaining: %.2f%%" % (runtime, 100 * done, 100 * remain)) print("Calculating will finish in %d Days %d Hours %d Minutes\n" % (num_day, num_hour, num_min)) [pred_Y, pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense, nrow, ncol, num_class, bool_onearray=False, mask=bool_mask.flatten()) return [pred_Y, pred_pY]