def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run): logger.info('Starting MLA run') logger.info('------------') if settings.pyspark_on == 1: # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible. from pyspark import SparkContext # It's slower, manages resources between nodes using HTTP. from pyspark.sql import SQLContext # So far, it does not include feature importance outputs. from pyspark.ml import Pipeline # I would have to program feature importances myself. May be time consuming. from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # pyspark go if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation? logger.info('Remaking csvs for pysparks...') numpy.savetxt(temp_train, XX, delimiter=",") logger.info('Training csv saved') numpy.savetxt(temp_pred, XXpredict, delimiter=",") logger.info('Predict csv saved') sc = SparkContext(appName="ML_RF") # Initiate spark sclogger=sc._jvm.org.apache.log4j # Initiate spark logging sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR) sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR) sqlContext=SQLContext(sc) # Read in data data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train) data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred) data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label") assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features") reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features") reduced_pr=assembler_pr.transform(data_pr.select('*')) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced) # Initiate MLA alg rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() model=pipeline.fit(reduced) # Fit end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') start, end=[],[] logger.info('Predict start') logger.info('------------') start = time.time() predictions = model.transform(reduced_pr) # Predict evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision") accuracy = evaluator.evaluate(predictions) logger.info("Test Error = %g" %(1.0-accuracy)) logger.info('------------') logger.info('Pulling results ...') yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program yypredict=yypredict[:,0] result=numpy.array(predictions.select("prediction").collect()) result=result[:,0] XXpredict=numpy.array(predictions.select("indexedFeatures").collect()) XXpredict=XXpredict[:,0] probs=numpy.array(predictions.select("probability").collect()) probs=probs[:,0] XXpredict=numpy.column_stack((XXpredict,yypredict)) end=time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') else: # Run sklearn MLA switch MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings clf = MLA().set_params(**settings.MLAset) logger.info('MLA settings') logger.info(clf) logger.info('------------') start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') score = clf.score if 'OvsA' not in ind_run_name: if settings.output_all_trees == 1: i_tree = 0 for tree_in_forest in clf.estimators_: with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file: my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree)) os.remove('plots/tree_%s.dot' %i_tree) i_tree = i_tree + 1 else: with open('plots/tree_example.dot', 'w') as my_file: my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png') os.remove('plots/tree_example.dot') start, end=[],[] # Split cats for RAM management numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs)) if settings.get_contributions ==1: numcats=100 if numcats < 1: numcats = 1 logger.info('Predict start') logger.info('------------') start = time.time() result,probs,bias,contributions,train_contributions=[],[],[],[],[] XXpredict_cats=numpy.array_split(XXpredict,numcats) logger.info('Splitting predict array into %s' %numcats) logger.info('------------') for i in range(len(XXpredict_cats)): logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats))) result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array. probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end if 'OvsA' not in ind_run_name: if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1): logger.info('Getting contributions from predict catalogue %s' %i) tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat]) contributions.extend(tiresult[2]) bias = tiresult[1][0] feat_importance = clf.feature_importances_ result=numpy.float32(result) probs=numpy.float32(probs) if 'OvsA' not in ind_run_name: if settings.get_contributions == 1: numpy.save('contributions',contributions) if settings.get_perfect_contributions == 1: numpy.save('perfect_contributions',contributions) if settings.compute_contribution_mic == 1: logger.info('Getting contributions from train catalogue (for plot_mic_cont)') tiresult_train = ti.predict(clf,XX[:,0:n_feat]) train_contributions=tiresult_train[2] bias_train = tiresult_train[1][0] accuracy = metrics.accuracy_score(result,yypredict) recall = metrics.recall_score(result,yypredict,average=None) precision = metrics.precision_score(result,yypredict,average=None) score = metrics.f1_score(result, yypredict,average=None) end = time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') logger.info('Recall Score: %s' %recall) logger.info('Precision Score: %s' %precision) logger.info('Accuracy Score: %s' %accuracy) logger.info('F1 Score: %s' %score) percentage=(n/predictdatanum)*100 run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result') # stats=numpy.array([]) # stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage)) # SAVE if settings.saveresults == 1: logger.info('Saving results') logger.info('------------') numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target") numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs) numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance) numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s") return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
OBJID_pr = OBJID_pr[0:predictdatanum] SPECOBJID_pr = SPECOBJID_pr[0:predictdatanum] RA_tr,DEC_tr = RA_tr[0:traindatanum],DEC_tr[0:traindatanum] RA_pr,DEC_pr = RA_pr[0:predictdatanum],DEC_pr[0:predictdatanum] specz_tr,specz_pr = specz_tr[0:traindatanum],specz_pr[0:predictdatanum] # Cuts for doublesubrun subclass_tr = subclass_tr[0:traindatanum] subclass_names_tr = subclass_names_tr[0:traindatanum] subclass_pr = subclass_pr[0:predictdatanum] subclass_names_pr = subclass_names_pr[0:predictdatanum] del traindata,preddata,filt_train,filt_predict,filt_train_all,filt_predict_all # Clean up unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr = \ run_opts.diagnostics([XX[:,-1],XXpredict[:,-1],classnames_tr,classnames_pr],'inputdata') # Total breakdown of types going in yy = XX[:,-1] # Training answers yypredict = XXpredict[:,-1] # Prediction answers if settings.compute_mifs==1: # define MI_FS feature selection method feat_selector = mifs.MutualInformationFeatureSelector(n_features=n_feat,method='MRMR') feat_selector.fit(XX[:,:-1], numpy.int64(yy)) if settings.one_vs_all == 1: # target is unique_IDs_tr[i] in loop XX_one_vs_all,XXpredict_one_vs_all,yy_one_vs_all,yypredict_one_vs_all = {},{},{},{} for i in range(len(unique_IDS_tr)): yy_orig = yy yypredict_orig = yypredict yy_out = [numpy.float32(99) if x!=unique_IDS_tr[i] else x for x in yy_orig]