Exemplo n.º 1
0
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run):
    logger.info('Starting MLA run')
    logger.info('------------')
    if settings.pyspark_on == 1:                # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible.
        from pyspark import SparkContext        # It's slower, manages resources between nodes using HTTP. 
        from pyspark.sql import SQLContext      # So far, it does not include feature importance outputs.
        from pyspark.ml import Pipeline         # I would have to program feature importances myself. May be time consuming.
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.feature import StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        # pyspark go
        
        if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation?
            logger.info('Remaking csvs for pysparks...')
            numpy.savetxt(temp_train, XX, delimiter=",")
            logger.info('Training csv saved')
            numpy.savetxt(temp_pred, XXpredict, delimiter=",")
            logger.info('Predict csv saved')
        sc = SparkContext(appName="ML_RF") # Initiate spark
        
        sclogger=sc._jvm.org.apache.log4j # Initiate spark logging
        sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR)
        sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR)
        sqlContext=SQLContext(sc)
        # Read in data
        data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train)
        data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred)
        data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label
        data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label")
        
        assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features")
        reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA
        
        assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features")
        reduced_pr=assembler_pr.transform(data_pr.select('*'))
        
        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors        
        featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced)
        # Initiate MLA alg
        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        model=pipeline.fit(reduced) # Fit
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        start, end=[],[]
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        predictions = model.transform(reduced_pr) # Predict
        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision")
        accuracy = evaluator.evaluate(predictions)
        logger.info("Test Error = %g" %(1.0-accuracy))
        logger.info('------------')
        logger.info('Pulling results ...')
        yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program
        yypredict=yypredict[:,0]
        result=numpy.array(predictions.select("prediction").collect())
        result=result[:,0]
        XXpredict=numpy.array(predictions.select("indexedFeatures").collect())
        XXpredict=XXpredict[:,0]
        probs=numpy.array(predictions.select("probability").collect())
        probs=probs[:,0]
        XXpredict=numpy.column_stack((XXpredict,yypredict))
        end=time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')
    
    else:
        # Run sklearn MLA switch
        MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings
        clf = MLA().set_params(**settings.MLAset)
        logger.info('MLA settings') 
        logger.info(clf)
        logger.info('------------')    
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        score = clf.score
        if 'OvsA' not in ind_run_name:
            if settings.output_all_trees == 1:
                i_tree = 0
                for tree_in_forest in clf.estimators_:
                    with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file:
                        my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                    os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree))
                    os.remove('plots/tree_%s.dot' %i_tree)
                    i_tree = i_tree + 1        
            else:
                with open('plots/tree_example.dot', 'w') as my_file:
                    my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png')
                os.remove('plots/tree_example.dot')
        start, end=[],[]
        # Split cats for RAM management
        numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs))
        if settings.get_contributions ==1:
            numcats=100
        if numcats < 1:
            numcats = 1
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        result,probs,bias,contributions,train_contributions=[],[],[],[],[]
        XXpredict_cats=numpy.array_split(XXpredict,numcats)
        logger.info('Splitting predict array into %s' %numcats)
        logger.info('------------')
        for i in range(len(XXpredict_cats)):
            logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats)))
            result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array.
            probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end
            if 'OvsA' not in ind_run_name:            
                if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1):           
                    logger.info('Getting contributions from predict catalogue %s' %i)
                    tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat])
                    contributions.extend(tiresult[2])
                    bias = tiresult[1][0]
        feat_importance = clf.feature_importances_
        result=numpy.float32(result)
        probs=numpy.float32(probs)
        if 'OvsA' not in ind_run_name:            
            if settings.get_contributions == 1: 
                numpy.save('contributions',contributions)
            if settings.get_perfect_contributions == 1: 
                numpy.save('perfect_contributions',contributions)
            if settings.compute_contribution_mic == 1:
                logger.info('Getting contributions from train catalogue (for plot_mic_cont)')
                tiresult_train = ti.predict(clf,XX[:,0:n_feat])
                train_contributions=tiresult_train[2]
                bias_train = tiresult_train[1][0]
        
        accuracy = metrics.accuracy_score(result,yypredict)
        recall = metrics.recall_score(result,yypredict,average=None)
        precision = metrics.precision_score(result,yypredict,average=None)
        score = metrics.f1_score(result, yypredict,average=None)
        
        end = time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')

    logger.info('Recall Score: %s' %recall)
    logger.info('Precision Score: %s' %precision)
    logger.info('Accuracy Score: %s' %accuracy)
    logger.info('F1 Score: %s' %score)
    percentage=(n/predictdatanum)*100
    
    run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result')
#    stats=numpy.array([])
#    stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage))
    # SAVE
    if settings.saveresults == 1:
        logger.info('Saving results')
        logger.info('------------')

        numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target")
        numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs)
        numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance)
        numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s")
    
    return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
Exemplo n.º 2
0
OBJID_pr = OBJID_pr[0:predictdatanum]
SPECOBJID_pr = SPECOBJID_pr[0:predictdatanum]
RA_tr,DEC_tr = RA_tr[0:traindatanum],DEC_tr[0:traindatanum]
RA_pr,DEC_pr = RA_pr[0:predictdatanum],DEC_pr[0:predictdatanum]
specz_tr,specz_pr = specz_tr[0:traindatanum],specz_pr[0:predictdatanum]

# Cuts for doublesubrun
subclass_tr = subclass_tr[0:traindatanum]
subclass_names_tr = subclass_names_tr[0:traindatanum]
subclass_pr = subclass_pr[0:predictdatanum]
subclass_names_pr = subclass_names_pr[0:predictdatanum]

del traindata,preddata,filt_train,filt_predict,filt_train_all,filt_predict_all # Clean up

unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr = \
run_opts.diagnostics([XX[:,-1],XXpredict[:,-1],classnames_tr,classnames_pr],'inputdata') # Total breakdown of types going in

yy = XX[:,-1] # Training answers
yypredict = XXpredict[:,-1] # Prediction answers

if settings.compute_mifs==1:
    # define MI_FS feature selection method
    feat_selector = mifs.MutualInformationFeatureSelector(n_features=n_feat,method='MRMR')
    feat_selector.fit(XX[:,:-1], numpy.int64(yy))
    
if settings.one_vs_all == 1: # target is unique_IDs_tr[i] in loop
    XX_one_vs_all,XXpredict_one_vs_all,yy_one_vs_all,yypredict_one_vs_all = {},{},{},{}
    for i in range(len(unique_IDS_tr)):
        yy_orig = yy
        yypredict_orig = yypredict
        yy_out = [numpy.float32(99) if x!=unique_IDS_tr[i] else x for x in yy_orig]