def train_test(self, df): df = self.dropNonTCPUDP(df) catCols = [] numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration'] labelCol = 'label' data = self.get_dummy(df, catCols, numCols, labelCol) data.show() labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data) labelIndexer.transform(data) featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures").fit(data) featureIndexer.transform(data) (trainingData, testData) = data.randomSplit([0.7, 0.3]) trainingData.cache() # trainingData.repartition(200) testData.cache() # testData.repartition(200) trainingData.show(5,False) testData.show(5,False) rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel') # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) # Select example rows to display. predictions.select("features","label","predictedLabel", "prediction") # Select (prediction, true label) and compute test error print(self.getTestError(predictions)) self.printMetrics(predictions) # print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features")) return model
def preprocessed_df(df, label="flg_cmd_lowcostIndex"): max_values_to_define_str_cols = 10 id_col = 'ID_CLIENT' dty = dict(df.dtypes) str_cols = [k for k, v in dty.items() if v == 'string'] str_cols.remove(id_col) for c in str_cols: stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index") model_str = stringIndexer.fit(df) df = model_str.transform(df).drop(c) input_cols = df.columns input_cols.remove(id_col) input_cols.remove(label) assembler = VectorAssembler(inputCols=input_cols, outputCol="features") df = assembler.transform(df) featureIndexer = VectorIndexer( inputCol="features", outputCol="indexedFeatures", maxCategories=max_values_to_define_str_cols).fit(df) return featureIndexer.transform(df), df
def chiSquareTest(self,categoricalFeatures,maxCategories): dataset=self.dataset labelColm=self.labelColm features=self.features length = features.__len__() featureassembler = VectorAssembler( inputCols=self.features, outputCol="featuresChiSquare", handleInvalid="skip") dataset= featureassembler.transform(dataset) vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset = vec_indexer.transform(dataset) # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure') # finalized_data.show() # using chi selector selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure", outputCol="selectedFeatures", labelCol=labelColm) result = selector.fit(dataset).transform(dataset) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() # runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head() p_values = list(r.pValues) PValues = [] for val in p_values: PValues.append(round(val, 4)) print(PValues) dof = list(r.degreesOfFreedom) stats = list(r.statistics) statistics = [] for val in stats: statistics.append(round(val, 4)) print(statistics) chiSquareDict = {} for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures): print(pval, doF, stat) chiSquareDict[colm] = pval, doF, stat chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics'] print(chiSquareDict) result = {'pvalues': chiSquareDict} return result
def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() label = '' for val in label_colm: label = val Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str( x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn(y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4, handleInvalid="skip").fit( dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed trainingData, testData = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40) # applying the model lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label) regressor = lr.fit(trainingData) locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' modelPersist = 'linearRegressorModel.parquet' modelStorageLocation = locationAddress + userId + modelPersist regressor.write().overwrite().save(modelStorageLocation) # print regressor.featureImportances # print(dataset.orderBy(feature_colm, ascending=True)) # pred = regressor.transform(testData) # coefficeint & intercept # saving the model and test dataset as csv file print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(testData) # VI_IMP = 2 prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select(label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(testData) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', intercept_t, '+' for feature, coeff in zip(feature_colm, coefficientList): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) st = list(equation) # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in PValuesList: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') ''' userId = 'sahil123' graphName = 'QQPlot.parquet' locationAddress = '/home/fidel/mltest/' finalLocation = locationAddress + userId + graphName print(finalLocation) pred_residuals.write.parquet(finalLocation,mode='overwrite') ''' #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select(label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop( 'row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', ( sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias( 'std_res')) std_resid.show() sqrt_std_res = std_resid.select("std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ###################################################################################### # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain) # schema = StructType([StructField('zTheoriticalTrain', FloatType(), True), # StructField('zPracticalTrain', FloatType(), True) # ]) # spark.createDataFrame(zPracticalTrain, FloatType()).show() #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ''' prediction = regressor.evaluate(dataset) predictionTestData= prediction.predictions predictionTestData.show() #appending the predicted column into the dataset which is test dataset predictionLabelList = [label,'prediction'] updatedFeatureColmList = feature_colm for val in predictionLabelList: updatedFeatureColmList.append(val) print(updatedFeatureColmList) predictionTestDatasetcolumn = predictionTestData.select(updatedFeatureColmList) predictionTestDatasetcolumn.show() ''' ########################################################################################## # scale location plot # for scale location plotequationAsList # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt # y = 0.1 # x = [] # # for i in range(0, 90): # x.append(y) # y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # # quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01) # print quantile_label # quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01) # print quantile_prediction # # Q_label_pred='' # print(len(quantile_label)) # length = len(quantile_label) # # for i in range(0,len(quantile_label)): # Q_label_pred += str(quantile_label[i]) + '|' + str(quantile_prediction[i]) + '\n' # writing it to the hdfs in parquet file # # quantile_label_tospark = spark.createDataFrame(quantile_label, FloatType()) # quantile_label_tospark = quantile_label_tospark.withColumnRenamed("value", "Q_label") # # quantile_prediction_tospark = spark.createDataFrame(quantile_prediction, FloatType()) # quantile_prediction_tospark = quantile_prediction_tospark.withColumnRenamed("value", "Q_prediction") # # quant_label = quantile_label_tospark.withColumn('row_index', f.monotonically_increasing_id()) # quant_predtiction = quantile_prediction_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_quantile = quant_label.join(quant_predtiction,on=['row_index']).sort('row_index').drop('row_index') # # final_quantile.show() # # final_quantile.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',mode='overwrite') # # # print(str(Q_label_pred[i])) # with open('Q_Q_plot.csv', 'w') as Q_Q: # writer_Q_Q = csv.writer(Q_Q) # writer_Q_Q.writerows((quantile_label, quantile_prediction)) # # plt.scatter(quantile_label, quantile_prediction) # plt.show() ## finding the residual vs fitted graph data # # # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", "prediction") # # prediction_val_pand_residual_tospark = spark.createDataFrame(prediction_val_pand_residual, FloatType()) # prediction_val_pand_residual_tospark = prediction_val_pand_residual_tospark.withColumnRenamed("value", "residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = prediction_val_pand_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_res_fitted = pred_spark.join(res_spark, on=['row_index'])\ # .sort('row_index').drop('row_index') # # final_res_fitted.show() # # final_res_fitted.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/RESIDUAL_FITTED_PLOT.parquet', # mode='overwrite') # # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual) # plt.axhline(y=0.0, color="red") # plt.xlabel("prediction") # plt.ylabel("residual") # plt.title("residual vs fitted ") # plt.show() # creating the csv file and writitng into it import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str(prediction_val_pand_predict[i]) + '|' + str( prediction_val_pand_residual[i]) + '\n' with open('residual_vs_fitted.csv', 'w') as r_f: writer_r_f = csv.writer(r_f) writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) # parquet file writing ## residual vs leverage graph data prediction_val_pand_residual # extreme value in the predictor colm prediction_col_extremeval = lr_prediction_quantile.agg({"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs() import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # plt.scatter(sqrt_residual, prediction_val_pand_predict) ####################################################################################3 # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_pred = statistics.stdev(prediction_val_pand_residual) # mean = statistics.mean(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_pred) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) #######################################################################################3 # QUANTILE ## sort the list sorted_std_res = sorted(std_res) print(sorted_std_res) # mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) print(mean) quantile = [] n = len(sorted_std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # # z_score theoritical from scipy.stats import norm z_theory = [] for x in quantile: z_theory.append((norm.ppf(abs(x)))) print(z_theory) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) # y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01) print(quantile_std_res_t) print(x) Q_label_pred = '' # print(len(quantile_label)) # length = len(quantile_label) for quant, val in zip(z_theory, z_pract): Q_label_pred += str(val) + 't' + str(quant) + 'n' plt.scatter(z_theory, z_pract) plt.savefig('q_q') #################################################### # creating the std residuals # square root of label sqrt_label = [] for x in prediction_val_pand_label: sqrt_label.append(math.sqrt(abs(x))) sqrt_label prediction_val_pand_residual std_residual = [] for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): std_residual.append(resid / sqr) # print(std_sqrt_residual) # creating the std sqr root sqrt_std_residuals = [] for x in std_residual: # print(math.sqrt(abs(x))) sqrt_std_residuals.append(math.sqrt(abs(x))) print(sqrt_std_residuals) # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) ########################################################################## # import math # sqrt_stdres = [] # for x in std_sqrt_residual: # sqrt_stdres.append(math.sqrt(x)) # # scale_predict_residual = '' # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres): # scale_predict_residual += str(pre) + 't' + str(res) + 'n' # print(scale_predict_residual) ###################################3 # plt.show() # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(std_sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n' # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # writing to the parquet # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", # "prediction") # # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType()) # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value", # "sqrt_residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \ # .sort('row_index').drop('row_index') # # final_scale_fitted.show() # # final_scale_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet', # mode='overwrite') # # dumping the dictionary into json object # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} tableContent = \ { 'coefficientValuesKey': coefficientList, 'tValuesKey': tValuesList, 'pValuesKey': PValuesList, 'significanceValuesKey': significanceObject, 'interceptValuesKey': intercept_t, "RMSE": RMSE, "RSquare": r_square, "AdjRSquare": adjsted_r_square, "CoefficientStdError": coefficientStdError, } print(tableContent) json_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square, "Coefficient_error": coefficientStdError, "T_value": T_values, "P_value": P_values, 'Q_Q_plot': Q_label_pred, 'residual_fitted': fitted_residual, 'scale_location': scale_predict_residual } return json_response except Exception as e: print('exception is =' + str(e))
def dataTranform(self): dataset = self.dataset schemaData = dataset.schema categoricalFeatures = [] numericalFeatures = [] for schemaVal in schemaData: if (str(schemaVal.dataType) == "StringType" or str(schemaVal.dataType) == "TimestampType" or str(schemaVal.dataType) == "DateType" or str(schemaVal.dataType) == "BooleanType" or str(schemaVal.dataType) == "BinaryType"): for y in self.featuresColm: if schemaVal.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) categoricalFeatures.append(schemaVal.name) else: for y in self.featuresColm: if schemaVal.name == y: numericalFeatures.append(schemaVal.name) for schemaVal in schemaData: if (str(schemaVal.dataType) == "StringType" and schemaVal.name == label): for labelkey in self.labelColm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexedFeatures = [] for colm in categoricalFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexedFeatures.append('indexed_' + colm) dataset = indexer.transform(dataset) combinedFeatures = numericalFeatures + indexedFeatures categoryColmListDict = {} countOfCategoricalColmList = [] for value in categoricalFeatures: # categoryColm = value # listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue self.numericalFeatures = numericalFeatures self.categoricalFeatures = categoricalFeatures if not categoricalFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) featureassembler = VectorAssembler(inputCols=combinedFeatures, outputCol="features", handleInvalid="skip") dataset = featureassembler.transform(dataset) vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) dataset = vec_indexer.transform(dataset) return dataset, categoricalFeatures, numericalFeatures
def callCenter(self): dataset = spark.read.csv( "/home/fidel/Downloads/CallCenterFinalTillAprilData", sep=',', header=True, inferSchema=True) dataset.show() feature_colm = ["col_2_SKILLNAME_2", "col_2_SKILLNAME_3"] label_colm = ["CALLDATE"] label = "" for val in label_colm: label = val Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() print(categoryColm) countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) maxCategories = 13 for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label dataset.show() indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories).fit(dataset) dataset = vectorIndexer.transform(dataset) import csv dataset = dataset.select("CALLDATE", "col_2_SKILLNAME_2", "col_2_SKILLNAME_3", "indexed_CALLDATE", "indexed_col_2_SKILLNAME_2", "indexed_col_2_SKILLNAME_3") # dataset.to_csv("/home/fidel/Downloads/Callcenterdata/callFinalFormated.csv") # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinalF.csv") # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinal.csv") # dataset.show() dataset.toPandas().to_csv( "/home/fidel/Downloads/Callcenterdata/callcsv.csv") trainDataRatioTransformed = 0.80 testDataRatio = 1 - trainDataRatioTransformed trainingData, testData = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=0) #applying the model randomForestModel = RandomForestClassifier( labelCol=label, featuresCol='vectorIndexedFeatures', numTrees=10, maxBins=maxCategories) randomForestModelFit = randomForestModel.fit(trainingData) predictions = randomForestModelFit.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def linearRegPersist(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() label = '' for val in label_colm: label = val Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4).fit(dataset) dataset = vectorIndexer.transform(dataset) # Loading the persisted model locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' modelPersist = 'linearRegressorModel.parquet' persistedModelLocation = locationAddress + userId + modelPersist regressorTest = LinearRegressionModel.load(persistedModelLocation) predictedData = regressorTest.transform(dataset) predictedData.show() except Exception as e: print('exception is :', e)
data["pickup_latitude"] >= 40.63).filter( data["dropoff_latitude"] <= 40.85).filter( data["dropoff_latitude"] >= 40.63) #data.printSchema() assembler = VectorAssembler().setInputCols([ "vendor_id", "pickup_longitude", "pickup_latitude", "pickup_hour", "pickup_month", "dropoff_longitude", "dropoff_latitude", "trip_distance", "passenger_count" ]).setOutputCol("features") df = assembler.setHandleInvalid("skip").transform(data).select( "trip_duration", "features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=30).fit(df) d = featureIndexer.transform(df) trainTest = d.randomSplit([0.8, 0.2]) traindf = trainTest[0] testdf = trainTest[1] # Model dtr = DecisionTreeRegressor(featuresCol="indexedFeatures", labelCol="trip_duration", impurity="variance") # choices of tuning parameters dtrparamGrid = (ParamGridBuilder().addGrid(dtr.maxDepth, [10]).build()) pipeline = Pipeline(stages=[featureIndexer, dtr]) crossval = CrossValidator(estimator=pipeline,
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: # dataset = spark.read.parquet(dataset_add) dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=';') dataset.show() label = '' for y in label_colm: label = y print(label) # # summaryList = ['mean', 'stddev', 'min', 'max'] # summaryDict = {} # for colm in feature_colm: # summaryListTemp = [] # for value in summaryList: # summ = list(dataset.select(colm).summary(value).toPandas()[colm]) # summaryListTemp.append(summ) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) # summaryDict[colm] = summaryListTemp # summaryList.append('variance') # summaryDict['summaryName'] = summaryList # # print(summaryDict) # print(summaryDict) # varianceDict = {} # for colm in feature_colm: # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # varianceDict[colm] = varianceListTemp # print(varianceDict) # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict} # print(summaryAll) # extracting the schema schemaDataset = dataset.schema stringFeatures = [] numericalFeatures = [] for x in schemaDataset: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) print(stringFeatures) print(numericalFeatures) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListTemp.append(summ) varianceListTemp = list( dataset.select(variance( col(colm)).alias(colm)).toPandas()[colm]) summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures print(summaryDict) # print(val) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) # calling pearson test fuction response_pearson_test = Correlation_test_imp( dataset=dataset, features=numericalFeatures, label_col=label) # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label) # dataset_pearson = dataset # # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset) # dataset = label_indexer.transform(dataset) ########################################################################### indexed_features = [] encoded_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset) # encoded_features.append('encoded_'+colm) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = numericalFeatures + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # output.show() # output.select("features").show() # output_features = dataset.select("features") #using the vector indexer vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select(label, 'vec_indexed_features') finalized_data.show() # renaming the colm # print (label) # dataset.withColumnRenamed(label,"label") # print (label) # dataset.show() # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # # # # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestRegressor(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) # Convert indexed labels back to original labels. # Train model. This also runs the indexers. model = rf.fit(train_data) # Make predictions. predictions = model.transform(test_data) # Select example rows to display. # predictions.select("prediction", "label", "features").show(10) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': feature_importance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'pearson_test_data': response_pearson_test, 'summaryDict': summaryDict } return response_dict print(response_dict) # Select (prediction, true label) and compute test error # evaluator = MulticlassClassificationEvaluator( # labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test Error = %g" % (1.0 - accuracy)) # rfModel = model.stages[2] # print(rfModel) # summary only except Exception as e: print("exception is = " + str(e))
def GradientBoostingClassification(self, dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.csv(dataset_add, sep=';', header=True, inferSchema=True) dataset.show() stepSize = self.learningRate label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) vectorIndexer = VectorIndexer( inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories).fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed trainingData, testData = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=0) gradientBoostingmodel = GBTClassifier( labelCol=label, featuresCol='vectorIndexedFeatures', maxIter=10, stepSize=stepSize) gradientBoostFittingTrainingData = gradientBoostingmodel.fit( trainingData) gBPredictionTrainData = gradientBoostFittingTrainingData.transform( trainingData) gBPredictionTestData = gradientBoostFittingTrainingData.transform( testData) gBPredictionTestData.select('prediction', label).show() # gbtModel = gradientBoostFittingTrainingData.stages featureImportance = gradientBoostFittingTrainingData.featureImportances.toArray( ).tolist() print(featureImportance) # prediction graph data from pyspark.sql.functions import col TrainPredictedTargetData = gBPredictionTrainData.select( label, 'prediction', 'probability', 'rawPrediction') residualsTrainData = TrainPredictedTargetData.withColumn( 'residuals', col(label) - col('prediction')) residualsTrainData.show() TestPredictedTargetData = gBPredictionTestData.select( label, 'prediction', 'probability', 'rawPrediction') residualsTestData = TestPredictedTargetData.withColumn( 'residuals', col(label) - col('prediction')) residualsTestData.show() # train Test data Metrics gBPredictionDataDict = { 'gBPredictionTestData': gBPredictionTestData, 'gBPredictionTrainData': gBPredictionTrainData } metricsList = [ 'f1', 'weightedPrecision', 'weightedRecall', 'accuracy' ] for key, value in gBPredictionDataDict.items(): if key == 'gBPredictionTestData': testDataMetrics = {} for metric in metricsList: evaluator = MulticlassClassificationEvaluator( labelCol=label, predictionCol="prediction", metricName=metric) metricValue = evaluator.evaluate(gBPredictionTestData) testDataMetrics[metric] = metricValue print('testDataMetrics :', testDataMetrics) if key == 'gBPredictionTrainData': trainDataMetrics = {} for metric in metricsList: evaluator = MulticlassClassificationEvaluator( labelCol=label, predictionCol="prediction", metricName=metric) metricValue = evaluator.evaluate(gBPredictionTrainData) trainDataMetrics[metric] = metricValue print('trainDataMetrics :', trainDataMetrics) # while fitting the training data totalNumberTrees = gradientBoostFittingTrainingData.getNumTrees print('Total number of trees used is :', totalNumberTrees) totalNumberNodes = gradientBoostFittingTrainingData.totalNumNodes print('Total number of node is :', totalNumberNodes) treeWeight = gradientBoostFittingTrainingData.treeWeights print('Weights on each tree is :', treeWeight) treeInfo = gradientBoostFittingTrainingData.trees for eachTree in treeInfo: print('info of each tree is :', eachTree) except Exception as e: print('exception is --', e)
def chi_square_test(dataset, features, label_col, stringFeatures): spark = SparkSession.builder.appName("predictive_analysis").master( "local[*]").getOrCreate() spark.sparkContext.setLogLevel("ERROR") stringFeatures length = features.__len__() datasetChi = dataset featureassembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="skip") datasetChi = featureassembler.transform(datasetChi) datasetChi.show() vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4, handleInvalid="skip").fit(datasetChi) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(datasetChi) vec_indexed.show() finalized_data = vec_indexed.select(label_col, 'vec_indexed_features') finalized_data.show() # using chi selector selector = ChiSqSelector(numTopFeatures=length, featuresCol="vec_indexed_features", outputCol="selected_features", labelCol=label_col) result = selector.fit(finalized_data).transform(finalized_data) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() # runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selected_features", label_col).head() p_values = list(r.pValues) PValues = [] for val in p_values: PValues.append(round(val, 4)) print(PValues) dof = list(r.degreesOfFreedom) stats = list(r.statistics) statistics = [] for val in stats: statistics.append(round(val, 4)) print(statistics) chiSquareDict = {} for pval, doF, stat, colm in zip(PValues, dof, statistics, stringFeatures): print(pval, doF, stat) chiSquareDict[colm] = pval, doF, stat chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics'] print(chiSquareDict) return_data = {'pvalues': chiSquareDict} return return_data
# In[280]: # Index labels, adding metadata to the label column labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(transformed) labelIndexer.transform(transformed).show(5, False) # In[265]: # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(transformed) featureIndexer.transform(transformed).show(5, True) # In[281]: data.show(2, False) # In[282]: # Split the data into training and test sets (40% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) trainingData.show(5, False) testData.show(5, False) # In[283]:
def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress): try: dataset = spark.read.parquet(dataset_add) dataset.show() label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] # encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) '''from pyspark.ml.feature import OneHotEncoderEstimator oneHotEncodedFeaturesList = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm) oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features, outputCols=oneHotEncodedFeaturesList) oneHotEncoderFit=oneHotEncoder.fit(dataset) oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)''' featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label) regressor = lr.fit(train_data) # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) featurePredictedLabel = feature_colm featurePredictedLabel.append('prediction') featurePredictedLabel.append(label) # testDataEvaluation = regressor.evaluate(test_data) # testDataPrediction = testDataEvaluation.predictions # testDataPrediction.select(featurePredictedLabel).show() prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions testDataPrediction = prediction_val.select(featurePredictedLabel) # storing test predicted value to the dataset prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() residual_graph = training_summary.residuals residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str( training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) coefficientList = list(regressor.coefficients) #summaryData import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') print(coefficientList) coefficientListRounded = [] for value in coefficientList: coefficientListRounded.append(round(value, 4)) # print(coefficientListRounded) # print(intercept_t) interceptRounded = round(float(intercept_t), 4) # print(interceptRounded) # print(RMSE) RMSERounded = round(RMSE, 4) # print(RMSERounded) MSERounded = round(MSE, 4) rSquareRounded = round(r_square, 4) adjustedrSquareRounded = round(adjsted_r_square, 4) coefficientStdError = training_summary.coefficientStandardErrors coefficientStdErrorRounded = [] for value in coefficientStdError: coefficientStdErrorRounded.append(round(float(value), 4)) print(coefficientStdErrorRounded) tValuesListRounded = [] for value in tValuesList: tValuesListRounded.append(round(value, 4)) print(tValuesListRounded) pValuesListRounded = [] PValuesList = training_summary.pValues for value in PValuesList: pValuesListRounded.append(round(value, 4)) print(pValuesListRounded) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', interceptRounded, '+' for feature, coeff in zip(feature_colm, coefficientListRounded): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) equationAsList = list(equation) '''# statTable function def summaryTable(self,featuresName,featuresStat): statTable={} for name, stat in zip(featuresName.values(), featuresStat.values()): print(name, ": ", stat) statTable[name]=stat return statTable ''' # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in pValuesListRounded: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) # storing test predicted value to the dataset predictionData = 'prediction.parquet' predictionDataStoring = locationAddress + userId + predictionData testDataPrediction.write.parquet(predictionDataStoring, mode='overwrite') # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ##########3 # table_response = { # # "Intercept": intercept_t, # "Coefficients": coefficient_t, # "RMSE": RMSE, # "MSE": MSE, # "R_square": r_square, # "Adj_R_square": adjsted_r_square, # "coefficientStdError": coefficientStdError, # "T_value": T_values, # "P_value": P_values # # } y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } tableContent = \ { 'coefficientValuesKey': coefficientListRounded, 'tValuesKey': tValuesListRounded, 'pValuesKey': pValuesListRounded, 'significanceValuesKey': significanceObject, 'interceptValuesKey': interceptRounded, "RMSE": RMSERounded, "RSquare": rSquareRounded, "AdjRSquare": adjustedrSquareRounded, "CoefficientStdError": coefficientStdErrorRounded, 'equationKey': equation } json_response = { 'table_data': tableContent, 'graph_data': graph_response } print(json_response) return (json_response) except Exception as e: print('exception is =' + str(e))
def Logistic_regression(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() label = '' for y in label_colm: label = y print(label) # using the rformula for indexing, encoding and vectorising # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # extracting the schema val = dataset.schema string_features = [] integer_features = [] for x in val: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: string_features.append(x.name) else: for y in feature_colm: if x.name == y: integer_features.append(x.name) print(string_features) print(integer_features) print(val) # print(label) # label = 'y' for z in val: if (z.name == label and str(z.dataType) == "StringType"): label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) if (z.name == label and str(z.dataType) == ("IntegerType" or "FloatType" or "DoubleType")): dataset = dataset.withColumnRenamed(label, 'indexed_' + label) ########################################################################### indexed_features = [] encoded_features = [] for col in string_features: indexer = StringIndexer(inputCol=col, outputCol='indexed_' + col).fit(dataset) indexed_features.append('indexed_' + col) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset) # encoded_features.append('encoded_'+col) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = integer_features + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # combining both the features colm together # output.show() # output.select("features").show() # output_features = dataset.select("features") # using the vector indexer (for categorical data kind of one hot encoding) vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=15).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select('indexed_' + label, 'vec_indexed_features') finalized_data.show() # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # splitting the dataset into train and test train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) # implementing the logistic regression # lr1 =LogisticRegression() Accuracy_list = [] # Accuracy_list.append(accuracy) FPR_list = [] # FPR_list.append(falsePositiveRate) TPR_list = [] precision_list = [] recall_list = [] y = 0.1 # x=[] for i in range(0, 3): y = round(y + 0.1, 2) lr = LogisticRegression(featuresCol='vec_indexed_features', labelCol='indexed_' + label, maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3) # fit the model lrModel = lr.fit(train_data) lrModel # print the coefficients and the intercept for the logistic regression print("coefficients:" + str(lrModel.coefficientMatrix)) # mat = (lrModel.coefficientMatrix) # print mat print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model # f-measure calculation from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print(" area under roc : ", training_summary.areaUnderROC) print(" roc : ", training_summary.roc) roc = training_summary.roc roc.show() print(" pr value : ", training_summary.pr) pr = training_summary.pr pr.show() print(" precision by threshold : ", training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print(" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print(accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print(objective) # for a multiclass we can inspect a matrix on a per label basis print("false positive rate by label:") for i, rate in enumerate( training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) # # print("True Negative rate") # for i, rate in enumerate(training_summary) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # Accuracy_list = [] Accuracy_list.append(accuracy) # FPR_list = [] FPR_list.append(falsePositiveRate) # TPR_list=[] TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print(Accuracy_list) print(FPR_list) print(TPR_list) print(precision_list) print(recall_list) import matplotlib.pyplot as plt # # plt.plot(recall_list, FPR_list) # plt.show() # # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ] # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395 , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0] # data visualization # ROC graph fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() # PR graph pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision, pr_recall) plt.show() # now applying the fit on the test data prediction_val = lrModel.transform(test_data) prediction_val.groupBy('indexed_' + label, "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
# In[14]: from pyspark.ml.feature import StringIndexer # Index labels, adding metadata to the label column labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data) labelIndexer.transform(data) from pyspark.ml.feature import VectorIndexer # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) featureIndexer.transform(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # In[15]: from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.util import MLUtils # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.parquet(dataset_add) label = '' for y in label_colm: label = y Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) # print(summaryListSubTemp) summaryListTemp.append(summaryListSubTemp) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp # summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne print(summaryDict) print(summaryList.extend(['skewness', 'kurtosis', 'variance'])) print(summaryDict) # for colm in numericalFeatures: # skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm)) # kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm)) # variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm))) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features response_chi_test = chi_square_test(dataset=dataset, features=indexed_features, label_col=label, stringFeatures=stringFeatures) featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() finalized_data = vec_indexed.select(label, 'vec_indexed_features') train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestClassifier(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) model = rf.fit(train_data) predictions = model.transform(test_data) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) # feature_importance = [round(x,4) for x in feature_importance] featureImportance = [] for x in feature_importance: featureImportance.append(round(x, 4)) print(featureImportance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': featureImportance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'ChiSquareTestData': response_chi_test, 'summaryDict': summaryDict } return response_dict except Exception as e: print("exception is = " + str(e))
def lassoRegression(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.parquet(dataset_add) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression( featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) # coefficient_error = str(training_summary.coefficientStandardErrors) # print(" Tvalues :\n" + str(training_summary.tValues)) # T_values = str(training_summary.tValues) # print(" p values :\n" + str(training_summary.pValues)) # P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square } ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################################################## # scale location plot # for scale location plot # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
# deal with categorical label from pyspark.ml.feature import StringIndexer # Index labels, adding metadata to the label column labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data) labelIndexer.transform(data).show(5, True) from pyspark.ml.feature import VectorIndexer # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures", \ maxCategories=4).fit(data) featureIndexer.transform(data).show(5, True) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # split data into training and test data sets (trainingData, testData) = data.randomSplit([0.6, 0.4]) # visualization import numpy as np import itertools import matplotlib.pyplot as plt def plot_confusion_matrix(cm, classes,
spark.stop() data["trainSI"] = trainPath data["testSI"] = testPath data["currentTrain"] = trainPath data["currentTest"] = testPath elif config["transformerType"] == "vi": train, test = spark.read.parquet(data["currentTrain"]), spark.read.parquet(data["currentTest"]) train.cache() test.cache() df = train.unionByName(test) featureIndexer = VectorIndexer(inputCol=config["inputCol"], outputCol=config["outputCol"], maxCategories=config["maxCategories"]).fit(df) train = featureIndexer.transform(train) test = featureIndexer.transform(test) trainPath = data['scheme'] + "://" + data['save'] + "/trainVI/" testPath = data['scheme'] + "://" + data['save'] + "/testVI/" if "partitionCol" in data and data['partitionCol'] in train.schema.names: train.write.partitionBy(data['partitionCol']).format("parquet").save(trainPath) test.write.partitionBy(data['partitionCol']).format("parquet").save(testPath) else: train.write.format("parquet").mode("overwrite").save(trainPath) test.write.format("parquet").mode("overwrite").save(testPath) spark.stop() data["trainVI"] = trainPath data["testVI"] = testPath
#transforms data into vectors def transData(data): return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features']) transformed = transData(df) transformed.show(5, False) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures",\ maxCategories=4).fit(transformed) data = featureIndexer.transform(transformed) #create a kmeans stage kmeans = KMeans() \ .setK(3) \ .setFeaturesCol("indexedFeatures")\ .setPredictionCol("cluster") # Chain indexer and kmeans in a Pipeline pipeline = Pipeline(stages=[featureIndexer, kmeans]) #fit pipeline model = pipeline.fit(transformed) #transform data cluster = model.transform(transformed)
class adresDefteri(kisi): def __init__(self): self.directory = 'sonuclar' self.createFolder() self.sc = SparkContext('local') spark = SparkSession(self.sc) spark = SparkSession \ .builder \ .appName("Python Spark Logistic Regression example") \ .config('spark.executor.heartbeatInterval', '3600s') \ .config("spark.some.config.option", "some-value") \ .getOrCreate() locale = self.sc._jvm.java.util.Locale locale.setDefault(locale.forLanguageTag("en-US")) self.catcols = ['targtype1_txt'] self.num_cols = ['country', 'region','attacktype1','weaptype1'] self.labelCol = 'gname' Root=Tk() Root.geometry("800x600") Root.title("Yaşanan Terör Olaylarını İçeren Büyük Verinin Makine Öğrenmesi Teknikleri İle Analizi") menu = Menu(Root) filemenu = Menu(menu) menu.add_cascade(label="File", menu=filemenu) filemenu.add_command(label="CSV View", command=self.secVeGoster) filemenu.add_separator() filemenu.add_command(label="Çıkış", command=Root.quit) filemenu.add_separator() filemenu.add_command(label="Yeniden Başlat", command=self.restart_program) helpmenu = Menu(menu) menu.add_cascade(label="Yardım", menu=helpmenu) helpmenu.add_command(label="Hakkında...", command=self.Hakkinda) Root.configure(background='yellow',menu=menu) global HakkindaPencere,combo self.nameText = StringVar() self.selected1 = IntVar() self.selected1.set(1) self.selected2 = IntVar() self.selected2.set(3) self.egitimLbl=Label(text=" Eğitim Verisi",width=30,height=3,fg="red",bg="yellow") self.egitimLbl.grid(row=0,column=0) self.egitimTxt=Entry(textvariable = self.nameText, fg="red",bg="yellow") self.egitimTxt.grid(row=0,column=1) self.egitimSec=Button(text=" ... ",command=self.secim,width=10,height=1,fg="red",bg="yellow") self.egitimSec.grid(row=0,column=2) self.dataSayisiLbl=Label(text=" Data Sayısı Girin (Maks:181600)",width=30,height=3,fg="red",bg="yellow") self.dataSayisiLbl.grid(row=0,column=3) self.dataSayisiTxt=Entry(fg="red",bg="yellow") self.dataSayisiTxt.grid(row=0,column=4) self.testLbl=Label(text=" Test Verisi Oranı %",width=30,height=3,fg="red",bg="yellow") self.testLbl.grid(row=1,column=0) self.testTxt=Entry(fg="red",bg="yellow") self.testTxt.grid(row=1,column=1) self.algoritmaLbl=Label(text=" Algoritma Seçiniz:",width=30,height=3,fg="red",bg="yellow") self.algoritmaLbl.grid(row=2,column=0) self.rad1 = Radiobutton(text='Hepsini karşılaştır',variable=self.selected1, value=1,command=self.secilenRadio1) self.rad1.grid(column=1, row=2) self.rad2 = Radiobutton(text='Bir Algoritma Seçiniz:',variable=self.selected1, value=2,command=self.secilenRadio1) self.rad2.grid(column=2, row=2) self.combo = ttk.Combobox (Root, state='readonly') self.combo['values']= ("Logistic Regression", "Naive Bayes", "Random Forest Classifier", "Decision Tree Classifier","Support Vector Machine","KNN" ) #self.combo.current(-1) #set the selected item #self.combo.grid(column=3, row=2) self.ulkeLbl=Label(text=" Ülke Seçiniz:",width=30,height=3,fg="red",bg="yellow") self.ulkeLbl.grid(row=3,column=0) self.rad3 = Radiobutton(text='Tüm Ülkeler İçin', variable=self.selected2, value=3,command=self.secilenRadio2) self.rad3.grid(column=1, row=3) self.rad4 = Radiobutton(text='Ülke Seçin:', variable=self.selected2, value=4,command=self.secilenRadio2) self.rad4.grid(column=2, row=3) self.comboulke = ttk.Combobox (Root, state='readonly') self.comboulke['values']= ("Türkiye", "ABD", "İran", "Pakistan", "Irak","Afganistan","Suriye") #self.comboulke.grid(column=3, row=3) #209 Turkey #217 ABD #94 İran #153 Pakistan #95 Irak #4 Afganistan #200 Suriye #self.comboulke.current(1) #set the selected item #image=photo3, ekler , compound=LEFT resmi sola ceker self.YukleBtn=Button(text="Veriyi Yükle", command=self.secilenDosya,width=20,height=3,fg="red",bg="yellow") self.YukleBtn.grid(row=4,column=2) self.DonusumBtn=Button(text=" Dönüşümü Başlat ", command=self.DonusumuBaslat,width=20,height=3,fg="red",bg="yellow") self.DonusumBtn.grid(row=5,column=2) self.ModelBtn=Button(text=" Modeli Eğit ", command=self.modeliEgit,width=20,height=3,fg="red",bg="yellow") self.ModelBtn.grid(row=6,column=2) self.SonucBtn=Button(text=" Sonucu Göster ", command=self.csvView,width=20,height=3,fg="red",bg="yellow") self.ExportCsvBtn=Button(text=" Export CSV ", command=self.exportCSV,width=20,height=3,fg="red",bg="yellow") #self.listele=Button(text="Listele",command=self.listele,width=30,height=3,fg="red",bg="yellow") #self.listele.grid(row=7,column=0) mainloop() def restart_program(self): #os.execv(sys.executable, ['python'] + sys.argv) import _winapi x = _winapi.GetCurrentProcess() _winapi.ExitProcess(x) #self.egitimTxt.delete(0, END) #self.dataSayisiTxt.delete(0, END) #self.comboulke.config(state=DISABLED) #self.combo.config(state=DISABLED) #self.YukleBtn.grid(row=4,column=2) #self.DonusumBtn.grid(row=5,column=2) #self.ModelBtn.grid(row=6,column=2) #self.SonucBtn.grid_remove() #self.ExportCsvBtn.grid_remove() def returnUlkeInt(self): self.comboUlkeDeger =self.comboulke.current() if self.comboUlkeDeger==0: return 209 elif self.comboUlkeDeger==1: return 217 elif self.comboUlkeDeger==2: return 94 elif self.comboUlkeDeger==3: return 153 elif self.comboUlkeDeger==4: return 95 elif self.comboUlkeDeger==5: return 4 elif self.comboUlkeDeger==6: return 200 else: return -1 #209 Turkey #217 ABD #94 İran #153 Pakistan #95 Irak #4 Afganistan #200 Suriye def exportCSV(self): path = 'sonuclar' output_file = os.path.join(path,'Combined Book.csv') export_file_path = filedialog.asksaveasfilename(defaultextension='.csv') self.predictions.toPandas().to_csv(export_file_path, sep=",", float_format='%.2f',index=False, line_terminator='\n',encoding='utf-8') def skorEkle(self): self.algoritma=self.combo.get() self.trainDataCount=self.trainingData.count() self.testDataCount=self.testData.count() self.dogrulukOrani=self.accuracy self.hataOrani=self.testError self.hesaplamaSuresi=self.tt self.egitilmeZamani=self.tt2 self.f1Score=self.f1 self.precisionSkor=self.wp self.recallScore=self.wr self.train_dogrulukOrani=self.train_accuracy self.train_hataOrani=self.train_Error self.train_hesaplamaSuresi=self.te self.train_egitilmeZamani=self.te2 self.train_f1Score=self.train_f1 self.train_precisionSkor=self.train_wp self.train_recallScore=self.train_wr self.tarihbug = str(datetime.now().strftime("%d.%m.%y_%H_%M")) temp1 = open("sonuclar.txt", "a") temp1.write("Algoritma:" +self.algoritma +" " +"Eğitim Data Sayısı: " +str(self.trainDataCount) +" " +"Test Data Sayısı: " +str(self.testDataCount) +" " +"Dogruluk Orani: " +str(self.dogrulukOrani) +" " +"Hata Orani: " +str(self.hataOrani) +" " +"Hesaplama Süresi: " +str(self.hesaplamaSuresi) +" sn " +" " +"Egitilme zamani: " +str(self.egitilmeZamani) +" sn " +" " +"F1 Skoru: " +str(self.f1Score) +" " +"Precision Skor: " +str(self.precisionSkor) +" " +"Recall Score: " +str(self.recallScore)) temp1.write("\n") messagebox.showinfo("Bilgi","%s algoritmasi listeye eklendi"%self.algoritma) path = "sonuclar" self.pathSave = path +'/' +self.algoritma+'_'+self.tarihbug +'.csv' with open(self.pathSave, mode='w') as csv_file: fieldnames = ['Algoritma', 'Data Sayısı', 'Dogruluk Orani', 'Hata Orani', 'Hesaplama Süresi', 'Egitilme zamani', 'F1 Skoru','Precision Skor', 'Recall Score'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() writer.writerow({'Algoritma': ''+self.algoritma+' (Egitim) ', 'Data Sayısı': ''+str(self.trainDataCount), 'Dogruluk Orani': ''+str(self.train_dogrulukOrani), 'Hata Orani': ''+str(self.train_hataOrani), 'Hesaplama Süresi': ''+str(self.train_hesaplamaSuresi), 'Egitilme zamani': ''+str(self.train_egitilmeZamani), 'F1 Skoru': ''+str(self.train_f1Score), 'Precision Skor': ''+str(self.train_precisionSkor), 'Recall Score': ''+str(self.train_recallScore)}) writer.writerow({'Algoritma': ''+self.algoritma+'(Test) ', 'Data Sayısı': ''+str(self.testDataCount), 'Dogruluk Orani': ''+str(self.dogrulukOrani), 'Hata Orani': ''+str(self.hataOrani), 'Hesaplama Süresi': ''+str(self.hesaplamaSuresi), 'Egitilme zamani': ''+str(self.egitilmeZamani), 'F1 Skoru': ''+str(self.f1Score), 'Precision Skor': ''+str(self.precisionSkor), 'Recall Score': ''+str(self.recallScore)}) #writer.write("\n") messagebox.showinfo("Bilgi","%s algoritmasi CSV olarak eklendi"%self.algoritma) def secVeGoster(self,event=None): self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*"))) print (self.filename) self.pathSave = self.filename self.csvView() def csvView(self): import tkinter import csv root = Tk() root.title("Sonuç Görüntüleme") path = "sonuclar" # open file with open(self.pathSave, mode='r') as file: reader = csv.reader(file) # r and c tell us where to grid the labels r = 0 for col in reader: c = 0 for row in col: # i've added some styling label = Label(root, width = 20, height = 3, \ text = row, relief = tkinter.RIDGE) label.grid(row = r, column = c) c += 1 r += 1 root.mainloop() def listele(self): ListelePencere=Tk() ListelePencere.geometry("600x400") ListelePencere.title("Kişi Listeleme") ListelePencere.configure(background="red") liste=Text(ListelePencere,width="200",height="400",fg="white",bg="red",font="helvetica 12") liste.grid(row=0,column=0) satir_sayisi=0 temp1 = open("sonuclar.txt", "r") readfile = temp1.read() liste.insert(END,readfile) def Hakkinda(self): HakkindaPencere=Tk() HakkindaPencere.geometry("700x50") HakkindaPencere.title("Barış KARABAY Fırat Üniversitesi Yazılım Mühendisliği Tez Projesi V2") HakkindaPencere.configure(background="red") self.baris=Label(HakkindaPencere,text="Bu Program Barış Karabay Tarafından Yapılmıştır \n Hiçbir Şekilde Paylaşılamaz ve Değiştirilemez. ",fg="black",bg="white") self.baris.grid(row=0,column=0) def secim(self,event=None): self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*"))) #ment = self.filename print (self.filename) #self.egitimTxt.set(self.filename) #self.['text']=self.filename self.nameText.set(self.filename) def secilenRadio1(self): print(self.selected1.get()) if self.selected1.get()==1: #showinfo("Uyarı","birinci") self.combo.grid_remove() else: #self.combo.grid() self.combo.grid(column=3, row=2) def secilenRadio2(self): print(self.selected2.get()) if self.selected2.get()==3: #showinfo("Uyarı","birinci") self.comboulke.grid_remove() else: #self.comboulke.grid() self.comboulke.grid(column=3, row=3) def get_dummy(self): from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.sql.functions import col df = self.spark_df categoricalCols = self.catcols continuousCols = self.num_cols labelCol = self.labelCol indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols ] # default setting: dropLast=True encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols, outputCol="features") pipeline = Pipeline(stages=indexers + encoders + [assembler]) model=pipeline.fit(df) data = model.transform(df) data = data.withColumn('label',col(labelCol)) data.show(5,False) return data.select('features','label') def secilenDosya(self): print(self.egitimTxt.get()) self.dosya=self.egitimTxt.get() self.dataSayisicntr = self.dataSayisiTxt.get() if self.dosya==" " or self.dosya=='' or self.dataSayisicntr=='': messagebox.showinfo("Uyarı","Boş Olamaz") else: print(self.comboulke.current(), self.comboulke.get()) #self.progress.start() messagebox.showinfo("Uyarı","Yükleme Başlatıldı") #self.progress.config(mode='indeterminate') self.dosya = str(self.dosya) self.dataSayisi = int(self.dataSayisiTxt.get()) print(self.dosya) mySchema = StructType([ StructField("country", IntegerType(), True)\ ,StructField("region", IntegerType(), True)\ ,StructField("attacktype1", IntegerType(), True)\ ,StructField("targtype1_txt", StringType(), True)\ ,StructField("gname", StringType(), True)\ ,StructField("weaptype1", IntegerType(), True)]) #egitim=pd.read_csv("D:/globalterrorismdb2.csv", usecols=[7, 9, 26, 27, 28, 35, 36, 40, 58, 68, 81], encoding='ISO-8859-1',low_memory=False) self.egitim=pd.read_csv(self.dosya, usecols=[7, 9, 28, 35, 58, 81], encoding='ISO-8859-1',low_memory=False,nrows=self.dataSayisi) #209 Turkey #217 ABD #94 İran #153 Pakistan #95 Irak #4 Afganistan #200 Suriye if self.comboulke.get() != '' or self.comboulke.get() != "": self.egitim = self.egitim[(self.egitim.country == self.returnUlkeInt())] messagebox.showinfo("Bilgi","%s ülkesi için eğitim ve test veri seti oluşturulacak"%self.comboulke.get()) print("Girilen Sayi dogru") print("Toplam Sayisi") print (self.egitim.count()) self.sqlContext = SQLContext(self.sc) self.spark_df = self.sqlContext.createDataFrame(self.egitim, schema=mySchema) self.YukleBtn.grid_remove() #self.progress.stop() messagebox.showinfo("Başarılıı","Yükleme Tamamlandı") def DonusumuBaslat(self): sp_df = self.spark_df messagebox.showinfo("Uyarı","Dönüşüm Başladı") self.data_f = self.get_dummy() self.data_f.show(25,False) self.labelIndexer = StringIndexer(inputCol='label',outputCol='indexedLabel').fit(self.data_f) self.labelIndexer.transform(self.data_f).show(25,False) self.featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(self.data_f) self.featureIndexer.transform(self.data_f).show(25,False) self.labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=self.labelIndexer.labels) if self.testTxt.get()=='': messagebox.showinfo("Hata","Lütfen Test oranını girin") else: deger = self.testTxt.get() testPoint=float(deger)/100 (self.trainingData, self.testData) = self.data_f.randomSplit([1.0-testPoint, testPoint], seed = 100) messagebox.showinfo("Başarılı","Oran Hesaplandı") self.DonusumBtn.grid_remove() def createFolder(self): try: if not os.path.exists(self.directory): os.makedirs(self.directory) except OSError: print ('Error: Creating directory. ' + self.directory) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') def modeliEgit(self): print(self.combo.current(), self.combo.get()) messagebox.showinfo("Bilgi","%s algoritması için model oluşturulacak"%self.combo.get()) if self.combo.current()==0: self.LogicticRegressionClassifier() elif self.combo.current()==1: self.NaiveBayesClassifier() elif self.combo.current()==2: self.RandomForestClassifier() elif self.combo.current()==3: self.DecisionTreeClassifier() elif self.combo.current()==4: self.SVMclassifier() elif self.combo.current()==5: self.KNNclassifier() def printMetrics(predictions_and_labels): metrics = MulticlassMetrics(predictions_and_labels) print('Precision of True ', metrics.precision(1)) print('Precision of False', metrics.precision(0)) print('Recall of True ', metrics.recall(1)) print('Recall of False ', metrics.recall(0)) print('F-1 Score ', metrics.fMeasure()) print('Confusion Matrix\n', metrics.confusionMatrix().toArray()) def getPredictionsLabels(model, test_data): predictions = model.predict(test_data.map(lambda r: r.features)) return predictions.zip(test_data.map(lambda r: r.label)) def LogicticRegressionClassifier(self): self.t0 = time() print("********************************************************************************************************************************************") print("Logistic Regression") logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel',maxIter=20, regParam=0.3, elasticNetParam=0) pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, logr, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) rfModel = model.stages[2] #print (rfModel._call_java('toDebugString')) rfModel = model.stages[2] #model.save("model2345678909") messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) #self.predictions.printSchema() #paramGrid = (ParamGridBuilder() # .addGrid(logr.regParam, [0.01, 0.1, 0.5]) \ # .addGrid(logr.maxIter, [10, 20, 50]) \ # .addGrid(logr.elasticNetParam, [0.0, 0.8]) \ # .build()) #crossval = CrossValidator(estimator=pipeline, # estimatorParamMaps=paramGrid, # evaluator=evaluator, # numFolds=3) # #model = crossval.fit(self.trainingData) #predictions = model.transform(self.testData) #accuracy = evaluator.evaluate(predictions) #print("Dogruluk = %g" % (accuracy)) def DecisionTreeClassifier(self): self.t0 = time() print("********************************************************************************************************************************************") print("Decision Tree Classifier") dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",impurity="gini",maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, cacheNodeIds=False, checkpointInterval=10) pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, dt, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) rfModel = model.stages[2] #print (rfModel._call_java('toDebugString')) messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) def NaiveBayesClassifier(self): print("********************************************************************************************************************************************") self.t0 = time() print("Bayes") nb = NaiveBayes(featuresCol='indexedFeatures', labelCol='indexedLabel', smoothing=1.0, modelType="multinomial") pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, nb, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) #print (rfModel._call_java('toDebugString')) messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) def RandomForestClassifier(self): print("********************************************************************************************************************************************") print("Random Forest") self.t0 = time() rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy") pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) rfModel = model.stages[2] #print (rfModel._call_java('toDebugString')) messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) svm = LinearSVC(maxIter=5, regParam=0.01) LSVC = LinearSVC() ovr = OneVsRest(classifier=LSVC) paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build() crossval = CrossValidator(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(metricName="f1"), numFolds=2) Train_sparkframe = self.trainingData.select("features", "label") cvModel = crossval.fit(Train_sparkframe) bestModel = cvModel.bestModel def SVMclassifier(self): print("********************************************************************************************************************************************") self.t0 = time() print("SVM") df = self.egitim df['gname_id'] = df['gname'].factorize()[0] df['weaptype1_id'] = df['weaptype1'].factorize()[0] df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0] df['targsubtype1_id'] = df['targsubtype1'].factorize()[0] X = df.iloc[:, [0,1,2,8,9,10]].values y = df.iloc[:, 7].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train) X_train = scaling.transform(X_train) X_test = scaling.transform(X_test) classifier = SVC(kernel='linear',cache_size=7000, random_state = 0) classifier.fit(X_train, y_train) self.tt = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() y_pred = classifier.predict(X_test) accuracy = accuracy_score(y_test, y_pred) self.tt2 = time() -self.t0 print(accuracy) print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy)) def KNNclassifier(self): print("********************************************************************************************************************************************") from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.preprocessing import LabelEncoder print("KNN") df = self.egitim df['gname_id'] = df['gname'].factorize()[0] df['weaptype1_id'] = df['weaptype1'].factorize()[0] df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0] print("Toplam Sayisi") #print (df.count()) X = df.iloc[:, [0,1,2,7,8]].values y = df.iloc[:, 6].values #print(df.iloc[:, 6]) #print(df.columns) #print(X) #print(y) #print(df['gname_id']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train) X_train = scaling.transform(X_train) X_test = scaling.transform(X_test) classifier = KNeighborsClassifier(n_neighbors=9, metric='minkowski', p = 2) self.t0 = time() classifier.fit(X_train, y_train) self.tt = time() - self.t0 print ("Veri kumesini egitim zamani {} saniye ".format(self.tt)) self.t0 = time() y_pred = classifier.predict(X_test) self.tt = time() - self.t0 print ("test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() x_pred = classifier.predict(X_train) self.tt = time() - self.t0 print ("egitim verisini siniflandirma zamani {} saniye ".format(self.tt)) accuracy = accuracy_score(y_test, y_pred) accuracy_egitim = accuracy_score(y_train, x_pred) self.tt2 = time() -self.t0 print ('Test Accuracy:', accuracy) print ('Egitim Accuracy:', accuracy_egitim) #print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy)) #print 'Accuracy:', accuracy_score(y_test, y_pred) print ('Test F1 score:', f1_score(y_test, y_pred,average='weighted')) print ('Test Recall:', recall_score(y_test, y_pred, average='weighted')) print ('Test Precision:', precision_score(y_test, y_pred, average='weighted')) print ('egitim F1 score:', f1_score(y_train, x_pred,average='weighted')) print ('egitim Recall:', recall_score(y_train, x_pred, average='weighted')) print ('egitim Precision:', precision_score(y_train, x_pred, average='weighted')) #print '\n clasification report:\n', classification_report(y_test, y_pred) #print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred) print("********************************************************************************************************************************************")