예제 #1
0
    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model
예제 #2
0
def preprocessed_df(df, label="flg_cmd_lowcostIndex"):
    max_values_to_define_str_cols = 10
    id_col = 'ID_CLIENT'

    dty = dict(df.dtypes)
    str_cols = [k for k, v in dty.items() if v == 'string']
    str_cols.remove(id_col)

    for c in str_cols:
        stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index")
        model_str = stringIndexer.fit(df)
        df = model_str.transform(df).drop(c)

    input_cols = df.columns
    input_cols.remove(id_col)
    input_cols.remove(label)

    assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
    df = assembler.transform(df)

    featureIndexer = VectorIndexer(
        inputCol="features",
        outputCol="indexedFeatures",
        maxCategories=max_values_to_define_str_cols).fit(df)
    return featureIndexer.transform(df), df
    def chiSquareTest(self,categoricalFeatures,maxCategories):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        length = features.__len__()

        featureassembler = VectorAssembler(
            inputCols=self.features,
            outputCol="featuresChiSquare", handleInvalid="skip")
        dataset= featureassembler.transform(dataset)

        vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))

        dataset = vec_indexer.transform(dataset)

        # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
        # finalized_data.show()

        # using chi selector
        selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
                                 outputCol="selectedFeatures",
                                 labelCol=labelColm)

        result = selector.fit(dataset).transform(dataset)

        print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
        result.show()

        # runnin gfor the chi vallue test

        r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
        p_values = list(r.pValues)
        PValues = []
        for val in p_values:
            PValues.append(round(val, 4))
        print(PValues)
        dof = list(r.degreesOfFreedom)
        stats = list(r.statistics)
        statistics = []
        for val in stats:
            statistics.append(round(val, 4))
        print(statistics)
        chiSquareDict = {}
        for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
            print(pval, doF, stat)
            chiSquareDict[colm] = pval, doF, stat
        chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
        print(chiSquareDict)

        result = {'pvalues': chiSquareDict}

        return result
    def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId):
        try:
            dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)
            dataset.show()
            label = ''
            for val in label_colm:
                label = val
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(
                        x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)
            if relation == 'linear':
                print('linear relationship')
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)
            dataset.show()
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4, handleInvalid="skip").fit(
                dataset)
            dataset = vectorIndexer.transform(dataset)

            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            trainingData, testData = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40)

            # applying the model

            lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label)
            regressor = lr.fit(trainingData)

            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'
            modelPersist = 'linearRegressorModel.parquet'
            modelStorageLocation = locationAddress + userId + modelPersist
            regressor.write().overwrite().save(modelStorageLocation)

            # print regressor.featureImportances

            # print(dataset.orderBy(feature_colm, ascending=True))

            # pred = regressor.transform(testData)

            # coefficeint & intercept


            # saving the model and test dataset as csv file


            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)

            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)

            prediction = regressor.evaluate(testData)

            # VI_IMP = 2

            prediction_val = prediction.predictions
            prediction_val.show()

            prediction_val_pand = prediction_val.select(label, "prediction").toPandas()

            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]

            prediction_val_pand_label = prediction_val_pand[label]

            # print prediction_val_pand_residual
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            # print prediction_val_pand_predict

            # test_summary = prediction.summary

            # for test data

            lr_prediction = regressor.transform(testData)

            lr_prediction.groupBy(label, "prediction").count().show()

            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" % str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', intercept_t, '+'
            for feature, coeff in zip(feature_colm, coefficientList):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            st = list(equation)

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in PValuesList:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)



            #######################################################################################################
            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index', f.monotonically_increasing_id())

            pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            '''
                        
            userId = 'sahil123'
            graphName = 'QQPlot.parquet'
            locationAddress = '/home/fidel/mltest/'
            
            finalLocation = locationAddress + userId + graphName
            print(finalLocation)
            pred_residuals.write.parquet(finalLocation,mode='overwrite')
    
            '''


            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(label, 'prediction',
                                              sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()

            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop(
                'row_index')

            sqrt_label_residual_join.show()

            std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', (
                    sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias(
                'std_res'))

            std_resid.show()

            sqrt_std_res = std_resid.select("std_res", 'prediction',
                                            sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))

            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')

            ######################################################################################
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math


            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain)

            # schema = StructType([StructField('zTheoriticalTrain', FloatType(), True),
            #                      StructField('zPracticalTrain', FloatType(), True)
            #                      ])
            # spark.createDataFrame(zPracticalTrain, FloatType()).show()

            ####################################################################################
            # appending predicted value to the dataset
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index', f.monotonically_increasing_id())
            target_d = target.withColumn('row_index', f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)
            '''
            prediction = regressor.evaluate(dataset)
            predictionTestData= prediction.predictions
            predictionTestData.show()
            #appending the predicted column into the dataset which is test dataset
            predictionLabelList = [label,'prediction']
            updatedFeatureColmList = feature_colm
            for val in predictionLabelList:
                updatedFeatureColmList.append(val)
            print(updatedFeatureColmList)
            predictionTestDatasetcolumn = predictionTestData.select(updatedFeatureColmList)
            predictionTestDatasetcolumn.show()

            '''

            ##########################################################################################

            # scale location plot

            # for scale location plotequationAsList
            # from pyspark.sql.functions import udf
            #
            # def std_res(x):
            #     res_list = []
            #     res_list.append(x)
            #
            # std_residuals = udf(lambda y: std_res(y), FloatType())
            #
            # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType())))
            #
            # import statistics
            # import numpy as np
            # residuals_panda = residuals.toPandas()
            # # residuals_panda.residuals = range(residuals_panda.shape[1])
            # residuals_panda = residuals_panda.values
            # print(residuals_panda)
            # stdev_training = statistics.stdev(residuals_panda)
            # print(stdev_training)

            ############################################################################################################

            # creating the dictionary for storing the result

            # json_response = coefficient_t

            # print(json_response)

            # json_response = {"adjusted r**2 value" : training_summary.r2adj}

            # DATA VISUALIZATION PART

            # finding the quantile in the dataset(Q_Q plot)
            import matplotlib.pyplot as plt

            # y = 0.1
            # x = []
            #
            # for i in range(0, 90):
            #     x.append(y)
            #     y = round(y + 0.01, 2)
            #
            # for z in x:
            #     print ("~~~~~   ",z)
            #

            # quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01)
            # print quantile_label
            # quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01)
            # print quantile_prediction
            #
            # Q_label_pred=''
            # print(len(quantile_label))
            # length = len(quantile_label)
            #
            # for i in range(0,len(quantile_label)):
            #     Q_label_pred += str(quantile_label[i]) + '|'  +  str(quantile_prediction[i]) + '\n'

            # writing it to the hdfs in parquet file
            #
            # quantile_label_tospark = spark.createDataFrame(quantile_label, FloatType())
            # quantile_label_tospark = quantile_label_tospark.withColumnRenamed("value", "Q_label")
            #
            # quantile_prediction_tospark = spark.createDataFrame(quantile_prediction, FloatType())
            # quantile_prediction_tospark = quantile_prediction_tospark.withColumnRenamed("value", "Q_prediction")
            #
            # quant_label = quantile_label_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # quant_predtiction = quantile_prediction_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_quantile = quant_label.join(quant_predtiction,on=['row_index']).sort('row_index').drop('row_index')
            #
            # final_quantile.show()
            #
            # final_quantile.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',mode='overwrite')
            #
            #

            # print(str(Q_label_pred[i]))

            # with open('Q_Q_plot.csv', 'w') as Q_Q:
            #     writer_Q_Q = csv.writer(Q_Q)
            #     writer_Q_Q.writerows((quantile_label, quantile_prediction))
            #
            # plt.scatter(quantile_label, quantile_prediction)
            # plt.show()

            ## finding the residual vs fitted graph data

            #
            #
            # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType())
            # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", "prediction")
            #
            # prediction_val_pand_residual_tospark = spark.createDataFrame(prediction_val_pand_residual, FloatType())
            # prediction_val_pand_residual_tospark = prediction_val_pand_residual_tospark.withColumnRenamed("value", "residual")
            #
            # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # res_spark = prediction_val_pand_residual_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_res_fitted = pred_spark.join(res_spark, on=['row_index'])\
            #     .sort('row_index').drop('row_index')
            #
            # final_res_fitted.show()
            #
            # final_res_fitted.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/RESIDUAL_FITTED_PLOT.parquet',
            #                              mode='overwrite')
            #

            # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual)
            # plt.axhline(y=0.0, color="red")
            # plt.xlabel("prediction")
            # plt.ylabel("residual")
            # plt.title("residual vs fitted ")
            # plt.show()

            # creating the csv file and writitng into it
            import math
            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(prediction_val_pand_predict[i]) + '|' + str(
                    prediction_val_pand_residual[i]) + '\n'

            with open('residual_vs_fitted.csv', 'w') as r_f:
                writer_r_f = csv.writer(r_f)
                writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual))

            # parquet file writing

            ## residual vs leverage graph data

            prediction_val_pand_residual
            # extreme value in the predictor colm
            prediction_col_extremeval = lr_prediction_quantile.agg({"prediction": "max"})
            # prediction_col_extremeval.show()

            # plt.plot(prediction_col_extremeval, prediction_val_pand_residual)
            # plt.show()

            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs()
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual

            # plt.scatter(sqrt_residual, prediction_val_pand_predict)
            ####################################################################################3

            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_pred = statistics.stdev(prediction_val_pand_residual)
            # mean = statistics.mean(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_pred)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)
            #######################################################################################3
            # QUANTILE

            ## sort the list
            sorted_std_res = sorted(std_res)
            print(sorted_std_res)
            #
            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            print(mean)
            quantile = []
            n = len(sorted_std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            #
            # z_score theoritical
            from scipy.stats import norm

            z_theory = []
            for x in quantile:
                z_theory.append((norm.ppf(abs(x))))
            print(z_theory)
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)

            #

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01)
            print(quantile_std_res_t)
            print(x)

            Q_label_pred = ''
            # print(len(quantile_label))
            # length = len(quantile_label)
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(val) + 't' + str(quant) + 'n'

            plt.scatter(z_theory, z_pract)
            plt.savefig('q_q')

            ####################################################

            # creating the std residuals

            # square root of label
            sqrt_label = []
            for x in prediction_val_pand_label:
                sqrt_label.append(math.sqrt(abs(x)))

            sqrt_label
            prediction_val_pand_residual
            std_residual = []
            for sqr, resid in zip(sqrt_label, prediction_val_pand_residual):
                std_residual.append(resid / sqr)
                # print(std_sqrt_residual)

            # creating the std sqr root

            sqrt_std_residuals = []
            for x in std_residual:
                # print(math.sqrt(abs(x)))
                sqrt_std_residuals.append(math.sqrt(abs(x)))
            print(sqrt_std_residuals)

            # print(std_sqrt_residual)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)

            ##########################################################################
            # import math
            # sqrt_stdres = []
            # for x in std_sqrt_residual:
            #     sqrt_stdres.append(math.sqrt(x))
            #
            # scale_predict_residual = ''
            # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres):
            #     scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            # print(scale_predict_residual)

            ###################################3

            # plt.show()

            # scale_predict_residual=''
            #
            # print(len(sqrt_residual))
            # length = len(sqrt_residual)
            #
            # for i in range(0, len(std_sqrt_residual)):
            #     scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n'

            # with open('scale_location_plot.csv', 'w') as s_l:
            #     writer_s_l = csv.writer(s_l)
            #     writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual))



            # writing to the parquet

            # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType())
            # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value",
            #                                                                                             "prediction")
            #
            # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType())
            # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value",
            #                                                                                               "sqrt_residual")
            #
            # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \
            #     .sort('row_index').drop('row_index')
            #
            # final_scale_fitted.show()
            #
            # final_scale_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet',
            #     mode='overwrite')
            #

            # dumping the dictionary into json object

            # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf}

            tableContent = \
                {
                    'coefficientValuesKey': coefficientList,
                    'tValuesKey': tValuesList,
                    'pValuesKey': PValuesList,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': intercept_t,
                    "RMSE": RMSE,
                    "RSquare": r_square,
                    "AdjRSquare": adjsted_r_square,
                    "CoefficientStdError": coefficientStdError,

                }
            print(tableContent)

            json_response = {

                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square,
                "Coefficient_error": coefficientStdError,
                "T_value": T_values,
                "P_value": P_values,
                'Q_Q_plot': Q_label_pred,
                'residual_fitted': fitted_residual,
                'scale_location': scale_predict_residual

            }

            return json_response


        except Exception as e:
            print('exception is =' + str(e))
예제 #5
0
    def dataTranform(self):
        dataset = self.dataset
        schemaData = dataset.schema
        categoricalFeatures = []
        numericalFeatures = []
        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    or str(schemaVal.dataType) == "TimestampType"
                    or str(schemaVal.dataType) == "DateType"
                    or str(schemaVal.dataType) == "BooleanType"
                    or str(schemaVal.dataType) == "BinaryType"):
                for y in self.featuresColm:
                    if schemaVal.name == y:
                        dataset = dataset.withColumn(
                            y, dataset[y].cast(StringType()))
                        categoricalFeatures.append(schemaVal.name)
            else:
                for y in self.featuresColm:
                    if schemaVal.name == y:
                        numericalFeatures.append(schemaVal.name)

        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    and schemaVal.name == label):
                for labelkey in self.labelColm:
                    label_indexer = StringIndexer(
                        inputCol=label,
                        outputCol='indexed_' + label,
                        handleInvalid="skip").fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexedFeatures = []
        for colm in categoricalFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm,
                                    handleInvalid="skip").fit(dataset)
            indexedFeatures.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        combinedFeatures = numericalFeatures + indexedFeatures
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in categoricalFeatures:
            # categoryColm = value
            # listValue = value
            listValue = []
            categoryColm = dataset.groupby(value).count()
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue
        self.numericalFeatures = numericalFeatures
        self.categoricalFeatures = categoricalFeatures
        if not categoricalFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)

        featureassembler = VectorAssembler(inputCols=combinedFeatures,
                                           outputCol="features",
                                           handleInvalid="skip")
        dataset = featureassembler.transform(dataset)
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        dataset = vec_indexer.transform(dataset)
        return dataset, categoricalFeatures, numericalFeatures
    def callCenter(self):
        dataset = spark.read.csv(
            "/home/fidel/Downloads/CallCenterFinalTillAprilData",
            sep=',',
            header=True,
            inferSchema=True)
        dataset.show()
        feature_colm = ["col_2_SKILLNAME_2", "col_2_SKILLNAME_3"]
        label_colm = ["CALLDATE"]
        label = ""
        for val in label_colm:
            label = val
        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"
                    or str(x.dataType) == 'TimestampType'
                    or str(x.dataType) == 'DateType'
                    or str(x.dataType) == 'BooleanType'
                    or str(x.dataType) == 'BinaryType'):
                for y in feature_colm:
                    if x.name == y:
                        dataset = dataset.withColumn(
                            y, dataset[y].cast(StringType()))
                        stringFeatures.append(x.name)

        categoryColmList = []
        categoryColmListFinal = []
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in stringFeatures:
            categoryColm = value
            listValue = value
            listValue = []
            categoryColm = dataset.groupby(value).count()
            print(categoryColm)
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue

        if not stringFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)
        maxCategories = 13

        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        dataset.show()
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        vectorIndexer = VectorIndexer(inputCol='features',
                                      outputCol='vectorIndexedFeatures',
                                      maxCategories=maxCategories).fit(dataset)
        dataset = vectorIndexer.transform(dataset)
        import csv
        dataset = dataset.select("CALLDATE", "col_2_SKILLNAME_2",
                                 "col_2_SKILLNAME_3", "indexed_CALLDATE",
                                 "indexed_col_2_SKILLNAME_2",
                                 "indexed_col_2_SKILLNAME_3")
        # dataset.to_csv("/home/fidel/Downloads/Callcenterdata/callFinalFormated.csv")
        # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinalF.csv")
        # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinal.csv")
        # dataset.show()
        dataset.toPandas().to_csv(
            "/home/fidel/Downloads/Callcenterdata/callcsv.csv")
        trainDataRatioTransformed = 0.80
        testDataRatio = 1 - trainDataRatioTransformed
        trainingData, testData = dataset.randomSplit(
            [trainDataRatioTransformed, testDataRatio], seed=0)
        #applying the model
        randomForestModel = RandomForestClassifier(
            labelCol=label,
            featuresCol='vectorIndexedFeatures',
            numTrees=10,
            maxBins=maxCategories)
        randomForestModelFit = randomForestModel.fit(trainingData)
        predictions = randomForestModelFit.transform(testData)

        # Select example rows to display.
        predictions.select("predictedLabel", "label", "features").show(5)
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel",
            predictionCol="prediction",
            metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
    def linearRegPersist(self, dataset_add, feature_colm, label_colm,
                         relation_list, relation, userId):
        try:
            dataset = spark.read.csv(dataset_add,
                                     header=True,
                                     inferSchema=True)
            dataset.show()
            label = ''
            for val in label_colm:
                label = val
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"):
                    for y in feature_colm:
                        if x.name == y:
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)
            if relation == 'linear':
                print('linear relationship')
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)
            dataset.show()
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label,
                                                      outputCol='indexed_' +
                                                      label).fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' +
                                        colm).fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            final_features = numericalFeatures + indexed_features
            featureassembler = VectorAssembler(inputCols=final_features,
                                               outputCol="features")
            dataset = featureassembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=4).fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            # Loading the persisted model
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'
            modelPersist = 'linearRegressorModel.parquet'
            persistedModelLocation = locationAddress + userId + modelPersist
            regressorTest = LinearRegressionModel.load(persistedModelLocation)
            predictedData = regressorTest.transform(dataset)

            predictedData.show()

        except Exception as e:
            print('exception is :', e)
예제 #8
0
                                data["pickup_latitude"] >= 40.63).filter(
                                    data["dropoff_latitude"] <= 40.85).filter(
                                        data["dropoff_latitude"] >= 40.63)
    #data.printSchema()
    assembler = VectorAssembler().setInputCols([
        "vendor_id", "pickup_longitude", "pickup_latitude", "pickup_hour",
        "pickup_month", "dropoff_longitude", "dropoff_latitude",
        "trip_distance", "passenger_count"
    ]).setOutputCol("features")
    df = assembler.setHandleInvalid("skip").transform(data).select(
        "trip_duration", "features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=30).fit(df)
    d = featureIndexer.transform(df)
    trainTest = d.randomSplit([0.8, 0.2])
    traindf = trainTest[0]
    testdf = trainTest[1]

    # Model
    dtr = DecisionTreeRegressor(featuresCol="indexedFeatures",
                                labelCol="trip_duration",
                                impurity="variance")

    # choices of tuning parameters
    dtrparamGrid = (ParamGridBuilder().addGrid(dtr.maxDepth, [10]).build())

    pipeline = Pipeline(stages=[featureIndexer, dtr])

    crossval = CrossValidator(estimator=pipeline,
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        # dataset = spark.read.parquet(dataset_add)
        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=';')

        dataset.show()

        label = ''
        for y in label_colm:
            label = y

        print(label)
        #
        # summaryList = ['mean', 'stddev', 'min', 'max']
        # summaryDict = {}
        # for colm in feature_colm:
        #     summaryListTemp = []
        #     for value in summaryList:
        #         summ = list(dataset.select(colm).summary(value).toPandas()[colm])
        #         summaryListTemp.append(summ)
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     summaryListTemp.append(varianceListTemp)
        #     summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        # summaryDict['summaryName'] = summaryList
        #
        # print(summaryDict)

        # print(summaryDict)
        # varianceDict = {}
        # for colm in feature_colm:
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     varianceDict[colm] = varianceListTemp
        # print(varianceDict)

        # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict}
        # print(summaryAll)

        # extracting the schema

        schemaDataset = dataset.schema

        stringFeatures = []
        numericalFeatures = []

        for x in schemaDataset:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        print(stringFeatures)
        print(numericalFeatures)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListTemp.append(summ)
            varianceListTemp = list(
                dataset.select(variance(
                    col(colm)).alias(colm)).toPandas()[colm])
            summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        print(summaryDict)

        # print(val)

        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        # calling pearson test fuction

        response_pearson_test = Correlation_test_imp(
            dataset=dataset, features=numericalFeatures, label_col=label)

        # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label)

        # dataset_pearson = dataset

        #
        # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset)
        # dataset = label_indexer.transform(dataset)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset)
            # encoded_features.append('encoded_'+colm)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = numericalFeatures + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        #using the vector indexer

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        finalized_data.show()

        # renaming the colm
        # print (label)
        # dataset.withColumnRenamed(label,"label")
        # print (label)
        # dataset.show()

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)
        #
        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()
        #
        #
        #
        # splitting the dataset into taining and testing

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        rf = RandomForestRegressor(labelCol=label,
                                   featuresCol='vec_indexed_features',
                                   numTrees=10)

        # Convert indexed labels back to original labels.

        # Train model.  This also runs the indexers.
        model = rf.fit(train_data)

        # Make predictions.
        predictions = model.transform(test_data)

        # Select example rows to display.
        # predictions.select("prediction", "label", "features").show(10)

        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)

        features_column_for_user = numericalFeatures + stringFeatures

        feature_imp = {
            'feature_importance': feature_importance,
            "feature_column": features_column_for_user
        }

        response_dict = {
            'feature_importance': feature_imp,
            'pearson_test_data': response_pearson_test,
            'summaryDict': summaryDict
        }

        return response_dict
        print(response_dict)

        # Select (prediction, true label) and compute test error
        # evaluator = MulticlassClassificationEvaluator(
        #     labelCol="label", predictionCol="prediction", metricName="accuracy")
        # accuracy = evaluator.evaluate(predictions)
        # print("Test Error = %g" % (1.0 - accuracy))

        # rfModel = model.stages[2]
        # print(rfModel)  # summary only

    except Exception as e:
        print("exception is  = " + str(e))
예제 #10
0
    def GradientBoostingClassification(self, dataset_add, feature_colm,
                                       label_colm, relation_list, relation):
        try:
            dataset = spark.read.csv(dataset_add,
                                     sep=';',
                                     header=True,
                                     inferSchema=True)
            dataset.show()
            stepSize = self.learningRate
            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label,
                                                      outputCol='indexed_' +
                                                      label).fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' +
                                        colm).fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            final_features = numericalFeatures + indexed_features
            featureassembler = VectorAssembler(inputCols=final_features,
                                               outputCol="features")
            dataset = featureassembler.transform(dataset)
            vectorIndexer = VectorIndexer(
                inputCol='features',
                outputCol='vectorIndexedFeatures',
                maxCategories=maxCategories).fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            trainingData, testData = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=0)

            gradientBoostingmodel = GBTClassifier(
                labelCol=label,
                featuresCol='vectorIndexedFeatures',
                maxIter=10,
                stepSize=stepSize)
            gradientBoostFittingTrainingData = gradientBoostingmodel.fit(
                trainingData)
            gBPredictionTrainData = gradientBoostFittingTrainingData.transform(
                trainingData)
            gBPredictionTestData = gradientBoostFittingTrainingData.transform(
                testData)
            gBPredictionTestData.select('prediction', label).show()
            # gbtModel = gradientBoostFittingTrainingData.stages
            featureImportance = gradientBoostFittingTrainingData.featureImportances.toArray(
            ).tolist()
            print(featureImportance)

            # prediction graph data
            from pyspark.sql.functions import col
            TrainPredictedTargetData = gBPredictionTrainData.select(
                label, 'prediction', 'probability', 'rawPrediction')
            residualsTrainData = TrainPredictedTargetData.withColumn(
                'residuals',
                col(label) - col('prediction'))
            residualsTrainData.show()

            TestPredictedTargetData = gBPredictionTestData.select(
                label, 'prediction', 'probability', 'rawPrediction')
            residualsTestData = TestPredictedTargetData.withColumn(
                'residuals',
                col(label) - col('prediction'))
            residualsTestData.show()

            # train Test data Metrics
            gBPredictionDataDict = {
                'gBPredictionTestData': gBPredictionTestData,
                'gBPredictionTrainData': gBPredictionTrainData
            }
            metricsList = [
                'f1', 'weightedPrecision', 'weightedRecall', 'accuracy'
            ]
            for key, value in gBPredictionDataDict.items():
                if key == 'gBPredictionTestData':
                    testDataMetrics = {}
                    for metric in metricsList:
                        evaluator = MulticlassClassificationEvaluator(
                            labelCol=label,
                            predictionCol="prediction",
                            metricName=metric)
                        metricValue = evaluator.evaluate(gBPredictionTestData)
                        testDataMetrics[metric] = metricValue
                    print('testDataMetrics :', testDataMetrics)

                if key == 'gBPredictionTrainData':
                    trainDataMetrics = {}
                    for metric in metricsList:
                        evaluator = MulticlassClassificationEvaluator(
                            labelCol=label,
                            predictionCol="prediction",
                            metricName=metric)
                        metricValue = evaluator.evaluate(gBPredictionTrainData)
                        trainDataMetrics[metric] = metricValue
                    print('trainDataMetrics :', trainDataMetrics)

            # while fitting the training data
            totalNumberTrees = gradientBoostFittingTrainingData.getNumTrees
            print('Total number of trees used is :', totalNumberTrees)
            totalNumberNodes = gradientBoostFittingTrainingData.totalNumNodes
            print('Total number of node is :', totalNumberNodes)
            treeWeight = gradientBoostFittingTrainingData.treeWeights
            print('Weights on each tree is :', treeWeight)
            treeInfo = gradientBoostFittingTrainingData.trees
            for eachTree in treeInfo:
                print('info of each tree is :', eachTree)

        except Exception as e:
            print('exception is --', e)
def chi_square_test(dataset, features, label_col, stringFeatures):
    spark = SparkSession.builder.appName("predictive_analysis").master(
        "local[*]").getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")
    stringFeatures
    length = features.__len__()
    datasetChi = dataset

    featureassembler = VectorAssembler(inputCols=features,
                                       outputCol="features",
                                       handleInvalid="skip")

    datasetChi = featureassembler.transform(datasetChi)
    datasetChi.show()

    vec_indexer = VectorIndexer(inputCol='features',
                                outputCol='vec_indexed_features',
                                maxCategories=4,
                                handleInvalid="skip").fit(datasetChi)

    categorical_features = vec_indexer.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categorical_features), ", ".join(
              str(k) for k in categorical_features.keys())))

    vec_indexed = vec_indexer.transform(datasetChi)
    vec_indexed.show()

    finalized_data = vec_indexed.select(label_col, 'vec_indexed_features')
    finalized_data.show()

    # using chi selector
    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="vec_indexed_features",
                             outputCol="selected_features",
                             labelCol=label_col)

    result = selector.fit(finalized_data).transform(finalized_data)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    # runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", label_col).head()
    p_values = list(r.pValues)
    PValues = []
    for val in p_values:
        PValues.append(round(val, 4))
    print(PValues)
    dof = list(r.degreesOfFreedom)
    stats = list(r.statistics)
    statistics = []
    for val in stats:
        statistics.append(round(val, 4))
    print(statistics)
    chiSquareDict = {}
    for pval, doF, stat, colm in zip(PValues, dof, statistics, stringFeatures):
        print(pval, doF, stat)
        chiSquareDict[colm] = pval, doF, stat
    chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
    print(chiSquareDict)

    return_data = {'pvalues': chiSquareDict}

    return return_data
# In[280]:

# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(transformed)
labelIndexer.transform(transformed).show(5, False)

# In[265]:

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(transformed)
featureIndexer.transform(transformed).show(5, True)

# In[281]:

data.show(2, False)

# In[282]:

# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

trainingData.show(5, False)
testData.show(5, False)

# In[283]:
    def linearReg(self, dataset_add, feature_colm, label_colm, relation_list,
                  relation, userId, locationAddress):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            # encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            '''from pyspark.ml.feature import OneHotEncoderEstimator
                oneHotEncodedFeaturesList = []
                for colm in stringFeatures:
                        indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                        indexed_features.append('indexed_' + colm)
                        dataset = indexer.transform(dataset)
                        oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm)
                oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features,
                                                     outputCols=oneHotEncodedFeaturesList)
                oneHotEncoderFit=oneHotEncoder.fit(dataset)
                oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)'''
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            lr = LinearRegression(featuresCol="vectorIndexedFeatures",
                                  labelCol=label)
            regressor = lr.fit(train_data)
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            featurePredictedLabel = feature_colm
            featurePredictedLabel.append('prediction')
            featurePredictedLabel.append(label)
            # testDataEvaluation = regressor.evaluate(test_data)
            # testDataPrediction = testDataEvaluation.predictions
            # testDataPrediction.select(featurePredictedLabel).show()

            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            testDataPrediction = prediction_val.select(featurePredictedLabel)

            # storing test predicted value to the dataset

            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            residual_graph = training_summary.residuals
            residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" +
                  str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(
                training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)
            coefficientList = list(regressor.coefficients)

            #summaryData
            import pyspark.sql.functions as F
            import builtins
            round = getattr(builtins, 'round')
            print(coefficientList)
            coefficientListRounded = []
            for value in coefficientList:
                coefficientListRounded.append(round(value, 4))
            # print(coefficientListRounded)
            # print(intercept_t)
            interceptRounded = round(float(intercept_t), 4)
            # print(interceptRounded)
            # print(RMSE)
            RMSERounded = round(RMSE, 4)
            # print(RMSERounded)
            MSERounded = round(MSE, 4)
            rSquareRounded = round(r_square, 4)
            adjustedrSquareRounded = round(adjsted_r_square, 4)
            coefficientStdError = training_summary.coefficientStandardErrors
            coefficientStdErrorRounded = []
            for value in coefficientStdError:
                coefficientStdErrorRounded.append(round(float(value), 4))
            print(coefficientStdErrorRounded)
            tValuesListRounded = []
            for value in tValuesList:
                tValuesListRounded.append(round(value, 4))
            print(tValuesListRounded)
            pValuesListRounded = []
            PValuesList = training_summary.pValues

            for value in PValuesList:
                pValuesListRounded.append(round(value, 4))
            print(pValuesListRounded)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', interceptRounded, '+'
            for feature, coeff in zip(feature_colm, coefficientListRounded):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            equationAsList = list(equation)
            '''# statTable function
            def summaryTable(self,featuresName,featuresStat):
                statTable={}
                for name, stat in zip(featuresName.values(),
                                      featuresStat.values()):
                    print(name, ": ", stat)
                    statTable[name]=stat
                return statTable
            '''

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in pValuesListRounded:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)

            # storing test predicted value to the dataset

            predictionData = 'prediction.parquet'

            predictionDataStoring = locationAddress + userId + predictionData
            testDataPrediction.write.parquet(predictionDataStoring,
                                             mode='overwrite')

            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########3
            # table_response = {
            #
            #     "Intercept": intercept_t,
            #     "Coefficients": coefficient_t,
            #     "RMSE": RMSE,
            #     "MSE": MSE,
            #     "R_square": r_square,
            #     "Adj_R_square": adjsted_r_square,
            #     "coefficientStdError": coefficientStdError,
            #     "T_value": T_values,
            #     "P_value": P_values
            #
            # }
            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            tableContent = \
                {
                    'coefficientValuesKey': coefficientListRounded,
                    'tValuesKey': tValuesListRounded,
                    'pValuesKey': pValuesListRounded,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': interceptRounded,
                    "RMSE": RMSERounded,
                    "RSquare": rSquareRounded,
                    "AdjRSquare": adjustedrSquareRounded,
                    "CoefficientStdError": coefficientStdErrorRounded,
                    'equationKey': equation
                }

            json_response = {
                'table_data': tableContent,
                'graph_data': graph_response
            }
            print(json_response)
            return (json_response)
        except Exception as e:
            print('exception is =' + str(e))
예제 #14
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
예제 #15
0
# In[14]:

from pyspark.ml.feature import StringIndexer
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
labelIndexer.transform(data)

from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(data)
featureIndexer.transform(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# In[15]:

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures")
예제 #16
0
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        dataset = spark.read.parquet(dataset_add)
        label = ''
        for y in label_colm:
            label = y

        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}

        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                # print(summaryListSubTemp)
                summaryListTemp.append(summaryListSubTemp)
            # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
            # summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        print(summaryDict)
        print(summaryList.extend(['skewness', 'kurtosis', 'variance']))
        print(summaryDict)
        # for colm in numericalFeatures:
        #     skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm))
        #     kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm))
        #     variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm)))
        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        dataset.show()
        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        response_chi_test = chi_square_test(dataset=dataset,
                                            features=indexed_features,
                                            label_col=label,
                                            stringFeatures=stringFeatures)

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        dataset.show()
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()
        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)
        rf = RandomForestClassifier(labelCol=label,
                                    featuresCol='vec_indexed_features',
                                    numTrees=10)
        model = rf.fit(train_data)
        predictions = model.transform(test_data)
        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        # feature_importance = [round(x,4) for x in feature_importance]
        featureImportance = []
        for x in feature_importance:
            featureImportance.append(round(x, 4))
        print(featureImportance)

        features_column_for_user = numericalFeatures + stringFeatures
        feature_imp = {
            'feature_importance': featureImportance,
            "feature_column": features_column_for_user
        }
        response_dict = {
            'feature_importance': feature_imp,
            'ChiSquareTestData': response_chi_test,
            'summaryDict': summaryDict
        }
        return response_dict
    except Exception as e:
        print("exception is  = " + str(e))
    def lassoRegression(self, dataset_add, feature_colm, label_colm,
                        relation_list, relation, userId):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()
            Rsqr_list = []
            Rsqr_regPara = {}
            print(self.xt)
            # print(data_add)

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            ######################################################################33
            # lasso final
            for t in self.xt:
                lr1 = LinearRegression(featuresCol="vectorIndexedFeatures",
                                       labelCol=label,
                                       elasticNetParam=1,
                                       regParam=t)
                regressor1 = lr1.fit(train_data)
                print(t)
                print("coefficient : " + str(regressor1.coefficients))
                reg_sum = regressor1.summary
                r2 = reg_sum.r2
                Rsqr_list.append(r2)
                Rsqr_regPara[r2] = t
                print(r2)

            print(Rsqr_list)
            print(max(Rsqr_list))
            maximum_rsqr = max(Rsqr_list)
            print(Rsqr_regPara)
            final_regPara = []

            for key, val in Rsqr_regPara.items():
                if (key == maximum_rsqr):
                    print(val)
                    final_regPara.append(val)

            for reg in final_regPara:
                lr_lasso = LinearRegression(
                    featuresCol="vectorIndexedFeatures",
                    labelCol=label,
                    elasticNetParam=1,
                    regParam=reg)
                regressor = lr_lasso.fit(train_data)
                training_summary = regressor.summary
                r2 = training_summary.r2
                print(r2)

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            prediction_val.show()
            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            # training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            # coefficient_error = str(training_summary.coefficientStandardErrors)
            # print(" Tvalues :\n" + str(training_summary.tValues))
            # T_values = str(training_summary.tValues)
            # print(" p values :\n" + str(training_summary.pValues))
            # P_values = str(training_summary.pValues)

            #######################################################################################################
            table_response = {
                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square
            }
            #######################################################################################################
            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########################################################################################

            # scale location plot

            # for scale location plot
            # from pyspark.sql.functions import udf
            #
            # def std_res(x):
            #     res_list = []
            #     res_list.append(x)
            #
            # std_residuals = udf(lambda y: std_res(y), FloatType())
            #
            # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType())))
            #
            # import statistics
            # import numpy as np
            # residuals_panda = residuals.toPandas()
            # # residuals_panda.residuals = range(residuals_panda.shape[1])
            # residuals_panda = residuals_panda.values
            # print(residuals_panda)
            # stdev_training = statistics.stdev(residuals_panda)
            # print(stdev_training)

            ############################################################################################################

            # creating the dictionary for storing the result

            # json_response = coefficient_t

            # print(json_response)

            # json_response = {"adjusted r**2 value" : training_summary.r2adj}

            # DATA VISUALIZATION PART

            # finding the quantile in the dataset(Q_Q plot)
            import matplotlib.pyplot as plt

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            json_response = {
                'table_data': table_response,
                'graph_data': graph_response
            }

            return json_response

        except Exception as e:
            print('exception is =' + str(e))
예제 #18
0
# deal with categorical label
from pyspark.ml.feature import StringIndexer

# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
labelIndexer.transform(data).show(5, True)

from pyspark.ml.feature import VectorIndexer

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures", \
                               maxCategories=4).fit(data)
featureIndexer.transform(data).show(5, True)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# split data into training and test data sets
(trainingData, testData) = data.randomSplit([0.6, 0.4])

# visualization
import numpy as np
import itertools
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, classes,
			spark.stop()
			data["trainSI"] = trainPath
			data["testSI"] = testPath

			data["currentTrain"] = trainPath
			data["currentTest"] = testPath

		elif config["transformerType"] == "vi":

			train, test = spark.read.parquet(data["currentTrain"]), spark.read.parquet(data["currentTest"])
			train.cache()
			test.cache()

			df = train.unionByName(test)
			featureIndexer = VectorIndexer(inputCol=config["inputCol"], outputCol=config["outputCol"], maxCategories=config["maxCategories"]).fit(df)
			train = featureIndexer.transform(train)
			test = featureIndexer.transform(test)

			trainPath = data['scheme'] + "://" + data['save'] + "/trainVI/"
			testPath = data['scheme'] + "://" + data['save'] + "/testVI/"
			if "partitionCol" in data and data['partitionCol'] in train.schema.names:
				train.write.partitionBy(data['partitionCol']).format("parquet").save(trainPath)
				test.write.partitionBy(data['partitionCol']).format("parquet").save(testPath)
			else:
				train.write.format("parquet").mode("overwrite").save(trainPath)
				test.write.format("parquet").mode("overwrite").save(testPath)
			spark.stop()

			data["trainVI"] = trainPath
			data["testVI"] = testPath
예제 #20
0
#transforms data into vectors
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features'])


transformed = transData(df)
transformed.show(5, False)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

#create a kmeans stage
kmeans = KMeans() \
          .setK(3) \
          .setFeaturesCol("indexedFeatures")\
          .setPredictionCol("cluster")

# Chain indexer and kmeans in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, kmeans])

#fit pipeline
model = pipeline.fit(transformed)

#transform data
cluster = model.transform(transformed)
예제 #21
0
class adresDefteri(kisi):
 def __init__(self):
  self.directory = 'sonuclar'
  self.createFolder()
  self.sc = SparkContext('local')
  spark = SparkSession(self.sc)
  spark = SparkSession \
                  .builder \
                  .appName("Python Spark Logistic Regression example") \
                  .config('spark.executor.heartbeatInterval', '3600s') \
                  .config("spark.some.config.option", "some-value") \
                  .getOrCreate()
  locale = self.sc._jvm.java.util.Locale
  locale.setDefault(locale.forLanguageTag("en-US"))
  
  self.catcols = ['targtype1_txt']
  self.num_cols = ['country', 'region','attacktype1','weaptype1']
  self.labelCol = 'gname'

  Root=Tk()
  Root.geometry("800x600")
  Root.title("Yaşanan Terör Olaylarını İçeren Büyük Verinin Makine Öğrenmesi Teknikleri İle Analizi")

  menu = Menu(Root)
  filemenu = Menu(menu)
  menu.add_cascade(label="File", menu=filemenu)
  filemenu.add_command(label="CSV View", command=self.secVeGoster)
  filemenu.add_separator()
  filemenu.add_command(label="Çıkış", command=Root.quit)
  
  filemenu.add_separator()
  filemenu.add_command(label="Yeniden Başlat", command=self.restart_program)

  helpmenu = Menu(menu)
  menu.add_cascade(label="Yardım", menu=helpmenu)
  helpmenu.add_command(label="Hakkında...", command=self.Hakkinda)

  
  Root.configure(background='yellow',menu=menu)
  global HakkindaPencere,combo
  self.nameText = StringVar()
  self.selected1 = IntVar()
  self.selected1.set(1)
  self.selected2 = IntVar()
  self.selected2.set(3)

  
  self.egitimLbl=Label(text="  Eğitim Verisi",width=30,height=3,fg="red",bg="yellow")
  self.egitimLbl.grid(row=0,column=0)

  self.egitimTxt=Entry(textvariable = self.nameText, fg="red",bg="yellow")
  self.egitimTxt.grid(row=0,column=1)

  self.egitimSec=Button(text="  ...  ",command=self.secim,width=10,height=1,fg="red",bg="yellow")
  self.egitimSec.grid(row=0,column=2)

  self.dataSayisiLbl=Label(text="  Data Sayısı Girin (Maks:181600)",width=30,height=3,fg="red",bg="yellow")
  self.dataSayisiLbl.grid(row=0,column=3)

  self.dataSayisiTxt=Entry(fg="red",bg="yellow")
  self.dataSayisiTxt.grid(row=0,column=4)

  self.testLbl=Label(text="  Test Verisi Oranı %",width=30,height=3,fg="red",bg="yellow")
  self.testLbl.grid(row=1,column=0)

  self.testTxt=Entry(fg="red",bg="yellow")
  self.testTxt.grid(row=1,column=1)  

  self.algoritmaLbl=Label(text="  Algoritma Seçiniz:",width=30,height=3,fg="red",bg="yellow")
  self.algoritmaLbl.grid(row=2,column=0)


  self.rad1 = Radiobutton(text='Hepsini karşılaştır',variable=self.selected1, value=1,command=self.secilenRadio1)
  self.rad1.grid(column=1, row=2)

  self.rad2 = Radiobutton(text='Bir Algoritma Seçiniz:',variable=self.selected1, value=2,command=self.secilenRadio1)
  self.rad2.grid(column=2, row=2)
  
  self.combo = ttk.Combobox (Root, state='readonly')
  self.combo['values']= ("Logistic Regression", "Naive Bayes", "Random Forest Classifier", "Decision Tree Classifier","Support Vector Machine","KNN" )
  #self.combo.current(-1) #set the selected item
  #self.combo.grid(column=3, row=2)
  

  self.ulkeLbl=Label(text="  Ülke Seçiniz:",width=30,height=3,fg="red",bg="yellow")
  self.ulkeLbl.grid(row=3,column=0)

  self.rad3 = Radiobutton(text='Tüm Ülkeler İçin', variable=self.selected2, value=3,command=self.secilenRadio2)
  self.rad3.grid(column=1, row=3)
  
  self.rad4 = Radiobutton(text='Ülke Seçin:', variable=self.selected2, value=4,command=self.secilenRadio2)
  self.rad4.grid(column=2, row=3)
  
  self.comboulke = ttk.Combobox (Root, state='readonly')
  self.comboulke['values']= ("Türkiye", "ABD", "İran", "Pakistan", "Irak","Afganistan","Suriye")
  #self.comboulke.grid(column=3, row=3)

  #209 Turkey
  #217 ABD
  #94 İran
  #153 Pakistan
  #95 Irak
  #4 Afganistan
  #200 Suriye

  #self.comboulke.current(1) #set the selected item
  #image=photo3, ekler , compound=LEFT resmi sola ceker

  self.YukleBtn=Button(text="Veriyi Yükle", command=self.secilenDosya,width=20,height=3,fg="red",bg="yellow")
  self.YukleBtn.grid(row=4,column=2)

  self.DonusumBtn=Button(text="  Dönüşümü Başlat  ", command=self.DonusumuBaslat,width=20,height=3,fg="red",bg="yellow")
  self.DonusumBtn.grid(row=5,column=2)

  self.ModelBtn=Button(text="  Modeli Eğit  ", command=self.modeliEgit,width=20,height=3,fg="red",bg="yellow")
  self.ModelBtn.grid(row=6,column=2)

  self.SonucBtn=Button(text="  Sonucu Göster  ", command=self.csvView,width=20,height=3,fg="red",bg="yellow")
  self.ExportCsvBtn=Button(text="  Export CSV  ", command=self.exportCSV,width=20,height=3,fg="red",bg="yellow")


  #self.listele=Button(text="Listele",command=self.listele,width=30,height=3,fg="red",bg="yellow")
  #self.listele.grid(row=7,column=0)



  
  mainloop()	
     

 def restart_program(self):
	 #os.execv(sys.executable, ['python'] + sys.argv)
     import _winapi
     x = _winapi.GetCurrentProcess()
     _winapi.ExitProcess(x)
	 
     #self.egitimTxt.delete(0, END)
     #self.dataSayisiTxt.delete(0, END)
     #self.comboulke.config(state=DISABLED)
	 #self.combo.config(state=DISABLED)
	 #self.YukleBtn.grid(row=4,column=2)
	 #self.DonusumBtn.grid(row=5,column=2)
	 #self.ModelBtn.grid(row=6,column=2)
	 #self.SonucBtn.grid_remove()
	 #self.ExportCsvBtn.grid_remove()
	

 def returnUlkeInt(self):
    self.comboUlkeDeger =self.comboulke.current()
    if self.comboUlkeDeger==0:
       return 209 
    elif self.comboUlkeDeger==1:
       return 217
    elif self.comboUlkeDeger==2:
       return 94
    elif self.comboUlkeDeger==3:
       return 153
    elif self.comboUlkeDeger==4:
       return 95
    elif self.comboUlkeDeger==5:
       return 4
    elif self.comboUlkeDeger==6:
       return 200
    else:
       return -1
    
    #209 Turkey
    #217 ABD
    #94 İran
    #153 Pakistan
    #95 Irak
    #4 Afganistan
    #200 Suriye


 def exportCSV(self):
    path = 'sonuclar'
    output_file = os.path.join(path,'Combined Book.csv')
    export_file_path = filedialog.asksaveasfilename(defaultextension='.csv')
    self.predictions.toPandas().to_csv(export_file_path, sep=",", float_format='%.2f',index=False, line_terminator='\n',encoding='utf-8')
 
 def skorEkle(self):

         self.algoritma=self.combo.get()
         self.trainDataCount=self.trainingData.count()
         self.testDataCount=self.testData.count()
         self.dogrulukOrani=self.accuracy
         self.hataOrani=self.testError
         self.hesaplamaSuresi=self.tt
         self.egitilmeZamani=self.tt2
         self.f1Score=self.f1
         self.precisionSkor=self.wp
         self.recallScore=self.wr

         self.train_dogrulukOrani=self.train_accuracy
         self.train_hataOrani=self.train_Error
         self.train_hesaplamaSuresi=self.te
         self.train_egitilmeZamani=self.te2
         self.train_f1Score=self.train_f1
         self.train_precisionSkor=self.train_wp
         self.train_recallScore=self.train_wr

         self.tarihbug = str(datetime.now().strftime("%d.%m.%y_%H_%M"))
         temp1 = open("sonuclar.txt", "a")
         temp1.write("Algoritma:" +self.algoritma +" " +"Eğitim Data Sayısı: "  +str(self.trainDataCount) +" " +"Test Data Sayısı: " +str(self.testDataCount) +" " +"Dogruluk Orani: " +str(self.dogrulukOrani)
         +" " +"Hata Orani: " +str(self.hataOrani)  +" " +"Hesaplama Süresi: " +str(self.hesaplamaSuresi) +" sn "  +" " +"Egitilme zamani: " +str(self.egitilmeZamani) +" sn "   +" " +"F1 Skoru: " +str(self.f1Score)
         +" " +"Precision Skor: " +str(self.precisionSkor) +" " +"Recall Score: " +str(self.recallScore))
         temp1.write("\n")              
         messagebox.showinfo("Bilgi","%s algoritmasi listeye eklendi"%self.algoritma)
         path = "sonuclar"
         self.pathSave = path +'/' +self.algoritma+'_'+self.tarihbug +'.csv'
         
         with open(self.pathSave, mode='w') as csv_file:
             fieldnames = ['Algoritma', 'Data Sayısı', 'Dogruluk Orani', 'Hata Orani', 'Hesaplama Süresi', 'Egitilme zamani', 'F1 Skoru','Precision Skor', 'Recall Score']
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()

             writer.writerow({'Algoritma': ''+self.algoritma+' (Egitim) ', 'Data Sayısı': ''+str(self.trainDataCount), 'Dogruluk Orani': ''+str(self.train_dogrulukOrani),
                              'Hata Orani': ''+str(self.train_hataOrani), 'Hesaplama Süresi': ''+str(self.train_hesaplamaSuresi), 'Egitilme zamani': ''+str(self.train_egitilmeZamani), 'F1 Skoru': ''+str(self.train_f1Score),
                              'Precision Skor': ''+str(self.train_precisionSkor), 'Recall Score': ''+str(self.train_recallScore)})

             
             writer.writerow({'Algoritma': ''+self.algoritma+'(Test) ', 'Data Sayısı': ''+str(self.testDataCount), 'Dogruluk Orani': ''+str(self.dogrulukOrani),
                              'Hata Orani': ''+str(self.hataOrani), 'Hesaplama Süresi': ''+str(self.hesaplamaSuresi), 'Egitilme zamani': ''+str(self.egitilmeZamani), 'F1 Skoru': ''+str(self.f1Score),
                              'Precision Skor': ''+str(self.precisionSkor), 'Recall Score': ''+str(self.recallScore)})
             #writer.write("\n")


             messagebox.showinfo("Bilgi","%s algoritmasi CSV olarak eklendi"%self.algoritma)    




 def secVeGoster(self,event=None):
      self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
      print (self.filename)
      self.pathSave = self.filename
      self.csvView()

 def csvView(self):

   import tkinter
   import csv
   root = Tk()
   root.title("Sonuç Görüntüleme")
   path = "sonuclar"
   # open file
   with open(self.pathSave, mode='r') as file:
      reader = csv.reader(file)

      # r and c tell us where to grid the labels
      r = 0
      for col in reader:
         c = 0
         for row in col:
            # i've added some styling
            label = Label(root, width = 20, height = 3, \
                                  text = row, relief = tkinter.RIDGE)
            label.grid(row = r, column = c)
            c += 1
         r += 1

   root.mainloop()


 def listele(self):
        ListelePencere=Tk()
        ListelePencere.geometry("600x400")
        ListelePencere.title("Kişi Listeleme")
        ListelePencere.configure(background="red")
    
        liste=Text(ListelePencere,width="200",height="400",fg="white",bg="red",font="helvetica 12")
        liste.grid(row=0,column=0)
    
     
        satir_sayisi=0
        temp1 = open("sonuclar.txt", "r")
        readfile = temp1.read()
        liste.insert(END,readfile)

        
 def Hakkinda(self):
        HakkindaPencere=Tk()
        HakkindaPencere.geometry("700x50")
        HakkindaPencere.title("Barış KARABAY Fırat Üniversitesi Yazılım Mühendisliği Tez Projesi V2")
        HakkindaPencere.configure(background="red")

        self.baris=Label(HakkindaPencere,text="Bu Program Barış Karabay Tarafından Yapılmıştır \n Hiçbir Şekilde Paylaşılamaz ve Değiştirilemez. ",fg="black",bg="white")
        self.baris.grid(row=0,column=0)

 def secim(self,event=None):
      self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
      #ment = self.filename
      print (self.filename)
      #self.egitimTxt.set(self.filename)
      #self.['text']=self.filename
      self.nameText.set(self.filename)
      
 def secilenRadio1(self):
      print(self.selected1.get())
      if self.selected1.get()==1:
         #showinfo("Uyarı","birinci")
         self.combo.grid_remove()
      else:
         #self.combo.grid()
         self.combo.grid(column=3, row=2)

 def secilenRadio2(self):
      print(self.selected2.get())
      if self.selected2.get()==3:
         #showinfo("Uyarı","birinci")
         self.comboulke.grid_remove()
      else:
         #self.comboulke.grid()
         self.comboulke.grid(column=3, row=3)
                 
 def get_dummy(self):
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col
    df = self.spark_df
    categoricalCols = self.catcols
    continuousCols = self.num_cols
    labelCol = self.labelCol
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]
    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")
    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    model=pipeline.fit(df)
    data = model.transform(df)
    data = data.withColumn('label',col(labelCol))
    data.show(5,False)
    return data.select('features','label')


 def secilenDosya(self):
      print(self.egitimTxt.get())
      self.dosya=self.egitimTxt.get()
      self.dataSayisicntr = self.dataSayisiTxt.get()
      if self.dosya==" " or self.dosya=='' or self.dataSayisicntr=='':
         messagebox.showinfo("Uyarı","Boş Olamaz")
      else:
         print(self.comboulke.current(), self.comboulke.get())
         #self.progress.start()
         messagebox.showinfo("Uyarı","Yükleme Başlatıldı")
         #self.progress.config(mode='indeterminate')
         self.dosya = str(self.dosya)
         self.dataSayisi = int(self.dataSayisiTxt.get())
         print(self.dosya)
         mySchema = StructType([ StructField("country", IntegerType(), True)\
                       ,StructField("region", IntegerType(), True)\
                       ,StructField("attacktype1", IntegerType(), True)\
                       ,StructField("targtype1_txt", StringType(), True)\
                       ,StructField("gname", StringType(), True)\
                       ,StructField("weaptype1", IntegerType(), True)])
         
         
         #egitim=pd.read_csv("D:/globalterrorismdb2.csv", usecols=[7, 9, 26, 27, 28, 35, 36, 40, 58, 68, 81], encoding='ISO-8859-1',low_memory=False)
         self.egitim=pd.read_csv(self.dosya, usecols=[7, 9, 28, 35, 58, 81], encoding='ISO-8859-1',low_memory=False,nrows=self.dataSayisi)
         #209 Turkey
         #217 ABD
         #94 İran
         #153 Pakistan
         #95 Irak
         #4 Afganistan
         #200 Suriye
         if self.comboulke.get() != '' or self.comboulke.get() != "":
            self.egitim = self.egitim[(self.egitim.country == self.returnUlkeInt())]
            messagebox.showinfo("Bilgi","%s ülkesi için eğitim ve test veri seti oluşturulacak"%self.comboulke.get()) 
         
         print("Girilen  Sayi dogru")
         print("Toplam  Sayisi")
         print (self.egitim.count())
         self.sqlContext = SQLContext(self.sc)
         self.spark_df = self.sqlContext.createDataFrame(self.egitim, schema=mySchema)
         self.YukleBtn.grid_remove()
         #self.progress.stop()
         messagebox.showinfo("Başarılıı","Yükleme Tamamlandı")

 def DonusumuBaslat(self):
         sp_df = self.spark_df
         messagebox.showinfo("Uyarı","Dönüşüm Başladı")
         self.data_f = self.get_dummy()
         self.data_f.show(25,False)
         self.labelIndexer = StringIndexer(inputCol='label',outputCol='indexedLabel').fit(self.data_f)
         self.labelIndexer.transform(self.data_f).show(25,False)
         self.featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(self.data_f)
         self.featureIndexer.transform(self.data_f).show(25,False)
         self.labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=self.labelIndexer.labels)
         if self.testTxt.get()=='':
             messagebox.showinfo("Hata","Lütfen Test oranını girin")
         else:
            deger = self.testTxt.get()
            testPoint=float(deger)/100
            (self.trainingData, self.testData) = self.data_f.randomSplit([1.0-testPoint, testPoint], seed = 100)
            messagebox.showinfo("Başarılı","Oran Hesaplandı")
            self.DonusumBtn.grid_remove()

 def createFolder(self):
    try:
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
    except OSError:
        print ('Error: Creating directory. ' +  self.directory)

 def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

 def modeliEgit(self):
         print(self.combo.current(), self.combo.get())
         messagebox.showinfo("Bilgi","%s algoritması için model oluşturulacak"%self.combo.get()) 
         if self.combo.current()==0:
            self.LogicticRegressionClassifier()
         elif self.combo.current()==1: 
            self.NaiveBayesClassifier()
         elif self.combo.current()==2:
            self.RandomForestClassifier()
         elif self.combo.current()==3:
            self.DecisionTreeClassifier()
         elif self.combo.current()==4:
            self.SVMclassifier()
         elif self.combo.current()==5:
            self.KNNclassifier()
            
 def printMetrics(predictions_and_labels):
   metrics = MulticlassMetrics(predictions_and_labels)
   print('Precision of True ', metrics.precision(1))
   print('Precision of False', metrics.precision(0))
   print('Recall of True    ', metrics.recall(1))
   print('Recall of False   ', metrics.recall(0))
   print('F-1 Score         ', metrics.fMeasure())
   print('Confusion Matrix\n', metrics.confusionMatrix().toArray())
    
 def getPredictionsLabels(model, test_data):
   predictions = model.predict(test_data.map(lambda r: r.features))
   return predictions.zip(test_data.map(lambda r: r.label))
         
 def LogicticRegressionClassifier(self):
   self.t0 = time()
   print("********************************************************************************************************************************************")
   print("Logistic Regression")
   logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel',maxIter=20, regParam=0.3, elasticNetParam=0)
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, logr, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))
   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   rfModel = model.stages[2]
   #model.save("model2345678909")
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   #self.predictions.printSchema()
   #paramGrid = (ParamGridBuilder()
   # .addGrid(logr.regParam, [0.01, 0.1, 0.5]) \
   #  .addGrid(logr.maxIter, [10, 20, 50]) \
   #  .addGrid(logr.elasticNetParam, [0.0, 0.8]) \
   # .build())
   
   #crossval = CrossValidator(estimator=pipeline,
   #                       estimatorParamMaps=paramGrid,
   #                       evaluator=evaluator,
   #                       numFolds=3)
   #
   #model = crossval.fit(self.trainingData)
   #predictions = model.transform(self.testData)
   #accuracy = evaluator.evaluate(predictions)
   #print("Dogruluk = %g" % (accuracy))

 def DecisionTreeClassifier(self):
   self.t0 = time()
   print("********************************************************************************************************************************************")
   print("Decision Tree Classifier")
   dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",impurity="gini",maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                         cacheNodeIds=False, checkpointInterval=10)
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, dt, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
 def NaiveBayesClassifier(self):
   print("********************************************************************************************************************************************")
   self.t0 = time()
   print("Bayes")
   nb = NaiveBayes(featuresCol='indexedFeatures', labelCol='indexedLabel', smoothing=1.0, modelType="multinomial")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, nb, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
 def RandomForestClassifier(self):
   print("********************************************************************************************************************************************")
   print("Random Forest")
   self.t0 = time()
   rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   svm = LinearSVC(maxIter=5, regParam=0.01)
   LSVC = LinearSVC()
   ovr = OneVsRest(classifier=LSVC)
   paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build()
   crossval = CrossValidator(estimator=ovr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=MulticlassClassificationEvaluator(metricName="f1"),
                                  numFolds=2)
   Train_sparkframe = self.trainingData.select("features", "label")
   cvModel = crossval.fit(Train_sparkframe)
   bestModel = cvModel.bestModel

   

 def SVMclassifier(self):
   print("********************************************************************************************************************************************")
   self.t0 = time()
   print("SVM")
   df = self.egitim
   df['gname_id'] = df['gname'].factorize()[0]
   df['weaptype1_id'] = df['weaptype1'].factorize()[0]
   df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0]
   df['targsubtype1_id'] = df['targsubtype1'].factorize()[0]
   X = df.iloc[:, [0,1,2,8,9,10]].values
   y = df.iloc[:, 7].values
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
   scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
   X_train = scaling.transform(X_train)
   X_test = scaling.transform(X_test)
   classifier = SVC(kernel='linear',cache_size=7000, random_state = 0)
   classifier.fit(X_train, y_train)
   self.tt = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.tt))
   self.t0 = time()
   y_pred = classifier.predict(X_test)
   accuracy = accuracy_score(y_test, y_pred)
   self.tt2 = time() -self.t0
   print(accuracy)
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy))

 def KNNclassifier(self):
   print("********************************************************************************************************************************************")
   from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
   from sklearn.preprocessing import StandardScaler
   from sklearn.model_selection import train_test_split
   from sklearn.metrics import accuracy_score
   from sklearn.preprocessing import MinMaxScaler
   from sklearn.ensemble import RandomForestClassifier
   from sklearn.tree import DecisionTreeClassifier
   from sklearn.svm import SVC
   from sklearn.neighbors import KNeighborsClassifier
   from sklearn.linear_model import LogisticRegression
   from sklearn.naive_bayes import GaussianNB
   from sklearn.metrics import classification_report
   from sklearn.metrics import confusion_matrix
   from sklearn.preprocessing import LabelEncoder

   
   print("KNN")
   df = self.egitim

   df['gname_id'] = df['gname'].factorize()[0]
   df['weaptype1_id'] = df['weaptype1'].factorize()[0]
   df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0]
   print("Toplam  Sayisi")
   #print (df.count())

   X = df.iloc[:, [0,1,2,7,8]].values
   y = df.iloc[:, 6].values
   #print(df.iloc[:, 6])
   #print(df.columns)
   #print(X)
   #print(y)
   #print(df['gname_id'])
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
   scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
   X_train = scaling.transform(X_train)
   X_test = scaling.transform(X_test)
   classifier = KNeighborsClassifier(n_neighbors=9, metric='minkowski', p = 2)
   self.t0 = time()
   classifier.fit(X_train, y_train)
   self.tt = time() - self.t0
   print ("Veri kumesini egitim zamani {} saniye ".format(self.tt))
   self.t0 = time()
   y_pred = classifier.predict(X_test)
   self.tt = time() - self.t0
   print ("test verisini siniflandirma zamani {} saniye ".format(self.tt))
   self.t0 = time()
   x_pred = classifier.predict(X_train)
   self.tt = time() - self.t0
   print ("egitim verisini siniflandirma zamani {} saniye ".format(self.tt))

   accuracy = accuracy_score(y_test, y_pred)
   accuracy_egitim = accuracy_score(y_train, x_pred)
   self.tt2 = time() -self.t0

   print ('Test Accuracy:', accuracy)
   print ('Egitim Accuracy:', accuracy_egitim)
   #print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy))
   #print 'Accuracy:', accuracy_score(y_test, y_pred)
   print ('Test F1 score:', f1_score(y_test, y_pred,average='weighted'))
   print ('Test Recall:', recall_score(y_test, y_pred,
                                 average='weighted'))
   print ('Test Precision:', precision_score(y_test, y_pred,
                                       average='weighted'))

   print ('egitim F1 score:', f1_score(y_train, x_pred,average='weighted'))
   print ('egitim Recall:', recall_score(y_train, x_pred,
                                 average='weighted'))
   print ('egitim Precision:', precision_score(y_train, x_pred,
                                       average='weighted'))

   #print '\n clasification report:\n', classification_report(y_test, y_pred)
   #print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred)

   print("********************************************************************************************************************************************")