Python VectorIndexer.transform примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark.ml.feature

Класс/Тип: VectorIndexer

Метод/Функция: transform

Примеров на hotexamples.com: 21

Python VectorIndexer.transform - 21 примеров найдено. Это лучшие примеры Python кода для pyspark.ml.feature.VectorIndexer.transform, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

VectorIndexer(30)

transform(21)

fit(17)

drop(2)

cache(1)

randomSplit(1)

setHandleInvalid(1)

setInputCol(1)

setParams(1)

Пример #1

Показать файл

    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model

Пример #2

Показать файл

def preprocessed_df(df, label="flg_cmd_lowcostIndex"):
    max_values_to_define_str_cols = 10
    id_col = 'ID_CLIENT'

    dty = dict(df.dtypes)
    str_cols = [k for k, v in dty.items() if v == 'string']
    str_cols.remove(id_col)

    for c in str_cols:
        stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index")
        model_str = stringIndexer.fit(df)
        df = model_str.transform(df).drop(c)

    input_cols = df.columns
    input_cols.remove(id_col)
    input_cols.remove(label)

    assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
    df = assembler.transform(df)

    featureIndexer = VectorIndexer(
        inputCol="features",
        outputCol="indexedFeatures",
        maxCategories=max_values_to_define_str_cols).fit(df)
    return featureIndexer.transform(df), df

Пример #3

Показать файл

Файл: StatisticalTest.py Проект: sahilsingh1123/predictive_analysis_git

    def chiSquareTest(self,categoricalFeatures,maxCategories):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        length = features.__len__()

        featureassembler = VectorAssembler(
            inputCols=self.features,
            outputCol="featuresChiSquare", handleInvalid="skip")
        dataset= featureassembler.transform(dataset)

        vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))

        dataset = vec_indexer.transform(dataset)

        # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
        # finalized_data.show()

        # using chi selector
        selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
                                 outputCol="selectedFeatures",
                                 labelCol=labelColm)

        result = selector.fit(dataset).transform(dataset)

        print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
        result.show()

        # runnin gfor the chi vallue test

        r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
        p_values = list(r.pValues)
        PValues = []
        for val in p_values:
            PValues.append(round(val, 4))
        print(PValues)
        dof = list(r.degreesOfFreedom)
        stats = list(r.statistics)
        statistics = []
        for val in stats:
            statistics.append(round(val, 4))
        print(statistics)
        chiSquareDict = {}
        for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
            print(pval, doF, stat)
            chiSquareDict[colm] = pval, doF, stat
        chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
        print(chiSquareDict)

        result = {'pvalues': chiSquareDict}

        return result

Пример #4

Показать файл

Файл: LinearRegressionTest.py Проект: sahilsingh1123/predictive_analysis_git

    def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId):
        try:
            dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)
            dataset.show()
            label = ''
            for val in label_colm:
                label = val
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(
                        x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)
            if relation == 'linear':
                print('linear relationship')
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)
            dataset.show()
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4, handleInvalid="skip").fit(
                dataset)
            dataset = vectorIndexer.transform(dataset)

            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            trainingData, testData = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40)

            # applying the model

            lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label)
            regressor = lr.fit(trainingData)

            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'
            modelPersist = 'linearRegressorModel.parquet'
            modelStorageLocation = locationAddress + userId + modelPersist
            regressor.write().overwrite().save(modelStorageLocation)

            # print regressor.featureImportances

            # print(dataset.orderBy(feature_colm, ascending=True))

            # pred = regressor.transform(testData)

            # coefficeint & intercept


            # saving the model and test dataset as csv file


            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)

            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)

            prediction = regressor.evaluate(testData)

            # VI_IMP = 2

            prediction_val = prediction.predictions
            prediction_val.show()

            prediction_val_pand = prediction_val.select(label, "prediction").toPandas()

            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]

            prediction_val_pand_label = prediction_val_pand[label]

            # print prediction_val_pand_residual
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            # print prediction_val_pand_predict

            # test_summary = prediction.summary

            # for test data

            lr_prediction = regressor.transform(testData)

            lr_prediction.groupBy(label, "prediction").count().show()

            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" % str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', intercept_t, '+'
            for feature, coeff in zip(feature_colm, coefficientList):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            st = list(equation)

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in PValuesList:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)



            #######################################################################################################
            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index', f.monotonically_increasing_id())

            pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            '''
                        
            userId = 'sahil123'
            graphName = 'QQPlot.parquet'
            locationAddress = '/home/fidel/mltest/'
            
            finalLocation = locationAddress + userId + graphName
            print(finalLocation)
            pred_residuals.write.parquet(finalLocation,mode='overwrite')
    
            '''


            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(label, 'prediction',
                                              sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()

            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop(
                'row_index')

            sqrt_label_residual_join.show()

            std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', (
                    sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias(
                'std_res'))

            std_resid.show()

            sqrt_std_res = std_resid.select("std_res", 'prediction',
                                            sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))

            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')

            ######################################################################################
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math


            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain)

            # schema = StructType([StructField('zTheoriticalTrain', FloatType(), True),
            #                      StructField('zPracticalTrain', FloatType(), True)
            #                      ])
            # spark.createDataFrame(zPracticalTrain, FloatType()).show()

            ####################################################################################
            # appending predicted value to the dataset
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index', f.monotonically_increasing_id())
            target_d = target.withColumn('row_index', f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)
            '''
            prediction = regressor.evaluate(dataset)
            predictionTestData= prediction.predictions
            predictionTestData.show()
            #appending the predicted column into the dataset which is test dataset
            predictionLabelList = [label,'prediction']
            updatedFeatureColmList = feature_colm
            for val in predictionLabelList:
                updatedFeatureColmList.append(val)
            print(updatedFeatureColmList)
            predictionTestDatasetcolumn = predictionTestData.select(updatedFeatureColmList)
            predictionTestDatasetcolumn.show()

            '''

            ##########################################################################################

            # scale location plot

            # for scale location plotequationAsList
            # from pyspark.sql.functions import udf
            #
            # def std_res(x):
            #     res_list = []
            #     res_list.append(x)
            #
            # std_residuals = udf(lambda y: std_res(y), FloatType())
            #
            # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType())))
            #
            # import statistics
            # import numpy as np
            # residuals_panda = residuals.toPandas()
            # # residuals_panda.residuals = range(residuals_panda.shape[1])
            # residuals_panda = residuals_panda.values
            # print(residuals_panda)
            # stdev_training = statistics.stdev(residuals_panda)
            # print(stdev_training)

            ############################################################################################################

            # creating the dictionary for storing the result

            # json_response = coefficient_t

            # print(json_response)

            # json_response = {"adjusted r**2 value" : training_summary.r2adj}

            # DATA VISUALIZATION PART

            # finding the quantile in the dataset(Q_Q plot)
            import matplotlib.pyplot as plt

            # y = 0.1
            # x = []
            #
            # for i in range(0, 90):
            #     x.append(y)
            #     y = round(y + 0.01, 2)
            #
            # for z in x:
            #     print ("~~~~~   ",z)
            #

            # quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01)
            # print quantile_label
            # quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01)
            # print quantile_prediction
            #
            # Q_label_pred=''
            # print(len(quantile_label))
            # length = len(quantile_label)
            #
            # for i in range(0,len(quantile_label)):
            #     Q_label_pred += str(quantile_label[i]) + '|'  +  str(quantile_prediction[i]) + '\n'

            # writing it to the hdfs in parquet file
            #
            # quantile_label_tospark = spark.createDataFrame(quantile_label, FloatType())
            # quantile_label_tospark = quantile_label_tospark.withColumnRenamed("value", "Q_label")
            #
            # quantile_prediction_tospark = spark.createDataFrame(quantile_prediction, FloatType())
            # quantile_prediction_tospark = quantile_prediction_tospark.withColumnRenamed("value", "Q_prediction")
            #
            # quant_label = quantile_label_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # quant_predtiction = quantile_prediction_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_quantile = quant_label.join(quant_predtiction,on=['row_index']).sort('row_index').drop('row_index')
            #
            # final_quantile.show()
            #
            # final_quantile.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',mode='overwrite')
            #
            #

            # print(str(Q_label_pred[i]))

            # with open('Q_Q_plot.csv', 'w') as Q_Q:
            #     writer_Q_Q = csv.writer(Q_Q)
            #     writer_Q_Q.writerows((quantile_label, quantile_prediction))
            #
            # plt.scatter(quantile_label, quantile_prediction)
            # plt.show()

            ## finding the residual vs fitted graph data

            #
            #
            # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType())
            # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", "prediction")
            #
            # prediction_val_pand_residual_tospark = spark.createDataFrame(prediction_val_pand_residual, FloatType())
            # prediction_val_pand_residual_tospark = prediction_val_pand_residual_tospark.withColumnRenamed("value", "residual")
            #
            # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # res_spark = prediction_val_pand_residual_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_res_fitted = pred_spark.join(res_spark, on=['row_index'])\
            #     .sort('row_index').drop('row_index')
            #
            # final_res_fitted.show()
            #
            # final_res_fitted.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/RESIDUAL_FITTED_PLOT.parquet',
            #                              mode='overwrite')
            #

            # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual)
            # plt.axhline(y=0.0, color="red")
            # plt.xlabel("prediction")
            # plt.ylabel("residual")
            # plt.title("residual vs fitted ")
            # plt.show()

            # creating the csv file and writitng into it
            import math
            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(prediction_val_pand_predict[i]) + '|' + str(
                    prediction_val_pand_residual[i]) + '\n'

            with open('residual_vs_fitted.csv', 'w') as r_f:
                writer_r_f = csv.writer(r_f)
                writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual))

            # parquet file writing

            ## residual vs leverage graph data

            prediction_val_pand_residual
            # extreme value in the predictor colm
            prediction_col_extremeval = lr_prediction_quantile.agg({"prediction": "max"})
            # prediction_col_extremeval.show()

            # plt.plot(prediction_col_extremeval, prediction_val_pand_residual)
            # plt.show()

            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs()
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual

            # plt.scatter(sqrt_residual, prediction_val_pand_predict)
            ####################################################################################3

            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_pred = statistics.stdev(prediction_val_pand_residual)
            # mean = statistics.mean(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_pred)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)
            #######################################################################################3
            # QUANTILE

            ## sort the list
            sorted_std_res = sorted(std_res)
            print(sorted_std_res)
            #
            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            print(mean)
            quantile = []
            n = len(sorted_std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            #
            # z_score theoritical
            from scipy.stats import norm

            z_theory = []
            for x in quantile:
                z_theory.append((norm.ppf(abs(x))))
            print(z_theory)
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)

            #

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01)
            print(quantile_std_res_t)
            print(x)

            Q_label_pred = ''
            # print(len(quantile_label))
            # length = len(quantile_label)
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(val) + 't' + str(quant) + 'n'

            plt.scatter(z_theory, z_pract)
            plt.savefig('q_q')

            ####################################################

            # creating the std residuals

            # square root of label
            sqrt_label = []
            for x in prediction_val_pand_label:
                sqrt_label.append(math.sqrt(abs(x)))

            sqrt_label
            prediction_val_pand_residual
            std_residual = []
            for sqr, resid in zip(sqrt_label, prediction_val_pand_residual):
                std_residual.append(resid / sqr)
                # print(std_sqrt_residual)

            # creating the std sqr root

            sqrt_std_residuals = []
            for x in std_residual:
                # print(math.sqrt(abs(x)))
                sqrt_std_residuals.append(math.sqrt(abs(x)))
            print(sqrt_std_residuals)

            # print(std_sqrt_residual)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)

            ##########################################################################
            # import math
            # sqrt_stdres = []
            # for x in std_sqrt_residual:
            #     sqrt_stdres.append(math.sqrt(x))
            #
            # scale_predict_residual = ''
            # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres):
            #     scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            # print(scale_predict_residual)

            ###################################3

            # plt.show()

            # scale_predict_residual=''
            #
            # print(len(sqrt_residual))
            # length = len(sqrt_residual)
            #
            # for i in range(0, len(std_sqrt_residual)):
            #     scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n'

            # with open('scale_location_plot.csv', 'w') as s_l:
            #     writer_s_l = csv.writer(s_l)
            #     writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual))



            # writing to the parquet

            # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType())
            # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value",
            #                                                                                             "prediction")
            #
            # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType())
            # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value",
            #                                                                                               "sqrt_residual")
            #
            # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \
            #     .sort('row_index').drop('row_index')
            #
            # final_scale_fitted.show()
            #
            # final_scale_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet',
            #     mode='overwrite')
            #

            # dumping the dictionary into json object

            # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf}

            tableContent = \
                {
                    'coefficientValuesKey': coefficientList,
                    'tValuesKey': tValuesList,
                    'pValuesKey': PValuesList,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': intercept_t,
                    "RMSE": RMSE,
                    "RSquare": r_square,
                    "AdjRSquare": adjsted_r_square,
                    "CoefficientStdError": coefficientStdError,

                }
            print(tableContent)

            json_response = {

                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square,
                "Coefficient_error": coefficientStdError,
                "T_value": T_values,
                "P_value": P_values,
                'Q_Q_plot': Q_label_pred,
                'residual_fitted': fitted_residual,
                'scale_location': scale_predict_residual

            }

            return json_response


        except Exception as e:
            print('exception is =' + str(e))

Пример #5

Показать файл

    def dataTranform(self):
        dataset = self.dataset
        schemaData = dataset.schema
        categoricalFeatures = []
        numericalFeatures = []
        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    or str(schemaVal.dataType) == "TimestampType"
                    or str(schemaVal.dataType) == "DateType"
                    or str(schemaVal.dataType) == "BooleanType"
                    or str(schemaVal.dataType) == "BinaryType"):
                for y in self.featuresColm:
                    if schemaVal.name == y:
                        dataset = dataset.withColumn(
                            y, dataset[y].cast(StringType()))
                        categoricalFeatures.append(schemaVal.name)
            else:
                for y in self.featuresColm:
                    if schemaVal.name == y:
                        numericalFeatures.append(schemaVal.name)

        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    and schemaVal.name == label):
                for labelkey in self.labelColm:
                    label_indexer = StringIndexer(
                        inputCol=label,
                        outputCol='indexed_' + label,
                        handleInvalid="skip").fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexedFeatures = []
        for colm in categoricalFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm,
                                    handleInvalid="skip").fit(dataset)
            indexedFeatures.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        combinedFeatures = numericalFeatures + indexedFeatures
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in categoricalFeatures:
            # categoryColm = value
            # listValue = value
            listValue = []
            categoryColm = dataset.groupby(value).count()
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue
        self.numericalFeatures = numericalFeatures
        self.categoricalFeatures = categoricalFeatures
        if not categoricalFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)

        featureassembler = VectorAssembler(inputCols=combinedFeatures,
                                           outputCol="features",
                                           handleInvalid="skip")
        dataset = featureassembler.transform(dataset)
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        dataset = vec_indexer.transform(dataset)
        return dataset, categoricalFeatures, numericalFeatures

Пример #6

Показать файл

Файл: callCenterdata.py Проект: sahilsingh1123/predictive_analysis_git

    def callCenter(self):
        dataset = spark.read.csv(
            "/home/fidel/Downloads/CallCenterFinalTillAprilData",
            sep=',',
            header=True,
            inferSchema=True)
        dataset.show()
        feature_colm = ["col_2_SKILLNAME_2", "col_2_SKILLNAME_3"]
        label_colm = ["CALLDATE"]
        label = ""
        for val in label_colm:
            label = val
        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"
                    or str(x.dataType) == 'TimestampType'
                    or str(x.dataType) == 'DateType'
                    or str(x.dataType) == 'BooleanType'
                    or str(x.dataType) == 'BinaryType'):
                for y in feature_colm:
                    if x.name == y:
                        dataset = dataset.withColumn(
                            y, dataset[y].cast(StringType()))
                        stringFeatures.append(x.name)

        categoryColmList = []
        categoryColmListFinal = []
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in stringFeatures:
            categoryColm = value
            listValue = value
            listValue = []
            categoryColm = dataset.groupby(value).count()
            print(categoryColm)
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue

        if not stringFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)
        maxCategories = 13

        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        dataset.show()
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        vectorIndexer = VectorIndexer(inputCol='features',
                                      outputCol='vectorIndexedFeatures',
                                      maxCategories=maxCategories).fit(dataset)
        dataset = vectorIndexer.transform(dataset)
        import csv
        dataset = dataset.select("CALLDATE", "col_2_SKILLNAME_2",
                                 "col_2_SKILLNAME_3", "indexed_CALLDATE",
                                 "indexed_col_2_SKILLNAME_2",
                                 "indexed_col_2_SKILLNAME_3")
        # dataset.to_csv("/home/fidel/Downloads/Callcenterdata/callFinalFormated.csv")
        # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinalF.csv")
        # dataset.write.csv("/home/fidel/Downloads/Callcenterdata/callFinal.csv")
        # dataset.show()
        dataset.toPandas().to_csv(
            "/home/fidel/Downloads/Callcenterdata/callcsv.csv")
        trainDataRatioTransformed = 0.80
        testDataRatio = 1 - trainDataRatioTransformed
        trainingData, testData = dataset.randomSplit(
            [trainDataRatioTransformed, testDataRatio], seed=0)
        #applying the model
        randomForestModel = RandomForestClassifier(
            labelCol=label,
            featuresCol='vectorIndexedFeatures',
            numTrees=10,
            maxBins=maxCategories)
        randomForestModelFit = randomForestModel.fit(trainingData)
        predictions = randomForestModelFit.transform(testData)

        # Select example rows to display.
        predictions.select("predictedLabel", "label", "features").show(5)
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel",
            predictionCol="prediction",
            metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))

Пример #7

Показать файл

Файл: LinearPersistModelTest.py Проект: sahilsingh1123/predictive_analysis_git

    def linearRegPersist(self, dataset_add, feature_colm, label_colm,
                         relation_list, relation, userId):
        try:
            dataset = spark.read.csv(dataset_add,
                                     header=True,
                                     inferSchema=True)
            dataset.show()
            label = ''
            for val in label_colm:
                label = val
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"):
                    for y in feature_colm:
                        if x.name == y:
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)
            if relation == 'linear':
                print('linear relationship')
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)
            dataset.show()
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label,
                                                      outputCol='indexed_' +
                                                      label).fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' +
                                        colm).fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            final_features = numericalFeatures + indexed_features
            featureassembler = VectorAssembler(inputCols=final_features,
                                               outputCol="features")
            dataset = featureassembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=4).fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            # Loading the persisted model
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'
            modelPersist = 'linearRegressorModel.parquet'
            persistedModelLocation = locationAddress + userId + modelPersist
            regressorTest = LinearRegressionModel.load(persistedModelLocation)
            predictedData = regressorTest.transform(dataset)

            predictedData.show()

        except Exception as e:
            print('exception is :', e)

Пример #8

Показать файл

                                data["pickup_latitude"] >= 40.63).filter(
                                    data["dropoff_latitude"] <= 40.85).filter(
                                        data["dropoff_latitude"] >= 40.63)
    #data.printSchema()
    assembler = VectorAssembler().setInputCols([
        "vendor_id", "pickup_longitude", "pickup_latitude", "pickup_hour",
        "pickup_month", "dropoff_longitude", "dropoff_latitude",
        "trip_distance", "passenger_count"
    ]).setOutputCol("features")
    df = assembler.setHandleInvalid("skip").transform(data).select(
        "trip_duration", "features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=30).fit(df)
    d = featureIndexer.transform(df)
    trainTest = d.randomSplit([0.8, 0.2])
    traindf = trainTest[0]
    testdf = trainTest[1]

    # Model
    dtr = DecisionTreeRegressor(featuresCol="indexedFeatures",
                                labelCol="trip_duration",
                                impurity="variance")

    # choices of tuning parameters
    dtrparamGrid = (ParamGridBuilder().addGrid(dtr.maxDepth, [10]).build())

    pipeline = Pipeline(stages=[featureIndexer, dtr])

    crossval = CrossValidator(estimator=pipeline,

Пример #9

Показать файл

Файл: random_forest_regression_test.py Проект: sahilsingh1123/predictive_analysis_git

def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        # dataset = spark.read.parquet(dataset_add)
        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=';')

        dataset.show()

        label = ''
        for y in label_colm:
            label = y

        print(label)
        #
        # summaryList = ['mean', 'stddev', 'min', 'max']
        # summaryDict = {}
        # for colm in feature_colm:
        #     summaryListTemp = []
        #     for value in summaryList:
        #         summ = list(dataset.select(colm).summary(value).toPandas()[colm])
        #         summaryListTemp.append(summ)
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     summaryListTemp.append(varianceListTemp)
        #     summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        # summaryDict['summaryName'] = summaryList
        #
        # print(summaryDict)

        # print(summaryDict)
        # varianceDict = {}
        # for colm in feature_colm:
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     varianceDict[colm] = varianceListTemp
        # print(varianceDict)

        # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict}
        # print(summaryAll)

        # extracting the schema

        schemaDataset = dataset.schema

        stringFeatures = []
        numericalFeatures = []

        for x in schemaDataset:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        print(stringFeatures)
        print(numericalFeatures)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListTemp.append(summ)
            varianceListTemp = list(
                dataset.select(variance(
                    col(colm)).alias(colm)).toPandas()[colm])
            summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        print(summaryDict)

        # print(val)

        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        # calling pearson test fuction

        response_pearson_test = Correlation_test_imp(
            dataset=dataset, features=numericalFeatures, label_col=label)

        # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label)

        # dataset_pearson = dataset

        #
        # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset)
        # dataset = label_indexer.transform(dataset)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset)
            # encoded_features.append('encoded_'+colm)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = numericalFeatures + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        #using the vector indexer

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        finalized_data.show()

        # renaming the colm
        # print (label)
        # dataset.withColumnRenamed(label,"label")
        # print (label)
        # dataset.show()

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)
        #
        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()
        #
        #
        #
        # splitting the dataset into taining and testing

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        rf = RandomForestRegressor(labelCol=label,
                                   featuresCol='vec_indexed_features',
                                   numTrees=10)

        # Convert indexed labels back to original labels.

        # Train model.  This also runs the indexers.
        model = rf.fit(train_data)

        # Make predictions.
        predictions = model.transform(test_data)

        # Select example rows to display.
        # predictions.select("prediction", "label", "features").show(10)

        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)

        features_column_for_user = numericalFeatures + stringFeatures

        feature_imp = {
            'feature_importance': feature_importance,
            "feature_column": features_column_for_user
        }

        response_dict = {
            'feature_importance': feature_imp,
            'pearson_test_data': response_pearson_test,
            'summaryDict': summaryDict
        }

        return response_dict
        print(response_dict)

        # Select (prediction, true label) and compute test error
        # evaluator = MulticlassClassificationEvaluator(
        #     labelCol="label", predictionCol="prediction", metricName="accuracy")
        # accuracy = evaluator.evaluate(predictions)
        # print("Test Error = %g" % (1.0 - accuracy))

        # rfModel = model.stages[2]
        # print(rfModel)  # summary only

    except Exception as e:
        print("exception is  = " + str(e))

Пример #10

Показать файл

    def GradientBoostingClassification(self, dataset_add, feature_colm,
                                       label_colm, relation_list, relation):
        try:
            dataset = spark.read.csv(dataset_add,
                                     sep=';',
                                     header=True,
                                     inferSchema=True)
            dataset.show()
            stepSize = self.learningRate
            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label,
                                                      outputCol='indexed_' +
                                                      label).fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' +
                                        colm).fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            final_features = numericalFeatures + indexed_features
            featureassembler = VectorAssembler(inputCols=final_features,
                                               outputCol="features")
            dataset = featureassembler.transform(dataset)
            vectorIndexer = VectorIndexer(
                inputCol='features',
                outputCol='vectorIndexedFeatures',
                maxCategories=maxCategories).fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            trainingData, testData = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=0)

            gradientBoostingmodel = GBTClassifier(
                labelCol=label,
                featuresCol='vectorIndexedFeatures',
                maxIter=10,
                stepSize=stepSize)
            gradientBoostFittingTrainingData = gradientBoostingmodel.fit(
                trainingData)
            gBPredictionTrainData = gradientBoostFittingTrainingData.transform(
                trainingData)
            gBPredictionTestData = gradientBoostFittingTrainingData.transform(
                testData)
            gBPredictionTestData.select('prediction', label).show()
            # gbtModel = gradientBoostFittingTrainingData.stages
            featureImportance = gradientBoostFittingTrainingData.featureImportances.toArray(
            ).tolist()
            print(featureImportance)

            # prediction graph data
            from pyspark.sql.functions import col
            TrainPredictedTargetData = gBPredictionTrainData.select(
                label, 'prediction', 'probability', 'rawPrediction')
            residualsTrainData = TrainPredictedTargetData.withColumn(
                'residuals',
                col(label) - col('prediction'))
            residualsTrainData.show()

            TestPredictedTargetData = gBPredictionTestData.select(
                label, 'prediction', 'probability', 'rawPrediction')
            residualsTestData = TestPredictedTargetData.withColumn(
                'residuals',
                col(label) - col('prediction'))
            residualsTestData.show()

            # train Test data Metrics
            gBPredictionDataDict = {
                'gBPredictionTestData': gBPredictionTestData,
                'gBPredictionTrainData': gBPredictionTrainData
            }
            metricsList = [
                'f1', 'weightedPrecision', 'weightedRecall', 'accuracy'
            ]
            for key, value in gBPredictionDataDict.items():
                if key == 'gBPredictionTestData':
                    testDataMetrics = {}
                    for metric in metricsList:
                        evaluator = MulticlassClassificationEvaluator(
                            labelCol=label,
                            predictionCol="prediction",
                            metricName=metric)
                        metricValue = evaluator.evaluate(gBPredictionTestData)
                        testDataMetrics[metric] = metricValue
                    print('testDataMetrics :', testDataMetrics)

                if key == 'gBPredictionTrainData':
                    trainDataMetrics = {}
                    for metric in metricsList:
                        evaluator = MulticlassClassificationEvaluator(
                            labelCol=label,
                            predictionCol="prediction",
                            metricName=metric)
                        metricValue = evaluator.evaluate(gBPredictionTrainData)
                        trainDataMetrics[metric] = metricValue
                    print('trainDataMetrics :', trainDataMetrics)

            # while fitting the training data
            totalNumberTrees = gradientBoostFittingTrainingData.getNumTrees
            print('Total number of trees used is :', totalNumberTrees)
            totalNumberNodes = gradientBoostFittingTrainingData.totalNumNodes
            print('Total number of node is :', totalNumberNodes)
            treeWeight = gradientBoostFittingTrainingData.treeWeights
            print('Weights on each tree is :', treeWeight)
            treeInfo = gradientBoostFittingTrainingData.trees
            for eachTree in treeInfo:
                print('info of each tree is :', eachTree)

        except Exception as e:
            print('exception is --', e)

Пример #11

Показать файл

Файл: chi_sqr_importance.py Проект: sahilsingh1123/predictive_analysis_git

def chi_square_test(dataset, features, label_col, stringFeatures):
    spark = SparkSession.builder.appName("predictive_analysis").master(
        "local[*]").getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")
    stringFeatures
    length = features.__len__()
    datasetChi = dataset

    featureassembler = VectorAssembler(inputCols=features,
                                       outputCol="features",
                                       handleInvalid="skip")

    datasetChi = featureassembler.transform(datasetChi)
    datasetChi.show()

    vec_indexer = VectorIndexer(inputCol='features',
                                outputCol='vec_indexed_features',
                                maxCategories=4,
                                handleInvalid="skip").fit(datasetChi)

    categorical_features = vec_indexer.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categorical_features), ", ".join(
              str(k) for k in categorical_features.keys())))

    vec_indexed = vec_indexer.transform(datasetChi)
    vec_indexed.show()

    finalized_data = vec_indexed.select(label_col, 'vec_indexed_features')
    finalized_data.show()

    # using chi selector
    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="vec_indexed_features",
                             outputCol="selected_features",
                             labelCol=label_col)

    result = selector.fit(finalized_data).transform(finalized_data)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    # runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", label_col).head()
    p_values = list(r.pValues)
    PValues = []
    for val in p_values:
        PValues.append(round(val, 4))
    print(PValues)
    dof = list(r.degreesOfFreedom)
    stats = list(r.statistics)
    statistics = []
    for val in stats:
        statistics.append(round(val, 4))
    print(statistics)
    chiSquareDict = {}
    for pval, doF, stat, colm in zip(PValues, dof, statistics, stringFeatures):
        print(pval, doF, stat)
        chiSquareDict[colm] = pval, doF, stat
    chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
    print(chiSquareDict)

    return_data = {'pvalues': chiSquareDict}

    return return_data

Пример #12

Показать файл

Файл: apachespark_project4.py Проект: EmmanuelAmeyaw/Capstone_project

# In[280]:

# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(transformed)
labelIndexer.transform(transformed).show(5, False)

# In[265]:

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(transformed)
featureIndexer.transform(transformed).show(5, True)

# In[281]:

data.show(2, False)

# In[282]:

# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

trainingData.show(5, False)
testData.show(5, False)

# In[283]:

Пример #13

Показать файл

Файл: LinearRegression.py Проект: sahilsingh1123/predictive_analysis_git

    def linearReg(self, dataset_add, feature_colm, label_colm, relation_list,
                  relation, userId, locationAddress):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            # encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            '''from pyspark.ml.feature import OneHotEncoderEstimator
                oneHotEncodedFeaturesList = []
                for colm in stringFeatures:
                        indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                        indexed_features.append('indexed_' + colm)
                        dataset = indexer.transform(dataset)
                        oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm)
                oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features,
                                                     outputCols=oneHotEncodedFeaturesList)
                oneHotEncoderFit=oneHotEncoder.fit(dataset)
                oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)'''
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            lr = LinearRegression(featuresCol="vectorIndexedFeatures",
                                  labelCol=label)
            regressor = lr.fit(train_data)
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            featurePredictedLabel = feature_colm
            featurePredictedLabel.append('prediction')
            featurePredictedLabel.append(label)
            # testDataEvaluation = regressor.evaluate(test_data)
            # testDataPrediction = testDataEvaluation.predictions
            # testDataPrediction.select(featurePredictedLabel).show()

            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            testDataPrediction = prediction_val.select(featurePredictedLabel)

            # storing test predicted value to the dataset

            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            residual_graph = training_summary.residuals
            residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" +
                  str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(
                training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)
            coefficientList = list(regressor.coefficients)

            #summaryData
            import pyspark.sql.functions as F
            import builtins
            round = getattr(builtins, 'round')
            print(coefficientList)
            coefficientListRounded = []
            for value in coefficientList:
                coefficientListRounded.append(round(value, 4))
            # print(coefficientListRounded)
            # print(intercept_t)
            interceptRounded = round(float(intercept_t), 4)
            # print(interceptRounded)
            # print(RMSE)
            RMSERounded = round(RMSE, 4)
            # print(RMSERounded)
            MSERounded = round(MSE, 4)
            rSquareRounded = round(r_square, 4)
            adjustedrSquareRounded = round(adjsted_r_square, 4)
            coefficientStdError = training_summary.coefficientStandardErrors
            coefficientStdErrorRounded = []
            for value in coefficientStdError:
                coefficientStdErrorRounded.append(round(float(value), 4))
            print(coefficientStdErrorRounded)
            tValuesListRounded = []
            for value in tValuesList:
                tValuesListRounded.append(round(value, 4))
            print(tValuesListRounded)
            pValuesListRounded = []
            PValuesList = training_summary.pValues

            for value in PValuesList:
                pValuesListRounded.append(round(value, 4))
            print(pValuesListRounded)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', interceptRounded, '+'
            for feature, coeff in zip(feature_colm, coefficientListRounded):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            equationAsList = list(equation)
            '''# statTable function
            def summaryTable(self,featuresName,featuresStat):
                statTable={}
                for name, stat in zip(featuresName.values(),
                                      featuresStat.values()):
                    print(name, ": ", stat)
                    statTable[name]=stat
                return statTable
            '''

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in pValuesListRounded:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)

            # storing test predicted value to the dataset

            predictionData = 'prediction.parquet'

            predictionDataStoring = locationAddress + userId + predictionData
            testDataPrediction.write.parquet(predictionDataStoring,
                                             mode='overwrite')

            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########3
            # table_response = {
            #
            #     "Intercept": intercept_t,
            #     "Coefficients": coefficient_t,
            #     "RMSE": RMSE,
            #     "MSE": MSE,
            #     "R_square": r_square,
            #     "Adj_R_square": adjsted_r_square,
            #     "coefficientStdError": coefficientStdError,
            #     "T_value": T_values,
            #     "P_value": P_values
            #
            # }
            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            tableContent = \
                {
                    'coefficientValuesKey': coefficientListRounded,
                    'tValuesKey': tValuesListRounded,
                    'pValuesKey': pValuesListRounded,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': interceptRounded,
                    "RMSE": RMSERounded,
                    "RSquare": rSquareRounded,
                    "AdjRSquare": adjustedrSquareRounded,
                    "CoefficientStdError": coefficientStdErrorRounded,
                    'equationKey': equation
                }

            json_response = {
                'table_data': tableContent,
                'graph_data': graph_response
            }
            print(json_response)
            return (json_response)
        except Exception as e:
            print('exception is =' + str(e))

Пример #14

Показать файл

    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()

Пример #15

Показать файл

# In[14]:

from pyspark.ml.feature import StringIndexer
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
labelIndexer.transform(data)

from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(data)
featureIndexer.transform(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# In[15]:

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures")

Пример #16

Показать файл

def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        dataset = spark.read.parquet(dataset_add)
        label = ''
        for y in label_colm:
            label = y

        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}

        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                # print(summaryListSubTemp)
                summaryListTemp.append(summaryListSubTemp)
            # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
            # summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        print(summaryDict)
        print(summaryList.extend(['skewness', 'kurtosis', 'variance']))
        print(summaryDict)
        # for colm in numericalFeatures:
        #     skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm))
        #     kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm))
        #     variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm)))
        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        dataset.show()
        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        response_chi_test = chi_square_test(dataset=dataset,
                                            features=indexed_features,
                                            label_col=label,
                                            stringFeatures=stringFeatures)

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        dataset.show()
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()
        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)
        rf = RandomForestClassifier(labelCol=label,
                                    featuresCol='vec_indexed_features',
                                    numTrees=10)
        model = rf.fit(train_data)
        predictions = model.transform(test_data)
        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        # feature_importance = [round(x,4) for x in feature_importance]
        featureImportance = []
        for x in feature_importance:
            featureImportance.append(round(x, 4))
        print(featureImportance)

        features_column_for_user = numericalFeatures + stringFeatures
        feature_imp = {
            'feature_importance': featureImportance,
            "feature_column": features_column_for_user
        }
        response_dict = {
            'feature_importance': feature_imp,
            'ChiSquareTestData': response_chi_test,
            'summaryDict': summaryDict
        }
        return response_dict
    except Exception as e:
        print("exception is  = " + str(e))

Пример #17

Показать файл

Файл: LassoRegression.py Проект: sahilsingh1123/predictive_analysis_git

    def lassoRegression(self, dataset_add, feature_colm, label_colm,
                        relation_list, relation, userId):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()
            Rsqr_list = []
            Rsqr_regPara = {}
            print(self.xt)
            # print(data_add)

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            ######################################################################33
            # lasso final
            for t in self.xt:
                lr1 = LinearRegression(featuresCol="vectorIndexedFeatures",
                                       labelCol=label,
                                       elasticNetParam=1,
                                       regParam=t)
                regressor1 = lr1.fit(train_data)
                print(t)
                print("coefficient : " + str(regressor1.coefficients))
                reg_sum = regressor1.summary
                r2 = reg_sum.r2
                Rsqr_list.append(r2)
                Rsqr_regPara[r2] = t
                print(r2)

            print(Rsqr_list)
            print(max(Rsqr_list))
            maximum_rsqr = max(Rsqr_list)
            print(Rsqr_regPara)
            final_regPara = []

            for key, val in Rsqr_regPara.items():
                if (key == maximum_rsqr):
                    print(val)
                    final_regPara.append(val)

            for reg in final_regPara:
                lr_lasso = LinearRegression(
                    featuresCol="vectorIndexedFeatures",
                    labelCol=label,
                    elasticNetParam=1,
                    regParam=reg)
                regressor = lr_lasso.fit(train_data)
                training_summary = regressor.summary
                r2 = training_summary.r2
                print(r2)

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            prediction_val.show()
            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            # training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            # coefficient_error = str(training_summary.coefficientStandardErrors)
            # print(" Tvalues :\n" + str(training_summary.tValues))
            # T_values = str(training_summary.tValues)
            # print(" p values :\n" + str(training_summary.pValues))
            # P_values = str(training_summary.pValues)

            #######################################################################################################
            table_response = {
                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square
            }
            #######################################################################################################
            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########################################################################################

            # scale location plot

            # for scale location plot
            # from pyspark.sql.functions import udf
            #
            # def std_res(x):
            #     res_list = []
            #     res_list.append(x)
            #
            # std_residuals = udf(lambda y: std_res(y), FloatType())
            #
            # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType())))
            #
            # import statistics
            # import numpy as np
            # residuals_panda = residuals.toPandas()
            # # residuals_panda.residuals = range(residuals_panda.shape[1])
            # residuals_panda = residuals_panda.values
            # print(residuals_panda)
            # stdev_training = statistics.stdev(residuals_panda)
            # print(stdev_training)

            ############################################################################################################

            # creating the dictionary for storing the result

            # json_response = coefficient_t

            # print(json_response)

            # json_response = {"adjusted r**2 value" : training_summary.r2adj}

            # DATA VISUALIZATION PART

            # finding the quantile in the dataset(Q_Q plot)
            import matplotlib.pyplot as plt

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            json_response = {
                'table_data': table_response,
                'graph_data': graph_response
            }

            return json_response

        except Exception as e:
            print('exception is =' + str(e))

Пример #18

Показать файл

Файл: classification_comm.py Проект: ywzhang188/spark-learn

# deal with categorical label
from pyspark.ml.feature import StringIndexer

# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
labelIndexer.transform(data).show(5, True)

from pyspark.ml.feature import VectorIndexer

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures", \
                               maxCategories=4).fit(data)
featureIndexer.transform(data).show(5, True)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# split data into training and test data sets
(trainingData, testData) = data.randomSplit([0.6, 0.4])

# visualization
import numpy as np
import itertools
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, classes,

Пример #19

Показать файл

Файл: transformers.py Проект: swipswaps/node-red-contrib-sparkml

			spark.stop()
			data["trainSI"] = trainPath
			data["testSI"] = testPath

			data["currentTrain"] = trainPath
			data["currentTest"] = testPath

		elif config["transformerType"] == "vi":

			train, test = spark.read.parquet(data["currentTrain"]), spark.read.parquet(data["currentTest"])
			train.cache()
			test.cache()

			df = train.unionByName(test)
			featureIndexer = VectorIndexer(inputCol=config["inputCol"], outputCol=config["outputCol"], maxCategories=config["maxCategories"]).fit(df)
			train = featureIndexer.transform(train)
			test = featureIndexer.transform(test)

			trainPath = data['scheme'] + "://" + data['save'] + "/trainVI/"
			testPath = data['scheme'] + "://" + data['save'] + "/testVI/"
			if "partitionCol" in data and data['partitionCol'] in train.schema.names:
				train.write.partitionBy(data['partitionCol']).format("parquet").save(trainPath)
				test.write.partitionBy(data['partitionCol']).format("parquet").save(testPath)
			else:
				train.write.format("parquet").mode("overwrite").save(trainPath)
				test.write.format("parquet").mode("overwrite").save(testPath)
			spark.stop()

			data["trainVI"] = trainPath
			data["testVI"] = testPath

Пример #20

Показать файл

Файл: hello_pyspark.py Проект: epcervantes7/pyspark-examples

#transforms data into vectors
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features'])


transformed = transData(df)
transformed.show(5, False)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

#create a kmeans stage
kmeans = KMeans() \
          .setK(3) \
          .setFeaturesCol("indexedFeatures")\
          .setPredictionCol("cluster")

# Chain indexer and kmeans in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, kmeans])

#fit pipeline
model = pipeline.fit(transformed)

#transform data
cluster = model.transform(transformed)

Пример #21

Показать файл

class adresDefteri(kisi):
 def __init__(self):
  self.directory = 'sonuclar'
  self.createFolder()
  self.sc = SparkContext('local')
  spark = SparkSession(self.sc)
  spark = SparkSession \
                  .builder \
                  .appName("Python Spark Logistic Regression example") \
                  .config('spark.executor.heartbeatInterval', '3600s') \
                  .config("spark.some.config.option", "some-value") \
                  .getOrCreate()
  locale = self.sc._jvm.java.util.Locale
  locale.setDefault(locale.forLanguageTag("en-US"))
  
  self.catcols = ['targtype1_txt']
  self.num_cols = ['country', 'region','attacktype1','weaptype1']
  self.labelCol = 'gname'

  Root=Tk()
  Root.geometry("800x600")
  Root.title("Yaşanan Terör Olaylarını İçeren Büyük Verinin Makine Öğrenmesi Teknikleri İle Analizi")

  menu = Menu(Root)
  filemenu = Menu(menu)
  menu.add_cascade(label="File", menu=filemenu)
  filemenu.add_command(label="CSV View", command=self.secVeGoster)
  filemenu.add_separator()
  filemenu.add_command(label="Çıkış", command=Root.quit)
  
  filemenu.add_separator()
  filemenu.add_command(label="Yeniden Başlat", command=self.restart_program)

  helpmenu = Menu(menu)
  menu.add_cascade(label="Yardım", menu=helpmenu)
  helpmenu.add_command(label="Hakkında...", command=self.Hakkinda)

  
  Root.configure(background='yellow',menu=menu)
  global HakkindaPencere,combo
  self.nameText = StringVar()
  self.selected1 = IntVar()
  self.selected1.set(1)
  self.selected2 = IntVar()
  self.selected2.set(3)

  
  self.egitimLbl=Label(text="  Eğitim Verisi",width=30,height=3,fg="red",bg="yellow")
  self.egitimLbl.grid(row=0,column=0)

  self.egitimTxt=Entry(textvariable = self.nameText, fg="red",bg="yellow")
  self.egitimTxt.grid(row=0,column=1)

  self.egitimSec=Button(text="  ...  ",command=self.secim,width=10,height=1,fg="red",bg="yellow")
  self.egitimSec.grid(row=0,column=2)

  self.dataSayisiLbl=Label(text="  Data Sayısı Girin (Maks:181600)",width=30,height=3,fg="red",bg="yellow")
  self.dataSayisiLbl.grid(row=0,column=3)

  self.dataSayisiTxt=Entry(fg="red",bg="yellow")
  self.dataSayisiTxt.grid(row=0,column=4)

  self.testLbl=Label(text="  Test Verisi Oranı %",width=30,height=3,fg="red",bg="yellow")
  self.testLbl.grid(row=1,column=0)

  self.testTxt=Entry(fg="red",bg="yellow")
  self.testTxt.grid(row=1,column=1)  

  self.algoritmaLbl=Label(text="  Algoritma Seçiniz:",width=30,height=3,fg="red",bg="yellow")
  self.algoritmaLbl.grid(row=2,column=0)


  self.rad1 = Radiobutton(text='Hepsini karşılaştır',variable=self.selected1, value=1,command=self.secilenRadio1)
  self.rad1.grid(column=1, row=2)

  self.rad2 = Radiobutton(text='Bir Algoritma Seçiniz:',variable=self.selected1, value=2,command=self.secilenRadio1)
  self.rad2.grid(column=2, row=2)
  
  self.combo = ttk.Combobox (Root, state='readonly')
  self.combo['values']= ("Logistic Regression", "Naive Bayes", "Random Forest Classifier", "Decision Tree Classifier","Support Vector Machine","KNN" )
  #self.combo.current(-1) #set the selected item
  #self.combo.grid(column=3, row=2)
  

  self.ulkeLbl=Label(text="  Ülke Seçiniz:",width=30,height=3,fg="red",bg="yellow")
  self.ulkeLbl.grid(row=3,column=0)

  self.rad3 = Radiobutton(text='Tüm Ülkeler İçin', variable=self.selected2, value=3,command=self.secilenRadio2)
  self.rad3.grid(column=1, row=3)
  
  self.rad4 = Radiobutton(text='Ülke Seçin:', variable=self.selected2, value=4,command=self.secilenRadio2)
  self.rad4.grid(column=2, row=3)
  
  self.comboulke = ttk.Combobox (Root, state='readonly')
  self.comboulke['values']= ("Türkiye", "ABD", "İran", "Pakistan", "Irak","Afganistan","Suriye")
  #self.comboulke.grid(column=3, row=3)

  #209 Turkey
  #217 ABD
  #94 İran
  #153 Pakistan
  #95 Irak
  #4 Afganistan
  #200 Suriye

  #self.comboulke.current(1) #set the selected item
  #image=photo3, ekler , compound=LEFT resmi sola ceker

  self.YukleBtn=Button(text="Veriyi Yükle", command=self.secilenDosya,width=20,height=3,fg="red",bg="yellow")
  self.YukleBtn.grid(row=4,column=2)

  self.DonusumBtn=Button(text="  Dönüşümü Başlat  ", command=self.DonusumuBaslat,width=20,height=3,fg="red",bg="yellow")
  self.DonusumBtn.grid(row=5,column=2)

  self.ModelBtn=Button(text="  Modeli Eğit  ", command=self.modeliEgit,width=20,height=3,fg="red",bg="yellow")
  self.ModelBtn.grid(row=6,column=2)

  self.SonucBtn=Button(text="  Sonucu Göster  ", command=self.csvView,width=20,height=3,fg="red",bg="yellow")
  self.ExportCsvBtn=Button(text="  Export CSV  ", command=self.exportCSV,width=20,height=3,fg="red",bg="yellow")


  #self.listele=Button(text="Listele",command=self.listele,width=30,height=3,fg="red",bg="yellow")
  #self.listele.grid(row=7,column=0)



  
  mainloop()	
     

 def restart_program(self):
	 #os.execv(sys.executable, ['python'] + sys.argv)
     import _winapi
     x = _winapi.GetCurrentProcess()
     _winapi.ExitProcess(x)
	 
     #self.egitimTxt.delete(0, END)
     #self.dataSayisiTxt.delete(0, END)
     #self.comboulke.config(state=DISABLED)
	 #self.combo.config(state=DISABLED)
	 #self.YukleBtn.grid(row=4,column=2)
	 #self.DonusumBtn.grid(row=5,column=2)
	 #self.ModelBtn.grid(row=6,column=2)
	 #self.SonucBtn.grid_remove()
	 #self.ExportCsvBtn.grid_remove()
	

 def returnUlkeInt(self):
    self.comboUlkeDeger =self.comboulke.current()
    if self.comboUlkeDeger==0:
       return 209 
    elif self.comboUlkeDeger==1:
       return 217
    elif self.comboUlkeDeger==2:
       return 94
    elif self.comboUlkeDeger==3:
       return 153
    elif self.comboUlkeDeger==4:
       return 95
    elif self.comboUlkeDeger==5:
       return 4
    elif self.comboUlkeDeger==6:
       return 200
    else:
       return -1
    
    #209 Turkey
    #217 ABD
    #94 İran
    #153 Pakistan
    #95 Irak
    #4 Afganistan
    #200 Suriye


 def exportCSV(self):
    path = 'sonuclar'
    output_file = os.path.join(path,'Combined Book.csv')
    export_file_path = filedialog.asksaveasfilename(defaultextension='.csv')
    self.predictions.toPandas().to_csv(export_file_path, sep=",", float_format='%.2f',index=False, line_terminator='\n',encoding='utf-8')
 
 def skorEkle(self):

         self.algoritma=self.combo.get()
         self.trainDataCount=self.trainingData.count()
         self.testDataCount=self.testData.count()
         self.dogrulukOrani=self.accuracy
         self.hataOrani=self.testError
         self.hesaplamaSuresi=self.tt
         self.egitilmeZamani=self.tt2
         self.f1Score=self.f1
         self.precisionSkor=self.wp
         self.recallScore=self.wr

         self.train_dogrulukOrani=self.train_accuracy
         self.train_hataOrani=self.train_Error
         self.train_hesaplamaSuresi=self.te
         self.train_egitilmeZamani=self.te2
         self.train_f1Score=self.train_f1
         self.train_precisionSkor=self.train_wp
         self.train_recallScore=self.train_wr

         self.tarihbug = str(datetime.now().strftime("%d.%m.%y_%H_%M"))
         temp1 = open("sonuclar.txt", "a")
         temp1.write("Algoritma:" +self.algoritma +" " +"Eğitim Data Sayısı: "  +str(self.trainDataCount) +" " +"Test Data Sayısı: " +str(self.testDataCount) +" " +"Dogruluk Orani: " +str(self.dogrulukOrani)
         +" " +"Hata Orani: " +str(self.hataOrani)  +" " +"Hesaplama Süresi: " +str(self.hesaplamaSuresi) +" sn "  +" " +"Egitilme zamani: " +str(self.egitilmeZamani) +" sn "   +" " +"F1 Skoru: " +str(self.f1Score)
         +" " +"Precision Skor: " +str(self.precisionSkor) +" " +"Recall Score: " +str(self.recallScore))
         temp1.write("\n")              
         messagebox.showinfo("Bilgi","%s algoritmasi listeye eklendi"%self.algoritma)
         path = "sonuclar"
         self.pathSave = path +'/' +self.algoritma+'_'+self.tarihbug +'.csv'
         
         with open(self.pathSave, mode='w') as csv_file:
             fieldnames = ['Algoritma', 'Data Sayısı', 'Dogruluk Orani', 'Hata Orani', 'Hesaplama Süresi', 'Egitilme zamani', 'F1 Skoru','Precision Skor', 'Recall Score']
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()

             writer.writerow({'Algoritma': ''+self.algoritma+' (Egitim) ', 'Data Sayısı': ''+str(self.trainDataCount), 'Dogruluk Orani': ''+str(self.train_dogrulukOrani),
                              'Hata Orani': ''+str(self.train_hataOrani), 'Hesaplama Süresi': ''+str(self.train_hesaplamaSuresi), 'Egitilme zamani': ''+str(self.train_egitilmeZamani), 'F1 Skoru': ''+str(self.train_f1Score),
                              'Precision Skor': ''+str(self.train_precisionSkor), 'Recall Score': ''+str(self.train_recallScore)})

             
             writer.writerow({'Algoritma': ''+self.algoritma+'(Test) ', 'Data Sayısı': ''+str(self.testDataCount), 'Dogruluk Orani': ''+str(self.dogrulukOrani),
                              'Hata Orani': ''+str(self.hataOrani), 'Hesaplama Süresi': ''+str(self.hesaplamaSuresi), 'Egitilme zamani': ''+str(self.egitilmeZamani), 'F1 Skoru': ''+str(self.f1Score),
                              'Precision Skor': ''+str(self.precisionSkor), 'Recall Score': ''+str(self.recallScore)})
             #writer.write("\n")


             messagebox.showinfo("Bilgi","%s algoritmasi CSV olarak eklendi"%self.algoritma)    




 def secVeGoster(self,event=None):
      self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
      print (self.filename)
      self.pathSave = self.filename
      self.csvView()

 def csvView(self):

   import tkinter
   import csv
   root = Tk()
   root.title("Sonuç Görüntüleme")
   path = "sonuclar"
   # open file
   with open(self.pathSave, mode='r') as file:
      reader = csv.reader(file)

      # r and c tell us where to grid the labels
      r = 0
      for col in reader:
         c = 0
         for row in col:
            # i've added some styling
            label = Label(root, width = 20, height = 3, \
                                  text = row, relief = tkinter.RIDGE)
            label.grid(row = r, column = c)
            c += 1
         r += 1

   root.mainloop()


 def listele(self):
        ListelePencere=Tk()
        ListelePencere.geometry("600x400")
        ListelePencere.title("Kişi Listeleme")
        ListelePencere.configure(background="red")
    
        liste=Text(ListelePencere,width="200",height="400",fg="white",bg="red",font="helvetica 12")
        liste.grid(row=0,column=0)
    
     
        satir_sayisi=0
        temp1 = open("sonuclar.txt", "r")
        readfile = temp1.read()
        liste.insert(END,readfile)

        
 def Hakkinda(self):
        HakkindaPencere=Tk()
        HakkindaPencere.geometry("700x50")
        HakkindaPencere.title("Barış KARABAY Fırat Üniversitesi Yazılım Mühendisliği Tez Projesi V2")
        HakkindaPencere.configure(background="red")

        self.baris=Label(HakkindaPencere,text="Bu Program Barış Karabay Tarafından Yapılmıştır \n Hiçbir Şekilde Paylaşılamaz ve Değiştirilemez. ",fg="black",bg="white")
        self.baris.grid(row=0,column=0)

 def secim(self,event=None):
      self.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
      #ment = self.filename
      print (self.filename)
      #self.egitimTxt.set(self.filename)
      #self.['text']=self.filename
      self.nameText.set(self.filename)
      
 def secilenRadio1(self):
      print(self.selected1.get())
      if self.selected1.get()==1:
         #showinfo("Uyarı","birinci")
         self.combo.grid_remove()
      else:
         #self.combo.grid()
         self.combo.grid(column=3, row=2)

 def secilenRadio2(self):
      print(self.selected2.get())
      if self.selected2.get()==3:
         #showinfo("Uyarı","birinci")
         self.comboulke.grid_remove()
      else:
         #self.comboulke.grid()
         self.comboulke.grid(column=3, row=3)
                 
 def get_dummy(self):
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col
    df = self.spark_df
    categoricalCols = self.catcols
    continuousCols = self.num_cols
    labelCol = self.labelCol
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]
    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")
    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    model=pipeline.fit(df)
    data = model.transform(df)
    data = data.withColumn('label',col(labelCol))
    data.show(5,False)
    return data.select('features','label')


 def secilenDosya(self):
      print(self.egitimTxt.get())
      self.dosya=self.egitimTxt.get()
      self.dataSayisicntr = self.dataSayisiTxt.get()
      if self.dosya==" " or self.dosya=='' or self.dataSayisicntr=='':
         messagebox.showinfo("Uyarı","Boş Olamaz")
      else:
         print(self.comboulke.current(), self.comboulke.get())
         #self.progress.start()
         messagebox.showinfo("Uyarı","Yükleme Başlatıldı")
         #self.progress.config(mode='indeterminate')
         self.dosya = str(self.dosya)
         self.dataSayisi = int(self.dataSayisiTxt.get())
         print(self.dosya)
         mySchema = StructType([ StructField("country", IntegerType(), True)\
                       ,StructField("region", IntegerType(), True)\
                       ,StructField("attacktype1", IntegerType(), True)\
                       ,StructField("targtype1_txt", StringType(), True)\
                       ,StructField("gname", StringType(), True)\
                       ,StructField("weaptype1", IntegerType(), True)])
         
         
         #egitim=pd.read_csv("D:/globalterrorismdb2.csv", usecols=[7, 9, 26, 27, 28, 35, 36, 40, 58, 68, 81], encoding='ISO-8859-1',low_memory=False)
         self.egitim=pd.read_csv(self.dosya, usecols=[7, 9, 28, 35, 58, 81], encoding='ISO-8859-1',low_memory=False,nrows=self.dataSayisi)
         #209 Turkey
         #217 ABD
         #94 İran
         #153 Pakistan
         #95 Irak
         #4 Afganistan
         #200 Suriye
         if self.comboulke.get() != '' or self.comboulke.get() != "":
            self.egitim = self.egitim[(self.egitim.country == self.returnUlkeInt())]
            messagebox.showinfo("Bilgi","%s ülkesi için eğitim ve test veri seti oluşturulacak"%self.comboulke.get()) 
         
         print("Girilen  Sayi dogru")
         print("Toplam  Sayisi")
         print (self.egitim.count())
         self.sqlContext = SQLContext(self.sc)
         self.spark_df = self.sqlContext.createDataFrame(self.egitim, schema=mySchema)
         self.YukleBtn.grid_remove()
         #self.progress.stop()
         messagebox.showinfo("Başarılıı","Yükleme Tamamlandı")

 def DonusumuBaslat(self):
         sp_df = self.spark_df
         messagebox.showinfo("Uyarı","Dönüşüm Başladı")
         self.data_f = self.get_dummy()
         self.data_f.show(25,False)
         self.labelIndexer = StringIndexer(inputCol='label',outputCol='indexedLabel').fit(self.data_f)
         self.labelIndexer.transform(self.data_f).show(25,False)
         self.featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(self.data_f)
         self.featureIndexer.transform(self.data_f).show(25,False)
         self.labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=self.labelIndexer.labels)
         if self.testTxt.get()=='':
             messagebox.showinfo("Hata","Lütfen Test oranını girin")
         else:
            deger = self.testTxt.get()
            testPoint=float(deger)/100
            (self.trainingData, self.testData) = self.data_f.randomSplit([1.0-testPoint, testPoint], seed = 100)
            messagebox.showinfo("Başarılı","Oran Hesaplandı")
            self.DonusumBtn.grid_remove()

 def createFolder(self):
    try:
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
    except OSError:
        print ('Error: Creating directory. ' +  self.directory)

 def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

 def modeliEgit(self):
         print(self.combo.current(), self.combo.get())
         messagebox.showinfo("Bilgi","%s algoritması için model oluşturulacak"%self.combo.get()) 
         if self.combo.current()==0:
            self.LogicticRegressionClassifier()
         elif self.combo.current()==1: 
            self.NaiveBayesClassifier()
         elif self.combo.current()==2:
            self.RandomForestClassifier()
         elif self.combo.current()==3:
            self.DecisionTreeClassifier()
         elif self.combo.current()==4:
            self.SVMclassifier()
         elif self.combo.current()==5:
            self.KNNclassifier()
            
 def printMetrics(predictions_and_labels):
   metrics = MulticlassMetrics(predictions_and_labels)
   print('Precision of True ', metrics.precision(1))
   print('Precision of False', metrics.precision(0))
   print('Recall of True    ', metrics.recall(1))
   print('Recall of False   ', metrics.recall(0))
   print('F-1 Score         ', metrics.fMeasure())
   print('Confusion Matrix\n', metrics.confusionMatrix().toArray())
    
 def getPredictionsLabels(model, test_data):
   predictions = model.predict(test_data.map(lambda r: r.features))
   return predictions.zip(test_data.map(lambda r: r.label))
         
 def LogicticRegressionClassifier(self):
   self.t0 = time()
   print("********************************************************************************************************************************************")
   print("Logistic Regression")
   logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel',maxIter=20, regParam=0.3, elasticNetParam=0)
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, logr, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))
   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   rfModel = model.stages[2]
   #model.save("model2345678909")
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   #self.predictions.printSchema()
   #paramGrid = (ParamGridBuilder()
   # .addGrid(logr.regParam, [0.01, 0.1, 0.5]) \
   #  .addGrid(logr.maxIter, [10, 20, 50]) \
   #  .addGrid(logr.elasticNetParam, [0.0, 0.8]) \
   # .build())
   
   #crossval = CrossValidator(estimator=pipeline,
   #                       estimatorParamMaps=paramGrid,
   #                       evaluator=evaluator,
   #                       numFolds=3)
   #
   #model = crossval.fit(self.trainingData)
   #predictions = model.transform(self.testData)
   #accuracy = evaluator.evaluate(predictions)
   #print("Dogruluk = %g" % (accuracy))

 def DecisionTreeClassifier(self):
   self.t0 = time()
   print("********************************************************************************************************************************************")
   print("Decision Tree Classifier")
   dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",impurity="gini",maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                         cacheNodeIds=False, checkpointInterval=10)
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, dt, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
 def NaiveBayesClassifier(self):
   print("********************************************************************************************************************************************")
   self.t0 = time()
   print("Bayes")
   nb = NaiveBayes(featuresCol='indexedFeatures', labelCol='indexedLabel', smoothing=1.0, modelType="multinomial")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, nb, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
 def RandomForestClassifier(self):
   print("********************************************************************************************************************************************")
   print("Random Forest")
   self.t0 = time()
   rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   svm = LinearSVC(maxIter=5, regParam=0.01)
   LSVC = LinearSVC()
   ovr = OneVsRest(classifier=LSVC)
   paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build()
   crossval = CrossValidator(estimator=ovr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=MulticlassClassificationEvaluator(metricName="f1"),
                                  numFolds=2)
   Train_sparkframe = self.trainingData.select("features", "label")
   cvModel = crossval.fit(Train_sparkframe)
   bestModel = cvModel.bestModel

   

 def SVMclassifier(self):
   print("********************************************************************************************************************************************")
   self.t0 = time()
   print("SVM")
   df = self.egitim
   df['gname_id'] = df['gname'].factorize()[0]
   df['weaptype1_id'] = df['weaptype1'].factorize()[0]
   df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0]
   df['targsubtype1_id'] = df['targsubtype1'].factorize()[0]
   X = df.iloc[:, [0,1,2,8,9,10]].values
   y = df.iloc[:, 7].values
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
   scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
   X_train = scaling.transform(X_train)
   X_test = scaling.transform(X_test)
   classifier = SVC(kernel='linear',cache_size=7000, random_state = 0)
   classifier.fit(X_train, y_train)
   self.tt = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.tt))
   self.t0 = time()
   y_pred = classifier.predict(X_test)
   accuracy = accuracy_score(y_test, y_pred)
   self.tt2 = time() -self.t0
   print(accuracy)
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy))

 def KNNclassifier(self):
   print("********************************************************************************************************************************************")
   from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
   from sklearn.preprocessing import StandardScaler
   from sklearn.model_selection import train_test_split
   from sklearn.metrics import accuracy_score
   from sklearn.preprocessing import MinMaxScaler
   from sklearn.ensemble import RandomForestClassifier
   from sklearn.tree import DecisionTreeClassifier
   from sklearn.svm import SVC
   from sklearn.neighbors import KNeighborsClassifier
   from sklearn.linear_model import LogisticRegression
   from sklearn.naive_bayes import GaussianNB
   from sklearn.metrics import classification_report
   from sklearn.metrics import confusion_matrix
   from sklearn.preprocessing import LabelEncoder

   
   print("KNN")
   df = self.egitim

   df['gname_id'] = df['gname'].factorize()[0]
   df['weaptype1_id'] = df['weaptype1'].factorize()[0]
   df['targtype1_txt_id'] = df['targtype1_txt'].factorize()[0]
   print("Toplam  Sayisi")
   #print (df.count())

   X = df.iloc[:, [0,1,2,7,8]].values
   y = df.iloc[:, 6].values
   #print(df.iloc[:, 6])
   #print(df.columns)
   #print(X)
   #print(y)
   #print(df['gname_id'])
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
   scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
   X_train = scaling.transform(X_train)
   X_test = scaling.transform(X_test)
   classifier = KNeighborsClassifier(n_neighbors=9, metric='minkowski', p = 2)
   self.t0 = time()
   classifier.fit(X_train, y_train)
   self.tt = time() - self.t0
   print ("Veri kumesini egitim zamani {} saniye ".format(self.tt))
   self.t0 = time()
   y_pred = classifier.predict(X_test)
   self.tt = time() - self.t0
   print ("test verisini siniflandirma zamani {} saniye ".format(self.tt))
   self.t0 = time()
   x_pred = classifier.predict(X_train)
   self.tt = time() - self.t0
   print ("egitim verisini siniflandirma zamani {} saniye ".format(self.tt))

   accuracy = accuracy_score(y_test, y_pred)
   accuracy_egitim = accuracy_score(y_train, x_pred)
   self.tt2 = time() -self.t0

   print ('Test Accuracy:', accuracy)
   print ('Egitim Accuracy:', accuracy_egitim)
   #print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, accuracy))
   #print 'Accuracy:', accuracy_score(y_test, y_pred)
   print ('Test F1 score:', f1_score(y_test, y_pred,average='weighted'))
   print ('Test Recall:', recall_score(y_test, y_pred,
                                 average='weighted'))
   print ('Test Precision:', precision_score(y_test, y_pred,
                                       average='weighted'))

   print ('egitim F1 score:', f1_score(y_train, x_pred,average='weighted'))
   print ('egitim Recall:', recall_score(y_train, x_pred,
                                 average='weighted'))
   print ('egitim Precision:', precision_score(y_train, x_pred,
                                       average='weighted'))

   #print '\n clasification report:\n', classification_report(y_test, y_pred)
   #print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred)

   print("********************************************************************************************************************************************")