示例#1
0
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys",
        "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl",
        "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance",
        "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=5,
                             featuresCol="indexedFeatures",
                             labelCol="Position",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Edad", "Genero", "Zona", "Fumador_Activo",
        "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria",
        "Tension_sistolica", "Tension_diastolica", "Colesterol_Total",
        "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno",
        "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina",
        "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos",
        "Estatina", "Antidiabeticos", "Adherencia_tratamiento"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=15)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=15,
                             featuresCol="indexedFeatures",
                             labelCol="Diabetes",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show(100)
    def test_chi_sq_selector(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
             (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
             (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
            ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1,
                                 outputCol="selectedFeatures")
        model = selector.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ChiSqSelector',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlChiSqSelector")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
示例#4
0
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i,
                             feature_cols):

    # APPLY CHI-SQUARE SELECTOR
    name = f"ChiSquareSelectorModel_{i}"
    selector_model_path = Path(spark_artefacts_dir).joinpath(name)

    if run_mode == 'first':

        # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better)
        r = ChiSquareTest.test(ml_df, "features", "label")
        pValues = r.select("pvalues").collect()[0][0].tolist()
        stats = r.select("statistics").collect()[0][0].tolist()
        dof = r.select("degreesOfFreedom").collect()[0][0]

        # ChiSq Selector
        selector = ChiSqSelector(numTopFeatures=10,
                                 featuresCol="features",
                                 outputCol="selected_features",
                                 labelCol="label")
        selector_model = selector.fit(ml_df)
        selector_model.write().overwrite().save(
            str(selector_model_path.absolute()))

        top_10_feaures_importance = []
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_feaures_importance.append(feature_cols[j])
            top_10_features.append(feature_cols[j])
            top_10_feaures_importance.append(stats[j])

        model_info = [
            name,
            ml_df.count(), None, None, None, None, None, None, None
        ] + top_10_feaures_importance
        model_info_df = spark.createDataFrame(data=[model_info],
                                              schema=MODEL_INFO_SCHEMA)
        model_info_df.write.jdbc(CONNECTION_STR,
                                 'model_info',
                                 mode='append',
                                 properties=CONNECTION_PROPERTIES)

    elif run_mode == 'incremental':
        selector_model = ChiSqSelectorModel.load(
            str(selector_model_path.absolute()))
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_features.append(feature_cols[j])

    ml_df_10 = selector_model.transform(ml_df)
    ml_df_10 = ml_df_10.drop("features")

    #Solve a problem with ChiSqSelector and Tree-based algorithm
    ml_rdd_10 = ml_df_10.rdd.map(
        lambda row: Row(label=row[0], features=DenseVector(row[1].toArray())))
    ml_df_10 = spark.createDataFrame(ml_rdd_10)

    return ml_df_10, top_10_features
def Chi_sqr(dataset_add, feature_colm, label_colm):
    dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)

    dataset.show()

    # using the rformula for indexing, encoding and vectorising

    label = ''
    for y in label_colm:
        label = y

    print(label)

    f = ""
    f = label + " ~ "

    for x in feature_colm:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    length = feature_colm.__len__()

    output = formula.fit(dataset).transform(dataset)

    output.select("features", "label").show()

    # chi selector
    from pyspark.ml.feature import ChiSqSelector

    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")

    result = selector.fit(output).transform(output)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    #runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", "label").head()
    print("pValues: " + str(r.pValues))
    p_values = str(r.pValues)
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))

    print("statistics: " + str(r.statistics))

    json_response = {'pvalues': p_values}

    return json_response


# Chi_sqr(dataset_add, features_colm, label_colm)
    def chiSquareTest(self,categoricalFeatures,maxCategories):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        length = features.__len__()

        featureassembler = VectorAssembler(
            inputCols=self.features,
            outputCol="featuresChiSquare", handleInvalid="skip")
        dataset= featureassembler.transform(dataset)

        vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))

        dataset = vec_indexer.transform(dataset)

        # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
        # finalized_data.show()

        # using chi selector
        selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
                                 outputCol="selectedFeatures",
                                 labelCol=labelColm)

        result = selector.fit(dataset).transform(dataset)

        print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
        result.show()

        # runnin gfor the chi vallue test

        r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
        p_values = list(r.pValues)
        PValues = []
        for val in p_values:
            PValues.append(round(val, 4))
        print(PValues)
        dof = list(r.degreesOfFreedom)
        stats = list(r.statistics)
        statistics = []
        for val in stats:
            statistics.append(round(val, 4))
        print(statistics)
        chiSquareDict = {}
        for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
            print(pval, doF, stat)
            chiSquareDict[colm] = pval, doF, stat
        chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
        print(chiSquareDict)

        result = {'pvalues': chiSquareDict}

        return result
def feature_selection(t_data):
    #Feature selection
    css = ChiSqSelector(featuresCol='scaled_features',
                        outputCol='Aspect',
                        labelCol='output',
                        numTopFeatures=10)
    t_data = css.fit(t_data).transform(t_data)

    return t_data
示例#8
0
def clasificar_chi2():
    #Leemos la data y convertimos a float los valores de cada columna
    conf = SparkConf().setAppName("NN_1").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    rdd = sqlContext.read.csv(
        "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd
    rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[
        3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]),
                             float(x[9])))

    df = rdd.toDF([
        "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics",
        "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno"
    ])
    #Construir nuestro vector assembler (features)
    assembler = VectorAssembler(inputCols=[
        "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin",
        "mitoses"
    ],
                                outputCol="featuresChi2")
    df_chi2 = assembler.transform(df)
    df_chi2 = df_chi2.select("featuresChi2", "P_Benigno")

    selector = ChiSqSelector(numTopFeatures=3,
                             featuresCol="featuresChi2",
                             labelCol="P_Benigno",
                             outputCol="featuresSelected")
    df_result = selector.fit(df_chi2).transform(df_chi2)

    #Dividir data en training y test
    (df_training, df_test) = df_result.randomSplit([0.7, 0.3])

    # Definir arquitectura de nuestra red (hiperparametro)
    capas = [3, 4, 6, 2]

    # Construimos al entrenador
    # Hiperparametro: maxIter
    entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected",
                                                labelCol="P_Benigno",
                                                maxIter=1000,
                                                layers=capas)
    # Entrenar nuestro modelo
    modelo = entrenador.fit(df_training)

    # Validar nuestro modelo
    df_predictions = modelo.transform(df_test)
    evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluador.evaluate(df_predictions)
    print(f"Accuracy: {accuracy}")

    df_predictions.select("prediction", "rawPrediction", "probability").show()

    #Mostramos la cantidad de 0 y 1 de las predicciones
    df_predictions.groupby('prediction').count().show()
示例#9
0
    def run_feature_selection_on(data):
        LOGGER.warning("Running feature selection.")
        selector = ChiSqSelector(numTopFeatures=10,
                                 featuresCol="features",
                                 outputCol="selectedFeatures",
                                 labelCol="label")

        data = selector.fit(data).transform(data).drop(
            'features').withColumnRenamed('selectedFeatures', 'features')
        LOGGER.warning("Ran feature selection.")
        return data
示例#10
0
def pre_processing(df):
    ''' feature selection '''
    selector = ChiSqSelector(numTopFeatures=1,
                             featuresCol="features",
                             outputCol="selectedFeatures",
                             labelCol="clicked")

    result = selector.fit(df).transform(df)

    print("ChiSqSelector output with top %d features selected" %
          selector.getNumTopFeatures())
    result.show()
示例#11
0
def getAllMeasure(rf,selectorData,featureCols,):
    measure = np.array(['  ', '  ', '  '])
    for i in range(1, len(featureCols) + 1):
        selector = ChiSqSelector(numTopFeatures=i, featuresCol="features",
                                 outputCol="selectedFeatures", labelCol="label")

        selectedData = selector.fit(selectorData).transform(selectorData)
        trainSelected, testSelected = selectedData.randomSplit([0.7, 0.3])
        rfModel = rf.fit(trainSelected)

        prediction = rfModel.transform(testSelected)
        evaluator = BinaryClassificationEvaluator()
        measure = np.vstack([evaluateLr(prediction, evaluator, i), measure])
    return measure
示例#12
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
示例#13
0
def appendselector(stages, percent=0.5):
    #A Chi-Square Feature Selector uses the Chi-Squared test of independence to decide which features
    #as the most "useful". In this case, 50% of the original amount of features are set to be kept.
    #With these Transformers, the stages for training Hybrid Classifiers are set (different Transformer
    #for TF-IDF and Word Embedding Text-Based Features.
    if (percent < 1.0):
        print("Appending Chi-Square to stages with percentage " + str(percent))
        selectorType = 'percentile'
        numTopFeatures = 50
        percentile = percent
    else:
        print("Appending Chi-Square to stage with numTopFeatures " +
              str(percent))
        selectorType = 'numTopFeatures'
        numTopFeatures = percent
        percentile = 0.1

    stages[-1].setOutputCol('prefeatures')
    selector = ChiSqSelector(numTopFeatures=numTopFeatures,
                             featuresCol='prefeatures',
                             outputCol='features',
                             selectorType=selectorType,
                             percentile=percentile)
    selectorstages = stages + [selector]
    return selectorstages
示例#14
0
def MachineLearning(df):
    file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/"
    data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label')
    data = data.withColumn('length',length(data['Summary']))
    # Basic sentence tokenizer
    tokenizer = Tokenizer(inputCol="Summary", outputCol="words")
   
    #remove stop words
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_features")
   
    #transoform dataset to vectors
    cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000)
    
    #calculate IDF for all dataset
    idf = IDF(inputCol= 'features1', outputCol = 'tf_idf')
    
    normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False)
    selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures",
                         outputCol="selectedFeatures", labelCol="label")
    #prepare data for ML spark library
    cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features')
    # Normalize each Vector using $L^1$ norm.
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp])
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    data.printSchema()
    train_data, test_data = data.randomSplit([0.7,0.3],seed=2018)

    lr = LogisticRegression(featuresCol="features", labelCol='label')
    lrModel = lr.fit(train_data)
    beta = np.sort(lrModel.coefficients)
    plt.plot(beta)
    plt.ylabel('Beta Coefficients')
    plt.show()

    trainingSummary = lrModel.summary
    roc = trainingSummary.roc.toPandas()
    plt.plot(roc['FPR'],roc['TPR'])
    plt.ylabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
    print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))



    pr = trainingSummary.pr.toPandas()
    plt.plot(pr['recall'],pr['precision'])
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.show()
    predictions = lrModel.transform(test_data)
    evaluator = BinaryClassificationEvaluator()
    print('Test Area Under ROC', evaluator.evaluate(predictions))
示例#15
0
def important_feature_selector(predicted):
    """Uses the Chi-Squared Test to select important features for classification, and prints them out.
    
    Params:
    - predicted (pyspark.sql.DataFrame): The dataset, with predictions
    """
    selector = ChiSqSelector(numTopFeatures=50,
                             featuresCol='presence_feature_set',
                             labelCol='label',
                             outputCol='selected_features',
                             selectorType='numTopFeatures')
    model = selector.fit(predicted)
    important_features = model.selectedFeatures
    with open('bag_of_words_labels.json', 'r') as bow_file:
        bow_labels = json.loads(
            bow_file.readlines()[0])  # There is only one line
    important_feature_labels = [
        bow_labels[index] for index in important_features
    ]
    print("=====Important Feature Labels=====")
    print(important_feature_labels)
示例#16
0
def pruebaChi(dataframe,
              categoricalCols,
              numericalCols,
              labelCol="TIPO PACIENTE"):
    """Función que hace todo el preprocesamiento de los datos
    categóricos de un conjunto de datos de entrenamiento (o no).
    :param train spark df: conjunto de datos de entrenamiento.
    :param categoricalCols list,array: conjunto de nombres de columnas categoricas del
        conjunto de datos train.
    :param numericalCols list,array: conjunto de nombres de columnas numéricas del 
        conjunto de datos train.
    :param labelCol str: variable objetivo o etiqueta

    :Returns spark dataframe con las columnas 'label' y 'features'
    """

    # codificamos todas las variables categóricas
    stages = []
    for categoricalCol in categoricalCols:
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + "Index")
        encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                                outputCol=categoricalCol + "ohe")
        stages += [stringIndexer, encoder]

    # variable objetivo (etiqueta)
    label_strIdx = StringIndexer(inputCol=labelCol, outputCol="label")
    stages += [label_strIdx]

    # ponemos todas las covariables en un vector
    assemblerInputs = [c + "ohe" for c in categoricalCols] + numericalCols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="feat")
    stages += [assembler]

    # seleccionamos las variables que nos sirven con ChiSqSelector
    selector = ChiSqSelector(featuresCol="feat",
                             outputCol="feature",
                             labelCol="label",
                             fpr=0.05,
                             selectorType='fpr')
    stages += [selector]

    # escala de 0-1
    scala = MinMaxScaler(inputCol="feature", outputCol="features")
    stages += [scala]

    # pipeline donde vamos a hacer todo el proceso
    pipe = Pipeline(stages=stages)
    pipeModel = pipe.fit(dataframe)
    df = pipeModel.transform(dataframe)

    # regresamos nuestro df con lo que necesitamos
    return df
示例#17
0
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
        "exang", "oldpeak", "slope", "ca", "thal"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=4,
                             featuresCol="indexedFeatures",
                             labelCol="target",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
示例#18
0
def feature_selection(df):
    # Creamos vectorassembler
    assembler = VectorAssembler(inputCols=[
        "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS",
        "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA",
        "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO",
        "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA",
        "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    # Vectorindexer
    indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    df = indexer.fit(df).transform(df)

    # Prueba ChiSquare
    selector = ChiSqSelector(numTopFeatures=8,
                             featuresCol="indexedFeatures",
                             labelCol="DIABETES",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
示例#19
0
def initializePipeline(num_cols, cat_cols):

    cat_cols_index = []
    cat_cols_hoted = []
    for i in cat_cols:
        cat_cols_index.append(i + "_index")
        cat_cols_hoted.append(i + "_hoted")

    featureCols = []
    for i in num_cols:
        featureCols.append(i + "scaled")

    for i in cat_cols:
        featureCols.append(i + "_hoted")

    labelindexers = [StringIndexer(inputCol="Churn", outputCol="label")]
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index")
        for column in cat_cols
    ]
    oneHotEncoder = [
        OneHotEncoderEstimator(inputCols=cat_cols_index,
                               outputCols=cat_cols_hoted,
                               dropLast=False)
    ]
    assembler = [
        VectorAssembler(inputCols=num_cols, outputCol=i + "_indexe")
        for i in num_cols
    ]
    normalizers = [
        MinMaxScaler(inputCol=column + "_indexe", outputCol=column + "scaled")
        for column in num_cols
    ]
    featureAssembler = [
        VectorAssembler(inputCols=featureCols, outputCol="resultedfeatures")
    ]
    selector = [
        ChiSqSelector(numTopFeatures=13,
                      featuresCol="resultedfeatures",
                      outputCol="features",
                      labelCol="label")
    ]
    pipeline = Pipeline(stages=indexers + oneHotEncoder + assembler +
                        normalizers + featureAssembler + labelindexers +
                        selector)
    return pipeline
    def build_trigrams(input_cols=("text", "target"), n=3):
        logging.warning("Building trigram model.")
        tokenizer = [Tokenizer(inputCol=input_cols[0], outputCol="words")]
        ngrams = [
            NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
            for i in range(1, n + 1)
        ]

        cv = [
            CountVectorizer(vocabSize=2**14,
                            inputCol="{0}_grams".format(i),
                            outputCol="{0}_tf".format(i))
            for i in range(1, n + 1)
        ]
        idf = [
            IDF(inputCol="{0}_tf".format(i),
                outputCol="{0}_tfidf".format(i),
                minDocFreq=5) for i in range(1, n + 1)
        ]

        assembler = [
            VectorAssembler(
                inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
                outputCol="rawFeatures")
        ]
        label_string_idx = [
            StringIndexer(inputCol=input_cols[1], outputCol="label")
        ]
        selector = [
            ChiSqSelector(numTopFeatures=2**14,
                          featuresCol='rawFeatures',
                          outputCol="features")
        ]
        lr = [LogisticRegression(maxIter=100)]
        return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                        label_string_idx + selector + lr)
    for feature in feature_cols:
        indexed = feature + "_" + "indexed"
        indexed_cols.append(indexed)
        indexer = StringIndexer(inputCol=feature,
                                outputCol=indexed,
                                handleInvalid="keep",
                                stringOrderType="frequencyDesc")
        stages.append(indexer)

    stages.append(
        VectorAssembler(inputCols=indexed_cols,
                        outputCol="features",
                        handleInvalid="keep"))
    stages.append(
        ChiSqSelector(numTopFeatures=20,
                      labelCol="HasDetections",
                      featuresCol="features",
                      outputCol="selectedFeatures"))

    print("Performing model fitting")
    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(df)
    df_features = model.transform(df)
    df_features.select("features", "selectedFeatures").show()

    print("Saving Pipeline Model")
    model.write().overwrite().save(pipeline_model_path)

    with open(feature_path, "wb") as f:
        pickle.dump(feature_cols, f)

features = model.stages[-1].selectedFeatures
示例#22
0
#

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector
from pyspark.mllib.linalg import Vectors
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ChiSqSelectorExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
        (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
        (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

    selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                             outputCol="selectedFeatures", labelCol="clicked")

    result = selector.fit(df).transform(df)
    result.show()
    # $example off$

    spark.stop()
示例#23
0
pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol(
    "polyFeatures")
pe.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer

tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)
chisq.fit(prechi).transform(prechi)\
  .drop("customerId", "Description", "DescOut").show()

# COMMAND ----------

fittedPCA = pca.fit(scaleDF)
fittedPCA.write().overwrite().save("/tmp/fittedPCA")

# COMMAND ----------

from pyspark.ml.feature import PCAModel

loadedPCA = PCAModel.load("/tmp/fittedPCA")
loadedPCA.transform(scaleDF).show()
示例#24
0
std_scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaled_df = std_scaler.fit(features_df).transform(features_df)
scaled_df.select("scaled_features").display()

# COMMAND ----------

# MAGIC %md ###Part 4: Feature Selection
# MAGIC Chi Square Selector

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

chisq_selector = ChiSqSelector(numTopFeatures=1,
                               featuresCol="scaled_features",
                               outputCol="selected_features",
                               labelCol="cust_age")

result_df = chisq_selector.fit(scaled_df).transform(scaled_df)

result_df.select("selected_features").display()

# COMMAND ----------

# MAGIC %md Feature Selection using VectorSclicer

# COMMAND ----------

from pyspark.ml.feature import VectorSlicer

vec_slicer = VectorSlicer(inputCol="scaled_features",
    class1_num = class1.count()
    class2_num = class2.count()
    fraction = 1.0 * class1_num / class2_num
    class2 = class2.sample(fraction)
    training_dataset_balanced = class1.union(class2)
    training_dataset_balanced.groupBy("_c41").count().show()

    ####### 14.1 ###
    converted_cols = ["s" + col for col in string_cols]
    assembler = VectorAssembler(inputCols=converted_cols + numerical_cols,
                                outputCol="features")
    labelIndexer = StringIndexer(inputCol="_c41", outputCol="label")
    #classifier = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxBins=64, maxDepth= 5, subsamplingRate= 1.0  )  ## 14.2
    #classifier = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=64)
    selector = ChiSqSelector(numTopFeatures=35,
                             featuresCol="features",
                             outputCol="selectedFeatures")
    classifier = NaiveBayes(
        smoothing=1.0
    )  ## modelType="multinomial" we have binomial here so it doesn't make change when we apply this parameter
    pipeline = Pipeline(stages=indexers +
                        [assembler, labelIndexer, selector, classifier])
    model = pipeline.fit(training_dataset_balanced)
    #predictions = model.transform(dataset_testing) ##14.1
    predictions = model.transform(dataset_testing)  ## 14.2
    predictions.show(10, False)

    ###### 14.2 ####
    evaluator = BinaryClassificationEvaluator(labelCol="label",
                                              rawPredictionCol="prediction")
    accuracy = evaluator.evaluate(predictions)
from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)
chisq.fit(prechi).transform(prechi)\
  .drop("customerId", "Description", "DescOut").show()


# COMMAND ----------

fittedPCA = pca.fit(scaleDF)
fittedPCA.write().overwrite().save("/tmp/fittedPCA")


# COMMAND ----------

from pyspark.ml.feature import PCAModel
loadedPCA = PCAModel.load("/tmp/fittedPCA")
示例#27
0
    # assemble all features into feature vector
    features_assembler = VectorAssembler(inputCols=num_bool_features,
                                         outputCol="features")

    # Index labels, adding metadata to the label column.
    label_indexer = StringIndexer(inputCol="has_over_50k",
                                  outputCol="label").fit(processed_train_set)

    # Convert indexed labels back to original labels.
    label_converter = IndexToString(inputCol="prediction",
                                    outputCol="predicted_label",
                                    labels=label_indexer.labels)

    # - ChiSQ feature Selection
    selector = ChiSqSelector(numTopFeatures=20,
                             featuresCol="features",
                             outputCol="featuresSel",
                             labelCol="label")

    # - RandomForest model with parameter tuning using cross validation
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="featuresSel",
                                numTrees=20)

    # - Create ParamGrid for Cross Validation
    rf_param_grid = (ParamGridBuilder().addGrid(
        rf.maxDepth,
        [2, 3, 4, 5, 10, 20]).addGrid(rf.maxBins,
                                      [10, 20, 40, 80, 100]).build())

    # - Model Evaluation
    rf_eval = BinaryClassificationEvaluator(labelCol="label")
    def add_vectorized_features(self, transform_type, min_df, max_df, isCHISQR,
                                chi_feature_num, num_features):
        '''
        Creates the pySpark feature pipeline and stores the vectorized data under the feature column 
        Input: transform_type: {'tfidf','tfidf_bigram'}, min document frequency (min_df), chi squared feature reduction (isCHISQR)
               number of reduced features with chi square feature reduction (chi_feature_num), number of features (num_features)                  
        Output: Returns the transformed dataframe with the label and features columns
        '''
        stages = []
        #Code this code transforms text to vectorized features

        # Tokenize review sentences into vectors of words
        regexTokenizer = RegexTokenizer(inputCol="reviewText",
                                        outputCol="words",
                                        pattern="\\W")

        stages += [regexTokenizer]

        #Remove stopwords from tokenized words
        #nltk.download('stopwords')
        from nltk.corpus import stopwords
        sw = stopwords.words('english')
        stopwordsRemover = StopWordsRemover(
            inputCol="words", outputCol="filtered").setStopWords(sw)

        #lemmatizer = WordNetLemmatizer()
        #doc = [lemmatizer.lemmatize(token) for token in doc]
        stages += [stopwordsRemover]

        # Using TFIDF for review transformation of unigrams.
        if transform_type == 'tfidf':
            # Creating IDF from the words the filtered words
            hashingTF = HashingTF(inputCol="filtered",
                                  outputCol="rawFeatures",
                                  numFeatures=num_features)
            idf = IDF(inputCol="rawFeatures",
                      outputCol="review_vector",
                      minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF, idf]

        # Using TFIDF for review transformation of bigrams
        if transform_type == 'tfidf_bigram':
            #Add unigram and bigram word vectors, then vectorize using TFIDF
            unigram = NGram(n=1, inputCol='filtered', outputCol='unigrams')
            stages += [unigram]

            bigram = NGram(n=2, inputCol='filtered', outputCol='bigrams')
            stages += [bigram]
            # Creating IDF from unigram  words
            hashingTF_unigram = HashingTF(inputCol="unigrams",
                                          outputCol="rawFeatures_unigrams",
                                          numFeatures=num_features)
            idf_unigram = IDF(inputCol="rawFeatures_unigrams",
                              outputCol="unigrams_vector",
                              minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF_unigram, idf_unigram]
            # Creating IDF from the bigram words
            hashingTF_bigram = HashingTF(inputCol="bigrams",
                                         outputCol="rawFeatures_bigrams",
                                         numFeatures=num_features)
            idf_bigram = IDF(inputCol="rawFeatures_bigrams",
                             outputCol="bigrams_vector",
                             minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF_bigram, idf_bigram]

            ngrams = VectorAssembler(
                inputCols=['unigrams_vector', 'bigrams_vector'],
                outputCol='review_vector')
            stages += [ngrams]

        assemblerInputs = ['review_vector']
        assembler = VectorAssembler(inputCols=assemblerInputs,
                                    outputCol="unstandard_features")

        stages += [assembler]

        if isCHISQR:
            chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num,
                                         featuresCol="unstandard_features",
                                         outputCol="chisq_features",
                                         labelCol="label")

            stages += [chi_selector]

            scaler = StandardScaler(inputCol="chisq_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)

            stages += [scaler]
        else:
            scaler = StandardScaler(inputCol="unstandard_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)

            stages += [scaler]

        pipeline = Pipeline(stages=stages)
        pipelineFit = pipeline.fit(self.df)
        self.df = pipelineFit.transform(self.df)
        return self.df
示例#29
0
spark = SparkSession.builder.appName("ChiSqSelector").getOrCreate()

df = spark.createDataFrame([(
    7,
    Vectors.dense([0.0, 0.0, 18.0, 1.0]),
    1.0,
), (
    8,
    Vectors.dense([0.0, 1.0, 12.0, 0.0]),
    0.0,
), (
    9,
    Vectors.dense([1.0, 0.0, 15.0, 0.1]),
    0.0,
)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1,
                         featuresCol="features",
                         outputCol="selectedFeatures",
                         labelCol="clicked")

model = selector.fit(df)
result = model.transform(df)

print("ChiSqSelector output with top %d features selected" %
      selector.getNumTopFeatures())
result.show()

spark.stop()
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ChiSqSelectorExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
        (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
        (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

    selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                             outputCol="selectedFeatures", labelCol="clicked")

    result = selector.fit(df).transform(df)

    print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
    result.show()
    # $example off$

    spark.stop()
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

working_cols = df.columns
working_cols.remove("ID")
working_cols.remove("Target")

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=working_cols, outputCol="rawFeatures")

#Execute Vector Assembler
assembled_df = vectorAssembler.transform(df)

#Select Features
selector = ChiSqSelector(numTopFeatures=5, featuresCol="rawFeatures",
                         outputCol="selectedFeatures", labelCol="Target")

#Execute Selector
selected_df = selector.fit(assembled_df).transform(assembled_df)

#Display Results
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
display(selected_df.select("rawFeatures", "selectedFeatures"))

# COMMAND ----------



# COMMAND ----------

display(assembled)
df = spark.createDataFrame([(
    7,
    Vectors.dense([0.0, 0.0, 18.0, 1.0]),
    1.0,
), (
    8,
    Vectors.dense([0.0, 1.0, 12.0, 0.0]),
    0.0,
), (
    9,
    Vectors.dense([1.0, 0.0, 15.0, 0.1]),
    0.0,
)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1,
                         featuresCol="features",
                         outputCol="selectedFeatures",
                         labelCol="clicked")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" %
      selector.getNumTopFeatures())
result.show()

# COMMAND ----------

###Locality sensitive hashing (LSH) is used in clustering data
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
def issue_impact_process(ml_df, columns, project, organization):

    # ChiSquare
    r = ChiSquareTest.test(ml_df, "features", "label")
    pValues = r.select("pvalues").collect()[0][0].tolist()
    stats = r.select("statistics").collect()[0][0].tolist()
    dof = r.select("degreesOfFreedom").collect()[0][0]

    # ChiSq Selector
    selector = ChiSqSelector(numTopFeatures=10,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")
    selector_model = selector.fit(ml_df)

    top_10_feaures_importance = []
    for j in selector_model.selectedFeatures:
        top_10_feaures_importance.append(columns[j])
        top_10_feaures_importance.append(stats[j])

    top_issue_lines = []
    data_count = ml_df.count()
    # First importance value being 0 => skip
    if top_10_feaures_importance[1] != 0:
        print("\tFirst ChiSquare selected issue's importance is 0")
        top_issue_lines.append(
            [organization, project, "ChiSquareSelectorModel", data_count] +
            top_10_feaures_importance)

    # Tree-based algorithm's Feature Importances
    dt = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label',
                                maxDepth=3)
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                numTrees=10)

    for algo, model_name in [(dt, "DecisionTreeModel"),
                             (rf, "RandomForestModel")]:

        model = algo.fit(ml_df)

        f_importances = model.featureImportances
        indices = f_importances.indices.tolist()
        values = f_importances.values.tolist()

        if len(values) < 2:
            print(
                f"\tOnly less or equal to 1 significant issue for model {model_name}. Skipping writing to Database."
            )
            continue

        value_index_lst = list(zip(values, indices))
        value_index_lst.sort(key=lambda x: x[0], reverse=True)

        importance_sorted_features = []
        for value, index in value_index_lst:
            importance_sorted_features.append(columns[index])
            importance_sorted_features.append(value)

        length = len(importance_sorted_features)

        if length > 20:
            importance_sorted_features = importance_sorted_features[:20]
        elif length < 20:
            importance_sorted_features = importance_sorted_features + (
                20 - length) * [None]

        top_issue_lines.append(
            [organization, project, model_name, data_count] +
            importance_sorted_features)

    if len(top_issue_lines) > 0:
        top_issue_df = spark.createDataFrame(data=top_issue_lines,
                                             schema=TOP_ISSUE_SCHEMA)
        top_issue_df.write.jdbc(CONNECTION_STR,
                                'top_issues',
                                mode='append',
                                properties=CONNECTION_PROPERTIES)