示例#1
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
示例#2
0
def appendselector(stages, percent=0.5):
    #A Chi-Square Feature Selector uses the Chi-Squared test of independence to decide which features
    #as the most "useful". In this case, 50% of the original amount of features are set to be kept.
    #With these Transformers, the stages for training Hybrid Classifiers are set (different Transformer
    #for TF-IDF and Word Embedding Text-Based Features.
    if (percent < 1.0):
        print("Appending Chi-Square to stages with percentage " + str(percent))
        selectorType = 'percentile'
        numTopFeatures = 50
        percentile = percent
    else:
        print("Appending Chi-Square to stage with numTopFeatures " +
              str(percent))
        selectorType = 'numTopFeatures'
        numTopFeatures = percent
        percentile = 0.1

    stages[-1].setOutputCol('prefeatures')
    selector = ChiSqSelector(numTopFeatures=numTopFeatures,
                             featuresCol='prefeatures',
                             outputCol='features',
                             selectorType=selectorType,
                             percentile=percentile)
    selectorstages = stages + [selector]
    return selectorstages
    def test_chi_sq_selector(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
             (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
             (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
            ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1,
                                 outputCol="selectedFeatures")
        model = selector.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ChiSqSelector',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlChiSqSelector")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Edad", "Genero", "Zona", "Fumador_Activo",
        "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria",
        "Tension_sistolica", "Tension_diastolica", "Colesterol_Total",
        "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno",
        "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina",
        "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos",
        "Estatina", "Antidiabeticos", "Adherencia_tratamiento"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=15)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=15,
                             featuresCol="indexedFeatures",
                             labelCol="Diabetes",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show(100)
示例#5
0
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys",
        "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl",
        "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance",
        "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=5,
                             featuresCol="indexedFeatures",
                             labelCol="Position",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
def Chi_sqr(dataset_add, feature_colm, label_colm):
    dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)

    dataset.show()

    # using the rformula for indexing, encoding and vectorising

    label = ''
    for y in label_colm:
        label = y

    print(label)

    f = ""
    f = label + " ~ "

    for x in feature_colm:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    length = feature_colm.__len__()

    output = formula.fit(dataset).transform(dataset)

    output.select("features", "label").show()

    # chi selector
    from pyspark.ml.feature import ChiSqSelector

    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")

    result = selector.fit(output).transform(output)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    #runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", "label").head()
    print("pValues: " + str(r.pValues))
    p_values = str(r.pValues)
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))

    print("statistics: " + str(r.statistics))

    json_response = {'pvalues': p_values}

    return json_response


# Chi_sqr(dataset_add, features_colm, label_colm)
示例#7
0
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i,
                             feature_cols):

    # APPLY CHI-SQUARE SELECTOR
    name = f"ChiSquareSelectorModel_{i}"
    selector_model_path = Path(spark_artefacts_dir).joinpath(name)

    if run_mode == 'first':

        # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better)
        r = ChiSquareTest.test(ml_df, "features", "label")
        pValues = r.select("pvalues").collect()[0][0].tolist()
        stats = r.select("statistics").collect()[0][0].tolist()
        dof = r.select("degreesOfFreedom").collect()[0][0]

        # ChiSq Selector
        selector = ChiSqSelector(numTopFeatures=10,
                                 featuresCol="features",
                                 outputCol="selected_features",
                                 labelCol="label")
        selector_model = selector.fit(ml_df)
        selector_model.write().overwrite().save(
            str(selector_model_path.absolute()))

        top_10_feaures_importance = []
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_feaures_importance.append(feature_cols[j])
            top_10_features.append(feature_cols[j])
            top_10_feaures_importance.append(stats[j])

        model_info = [
            name,
            ml_df.count(), None, None, None, None, None, None, None
        ] + top_10_feaures_importance
        model_info_df = spark.createDataFrame(data=[model_info],
                                              schema=MODEL_INFO_SCHEMA)
        model_info_df.write.jdbc(CONNECTION_STR,
                                 'model_info',
                                 mode='append',
                                 properties=CONNECTION_PROPERTIES)

    elif run_mode == 'incremental':
        selector_model = ChiSqSelectorModel.load(
            str(selector_model_path.absolute()))
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_features.append(feature_cols[j])

    ml_df_10 = selector_model.transform(ml_df)
    ml_df_10 = ml_df_10.drop("features")

    #Solve a problem with ChiSqSelector and Tree-based algorithm
    ml_rdd_10 = ml_df_10.rdd.map(
        lambda row: Row(label=row[0], features=DenseVector(row[1].toArray())))
    ml_df_10 = spark.createDataFrame(ml_rdd_10)

    return ml_df_10, top_10_features
def feature_selection(t_data):
    #Feature selection
    css = ChiSqSelector(featuresCol='scaled_features',
                        outputCol='Aspect',
                        labelCol='output',
                        numTopFeatures=10)
    t_data = css.fit(t_data).transform(t_data)

    return t_data
    def chiSquareTest(self,categoricalFeatures,maxCategories):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        length = features.__len__()

        featureassembler = VectorAssembler(
            inputCols=self.features,
            outputCol="featuresChiSquare", handleInvalid="skip")
        dataset= featureassembler.transform(dataset)

        vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))

        dataset = vec_indexer.transform(dataset)

        # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
        # finalized_data.show()

        # using chi selector
        selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
                                 outputCol="selectedFeatures",
                                 labelCol=labelColm)

        result = selector.fit(dataset).transform(dataset)

        print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
        result.show()

        # runnin gfor the chi vallue test

        r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
        p_values = list(r.pValues)
        PValues = []
        for val in p_values:
            PValues.append(round(val, 4))
        print(PValues)
        dof = list(r.degreesOfFreedom)
        stats = list(r.statistics)
        statistics = []
        for val in stats:
            statistics.append(round(val, 4))
        print(statistics)
        chiSquareDict = {}
        for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
            print(pval, doF, stat)
            chiSquareDict[colm] = pval, doF, stat
        chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
        print(chiSquareDict)

        result = {'pvalues': chiSquareDict}

        return result
示例#10
0
def clasificar_chi2():
    #Leemos la data y convertimos a float los valores de cada columna
    conf = SparkConf().setAppName("NN_1").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    rdd = sqlContext.read.csv(
        "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd
    rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[
        3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]),
                             float(x[9])))

    df = rdd.toDF([
        "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics",
        "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno"
    ])
    #Construir nuestro vector assembler (features)
    assembler = VectorAssembler(inputCols=[
        "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin",
        "mitoses"
    ],
                                outputCol="featuresChi2")
    df_chi2 = assembler.transform(df)
    df_chi2 = df_chi2.select("featuresChi2", "P_Benigno")

    selector = ChiSqSelector(numTopFeatures=3,
                             featuresCol="featuresChi2",
                             labelCol="P_Benigno",
                             outputCol="featuresSelected")
    df_result = selector.fit(df_chi2).transform(df_chi2)

    #Dividir data en training y test
    (df_training, df_test) = df_result.randomSplit([0.7, 0.3])

    # Definir arquitectura de nuestra red (hiperparametro)
    capas = [3, 4, 6, 2]

    # Construimos al entrenador
    # Hiperparametro: maxIter
    entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected",
                                                labelCol="P_Benigno",
                                                maxIter=1000,
                                                layers=capas)
    # Entrenar nuestro modelo
    modelo = entrenador.fit(df_training)

    # Validar nuestro modelo
    df_predictions = modelo.transform(df_test)
    evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluador.evaluate(df_predictions)
    print(f"Accuracy: {accuracy}")

    df_predictions.select("prediction", "rawPrediction", "probability").show()

    #Mostramos la cantidad de 0 y 1 de las predicciones
    df_predictions.groupby('prediction').count().show()
示例#11
0
def MachineLearning(df):
    file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/"
    data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label')
    data = data.withColumn('length',length(data['Summary']))
    # Basic sentence tokenizer
    tokenizer = Tokenizer(inputCol="Summary", outputCol="words")
   
    #remove stop words
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_features")
   
    #transoform dataset to vectors
    cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000)
    
    #calculate IDF for all dataset
    idf = IDF(inputCol= 'features1', outputCol = 'tf_idf')
    
    normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False)
    selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures",
                         outputCol="selectedFeatures", labelCol="label")
    #prepare data for ML spark library
    cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features')
    # Normalize each Vector using $L^1$ norm.
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp])
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    data.printSchema()
    train_data, test_data = data.randomSplit([0.7,0.3],seed=2018)

    lr = LogisticRegression(featuresCol="features", labelCol='label')
    lrModel = lr.fit(train_data)
    beta = np.sort(lrModel.coefficients)
    plt.plot(beta)
    plt.ylabel('Beta Coefficients')
    plt.show()

    trainingSummary = lrModel.summary
    roc = trainingSummary.roc.toPandas()
    plt.plot(roc['FPR'],roc['TPR'])
    plt.ylabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
    print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))



    pr = trainingSummary.pr.toPandas()
    plt.plot(pr['recall'],pr['precision'])
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.show()
    predictions = lrModel.transform(test_data)
    evaluator = BinaryClassificationEvaluator()
    print('Test Area Under ROC', evaluator.evaluate(predictions))
示例#12
0
    def run_feature_selection_on(data):
        LOGGER.warning("Running feature selection.")
        selector = ChiSqSelector(numTopFeatures=10,
                                 featuresCol="features",
                                 outputCol="selectedFeatures",
                                 labelCol="label")

        data = selector.fit(data).transform(data).drop(
            'features').withColumnRenamed('selectedFeatures', 'features')
        LOGGER.warning("Ran feature selection.")
        return data
示例#13
0
def pruebaChi(dataframe,
              categoricalCols,
              numericalCols,
              labelCol="TIPO PACIENTE"):
    """Función que hace todo el preprocesamiento de los datos
    categóricos de un conjunto de datos de entrenamiento (o no).
    :param train spark df: conjunto de datos de entrenamiento.
    :param categoricalCols list,array: conjunto de nombres de columnas categoricas del
        conjunto de datos train.
    :param numericalCols list,array: conjunto de nombres de columnas numéricas del 
        conjunto de datos train.
    :param labelCol str: variable objetivo o etiqueta

    :Returns spark dataframe con las columnas 'label' y 'features'
    """

    # codificamos todas las variables categóricas
    stages = []
    for categoricalCol in categoricalCols:
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + "Index")
        encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                                outputCol=categoricalCol + "ohe")
        stages += [stringIndexer, encoder]

    # variable objetivo (etiqueta)
    label_strIdx = StringIndexer(inputCol=labelCol, outputCol="label")
    stages += [label_strIdx]

    # ponemos todas las covariables en un vector
    assemblerInputs = [c + "ohe" for c in categoricalCols] + numericalCols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="feat")
    stages += [assembler]

    # seleccionamos las variables que nos sirven con ChiSqSelector
    selector = ChiSqSelector(featuresCol="feat",
                             outputCol="feature",
                             labelCol="label",
                             fpr=0.05,
                             selectorType='fpr')
    stages += [selector]

    # escala de 0-1
    scala = MinMaxScaler(inputCol="feature", outputCol="features")
    stages += [scala]

    # pipeline donde vamos a hacer todo el proceso
    pipe = Pipeline(stages=stages)
    pipeModel = pipe.fit(dataframe)
    df = pipeModel.transform(dataframe)

    # regresamos nuestro df con lo que necesitamos
    return df
示例#14
0
def pre_processing(df):
    ''' feature selection '''
    selector = ChiSqSelector(numTopFeatures=1,
                             featuresCol="features",
                             outputCol="selectedFeatures",
                             labelCol="clicked")

    result = selector.fit(df).transform(df)

    print("ChiSqSelector output with top %d features selected" %
          selector.getNumTopFeatures())
    result.show()
示例#15
0
def getAllMeasure(rf,selectorData,featureCols,):
    measure = np.array(['  ', '  ', '  '])
    for i in range(1, len(featureCols) + 1):
        selector = ChiSqSelector(numTopFeatures=i, featuresCol="features",
                                 outputCol="selectedFeatures", labelCol="label")

        selectedData = selector.fit(selectorData).transform(selectorData)
        trainSelected, testSelected = selectedData.randomSplit([0.7, 0.3])
        rfModel = rf.fit(trainSelected)

        prediction = rfModel.transform(testSelected)
        evaluator = BinaryClassificationEvaluator()
        measure = np.vstack([evaluateLr(prediction, evaluator, i), measure])
    return measure
示例#16
0
def initializePipeline(num_cols, cat_cols):

    cat_cols_index = []
    cat_cols_hoted = []
    for i in cat_cols:
        cat_cols_index.append(i + "_index")
        cat_cols_hoted.append(i + "_hoted")

    featureCols = []
    for i in num_cols:
        featureCols.append(i + "scaled")

    for i in cat_cols:
        featureCols.append(i + "_hoted")

    labelindexers = [StringIndexer(inputCol="Churn", outputCol="label")]
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index")
        for column in cat_cols
    ]
    oneHotEncoder = [
        OneHotEncoderEstimator(inputCols=cat_cols_index,
                               outputCols=cat_cols_hoted,
                               dropLast=False)
    ]
    assembler = [
        VectorAssembler(inputCols=num_cols, outputCol=i + "_indexe")
        for i in num_cols
    ]
    normalizers = [
        MinMaxScaler(inputCol=column + "_indexe", outputCol=column + "scaled")
        for column in num_cols
    ]
    featureAssembler = [
        VectorAssembler(inputCols=featureCols, outputCol="resultedfeatures")
    ]
    selector = [
        ChiSqSelector(numTopFeatures=13,
                      featuresCol="resultedfeatures",
                      outputCol="features",
                      labelCol="label")
    ]
    pipeline = Pipeline(stages=indexers + oneHotEncoder + assembler +
                        normalizers + featureAssembler + labelindexers +
                        selector)
    return pipeline
示例#17
0
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
        "exang", "oldpeak", "slope", "ca", "thal"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=4,
                             featuresCol="indexedFeatures",
                             labelCol="target",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
示例#18
0
def important_feature_selector(predicted):
    """Uses the Chi-Squared Test to select important features for classification, and prints them out.
    
    Params:
    - predicted (pyspark.sql.DataFrame): The dataset, with predictions
    """
    selector = ChiSqSelector(numTopFeatures=50,
                             featuresCol='presence_feature_set',
                             labelCol='label',
                             outputCol='selected_features',
                             selectorType='numTopFeatures')
    model = selector.fit(predicted)
    important_features = model.selectedFeatures
    with open('bag_of_words_labels.json', 'r') as bow_file:
        bow_labels = json.loads(
            bow_file.readlines()[0])  # There is only one line
    important_feature_labels = [
        bow_labels[index] for index in important_features
    ]
    print("=====Important Feature Labels=====")
    print(important_feature_labels)
    def build_trigrams(input_cols=("text", "target"), n=3):
        logging.warning("Building trigram model.")
        tokenizer = [Tokenizer(inputCol=input_cols[0], outputCol="words")]
        ngrams = [
            NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
            for i in range(1, n + 1)
        ]

        cv = [
            CountVectorizer(vocabSize=2**14,
                            inputCol="{0}_grams".format(i),
                            outputCol="{0}_tf".format(i))
            for i in range(1, n + 1)
        ]
        idf = [
            IDF(inputCol="{0}_tf".format(i),
                outputCol="{0}_tfidf".format(i),
                minDocFreq=5) for i in range(1, n + 1)
        ]

        assembler = [
            VectorAssembler(
                inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
                outputCol="rawFeatures")
        ]
        label_string_idx = [
            StringIndexer(inputCol=input_cols[1], outputCol="label")
        ]
        selector = [
            ChiSqSelector(numTopFeatures=2**14,
                          featuresCol='rawFeatures',
                          outputCol="features")
        ]
        lr = [LogisticRegression(maxIter=100)]
        return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                        label_string_idx + selector + lr)
示例#20
0
def feature_selection(df):
    # Creamos vectorassembler
    assembler = VectorAssembler(inputCols=[
        "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS",
        "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA",
        "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO",
        "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA",
        "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    # Vectorindexer
    indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    df = indexer.fit(df).transform(df)

    # Prueba ChiSquare
    selector = ChiSqSelector(numTopFeatures=8,
                             featuresCol="indexedFeatures",
                             labelCol="DIABETES",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
    for feature in feature_cols:
        indexed = feature + "_" + "indexed"
        indexed_cols.append(indexed)
        indexer = StringIndexer(inputCol=feature,
                                outputCol=indexed,
                                handleInvalid="keep",
                                stringOrderType="frequencyDesc")
        stages.append(indexer)

    stages.append(
        VectorAssembler(inputCols=indexed_cols,
                        outputCol="features",
                        handleInvalid="keep"))
    stages.append(
        ChiSqSelector(numTopFeatures=20,
                      labelCol="HasDetections",
                      featuresCol="features",
                      outputCol="selectedFeatures"))

    print("Performing model fitting")
    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(df)
    df_features = model.transform(df)
    df_features.select("features", "selectedFeatures").show()

    print("Saving Pipeline Model")
    model.write().overwrite().save(pipeline_model_path)

    with open(feature_path, "wb") as f:
        pickle.dump(feature_cols, f)

features = model.stages[-1].selectedFeatures
def issue_impact_process(ml_df, columns, project, organization):

    # ChiSquare
    r = ChiSquareTest.test(ml_df, "features", "label")
    pValues = r.select("pvalues").collect()[0][0].tolist()
    stats = r.select("statistics").collect()[0][0].tolist()
    dof = r.select("degreesOfFreedom").collect()[0][0]

    # ChiSq Selector
    selector = ChiSqSelector(numTopFeatures=10,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")
    selector_model = selector.fit(ml_df)

    top_10_feaures_importance = []
    for j in selector_model.selectedFeatures:
        top_10_feaures_importance.append(columns[j])
        top_10_feaures_importance.append(stats[j])

    top_issue_lines = []
    data_count = ml_df.count()
    # First importance value being 0 => skip
    if top_10_feaures_importance[1] != 0:
        print("\tFirst ChiSquare selected issue's importance is 0")
        top_issue_lines.append(
            [organization, project, "ChiSquareSelectorModel", data_count] +
            top_10_feaures_importance)

    # Tree-based algorithm's Feature Importances
    dt = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label',
                                maxDepth=3)
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                numTrees=10)

    for algo, model_name in [(dt, "DecisionTreeModel"),
                             (rf, "RandomForestModel")]:

        model = algo.fit(ml_df)

        f_importances = model.featureImportances
        indices = f_importances.indices.tolist()
        values = f_importances.values.tolist()

        if len(values) < 2:
            print(
                f"\tOnly less or equal to 1 significant issue for model {model_name}. Skipping writing to Database."
            )
            continue

        value_index_lst = list(zip(values, indices))
        value_index_lst.sort(key=lambda x: x[0], reverse=True)

        importance_sorted_features = []
        for value, index in value_index_lst:
            importance_sorted_features.append(columns[index])
            importance_sorted_features.append(value)

        length = len(importance_sorted_features)

        if length > 20:
            importance_sorted_features = importance_sorted_features[:20]
        elif length < 20:
            importance_sorted_features = importance_sorted_features + (
                20 - length) * [None]

        top_issue_lines.append(
            [organization, project, model_name, data_count] +
            importance_sorted_features)

    if len(top_issue_lines) > 0:
        top_issue_df = spark.createDataFrame(data=top_issue_lines,
                                             schema=TOP_ISSUE_SCHEMA)
        top_issue_df.write.jdbc(CONNECTION_STR,
                                'top_issues',
                                mode='append',
                                properties=CONNECTION_PROPERTIES)
示例#23
0
pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol(
    "polyFeatures")
pe.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer

tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)
chisq.fit(prechi).transform(prechi)\
  .drop("customerId", "Description", "DescOut").show()

# COMMAND ----------

fittedPCA = pca.fit(scaleDF)
fittedPCA.write().overwrite().save("/tmp/fittedPCA")

# COMMAND ----------

from pyspark.ml.feature import PCAModel

loadedPCA = PCAModel.load("/tmp/fittedPCA")
loadedPCA.transform(scaleDF).show()
示例#24
0
std_scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaled_df = std_scaler.fit(features_df).transform(features_df)
scaled_df.select("scaled_features").display()

# COMMAND ----------

# MAGIC %md ###Part 4: Feature Selection
# MAGIC Chi Square Selector

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

chisq_selector = ChiSqSelector(numTopFeatures=1,
                               featuresCol="scaled_features",
                               outputCol="selected_features",
                               labelCol="cust_age")

result_df = chisq_selector.fit(scaled_df).transform(scaled_df)

result_df.select("selected_features").display()

# COMMAND ----------

# MAGIC %md Feature Selection using VectorSclicer

# COMMAND ----------

from pyspark.ml.feature import VectorSlicer

vec_slicer = VectorSlicer(inputCol="scaled_features",
    class1_num = class1.count()
    class2_num = class2.count()
    fraction = 1.0 * class1_num / class2_num
    class2 = class2.sample(fraction)
    training_dataset_balanced = class1.union(class2)
    training_dataset_balanced.groupBy("_c41").count().show()

    ####### 14.1 ###
    converted_cols = ["s" + col for col in string_cols]
    assembler = VectorAssembler(inputCols=converted_cols + numerical_cols,
                                outputCol="features")
    labelIndexer = StringIndexer(inputCol="_c41", outputCol="label")
    #classifier = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxBins=64, maxDepth= 5, subsamplingRate= 1.0  )  ## 14.2
    #classifier = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=64)
    selector = ChiSqSelector(numTopFeatures=35,
                             featuresCol="features",
                             outputCol="selectedFeatures")
    classifier = NaiveBayes(
        smoothing=1.0
    )  ## modelType="multinomial" we have binomial here so it doesn't make change when we apply this parameter
    pipeline = Pipeline(stages=indexers +
                        [assembler, labelIndexer, selector, classifier])
    model = pipeline.fit(training_dataset_balanced)
    #predictions = model.transform(dataset_testing) ##14.1
    predictions = model.transform(dataset_testing)  ## 14.2
    predictions.show(10, False)

    ###### 14.2 ####
    evaluator = BinaryClassificationEvaluator(labelCol="label",
                                              rawPredictionCol="prediction")
    accuracy = evaluator.evaluate(predictions)
    def add_vectorized_features(self, transform_type, min_df, max_df, isCHISQR,
                                chi_feature_num, num_features):
        '''
        Creates the pySpark feature pipeline and stores the vectorized data under the feature column 
        Input: transform_type: {'tfidf','tfidf_bigram'}, min document frequency (min_df), chi squared feature reduction (isCHISQR)
               number of reduced features with chi square feature reduction (chi_feature_num), number of features (num_features)                  
        Output: Returns the transformed dataframe with the label and features columns
        '''
        stages = []
        #Code this code transforms text to vectorized features

        # Tokenize review sentences into vectors of words
        regexTokenizer = RegexTokenizer(inputCol="reviewText",
                                        outputCol="words",
                                        pattern="\\W")

        stages += [regexTokenizer]

        #Remove stopwords from tokenized words
        #nltk.download('stopwords')
        from nltk.corpus import stopwords
        sw = stopwords.words('english')
        stopwordsRemover = StopWordsRemover(
            inputCol="words", outputCol="filtered").setStopWords(sw)

        #lemmatizer = WordNetLemmatizer()
        #doc = [lemmatizer.lemmatize(token) for token in doc]
        stages += [stopwordsRemover]

        # Using TFIDF for review transformation of unigrams.
        if transform_type == 'tfidf':
            # Creating IDF from the words the filtered words
            hashingTF = HashingTF(inputCol="filtered",
                                  outputCol="rawFeatures",
                                  numFeatures=num_features)
            idf = IDF(inputCol="rawFeatures",
                      outputCol="review_vector",
                      minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF, idf]

        # Using TFIDF for review transformation of bigrams
        if transform_type == 'tfidf_bigram':
            #Add unigram and bigram word vectors, then vectorize using TFIDF
            unigram = NGram(n=1, inputCol='filtered', outputCol='unigrams')
            stages += [unigram]

            bigram = NGram(n=2, inputCol='filtered', outputCol='bigrams')
            stages += [bigram]
            # Creating IDF from unigram  words
            hashingTF_unigram = HashingTF(inputCol="unigrams",
                                          outputCol="rawFeatures_unigrams",
                                          numFeatures=num_features)
            idf_unigram = IDF(inputCol="rawFeatures_unigrams",
                              outputCol="unigrams_vector",
                              minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF_unigram, idf_unigram]
            # Creating IDF from the bigram words
            hashingTF_bigram = HashingTF(inputCol="bigrams",
                                         outputCol="rawFeatures_bigrams",
                                         numFeatures=num_features)
            idf_bigram = IDF(inputCol="rawFeatures_bigrams",
                             outputCol="bigrams_vector",
                             minDocFreq=min_df)
            # Add to stages
            stages += [hashingTF_bigram, idf_bigram]

            ngrams = VectorAssembler(
                inputCols=['unigrams_vector', 'bigrams_vector'],
                outputCol='review_vector')
            stages += [ngrams]

        assemblerInputs = ['review_vector']
        assembler = VectorAssembler(inputCols=assemblerInputs,
                                    outputCol="unstandard_features")

        stages += [assembler]

        if isCHISQR:
            chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num,
                                         featuresCol="unstandard_features",
                                         outputCol="chisq_features",
                                         labelCol="label")

            stages += [chi_selector]

            scaler = StandardScaler(inputCol="chisq_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)

            stages += [scaler]
        else:
            scaler = StandardScaler(inputCol="unstandard_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)

            stages += [scaler]

        pipeline = Pipeline(stages=stages)
        pipelineFit = pipeline.fit(self.df)
        self.df = pipelineFit.transform(self.df)
        return self.df
示例#27
0
    # assemble all features into feature vector
    features_assembler = VectorAssembler(inputCols=num_bool_features,
                                         outputCol="features")

    # Index labels, adding metadata to the label column.
    label_indexer = StringIndexer(inputCol="has_over_50k",
                                  outputCol="label").fit(processed_train_set)

    # Convert indexed labels back to original labels.
    label_converter = IndexToString(inputCol="prediction",
                                    outputCol="predicted_label",
                                    labels=label_indexer.labels)

    # - ChiSQ feature Selection
    selector = ChiSqSelector(numTopFeatures=20,
                             featuresCol="features",
                             outputCol="featuresSel",
                             labelCol="label")

    # - RandomForest model with parameter tuning using cross validation
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="featuresSel",
                                numTrees=20)

    # - Create ParamGrid for Cross Validation
    rf_param_grid = (ParamGridBuilder().addGrid(
        rf.maxDepth,
        [2, 3, 4, 5, 10, 20]).addGrid(rf.maxBins,
                                      [10, 20, 40, 80, 100]).build())

    # - Model Evaluation
    rf_eval = BinaryClassificationEvaluator(labelCol="label")
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

working_cols = df.columns
working_cols.remove("ID")
working_cols.remove("Target")

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=working_cols, outputCol="rawFeatures")

#Execute Vector Assembler
assembled_df = vectorAssembler.transform(df)

#Select Features
selector = ChiSqSelector(numTopFeatures=5, featuresCol="rawFeatures",
                         outputCol="selectedFeatures", labelCol="Target")

#Execute Selector
selected_df = selector.fit(assembled_df).transform(assembled_df)

#Display Results
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
display(selected_df.select("rawFeatures", "selectedFeatures"))

# COMMAND ----------



# COMMAND ----------

display(assembled)
df = spark.createDataFrame([(
    7,
    Vectors.dense([0.0, 0.0, 18.0, 1.0]),
    1.0,
), (
    8,
    Vectors.dense([0.0, 1.0, 12.0, 0.0]),
    0.0,
), (
    9,
    Vectors.dense([1.0, 0.0, 15.0, 0.1]),
    0.0,
)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1,
                         featuresCol="features",
                         outputCol="selectedFeatures",
                         labelCol="clicked")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" %
      selector.getNumTopFeatures())
result.show()

# COMMAND ----------

###Locality sensitive hashing (LSH) is used in clustering data
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
def main():

    #initialize spark session
    spark = SparkSession\
            .builder\
            .appName("Malware Random Forests")\
            .getOrCreate()
    sc = spark.sparkContext

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId",
                                      "AKIAI5AGW2NXUVOJ7L2A")
    sc._jsc.hadoopConfiguration().set(
        "fs.s3n.awsSecretAccessKey",
        "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")

    #read file names
    fileNames = (sys.argv)
    X_train = sc.textFile(fileNames[1], 40)
    y_train = sc.textFile(fileNames[2], 40)

    #paths to test data
    X_test = sc.textFile(fileNames[3], 40) if len(fileNames) >= 4 else None
    y_test = sc.textFile(fileNames[4], 40) if len(fileNames) == 5 else None
    print y_test
    print "printing ytest!!!!!"

    #get train bytes features
    preprocessor_b = preprocessor_bytes(most_common_bigrams=True)
    hashes_and_labels = preprocessor_b.hashes_and_labels(X_train, y_train)
    s3_path = preprocessor_b.get_s3_path(X_train)
    bytes = sc.wholeTextFiles(','.join(s3_path), 40)
    train_bytes = preprocessor_b.transform(bytes, hashes_and_labels)
    print "train_bytes done!!!!!!!"

    #get asm features
    preprocessor_a = preprocessor_asm()
    s3_path = preprocessor_a.get_s3_path(X_train)
    metadata = sc.wholeTextFiles(','.join(s3_path), 40)
    train_asm = preprocessor_a.transform(metadata,
                                         hashes_and_labels,
                                         train=True)
    print train_asm.show()
    print "train_asm done!!!!!!!"

    #get test bytes data
    hashes_and_labels = preprocessor_b.hashes_and_labels(X_test, y_test)
    s3_path = preprocessor_b.get_s3_path(X_test)
    bytes = sc.wholeTextFiles(','.join(s3_path), 40)
    test_bytes = preprocessor_b.transform(
        bytes, hashes_and_labels
    ) if y_test is not None else preprocessor_b.transform(bytes)
    print "test_bytes done!!!!!!!"

    #get test asm data
    s3_path = preprocessor_a.get_s3_path(X_test)
    metadata = sc.wholeTextFiles(','.join(s3_path), 40)
    test_asm = preprocessor_a.transform(
        metadata, hashes_and_labels,
        train=False) if y_test is not None else preprocessor_a.transform(
            metadata, hashes_and_labels=None, train=False)
    print "test_asm done!!!!!!!"

    #chisqr feature selector
    selector1 = ChiSqSelector(numTopFeatures=150, outputCol="selectedFeatures")
    selectormodel1 = selector1.fit(train_bytes)
    train_bytes = selectormodel1.transform(train_bytes)
    test_bytes = selectormodel1.transform(test_bytes)
    print "ChiSqSelector bytes done!!!!!!!"

    selector2 = ChiSqSelector(numTopFeatures=150, outputCol="selectedFeatures")
    selectormodel2 = selector2.fit(train_asm)
    train_asm = selectormodel2.transform(train_asm)
    test_asm = selectormodel2.transform(test_asm)
    print "ChiSqSelector asm done!!!!!!!"

    #to rdd
    train_bytes = train_bytes.select(
        'hash', 'selectedFeatures',
        'label').rdd.map(lambda (hash, feats, label): (hash, (feats, label)))
    train_asm = train_asm.select(
        'hash', 'selectedFeatures',
        'label').rdd.map(lambda (hash, feats, label): (hash, (feats, label)))

    #merge train bytes and train asm data
    train = train_bytes.join(train_asm).map(lambda (hash, (
        (bytes, label), (asm, label2))): (hash, bytes, asm, label))
    schema = StructType([
        StructField('hash', StringType(), True),
        StructField('bytes', VectorUDT(), True),
        StructField('asm', VectorUDT(), True),
        StructField('label', StringType(), True)
    ])
    train = train.toDF(schema)
    train = train.withColumn('label', train.label.cast(DoubleType()))
    print train.show()
    print "Final TrainDF done!!!!!!!"
    '''
    we want to use the same pipeline when the testing labels are present or absent, so, the below code builds 
    the test data accordingly.
    The test RDD looks like:
        - When test labels are present: <hash,selectedFeatures,label>
        - When test labels are absent : <hash,selectedFeatures>
    
    When labels are absent, there is no 3rd column in dataframe, and so, the consecutive dataframes also
    don't have that column.
    '''
    if y_test is not None:
        test_bytes = test_bytes.select(
            'hash', 'selectedFeatures', 'label').rdd.map(
                lambda (hash, feats, label): (hash, (feats, label)))
        test_asm = test_asm.select(
            'hash', 'selectedFeatures', 'label').rdd.map(
                lambda (hash, feats, label): (hash, (feats, label)))

        #merge test bytes and test asm data
        test = test_bytes.join(test_asm).map(lambda (hash, (
            (bytes, label), (asm, label2))): (hash, bytes, asm, label))
        schema = StructType([
            StructField('hash', StringType(), True),
            StructField('bytes', VectorUDT(), True),
            StructField('asm', VectorUDT(), True),
            StructField('label', StringType(), True)
        ])
        test = test.toDF(schema)
        test = test.withColumn('label', test.label.cast(DoubleType()))

    else:
        test_bytes = test_bytes.select(
            'hash',
            'selectedFeatures').rdd.map(lambda (hash, feats): (hash, feats))
        test_asm = test_asm.select(
            'hash',
            'selectedFeatures').rdd.map(lambda (hash, feats): (hash, feats))

        #merge test bytes and test asm data
        test = test_bytes.join(test_asm).map(lambda (hash, (bytes, asm)):
                                             (hash, bytes, asm))
        schema = StructType([
            StructField('hash', StringType(), True),
            StructField('bytes', VectorUDT(), True),
            StructField('asm', VectorUDT(), True)
        ])
        test = test.toDF(schema)

    print test.show()
    print "Final TestDF done!!!!!!!"

    #merge bytes and asm features
    assembler = VectorAssembler(inputCols=["bytes", "asm"],
                                outputCol="features")
    train = assembler.transform(train)
    test = assembler.transform(test)
    print "VectorAssembler done!!!!!!!"

    #rf classifier
    rf = RandomForestClassifier(numTrees=100,
                                maxDepth=12,
                                maxBins=32,
                                maxMemoryInMB=512,
                                seed=1)
    model = rf.fit(train)
    result = model.transform(test)

    #save to csv
    #result.select("prediction").toPandas().astype(int).to_csv('prediction.csv',header=False,index=False)
    hash_predictions = result.select("hash", "prediction")
    result.select("hash", "prediction").toPandas().to_csv('./prediction.csv',
                                                          header=False,
                                                          index=False)
    print "Results .csv written"

    #Get all the predictions in the same order as hashes in X_test and save them to a file
    test_hashes = X_test.zipWithIndex().map(lambda x:
                                            (x[1], x[0].encode('utf-8')))
    predictions = hash_predictions.collect()
    predictionsBroadCast = sc.broadcast(dict(predictions))
    predicted_labels = test_hashes.map(lambda (indx, docHash): (
        docHash, get_predicted_label(docHash, predictionsBroadCast.value)
    )).map(lambda (docHash, label): int(label)).collect()
    save_predicted_labels(predicted_labels, './predictions.txt')
    print "Results .txt written"

    spark.stop()