def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def appendselector(stages, percent=0.5): #A Chi-Square Feature Selector uses the Chi-Squared test of independence to decide which features #as the most "useful". In this case, 50% of the original amount of features are set to be kept. #With these Transformers, the stages for training Hybrid Classifiers are set (different Transformer #for TF-IDF and Word Embedding Text-Based Features. if (percent < 1.0): print("Appending Chi-Square to stages with percentage " + str(percent)) selectorType = 'percentile' numTopFeatures = 50 percentile = percent else: print("Appending Chi-Square to stage with numTopFeatures " + str(percent)) selectorType = 'numTopFeatures' numTopFeatures = percent percentile = 0.1 stages[-1].setOutputCol('prefeatures') selector = ChiSqSelector(numTopFeatures=numTopFeatures, featuresCol='prefeatures', outputCol='features', selectorType=selectorType, percentile=percentile) selectorstages = stages + [selector] return selectorstages
def test_chi_sq_selector(self): data = self.spark.createDataFrame( [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ["features", "label"]) selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") model = selector.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().selectedFeatures.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Edad", "Genero", "Zona", "Fumador_Activo", "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria", "Tension_sistolica", "Tension_diastolica", "Colesterol_Total", "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno", "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina", "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos", "Estatina", "Antidiabeticos", "Adherencia_tratamiento" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=15, featuresCol="indexedFeatures", labelCol="Diabetes", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show(100)
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=5, featuresCol="indexedFeatures", labelCol="Position", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def Chi_sqr(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # using the rformula for indexing, encoding and vectorising label = '' for y in label_colm: label = y print(label) f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") length = feature_colm.__len__() output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # chi selector from pyspark.ml.feature import ChiSqSelector selector = ChiSqSelector(numTopFeatures=length, featuresCol="features", outputCol="selected_features", labelCol="label") result = selector.fit(output).transform(output) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() #runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selected_features", "label").head() print("pValues: " + str(r.pValues)) p_values = str(r.pValues) print("degreesOfFreedom: " + str(r.degreesOfFreedom)) print("statistics: " + str(r.statistics)) json_response = {'pvalues': p_values} return json_response # Chi_sqr(dataset_add, features_colm, label_colm)
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i, feature_cols): # APPLY CHI-SQUARE SELECTOR name = f"ChiSquareSelectorModel_{i}" selector_model_path = Path(spark_artefacts_dir).joinpath(name) if run_mode == 'first': # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better) r = ChiSquareTest.test(ml_df, "features", "label") pValues = r.select("pvalues").collect()[0][0].tolist() stats = r.select("statistics").collect()[0][0].tolist() dof = r.select("degreesOfFreedom").collect()[0][0] # ChiSq Selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="label") selector_model = selector.fit(ml_df) selector_model.write().overwrite().save( str(selector_model_path.absolute())) top_10_feaures_importance = [] top_10_features = [] for j in selector_model.selectedFeatures: top_10_feaures_importance.append(feature_cols[j]) top_10_features.append(feature_cols[j]) top_10_feaures_importance.append(stats[j]) model_info = [ name, ml_df.count(), None, None, None, None, None, None, None ] + top_10_feaures_importance model_info_df = spark.createDataFrame(data=[model_info], schema=MODEL_INFO_SCHEMA) model_info_df.write.jdbc(CONNECTION_STR, 'model_info', mode='append', properties=CONNECTION_PROPERTIES) elif run_mode == 'incremental': selector_model = ChiSqSelectorModel.load( str(selector_model_path.absolute())) top_10_features = [] for j in selector_model.selectedFeatures: top_10_features.append(feature_cols[j]) ml_df_10 = selector_model.transform(ml_df) ml_df_10 = ml_df_10.drop("features") #Solve a problem with ChiSqSelector and Tree-based algorithm ml_rdd_10 = ml_df_10.rdd.map( lambda row: Row(label=row[0], features=DenseVector(row[1].toArray()))) ml_df_10 = spark.createDataFrame(ml_rdd_10) return ml_df_10, top_10_features
def feature_selection(t_data): #Feature selection css = ChiSqSelector(featuresCol='scaled_features', outputCol='Aspect', labelCol='output', numTopFeatures=10) t_data = css.fit(t_data).transform(t_data) return t_data
def chiSquareTest(self,categoricalFeatures,maxCategories): dataset=self.dataset labelColm=self.labelColm features=self.features length = features.__len__() featureassembler = VectorAssembler( inputCols=self.features, outputCol="featuresChiSquare", handleInvalid="skip") dataset= featureassembler.transform(dataset) vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset = vec_indexer.transform(dataset) # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure') # finalized_data.show() # using chi selector selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure", outputCol="selectedFeatures", labelCol=labelColm) result = selector.fit(dataset).transform(dataset) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() # runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head() p_values = list(r.pValues) PValues = [] for val in p_values: PValues.append(round(val, 4)) print(PValues) dof = list(r.degreesOfFreedom) stats = list(r.statistics) statistics = [] for val in stats: statistics.append(round(val, 4)) print(statistics) chiSquareDict = {} for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures): print(pval, doF, stat) chiSquareDict[colm] = pval, doF, stat chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics'] print(chiSquareDict) result = {'pvalues': chiSquareDict} return result
def clasificar_chi2(): #Leemos la data y convertimos a float los valores de cada columna conf = SparkConf().setAppName("NN_1").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) rdd = sqlContext.read.csv( "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[ 3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]), float(x[9]))) df = rdd.toDF([ "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics", "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno" ]) #Construir nuestro vector assembler (features) assembler = VectorAssembler(inputCols=[ "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin", "mitoses" ], outputCol="featuresChi2") df_chi2 = assembler.transform(df) df_chi2 = df_chi2.select("featuresChi2", "P_Benigno") selector = ChiSqSelector(numTopFeatures=3, featuresCol="featuresChi2", labelCol="P_Benigno", outputCol="featuresSelected") df_result = selector.fit(df_chi2).transform(df_chi2) #Dividir data en training y test (df_training, df_test) = df_result.randomSplit([0.7, 0.3]) # Definir arquitectura de nuestra red (hiperparametro) capas = [3, 4, 6, 2] # Construimos al entrenador # Hiperparametro: maxIter entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected", labelCol="P_Benigno", maxIter=1000, layers=capas) # Entrenar nuestro modelo modelo = entrenador.fit(df_training) # Validar nuestro modelo df_predictions = modelo.transform(df_test) evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno", predictionCol="prediction", metricName="accuracy") accuracy = evaluador.evaluate(df_predictions) print(f"Accuracy: {accuracy}") df_predictions.select("prediction", "rawPrediction", "probability").show() #Mostramos la cantidad de 0 y 1 de las predicciones df_predictions.groupby('prediction').count().show()
def MachineLearning(df): file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/" data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label') data = data.withColumn('length',length(data['Summary'])) # Basic sentence tokenizer tokenizer = Tokenizer(inputCol="Summary", outputCol="words") #remove stop words remover = StopWordsRemover(inputCol="words", outputCol="filtered_features") #transoform dataset to vectors cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000) #calculate IDF for all dataset idf = IDF(inputCol= 'features1', outputCol = 'tf_idf') normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False) selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures", outputCol="selectedFeatures", labelCol="label") #prepare data for ML spark library cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features') # Normalize each Vector using $L^1$ norm. pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp]) pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) data.printSchema() train_data, test_data = data.randomSplit([0.7,0.3],seed=2018) lr = LogisticRegression(featuresCol="features", labelCol='label') lrModel = lr.fit(train_data) beta = np.sort(lrModel.coefficients) plt.plot(beta) plt.ylabel('Beta Coefficients') plt.show() trainingSummary = lrModel.summary roc = trainingSummary.roc.toPandas() plt.plot(roc['FPR'],roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') plt.show() print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC)) pr = trainingSummary.pr.toPandas() plt.plot(pr['recall'],pr['precision']) plt.ylabel('Precision') plt.xlabel('Recall') plt.show() predictions = lrModel.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Test Area Under ROC', evaluator.evaluate(predictions))
def run_feature_selection_on(data): LOGGER.warning("Running feature selection.") selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selectedFeatures", labelCol="label") data = selector.fit(data).transform(data).drop( 'features').withColumnRenamed('selectedFeatures', 'features') LOGGER.warning("Ran feature selection.") return data
def pruebaChi(dataframe, categoricalCols, numericalCols, labelCol="TIPO PACIENTE"): """Función que hace todo el preprocesamiento de los datos categóricos de un conjunto de datos de entrenamiento (o no). :param train spark df: conjunto de datos de entrenamiento. :param categoricalCols list,array: conjunto de nombres de columnas categoricas del conjunto de datos train. :param numericalCols list,array: conjunto de nombres de columnas numéricas del conjunto de datos train. :param labelCol str: variable objetivo o etiqueta :Returns spark dataframe con las columnas 'label' y 'features' """ # codificamos todas las variables categóricas stages = [] for categoricalCol in categoricalCols: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "ohe") stages += [stringIndexer, encoder] # variable objetivo (etiqueta) label_strIdx = StringIndexer(inputCol=labelCol, outputCol="label") stages += [label_strIdx] # ponemos todas las covariables en un vector assemblerInputs = [c + "ohe" for c in categoricalCols] + numericalCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="feat") stages += [assembler] # seleccionamos las variables que nos sirven con ChiSqSelector selector = ChiSqSelector(featuresCol="feat", outputCol="feature", labelCol="label", fpr=0.05, selectorType='fpr') stages += [selector] # escala de 0-1 scala = MinMaxScaler(inputCol="feature", outputCol="features") stages += [scala] # pipeline donde vamos a hacer todo el proceso pipe = Pipeline(stages=stages) pipeModel = pipe.fit(dataframe) df = pipeModel.transform(dataframe) # regresamos nuestro df con lo que necesitamos return df
def pre_processing(df): ''' feature selection ''' selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show()
def getAllMeasure(rf,selectorData,featureCols,): measure = np.array([' ', ' ', ' ']) for i in range(1, len(featureCols) + 1): selector = ChiSqSelector(numTopFeatures=i, featuresCol="features", outputCol="selectedFeatures", labelCol="label") selectedData = selector.fit(selectorData).transform(selectorData) trainSelected, testSelected = selectedData.randomSplit([0.7, 0.3]) rfModel = rf.fit(trainSelected) prediction = rfModel.transform(testSelected) evaluator = BinaryClassificationEvaluator() measure = np.vstack([evaluateLr(prediction, evaluator, i), measure]) return measure
def initializePipeline(num_cols, cat_cols): cat_cols_index = [] cat_cols_hoted = [] for i in cat_cols: cat_cols_index.append(i + "_index") cat_cols_hoted.append(i + "_hoted") featureCols = [] for i in num_cols: featureCols.append(i + "scaled") for i in cat_cols: featureCols.append(i + "_hoted") labelindexers = [StringIndexer(inputCol="Churn", outputCol="label")] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index") for column in cat_cols ] oneHotEncoder = [ OneHotEncoderEstimator(inputCols=cat_cols_index, outputCols=cat_cols_hoted, dropLast=False) ] assembler = [ VectorAssembler(inputCols=num_cols, outputCol=i + "_indexe") for i in num_cols ] normalizers = [ MinMaxScaler(inputCol=column + "_indexe", outputCol=column + "scaled") for column in num_cols ] featureAssembler = [ VectorAssembler(inputCols=featureCols, outputCol="resultedfeatures") ] selector = [ ChiSqSelector(numTopFeatures=13, featuresCol="resultedfeatures", outputCol="features", labelCol="label") ] pipeline = Pipeline(stages=indexers + oneHotEncoder + assembler + normalizers + featureAssembler + labelindexers + selector) return pipeline
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=4, featuresCol="indexedFeatures", labelCol="target", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def important_feature_selector(predicted): """Uses the Chi-Squared Test to select important features for classification, and prints them out. Params: - predicted (pyspark.sql.DataFrame): The dataset, with predictions """ selector = ChiSqSelector(numTopFeatures=50, featuresCol='presence_feature_set', labelCol='label', outputCol='selected_features', selectorType='numTopFeatures') model = selector.fit(predicted) important_features = model.selectedFeatures with open('bag_of_words_labels.json', 'r') as bow_file: bow_labels = json.loads( bow_file.readlines()[0]) # There is only one line important_feature_labels = [ bow_labels[index] for index in important_features ] print("=====Important Feature Labels=====") print(important_feature_labels)
def build_trigrams(input_cols=("text", "target"), n=3): logging.warning("Building trigram model.") tokenizer = [Tokenizer(inputCol=input_cols[0], outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_string_idx = [ StringIndexer(inputCol=input_cols[1], outputCol="label") ] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_string_idx + selector + lr)
def feature_selection(df): # Creamos vectorassembler assembler = VectorAssembler(inputCols=[ "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS", "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA", "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO", "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA", "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS" ], outputCol="features") df = assembler.transform(df) # Vectorindexer indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") df = indexer.fit(df).transform(df) # Prueba ChiSquare selector = ChiSqSelector(numTopFeatures=8, featuresCol="indexedFeatures", labelCol="DIABETES", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
for feature in feature_cols: indexed = feature + "_" + "indexed" indexed_cols.append(indexed) indexer = StringIndexer(inputCol=feature, outputCol=indexed, handleInvalid="keep", stringOrderType="frequencyDesc") stages.append(indexer) stages.append( VectorAssembler(inputCols=indexed_cols, outputCol="features", handleInvalid="keep")) stages.append( ChiSqSelector(numTopFeatures=20, labelCol="HasDetections", featuresCol="features", outputCol="selectedFeatures")) print("Performing model fitting") pipeline = Pipeline(stages=stages) model = pipeline.fit(df) df_features = model.transform(df) df_features.select("features", "selectedFeatures").show() print("Saving Pipeline Model") model.write().overwrite().save(pipeline_model_path) with open(feature_path, "wb") as f: pickle.dump(feature_cols, f) features = model.stages[-1].selectedFeatures
def issue_impact_process(ml_df, columns, project, organization): # ChiSquare r = ChiSquareTest.test(ml_df, "features", "label") pValues = r.select("pvalues").collect()[0][0].tolist() stats = r.select("statistics").collect()[0][0].tolist() dof = r.select("degreesOfFreedom").collect()[0][0] # ChiSq Selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="label") selector_model = selector.fit(ml_df) top_10_feaures_importance = [] for j in selector_model.selectedFeatures: top_10_feaures_importance.append(columns[j]) top_10_feaures_importance.append(stats[j]) top_issue_lines = [] data_count = ml_df.count() # First importance value being 0 => skip if top_10_feaures_importance[1] != 0: print("\tFirst ChiSquare selected issue's importance is 0") top_issue_lines.append( [organization, project, "ChiSquareSelectorModel", data_count] + top_10_feaures_importance) # Tree-based algorithm's Feature Importances dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=3) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=10) for algo, model_name in [(dt, "DecisionTreeModel"), (rf, "RandomForestModel")]: model = algo.fit(ml_df) f_importances = model.featureImportances indices = f_importances.indices.tolist() values = f_importances.values.tolist() if len(values) < 2: print( f"\tOnly less or equal to 1 significant issue for model {model_name}. Skipping writing to Database." ) continue value_index_lst = list(zip(values, indices)) value_index_lst.sort(key=lambda x: x[0], reverse=True) importance_sorted_features = [] for value, index in value_index_lst: importance_sorted_features.append(columns[index]) importance_sorted_features.append(value) length = len(importance_sorted_features) if length > 20: importance_sorted_features = importance_sorted_features[:20] elif length < 20: importance_sorted_features = importance_sorted_features + ( 20 - length) * [None] top_issue_lines.append( [organization, project, model_name, data_count] + importance_sorted_features) if len(top_issue_lines) > 0: top_issue_df = spark.createDataFrame(data=top_issue_lines, schema=TOP_ISSUE_SCHEMA) top_issue_df.write.jdbc(CONNECTION_STR, 'top_issues', mode='append', properties=CONNECTION_PROPERTIES)
pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol( "polyFeatures") pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\ .setNumTopFeatures(2) chisq.fit(prechi).transform(prechi)\ .drop("customerId", "Description", "DescOut").show() # COMMAND ---------- fittedPCA = pca.fit(scaleDF) fittedPCA.write().overwrite().save("/tmp/fittedPCA") # COMMAND ---------- from pyspark.ml.feature import PCAModel loadedPCA = PCAModel.load("/tmp/fittedPCA") loadedPCA.transform(scaleDF).show()
std_scaler = StandardScaler(inputCol="features", outputCol="scaled_features") scaled_df = std_scaler.fit(features_df).transform(features_df) scaled_df.select("scaled_features").display() # COMMAND ---------- # MAGIC %md ###Part 4: Feature Selection # MAGIC Chi Square Selector # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors chisq_selector = ChiSqSelector(numTopFeatures=1, featuresCol="scaled_features", outputCol="selected_features", labelCol="cust_age") result_df = chisq_selector.fit(scaled_df).transform(scaled_df) result_df.select("selected_features").display() # COMMAND ---------- # MAGIC %md Feature Selection using VectorSclicer # COMMAND ---------- from pyspark.ml.feature import VectorSlicer vec_slicer = VectorSlicer(inputCol="scaled_features",
class1_num = class1.count() class2_num = class2.count() fraction = 1.0 * class1_num / class2_num class2 = class2.sample(fraction) training_dataset_balanced = class1.union(class2) training_dataset_balanced.groupBy("_c41").count().show() ####### 14.1 ### converted_cols = ["s" + col for col in string_cols] assembler = VectorAssembler(inputCols=converted_cols + numerical_cols, outputCol="features") labelIndexer = StringIndexer(inputCol="_c41", outputCol="label") #classifier = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxBins=64, maxDepth= 5, subsamplingRate= 1.0 ) ## 14.2 #classifier = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=64) selector = ChiSqSelector(numTopFeatures=35, featuresCol="features", outputCol="selectedFeatures") classifier = NaiveBayes( smoothing=1.0 ) ## modelType="multinomial" we have binomial here so it doesn't make change when we apply this parameter pipeline = Pipeline(stages=indexers + [assembler, labelIndexer, selector, classifier]) model = pipeline.fit(training_dataset_balanced) #predictions = model.transform(dataset_testing) ##14.1 predictions = model.transform(dataset_testing) ## 14.2 predictions.show(10, False) ###### 14.2 #### evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction") accuracy = evaluator.evaluate(predictions)
def add_vectorized_features(self, transform_type, min_df, max_df, isCHISQR, chi_feature_num, num_features): ''' Creates the pySpark feature pipeline and stores the vectorized data under the feature column Input: transform_type: {'tfidf','tfidf_bigram'}, min document frequency (min_df), chi squared feature reduction (isCHISQR) number of reduced features with chi square feature reduction (chi_feature_num), number of features (num_features) Output: Returns the transformed dataframe with the label and features columns ''' stages = [] #Code this code transforms text to vectorized features # Tokenize review sentences into vectors of words regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W") stages += [regexTokenizer] #Remove stopwords from tokenized words #nltk.download('stopwords') from nltk.corpus import stopwords sw = stopwords.words('english') stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(sw) #lemmatizer = WordNetLemmatizer() #doc = [lemmatizer.lemmatize(token) for token in doc] stages += [stopwordsRemover] # Using TFIDF for review transformation of unigrams. if transform_type == 'tfidf': # Creating IDF from the words the filtered words hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=num_features) idf = IDF(inputCol="rawFeatures", outputCol="review_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF, idf] # Using TFIDF for review transformation of bigrams if transform_type == 'tfidf_bigram': #Add unigram and bigram word vectors, then vectorize using TFIDF unigram = NGram(n=1, inputCol='filtered', outputCol='unigrams') stages += [unigram] bigram = NGram(n=2, inputCol='filtered', outputCol='bigrams') stages += [bigram] # Creating IDF from unigram words hashingTF_unigram = HashingTF(inputCol="unigrams", outputCol="rawFeatures_unigrams", numFeatures=num_features) idf_unigram = IDF(inputCol="rawFeatures_unigrams", outputCol="unigrams_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF_unigram, idf_unigram] # Creating IDF from the bigram words hashingTF_bigram = HashingTF(inputCol="bigrams", outputCol="rawFeatures_bigrams", numFeatures=num_features) idf_bigram = IDF(inputCol="rawFeatures_bigrams", outputCol="bigrams_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF_bigram, idf_bigram] ngrams = VectorAssembler( inputCols=['unigrams_vector', 'bigrams_vector'], outputCol='review_vector') stages += [ngrams] assemblerInputs = ['review_vector'] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="unstandard_features") stages += [assembler] if isCHISQR: chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num, featuresCol="unstandard_features", outputCol="chisq_features", labelCol="label") stages += [chi_selector] scaler = StandardScaler(inputCol="chisq_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] else: scaler = StandardScaler(inputCol="unstandard_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] pipeline = Pipeline(stages=stages) pipelineFit = pipeline.fit(self.df) self.df = pipelineFit.transform(self.df) return self.df
# assemble all features into feature vector features_assembler = VectorAssembler(inputCols=num_bool_features, outputCol="features") # Index labels, adding metadata to the label column. label_indexer = StringIndexer(inputCol="has_over_50k", outputCol="label").fit(processed_train_set) # Convert indexed labels back to original labels. label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.labels) # - ChiSQ feature Selection selector = ChiSqSelector(numTopFeatures=20, featuresCol="features", outputCol="featuresSel", labelCol="label") # - RandomForest model with parameter tuning using cross validation rf = RandomForestClassifier(labelCol="label", featuresCol="featuresSel", numTrees=20) # - Create ParamGrid for Cross Validation rf_param_grid = (ParamGridBuilder().addGrid( rf.maxDepth, [2, 3, 4, 5, 10, 20]).addGrid(rf.maxBins, [10, 20, 40, 80, 100]).build()) # - Model Evaluation rf_eval = BinaryClassificationEvaluator(labelCol="label")
from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors working_cols = df.columns working_cols.remove("ID") working_cols.remove("Target") # This concatenates all feature columns into a single feature vector in a new column "rawFeatures". vectorAssembler = VectorAssembler(inputCols=working_cols, outputCol="rawFeatures") #Execute Vector Assembler assembled_df = vectorAssembler.transform(df) #Select Features selector = ChiSqSelector(numTopFeatures=5, featuresCol="rawFeatures", outputCol="selectedFeatures", labelCol="Target") #Execute Selector selected_df = selector.fit(assembled_df).transform(assembled_df) #Display Results print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) display(selected_df.select("rawFeatures", "selectedFeatures")) # COMMAND ---------- # COMMAND ---------- display(assembled)
df = spark.createDataFrame([( 7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0, ), ( 8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0, ), ( 9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0, )], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() # COMMAND ---------- ###Locality sensitive hashing (LSH) is used in clustering data from pyspark.ml.feature import BucketedRandomProjectionLSH from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col
def main(): #initialize spark session spark = SparkSession\ .builder\ .appName("Malware Random Forests")\ .getOrCreate() sc = spark.sparkContext sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "AKIAI5AGW2NXUVOJ7L2A") sc._jsc.hadoopConfiguration().set( "fs.s3n.awsSecretAccessKey", "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") #read file names fileNames = (sys.argv) X_train = sc.textFile(fileNames[1], 40) y_train = sc.textFile(fileNames[2], 40) #paths to test data X_test = sc.textFile(fileNames[3], 40) if len(fileNames) >= 4 else None y_test = sc.textFile(fileNames[4], 40) if len(fileNames) == 5 else None print y_test print "printing ytest!!!!!" #get train bytes features preprocessor_b = preprocessor_bytes(most_common_bigrams=True) hashes_and_labels = preprocessor_b.hashes_and_labels(X_train, y_train) s3_path = preprocessor_b.get_s3_path(X_train) bytes = sc.wholeTextFiles(','.join(s3_path), 40) train_bytes = preprocessor_b.transform(bytes, hashes_and_labels) print "train_bytes done!!!!!!!" #get asm features preprocessor_a = preprocessor_asm() s3_path = preprocessor_a.get_s3_path(X_train) metadata = sc.wholeTextFiles(','.join(s3_path), 40) train_asm = preprocessor_a.transform(metadata, hashes_and_labels, train=True) print train_asm.show() print "train_asm done!!!!!!!" #get test bytes data hashes_and_labels = preprocessor_b.hashes_and_labels(X_test, y_test) s3_path = preprocessor_b.get_s3_path(X_test) bytes = sc.wholeTextFiles(','.join(s3_path), 40) test_bytes = preprocessor_b.transform( bytes, hashes_and_labels ) if y_test is not None else preprocessor_b.transform(bytes) print "test_bytes done!!!!!!!" #get test asm data s3_path = preprocessor_a.get_s3_path(X_test) metadata = sc.wholeTextFiles(','.join(s3_path), 40) test_asm = preprocessor_a.transform( metadata, hashes_and_labels, train=False) if y_test is not None else preprocessor_a.transform( metadata, hashes_and_labels=None, train=False) print "test_asm done!!!!!!!" #chisqr feature selector selector1 = ChiSqSelector(numTopFeatures=150, outputCol="selectedFeatures") selectormodel1 = selector1.fit(train_bytes) train_bytes = selectormodel1.transform(train_bytes) test_bytes = selectormodel1.transform(test_bytes) print "ChiSqSelector bytes done!!!!!!!" selector2 = ChiSqSelector(numTopFeatures=150, outputCol="selectedFeatures") selectormodel2 = selector2.fit(train_asm) train_asm = selectormodel2.transform(train_asm) test_asm = selectormodel2.transform(test_asm) print "ChiSqSelector asm done!!!!!!!" #to rdd train_bytes = train_bytes.select( 'hash', 'selectedFeatures', 'label').rdd.map(lambda (hash, feats, label): (hash, (feats, label))) train_asm = train_asm.select( 'hash', 'selectedFeatures', 'label').rdd.map(lambda (hash, feats, label): (hash, (feats, label))) #merge train bytes and train asm data train = train_bytes.join(train_asm).map(lambda (hash, ( (bytes, label), (asm, label2))): (hash, bytes, asm, label)) schema = StructType([ StructField('hash', StringType(), True), StructField('bytes', VectorUDT(), True), StructField('asm', VectorUDT(), True), StructField('label', StringType(), True) ]) train = train.toDF(schema) train = train.withColumn('label', train.label.cast(DoubleType())) print train.show() print "Final TrainDF done!!!!!!!" ''' we want to use the same pipeline when the testing labels are present or absent, so, the below code builds the test data accordingly. The test RDD looks like: - When test labels are present: <hash,selectedFeatures,label> - When test labels are absent : <hash,selectedFeatures> When labels are absent, there is no 3rd column in dataframe, and so, the consecutive dataframes also don't have that column. ''' if y_test is not None: test_bytes = test_bytes.select( 'hash', 'selectedFeatures', 'label').rdd.map( lambda (hash, feats, label): (hash, (feats, label))) test_asm = test_asm.select( 'hash', 'selectedFeatures', 'label').rdd.map( lambda (hash, feats, label): (hash, (feats, label))) #merge test bytes and test asm data test = test_bytes.join(test_asm).map(lambda (hash, ( (bytes, label), (asm, label2))): (hash, bytes, asm, label)) schema = StructType([ StructField('hash', StringType(), True), StructField('bytes', VectorUDT(), True), StructField('asm', VectorUDT(), True), StructField('label', StringType(), True) ]) test = test.toDF(schema) test = test.withColumn('label', test.label.cast(DoubleType())) else: test_bytes = test_bytes.select( 'hash', 'selectedFeatures').rdd.map(lambda (hash, feats): (hash, feats)) test_asm = test_asm.select( 'hash', 'selectedFeatures').rdd.map(lambda (hash, feats): (hash, feats)) #merge test bytes and test asm data test = test_bytes.join(test_asm).map(lambda (hash, (bytes, asm)): (hash, bytes, asm)) schema = StructType([ StructField('hash', StringType(), True), StructField('bytes', VectorUDT(), True), StructField('asm', VectorUDT(), True) ]) test = test.toDF(schema) print test.show() print "Final TestDF done!!!!!!!" #merge bytes and asm features assembler = VectorAssembler(inputCols=["bytes", "asm"], outputCol="features") train = assembler.transform(train) test = assembler.transform(test) print "VectorAssembler done!!!!!!!" #rf classifier rf = RandomForestClassifier(numTrees=100, maxDepth=12, maxBins=32, maxMemoryInMB=512, seed=1) model = rf.fit(train) result = model.transform(test) #save to csv #result.select("prediction").toPandas().astype(int).to_csv('prediction.csv',header=False,index=False) hash_predictions = result.select("hash", "prediction") result.select("hash", "prediction").toPandas().to_csv('./prediction.csv', header=False, index=False) print "Results .csv written" #Get all the predictions in the same order as hashes in X_test and save them to a file test_hashes = X_test.zipWithIndex().map(lambda x: (x[1], x[0].encode('utf-8'))) predictions = hash_predictions.collect() predictionsBroadCast = sc.broadcast(dict(predictions)) predicted_labels = test_hashes.map(lambda (indx, docHash): ( docHash, get_predicted_label(docHash, predictionsBroadCast.value) )).map(lambda (docHash, label): int(label)).collect() save_predicted_labels(predicted_labels, './predictions.txt') print "Results .txt written" spark.stop()