def vectorize_data(training_data, test_data): # Assemble the vectors input_columns = training_data.columns input_columns.remove(TARGET) print("Using these features: {}".format(input_columns)) vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features') train_df = vector_assembler.transform(training_data) # Normalize the data using Scalar scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df) train_df = scalar.transform(train_df) # Select the rows needed train_df = train_df.select(['scaledFeatures', TARGET]) new_test_data = dict() for company in test_data: company_data = test_data[company] test_df = vector_assembler.transform(company_data) test_df = scalar.transform(test_df) test_df = test_df.select(['scaledFeatures', TARGET]) new_test_data[company] = test_df return train_df, new_test_data
def preprocess(df, should_undersample, scaler=None): """ Escala los datos y balancea usando Random Undersample (RUS) """ # Agrupar las caracteristicas para poder usarlas en la MLlib: assembler = VectorAssembler(inputCols=[ "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A", "PSSM_r1_1_W", "PSSM_central_-1_V" ], outputCol="features") out = assembler.transform(df).select("features", "class") # Random Undersample (RUS) # Antes: POS = 550.140, NEG = 1.100.591 # Despues: POS = 550.140, NEG = 549.668 if should_undersample: positive = out.filter(out["class"] == 1.0) negative = out.filter(out["class"] == 0.0) fraction = float(positive.count()) / float(negative.count()) negative = negative.sample(withReplacement=False, fraction=fraction, seed=89) out = negative.union(positive) # Escalar: if scaler == None: scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features") scaler = scaler.fit(out) out = scaler.transform(out) else: out = scaler.transform(out) return out, scaler
def standard_scale(dataFrame, inputColNames, usr_withStd=True, usr_withMean=False): assembledDF = getAssembledDataFrame(dataFrame, inputColNames) scaler=StandardScaler(inputCol="features", \ outputCol="scaled features", \ withStd=usr_withStd, \ withMean=usr_withMean).fit(assembledDF) scaledDF = scaler.transform(assembledDF).drop("features") return scaledDF
def scaling(dataFrame, inputColName, usr_withStd, usr_withMean): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=StandardScaler(inputCol="features", \ outputCol=outputColName, \ withStd=usr_withStd, \ withMean=usr_withMean).fit(assembledDF) scaledDF = scaler.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' and create a new column '{1:s}'.".format(inputColName, outputColName)) return scaledDF
def preprocess(df): df = under_sampling(df) indexer = StringIndexer(inputCol="PredSS_central_1", outputCol="PredSS_central_1_indexed") assembler = VectorAssembler(inputCols=[ "PSSM_r1_1_N", "PredSS_central_1_indexed", "AA_freq_central_A", "AA_freq_global_H", "PSSM_r1_1_S", "PSSM_r2_-3_Y" ], outputCol='features') pipeline = Pipeline(stages=[indexer, assembler]) df_1 = pipeline.fit(df).transform(df).select('features', 'class') #df = assembler.transform(df).select('features', 'class') #df = df.select('features', 'labels') scale = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='scaled_features') scale = scale.fit(df_1) df_1 = scale.transform(df_1) return df_1
featuresForScale = [x for x in mlSourceDFCat.columns if 'Lag' in x] print(len(featuresForScale)) assembler = VectorAssembler( inputCols=featuresForScale, outputCol="features" ) assembled = assembler.transform(mlSourceDFCat).select(col('key'), col('features')) scaler = StandardScaler( inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False ).fit(assembled) scaler.write().overwrite().save(featureScaleModelFile) scaledData = scaler.transform(assembled).select('key','scaledFeatures') def extract(row): return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values) rdd = scaledData.rdd.map(lambda x: Row(key=x[0],scaledFeatures=DenseVector(x[1].toArray()))) scaledDf = rdd.map(extract).toDF(["key"]) # rename columns oldColumns = scaledDf.columns scaledColumns = ['scaledKey'] scaledColumns.extend(['scaled'+str(i) for i in featuresForScale]) scaledOutcome = scaledDf.select([col(oldColumns[index]).alias(scaledColumns[index]) for index in range(0,len(oldColumns))]) noScaledMLSourceDF = mlSourceDFCat.select([column for column in mlSourceDFCat.columns if column not in featuresForScale]) newDF = noScaledMLSourceDF.join(scaledOutcome, noScaledMLSourceDF.key==scaledOutcome.scaledKey, 'outer') newDF.cache() mlSourceDFCat = newDF mlSourceDFCat=mlSourceDFCat.fillna(0, subset= [x for x in mlSourceDFCat.columns if 'Lag' in x])
# create vector test_df assembled_test = assembler.transform(test_df).drop( "CRS_DEP_TIME", "DISTANCE", 'vis_distance', 'tmp', 'dew', 'elevation', 'dest_wnd_speed', 'pagerank', 'pagerank_dest', 'wnd_speed', 'cig_height', 'dest_vis_distance', 'dest_tmp', 'dest_dew', 'dest_elevation', 'dest_cig_height') # COMMAND ---------- # DBTITLE 1,Scale Continuous Features # scale train scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True).fit(assembled_train) assembled_train = scaler.transform(assembled_train).drop('features') assembled_train = _convert_vector(assembled_train, 'float32') # scale val assembled_val = scaler.transform(assembled_val).drop('features') assembled_val = _convert_vector(assembled_val, 'float32') # scale test assembled_test = scaler.transform(assembled_test).drop('features') assembled_test = _convert_vector(assembled_test, 'float32') # COMMAND ---------- # check partition size assembled_val.rdd.getNumPartitions()
### NEEED SCALING NOW #### Scaling between 0-1 for continuous variables # Data needs to be scaled to a small range like 0 to 1 for the neural # network to work well. vectorAssembler = VectorAssembler(inputCols=selected_covariates_names_updated, outputCol='features') vtraining_df = vectorAssembler.transform(training_spark_df) from pyspark.ml.feature import StandardScaler standardscaler = StandardScaler().setInputCol("features").setOutputCol( "Scaled_features") standardscaler = standardscaler.fit(training_df) training_df = standardscaler.transform(training_df) testing_df = standardscaler.transform(tesing_df) #raw_data.select("features","Scaled_features").show(5) ########################################### ### PART IV: Run model and perform assessment ########################### training_spark_df = sqlContext.createDataFrame(X_y_training_df) #https://www.guru99.com/pyspark-tutorial.html #https://towardsdatascience.com/building-a-linear-regression-with-pyspark-and-mllib-d065c3ba246a vectorAssembler = VectorAssembler(inputCols=selected_covariates_names_updated, outputCol='features') vtraining_df = vectorAssembler.transform(training_spark_df)