def get_ml1_pipeline(): stages = [] imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS ) stages.append(imputer) ohe_input_cols = [] ohe_output_cols = [] for categorical_column in ML1_CATEGORICAL_COLUMNS: str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep') ohe_input_cols.append(str_indexer.getOutputCol()) ohe_output_cols.append(categorical_column + "_class_vec") stages.append(str_indexer) encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False) stages.append(encoder) numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep") scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols") stages.append(numerical_vector_assembler) stages.append(scaler) label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep") stages.append(label_str_indexer) assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()] assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip") stages.append(assembler) pipeline = Pipeline(stages = stages) return pipeline
def one_hot(datafile): spark=init_spark() df=spark.read.format("csv").option("header","true").load(datafile) df1=df.select( # "date_account_created" # ,"timestamp_first_active", "age", "gender" ,"signup_method" ,"signup_flow" ,"language" ,"affiliate_channel" ,"affiliate_provider" ,"first_affiliate_tracked" ,"signup_app" ,"first_device_type" ,"first_browser" ,"country_destination") # print(df1) age_average = (df1.agg({"age": "sum"}).collect()[0][0]) / (df1.select("age").count()) df2=df1.fillna({'age':age_average}) # df3=df2.withColumn("age", when(df["age"]<=17, age_average).otherwise(df["age"])) # indexers = [StringIndexer(inputCol="gender", outputCol="gender_numeric").fit(df2)] df3=df2.dropna() data_df= df3.withColumn("age", df3["age"].cast(IntegerType())) indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_features] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="cat_features" ) # combine all the numberical_feature togeher assembler2=VectorAssembler( inputCols=numberical_feature, outputCol="num_features" ) pipeline = Pipeline(stages=indexers+[encoder,assembler,assembler2]) df_r = pipeline.fit(data_df).transform(data_df) # df_r.show() # combine all the numberical_feature togeher return df_r
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep") for c in categorical ] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers ]) assembler = VectorAssembler(inputCols=encoder.getOutputCols(), outputCol="features") stages = indexers + [encoder, assembler] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) one_hot_encoder = pipeline.fit(df_train) df_train_encoded = one_hot_encoder.transform(df_train) df_train_encoded.show() df_train_encoded = df_train_encoded.select(["label", "features"])
] label = StringIndexer(inputCol=target[0], outputCol="label") #one hot encoding categorical features to reduce dimensionality for model training: '''note: one-hot encoding maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from the set of all feature values.''' encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers ], dropLast=False) #combining all the feature columns into a single vector column assembler = VectorAssembler(inputCols=encoder.getOutputCols() + integerCols, outputCol="features") #creating instance of a logistic regression model lr = LogisticRegression(maxIter=10) #laying down pipeline for model fitting pipeline = Pipeline(stages=indexers + [encoder, assembler, label, lr]) # fit train split with LogisticRegression model using the pipeline lr_model = pipeline.fit(trainingData) #making predictions on test data using the transform method predictions = lr_model.transform(testData) #extracting true,predicted and probability to compute log loss
def main(spark): getCsv() schema = StructType([ StructField("age", IntegerType(), True), StructField("workclass", StringType(), True), StructField("fnlwgt", IntegerType(), True), StructField("education", StringType(), True), StructField("education-num", IntegerType(), True), StructField("marital-status", StringType(), True), StructField("occupation", StringType(), True), StructField("relationship", StringType(), True), StructField("race", StringType(), True), StructField("sex", StringType(), True), StructField("capital-gain", IntegerType(), True), StructField("capital-loss", IntegerType(), True), StructField("hours-per-week", IntegerType(), True), StructField("native-country", StringType(), True), StructField("salary", StringType(), True) ]) train_df = spark.read.csv('train.csv', header=False, schema=schema) test_df = spark.read.csv('test.csv', header=False, schema=schema) print(train_df.limit(5).toPandas()) categorical_variables = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] indexers = [ StringIndexer(inputCol=column, outputCol=column + "-index") for column in categorical_variables ] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers ]) assembler = VectorAssembler(inputCols=encoder.getOutputCols(), outputCol="categorical-features") pipeline = Pipeline(stages=indexers + [encoder, assembler]) train_df = pipeline.fit(train_df).transform(train_df) test_df = pipeline.fit(test_df).transform(test_df) train_df.printSchema() df = train_df.limit(5).toPandas() print(df['categorical-features'][1]) continuous_variables = [ 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ] assembler = VectorAssembler( inputCols=['categorical-features', *continuous_variables], outputCol='features') train_df = assembler.transform(train_df) test_df = assembler.transform(test_df) print(train_df.limit(5).toPandas()['features'][0]) indexer = StringIndexer(inputCol='salary', outputCol='label', handleInvalid="skip") train_df = indexer.fit(train_df).transform(train_df) test_df = indexer.fit(test_df).transform(test_df) lr = LogisticRegression(featuresCol='features', labelCol='label') model = lr.fit(train_df) pred = model.transform(test_df)
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep") for c in categorical ] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="features" ) stages = indexers + [encoder, assembler] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) one_hot_encoder = pipeline.fit(df_train) df_train_encoded = one_hot_encoder.transform(df_train)
tgt_agent_indexer.getOutputCol(), tgt_other_agent_indexer.getOutputCol() ], outputCols=[ "country_code_ohe", "geoname_ohe", "source_ohe", "src_actor_ohe", "src_agent_ohe", "src_other_agent_ohe", "target_ohe", "tgt_actor_ohe", "tgt_agent_ohe", "tgt_other_agent_ohe" ], handleInvalid='keep', dropLast=True) # Combine all features into a single column feature_assembler = VectorAssembler(inputCols=ohe.getOutputCols() + [goldstein_scaler.getOutputCol()], outputCol="features") # Index root_code labels label_indexer = StringIndexer( inputCol="root_code", outputCol="indexedLabel").setHandleInvalid('skip') # Select a subset of important features feature_selector = ChiSqSelector( percentile=0.5, featuresCol=feature_assembler.getOutputCol(), labelCol=label_indexer.getOutputCol(), outputCol="selected_features")
numColumns = [ item[0] for item in df.dtypes if not item[1].startswith('string') ] catColVectors = [c + '_vector' for c in catColumns] # Change categorical values into numeric indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index") for column in catColumns ] encoder = OneHotEncoderEstimator( inputCols=[c + "_index" for c in catColumns], outputCols=[c + "_vector" for c in catColumns]) assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns, outputCol="features") label_stringIdx = StringIndexer(inputCol="income", outputCol="label") pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler]) encoded_df = pipeline.fit(df).transform(df) selectedCols = ['label', 'features'] + cols dataset = encoded_df.select(selectedCols) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count())
#BUILD FEATURES #Pred windows 1, 5, 10, 20, 60 stockcreator = StockFeatureCreator(lags=10, pred_window=1, ma_windows=[3, 5, 10, 20, 50, 80, 100], tickers=["aapl", "sp500"]) stockcreator.build() #ONE HOT CATEGORICAL DATE FEATURES inputs = [s for s in time_cols if s not in ["Date"]] encoder = OneHotEncoderEstimator( inputCols=inputs, outputCols=[s + "_Vec" for s in time_cols if s not in ["Date"]]) #VECTOR ASSEMBLER features = stockcreator.getOutputCols() + encoder.getOutputCols( ) #getOutoutCols returning empty list features = [ col for col in features if (col != "label") or (col not in time_cols) ] featureassembler = VectorAssembler(inputCols=features, outputCol="features") #SPLIT finalized_data = df.withColumn( "rank", percent_rank().over(Window.partitionBy().orderBy("Date"))) train_data = finalized_data.where("rank <= .9").drop("rank") test_data = finalized_data.where("rank > .9").drop("rank") #FEATURE SELECTION selector = ChiSqSelector(numTopFeatures=300, featuresCol="features",