def tourTimePredict(self, data): ''' Predict the tour time given data of boarding coordinate, getting off coordinate and boarding time. :param data: [upCoord, downCoord, upTime] :return: prediction of board time of the tour. ''' upCoord, downCoord, upTime = data upCoord_idx = int((round(upCoord[1][0][0], 1) - 120.5) * 10 * 12 + (round(upCoord[1][0][1], 1) - 30.4) * 10) manhLon = abs(upCoord[0] - downCoord[0]) manhLat = abs(upCoord[0] - downCoord[0]) test = self.spark.createDataFrame( [upTime, upCoord_idx, manhLon, manhLat], ['upTime', 'upCoord', 'manhLon', manhLat]) encoder_time = OneHotEncoder(inputCol='upTime', outputCol='upTime_onehot', dropLast=False) encoder_coord = OneHotEncoder(inputCol='upCoord', outputCol='upCoord_onehot', dropLast=False) assembler = VectorAssembler(inputCols=[ 'upTime_onehot', 'upCoord_onehot', 'manhLon', 'manhLat' ], outputCol='features') testDF = assembler.transform( encoder_coord.transform(encoder_time.transform(test))) return self.model.transform(testDF).head().prediction
def get_kdd_data(csv): """ input the filepath of the csv data, expecting this to be in the kdd format, with the columns in the standard order """ col_names = np.loadtxt('./col_names.txt', dtype='str') data = pd.read_csv(csv, header=None, names=col_names) #want to change this so spark reads the csv straight, no pd mediation df = spark.createDataFrame(data) #one-hot encode the string columns one_hot_cols = ['protocol_type', 'service', 'flag'] for label in one_hot_cols: stringIndexer = StringIndexer(inputCol=label, outputCol=label + "_index") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=label + "_index", outputCol=label + "_vec") encoded = encoder.transform(indexed) df = encoder.transform(indexed).drop(label + "_index").drop(label) return df
def encoding(df): encoder0 = OneHotEncoder(inputCol='salutation', outputCol='salutation_Vec') encoder1 = OneHotEncoder(inputCol='paymenttype', outputCol='paymenttype_Vec') encoder2 = OneHotEncoder(inputCol='model', outputCol='model_Vec') # Apply the encoder transformer df = encoder0.transform(df) df = encoder1.transform(df) df = encoder2.transform(df) return df
def one_hot_encode(dfFull): dfNoNullNumbers = dfFull.fillna('0') # start with user id stringIndexer_user = StringIndexer(inputCol="userid", outputCol="UserIdx", handleInvalid='keep') model_user = stringIndexer_user.fit(dfNoNullNumbers) indexed_user = model_user.transform(dfNoNullNumbers) encoder_user = OneHotEncoder(inputCol="UserIdx", outputCol="UserVec") encoded_user = encoder_user.transform(indexed_user) # now do item stringIndexer_item = StringIndexer(inputCol="product", outputCol="ItemIdx", handleInvalid='keep') model_item = stringIndexer_item.fit(encoded_user) indexed_item = model_item.transform(encoded_user) encoder_item = OneHotEncoder(inputCol="ItemIdx", outputCol="ItemVec") encoded_item = encoder_item.transform(indexed_item) # now do category stringIndexer_cat = StringIndexer(inputCol="category", outputCol="CategoryCleanedIdx", handleInvalid='keep') model_cat = stringIndexer_cat.fit(encoded_item) indexed_cat = model_cat.transform(encoded_item) encoder_cat = OneHotEncoder(inputCol="CategoryCleanedIdx", outputCol="CategoryCleanedVec") encoded_cat = encoder_cat.transform(indexed_cat) # now do offerid stringIndexer_offer = StringIndexer(inputCol="offerid", outputCol="OfferIdx", handleInvalid='keep') model_offer = stringIndexer_offer.fit(encoded_cat) indexed_offer = model_offer.transform(encoded_cat) encoder_offer = OneHotEncoder(inputCol="OfferIdx", outputCol="OfferVec") encoded_offer = encoder_offer.transform(indexed_offer) # now do countrycode stringIndexer_cc = StringIndexer(inputCol="countrycode", outputCol="CcIdx", handleInvalid='keep') model_cc = stringIndexer_cc.fit(encoded_offer) indexed_cc = model_cc.transform(encoded_offer) encoder_cc = OneHotEncoder(inputCol="CcIdx", outputCol="CcVec") encoded_cc = encoder_cc.transform(indexed_cc) dfEncoded = encoded_cc.drop('userid').drop('UserIdx') \ .drop('product').drop('ItemIdx') \ .drop('offerid').drop('OfferIdx') \ .drop('countrycode').drop('CcIdx') \ .drop('category').drop('CategoryCleanedIdx') return dfEncoded
def onehot_encoder(df, features_categorical_indexed): for s in features_categorical_indexed: encoder = OneHotEncoder(dropLast=True, inputCol=s, outputCol=(s + "_Vec")) df = encoder.transform(df) return df
def __clean_data(self, df, is_fraud="isfraud"): ignore = [is_fraud, 'label'] #Removendo colunas não utilizadas df = df.drop(*['paysim_id', 'nameorig', 'namedest']) #String Indexing string_indexer = StringIndexer(inputCol="type", outputCol="type_numeric").fit(df) df = string_indexer.transform(df) df = df.drop(df.type) #One-hot encoding encoder = OneHotEncoder(inputCol="type_numeric", outputCol="type_vector") df = encoder.transform(df) df = df.drop("type_numeric") #Label encoding label_stringIdx = StringIndexer(inputCol=is_fraud, outputCol='label').fit(df) df = label_stringIdx.transform(df) df = df.drop(is_fraud) #Vector Assembling assembler = VectorAssembler( inputCols=[x for x in df.columns if x not in ignore], outputCol='features') df = assembler.transform(df) # dataframe in the correct format selectedCols = ['label', 'features'] df = df.select(selectedCols) return df
def oneHotEncodeColumns(df, cols): newdf = df for c in cols: onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False) newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf
def add_categorical(df, train=False): from pyspark.ml.feature import OneHotEncoder, StringIndexer if train: indexer = StringIndexer(inputCol='ORIGIN', outputCol='origin_index') index_model = indexer.fit(df) indexed = index_model.transform(df) encoder = OneHotEncoder(inputCol='origin_index', outputCol='origin_onehot') return encoder.transform(indexed)
def oneHotEncodeColumns(df, cols): from pyspark.ml.feature import OneHotEncoder newdf = df for c in cols: onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False) newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf
def tourTimePredict_train(self): ''' Train a ridge regression model which could predict the tour time given boarding coordinate, getting off coordinate and boarding time. ''' # The high 4 numbers of upCoord is longitude and the low 3 numbers is latitude onRecordDF = self.upRecord.filter(lambda p: (p[1][0][0] >= 120.5) & (p[1][0][0] <= 122.1) & (p[1][0][1] >= 30.4) & (p[1][0][1] <= 31.5))\ .map(lambda p: Row(upCoord=int((round(p[1][0][0], 1) - 120.5) * 10 * 12 + (round(p[1][0][1], 1) - 30.4) * 10), upTime=p[1][1].hour, duration=p[1][2].days * 60 * 24 + p[1][2].seconds/60, manhLon=abs(p[1][3][0] - p[1][0][0]), manhLat=abs(p[1][3][1] - p[1][0][1]))) onRecordDF = self.spark.createDataFrame(onRecordDF) # generate feature vector encoder_time = OneHotEncoder(inputCol='upTime', outputCol='upTime_onehot', dropLast=False) encoder_coord = OneHotEncoder(inputCol='upCoord', outputCol='upCoord_onehot', dropLast=False) assembler = VectorAssembler(inputCols=[ 'upTime_onehot', 'upCoord_onehot', 'manhLon', 'manhLat' ], outputCol='features') onRecordDF = assembler.transform( encoder_coord.transform(encoder_time.transform(onRecordDF))) trainSet, validSet, testSet = onRecordDF.randomSplit([7., 1., 2.]) # train model lr = LinearRegression(labelCol='duration', regParam=0.01, maxIter=100) self.model = lr.fit(trainSet) train_summary = self.model.summary print('RMSE of training:', train_summary.rootMeanSquaredError, 'min') print('Adjusted R2 of training:', train_summary.r2adj) # evaluation evaluator = RegressionEvaluator(labelCol='duration') model_valid = self.model.transform(validSet) print('RMSE on validation set:', evaluator.evaluate(model_valid, {evaluator.metricName: 'rmse'}), 'min') model_valid = self.model.transform(testSet) print('RMSE on test set:', evaluator.evaluate(model_valid, {evaluator.metricName: 'rmse'}), 'min')
def onehotencode(df, s1, s2, temp): from pyspark.ml.feature import OneHotEncoder, StringIndexer stringIndexer = StringIndexer(inputCol=s1, outputCol=temp) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=temp, outputCol=s2) encoded = encoder.transform(indexed) encoded.select(s2).show() return encoded
def events(df,column_name): i = column_name+"I" v = column_name+"V" stringIndexer = StringIndexer(inputCol=column_name, outputCol=i) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=i, outputCol=v) encoded = encoder.transform(indexed) return encoded
def create_catgeory_vars(dataset, field_name): idx_col = field_name + "Index" col_vec = field_name + "Vec" month_stringindexer = StringIndexer(inputCol=field_name, outputCol=idx_col) month_model = month_stringindexer.fit(dataset) month_indexed = month_model.transform(dataset) month_encoder = OneHotEncoder(dropLast=True, inputCol=idx_col, outputCol=col_vec) return month_encoder.transform(month_indexed)
def onehot_encoder_usecase(): spark = getSparkSession() df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoder(inputCol="categoryIndex2", outputCol="categoryVec2") encoded = encoder.transform(df) encoded.show()
def _transform(self,df): ohe_columns=[col for col in df.columns if col.startswith('index_')] ohe_columns=[col for col in ohe_columns if df.select(col).distinct().count()>2] for column in ohe_columns: sti=OneHotEncoder(inputCol=column,outputCol='ohe_'+column) df=sti.transform(df) df=df.drop(column) #print(df.columns) #df=df.join(label_column) return df
def update_columns(column_list, main_df): for column_name in column_list: string_indexer = StringIndexer(inputCol=column_name, outputCol=f'{column_name}_Index') model = string_indexer.fit(main_df) indexed = model.transform(main_df) encoder = OneHotEncoder(inputCol=f'{column_name}_Index', outputCol=f'{column_name}_Vec') main_df = encoder.transform(indexed) return main_df
def hot_encoding_var(df, feature_names): from pyspark.ml.feature import OneHotEncoder counter = 0 for i in feature_names: counter+=1 print("working on feature " + str(counter) + " of " + str(len(feature_names))) print("one hot encoding " + str(i) + " feature...") encoder = OneHotEncoder(inputCol=i, outputCol= str(i) + "_cd") encoded = encoder.transform(df) df = encoded return df
def ColToDummiesOHE(df, column): stringIndexer = StringIndexer(inputCol=column, outputCol=column + "Index") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=column + "Index", outputCol=column + "Vec") encoded = encoder.transform(indexed) return encoded
def oneHotEncoding(self, df, input_col): stringInd = StringIndexer(inputCol=input_col, outputCol="indexed") model = stringInd.fit(df) td = model.transform(df) encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False) final_encoding = encoder.transform(td).select(df.id, 'features').cache() conv_udf = udf(lambda line: Vectors.dense(line).tolist()) final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache() return final_encoding
def string_index_fun(col_name): stringindexer = StringIndexer(inputCol=col_name, outputCol=col_name + '_Index') model = stringindexer.fit(titanic_df) indexed = model.transform(titanic_df) # return indexed #convert all index columns into vector columns using onehotencoder with column name col_name+vector encoder = OneHotEncoder(inputCol=col_name + '_Index', outputCol=col_name + '_Vector') encoded = encoder.transform(indexed) return encoded
def one_hot_encoding(self, df, target): label_encoding(self, df, target) ohe_columns = [col for col in df.columns if col.startswith('index_')] ohe_columns = [ col for col in ohe_columns if df.select(col).distinct().count() > 2 ] for column in ohe_columns: sti = OneHotEncoder(inputCol=column, outputCol='ohe_' + column) df = sti.transform(df) df = df.drop(column) print('One-Hot Encoding completed.') return df
def onehot_encoding_column(self, column_name): self._data_frame = self.label_encoding_column(column_name) encoder = OneHotEncoder(dropLast=False, inputCol=column_name + "_ed_label_encoded", outputCol=column_name + "_ed_one_hot_encoded") self._data_frame = encoder.transform(self._data_frame) self._data_frame = self._data_frame.withColumn( column_name + "_ed_one_hot_encoded", self._data_frame[column_name + "_ed_one_hot_encoded"].cast('string')) self._data_frame = self._data_frame.drop(column_name + "_ed_label_encoded") return self._data_frame
def findSimillar(): #Dealing with the server request #project_ID = request.args.get('project_ID', None) project_ID = 'afd99a01739ad5557b51b1ba0174e832' projects.createOrReplaceTempView('projects') silhouette = [] cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"] colsa = [] #df = projects.select(cols) df = projects df = df.where(df.Project_Subject_Category_Tree.isNotNull()) df = df.where(df.Project_Subject_Subcategory_Tree.isNotNull()) df = df.where(df.Project_Grade_Level_Category.isNotNull()) df = df.where(df.Project_Resource_Category.isNotNull()) for i in range(len(cols)): stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a") model = stringIndexer.fit(df) df = model.transform(df) colsa.append(cols[i]+"a") for i in range(len(cols)): encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v") encoded = encoder.transform(df) assembler = VectorAssembler( inputCols=colsa, outputCol="features") output = assembler.transform(encoded) kmax = 10; #optimal K happens at k=4 for i in range(2,kmax): # Trains a k-means model. kmeans = KMeans().setK(i).setSeed(1) model = kmeans.fit(output) # Evaluate clustering by computing Silhouette score predictions = model.transform(output) evaluator = ClusteringEvaluator() silhouette.append([i,evaluator.evaluate(predictions)]) k_optimal = np.array(silhouette)[int(np.where(np.array(silhouette)[:,1]==np.amax(np.array(silhouette)[:,1]))[0]),0] kmeans = KMeans().setK(k_optimal).setSeed(1)
def cat2Num(self, df, indices): """ Write your code! """ # function to select one feature from a list of feature def select_feature(raw_feature, index): return raw_feature[index] # function to select remove features from a list of feature def delete_feature(raw_feature, indices): feature = [ i for j, i in enumerate(raw_feature) if j not in indices ] return Vectors.dense(feature) # Get categorical features and perform One-Hot Encoding df_prev = df for index in indices: select_feature_udf = udf(lambda x: select_feature(x, index), StringType()) df_encoded = df_prev.withColumn("cat_" + str(index), select_feature_udf("rawFeatures")) # string index stringIndexer = StringIndexer(inputCol="cat_" + str(index), outputCol="cat_index_" + str(index)) model_stringIndexer = stringIndexer.fit(df_encoded) indexed = model_stringIndexer.transform(df_encoded) # one-hot encode encoder = OneHotEncoder(inputCol="cat_index_" + str(index), outputCol="cat_vector_" + str(index), dropLast=False) encoded = encoder.transform(indexed) df_prev = encoded # Get continious features by removing categorical indices from rawFeatures delete_feature_udf = udf(lambda x: delete_feature(x, indices), VectorUDT()) df_cont = df_prev.withColumn("cont", delete_feature_udf("rawFeatures")) # Combine one-hot encoded categorical and continious features feature = [] for index in indices: feature.append("cat_vector_" + str(index)) feature.append("cont") assembler = VectorAssembler(inputCols=feature, outputCol="features") df_transformed = assembler.transform(df_cont) \ .select("id","rawFeatures","features") return df_transformed
def K_means(): knr = 2; cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"] df = projects.select(cols) for i in range(len(cols)): stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a") model = stringIndexer.fit(df) indexed = model.transform(df) for i in range(len(cols)): encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v") encoded = encoder.transform(indexed) # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(encoded)
def indexAndEncode(processedData, features): encodedFinal = processedData for feature in features: stringIndexer = StringIndexer(inputCol=feature, outputCol=feature + "Index") model = stringIndexer.fit( encodedFinal) # Input data-frame is the cleaned one from above indexed = model.transform(encodedFinal) encoder = OneHotEncoder(dropLast=False, inputCol=feature + "Index", outputCol=feature + "Vec") encodedFinal = encoder.transform(indexed) return encodedFinal
def get_sdummies(sdf, dummy_columns, keep_top, replace_with='other'): """ Index string columns and group all observations that occur in less then a keep_top% of the rows in sdf per column. :param sdf: A pyspark.sql.dataframe.DataFrame :param dummy_columns: String columns that need to be indexed :param keep_top: List [1, 0.8, 0.8] :param replace_with: String to use as replacement for the observations that need to be grouped. """ total = sdf.count() column_i = 0 for string_col in dummy_columns: # Descending sorting with counts sdf_column_count = sdf.groupBy(string_col).count().orderBy( 'count', ascending=False) sdf_column_count = sdf_column_count.withColumn( "cumsum", F.sum("count").over(Window.partitionBy().orderBy().rowsBetween( -sys.maxsize, 0))) # Obtain top dummy factors sdf_column_top_dummies = sdf_column_count.withColumn( "cumperc", sdf_column_count['cumsum'] / total).filter(col('cumperc') <= keep_top[column_i]) keep_list = sdf_column_top_dummies.select(string_col).rdd.flatMap( lambda x: x).collect() sdf = sdf.withColumn( string_col, when((col(string_col).isin(keep_list)), col(string_col)).otherwise(replace_with)) # Apply string indexer pipeline = Pipeline(stages=[ StringIndexer(inputCol=string_col, outputCol="IDX_" + string_col) ]) sdf = pipeline.fit(sdf).transform(sdf) encoder = OneHotEncoder(inputCol="IDX_" + string_col, outputCol="ONEHOT_" + string_col) encoder.setDropLast( True) # only keep 2^n-n dummies to keep dummy independent. sdf = encoder.transform(sdf) column_i += 1 ## Drop intermediate columns drop_columns = ["IDX_" + x for x in dummy_columns] # + dummy_columns sdf = sdf.drop(*drop_columns) return sdf
def one_hot_encode(column, dataframe): ''' Returns a dataframe with an additional one hot encoded column specified on the input ''' from pyspark.ml.feature import OneHotEncoder, StringIndexer # Indexing the column before one hot encoding stringIndexer = StringIndexer(inputCol=column, outputCol='categoryIndex') model = stringIndexer.fit(dataframe) indexed = model.transform(dataframe) # One hot encoding the column encoder = OneHotEncoder(inputCol='categoryIndex', outputCol=column+'_one_hot') encoded = encoder.transform(indexed).drop('categoryIndex') return encoded
def category_to_vactor(idx, df): cat_dist = [] for i in idx: cat = df.schema.names[i] #StringToIndex categoryIndexer = StringIndexer(inputCol=cat, outputCol="Index_" + cat) categoryTransformer = categoryIndexer.fit(df) new_df = categoryTransformer.transform(df) #OneHotEncoder encoder = OneHotEncoder(dropLast=False, inputCol="Index_" + cat, outputCol="Vector_" + cat) new_df = encoder.transform(new_df) df = new_df # cat_dist = [cat_dist,categoryTransformer] cat_dist.append(categoryTransformer) return (new_df, cat_dist)
def one_hot_encoding(self, df): ''' Purpose: Encode data using one hot encoding Inputs : Data(spark dataframe) Output : Encoded data ''' ohe_columns = [col for col in df.columns if col.startswith('index_')] ohe_columns = [ col for col in ohe_columns if df.select(col).distinct().count() > 2 ] for column in ohe_columns: sti = OneHotEncoder(inputCol=column, outputCol='ohe_' + column) df = sti.transform(df) df = df.drop(column) print('One-Hot Encoding completed.') return df
def oneHot(): spark = SparkSession \ .builder \ .appName("OneHotEncoderExample") \ .getOrCreate() df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "b"), (6, "c")], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) encoded.show()
def main(train_data_folder, model_path): #Starting session spark = SparkSession.builder.appName('BigDataML').getOrCreate() spark.sparkContext.setLogLevel("ERROR") #Loading data data = spark.read.parquet(train_data_folder) data = data.dropna(how='any') encoder = OneHotEncoder(inputCol="station_index", outputCol="station_vector") data = encoder.transform(data) features =\ VectorAssembler(inputCols=['hour', 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility', 'weather_index', "station_vector"], outputCol="features") train_data = features.transform(data) # Train a GBT model. gbt = GBTRegressor(featuresCol="features", maxIter=10) # Chain indexer and GBT in a Pipeline pipeline = Pipeline(stages=[gbt]) # Train model. This also runs the indexer. model = pipeline.fit(train_data) model.write().overwrite().save(model_path) # Make predictions. predictions = model.transform(train_data) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") rmse = evaluator.evaluate(predictions) print("R2 on train data = %g" % rmse) gbtModel = model.stages[0] print(gbtModel) # summary only
def create_category_vars(dataset, field_name): # Create a new column with the suffix "Index" for each variable Index_col = field_name + "Index" # Create a new column with the suffix "Vec" for each variable Column_vec = field_name + "Vec" # For each variable return the index corresponding to the value in that variable # Define the StringIndex Object col_stringIndexer = StringIndexer(inputCol=field_name, outputCol=Index_col) # Find the no of indexes for that variable model = col_stringIndexer.fit(newFinDepDF) # Determine and return the index corresponding to the value in that variables into the columns column_name _ 'Index' idx_data = model.transform(newFinDepDF) # Using the Indexes returned from StringIndexer build and return the Vector of values for each variable encoder = OneHotEncoder(dropLast=True, inputCol=Index_col, outputCol=Column_vec) return encoder.transform(idx_data)
# MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector). # COMMAND ---------- ###One-Hot Encoding from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") model = stringIndexer.fit(dataset) indexed = model.transform(dataset) # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") encoded = encoder.transform(indexed) dataset = encoded print dataset.take(1) # COMMAND ---------- # MAGIC %md # MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row. # COMMAND ---------- # MAGIC %md # MAGIC We use the StringIndexer() again here to encode our labels to label indices # COMMAND ----------
def initialize(self, do_scaling=True, do_onehot=True): """Reads the dataset, initializes class members. features_df: Original DataFrame as read from the features_file. train_df: A DataFrame with columns Lat, Lon, Pickup_Count and vector columns Features & ScaledFeatures. Contains only data before 2015. test_df: As train_df, but only containing data of 2015. districts_with_counts: A DataFrame with all districts and their counts. """ # Read feature dataframe self.features_df = self.sql_context.read.parquet(self.features_file).cache() # Set exclude columns to default exclude_columns = self.EXCLUDE_COLUMNS # Scale features if do_scaling: assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS, outputCol='FeaturesToScale') self.features_df = assembler.transform(self.features_df) scaler = StandardScaler(inputCol='FeaturesToScale', outputCol=('ScaledFeatures'), withStd=True, withMean=False) self.features_df = scaler.fit(self.features_df).transform(self.features_df) exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale'] # Adopt categorical features that do not have a value range of [0, numCategories) for column in ['Day', 'Month', 'Day_Of_Year']: if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1) # Encode categorical features using one-hot encoding if do_onehot: vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS] for i in range(len(self.ONE_HOT_COLUMNS)): column = self.ONE_HOT_COLUMNS[i] if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType())) encoder = OneHotEncoder(inputCol=column, outputCol=vec_category_columns[i], dropLast=False) self.features_df = encoder.transform(self.features_df) exclude_columns += self.ONE_HOT_COLUMNS # Vectorize features feature_columns = [column for column in self.features_df.columns if column not in exclude_columns] assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features') self.features_df = assembler.transform(self.features_df) # Set number of distinct values for categorical features (identified by index) self.categorical_features_info = {} if not do_onehot: self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]] for i in range(len(feature_columns)) if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()} # Split into train and test data split_date = datetime(2015, 1, 1) self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache() self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache() # Compute Districts with counts self.districts_with_counts = self.features_df \ .groupBy([self.features_df.Lat, self.features_df.Lon]) \ .count()
(Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show() # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\
WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush" END as TrafficTimeBins FROM taxi_test """ taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement) ## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY taxi_df_test_with_newFeatures.cache() taxi_df_test_with_newFeatures.count() ## INDEX AND ONE-HOT ENCODING stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex") model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above indexed = model.transform(taxi_df_test_with_newFeatures) encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec") encoded1 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex") model = stringIndexer.fit(encoded1) indexed = model.transform(encoded1) encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec") encoded2 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex") model = stringIndexer.fit(encoded2) indexed = model.transform(encoded2) encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec") encoded3 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="TrafficTimeBins", outputCol="TrafficTimeBinsIndex") model = stringIndexer.fit(encoded3)
df_model=df_ORG # stringIndexer1 = StringIndexer(inputCol="Origin", outputCol="originIndex") # model_stringIndexer = stringIndexer1.fit(df_model) # indexedOrigin = model_stringIndexer.transform(df_model) # encoder1 = OneHotEncoder(dropLast=False, inputCol="originIndex", outputCol="originVec") # df_model = encoder1.transform(indexedOrigin) # In[ ]: stringIndexer2 = StringIndexer(inputCol="Dest", outputCol="destIndex") model_stringIndexer = stringIndexer2.fit(df_model) indexedDest = model_stringIndexer.transform(df_model) encoder2 = OneHotEncoder(dropLast=False, inputCol="destIndex", outputCol="destVec") df_model = encoder2.transform(indexedDest) # We use __labeled point__ to make local vectors associated with a label/response. In MLlib, labeled points are used in supervised learning algorithms and they are stored as doubles. For binary classification, a label should be either 0 (negative) or 1 (positive). # In[105]: assembler = VectorAssembler( inputCols = ['Year','Month','DayofMonth','DayOfWeek','Hour','Distance','destVec'], outputCol = "features") output = assembler.transform(df_model) airlineRDD=output.map(lambda row: LabeledPoint([0,1][row['DepDelayed']],row['features'])) # ### Preprocessing: Spliting dataset into train and test dtasets
False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'], False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True, False if r.attributes['Take-out'] is None else r.attributes['Take-out']] ).toDF(clustering_columns) # drop row with null values lv_clustering_data = lv_clustering_data.dropna() #Neighborhood feature engineering stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index") lv_model = stringIndexer.fit(lv_clustering_data) lv_indexed = lv_model.transform(lv_clustering_data) encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec") lv_encoded = encoder.transform(lv_indexed) #initial feature set # assembler = VectorAssembler( # inputCols=["stars", "price_range", "neigh_vec"], # outputCol="features_vec") #expanded feature set feature_columns = clustering_columns[2:] feature_columns.append("neigh_vec") assembler = VectorAssembler( inputCols=feature_columns, outputCol="features_vec") lv_assembled = assembler.transform(lv_encoded)