def replace_column_names(data, fro=".", to="_"): """ Rename single characters columns of a data frame. After renaming the columns returns the data frame as well as the vector of new column names. :param data: the data frame of which you want to rename the columns :param fro: sequence which you want to replace in the column name :param to: sequence to which the columns are renamed :return: returns a tuple consisting of a data frame with new column names as well as the vector of new column names """ old_cols = data.columns new_cols = list(map(lambda x: x.replace(fro, to), old_cols)) data = reduce( lambda d, idx: d.withColumnRenamed(old_cols[idx], new_cols[idx]), range(len(new_cols)), data) return data, new_cols
# Reading the training dataset locally stored in the container data_train = spark.read.option("delimiter", ";").csv('TrainingDataset.csv', header=True, inferSchema=True) #To clean out CSV headers if quotes are present old_column_name = data_train.schema.names print(data_train.schema) clean_column_name = [] for name in old_column_name: clean_column_name.append(name.replace('"', '')) data_train = reduce( lambda data_train, idx: data_train.withColumnRenamed( old_column_name[idx], clean_column_name[idx]), range(len(clean_column_name)), data_train) data_test = reduce( lambda data_test, idx: data_test.withColumnRenamed(old_column_name[idx], clean_column_name[idx]), range(len(clean_column_name)), data_test) print(data_train.schema) # Dropping rows with quality equal to 3 because it contains very little data data_train_new = data_train.filter(data_train['quality'] != "3") # Selecting all columns except quality as feature columns from our train dataset feature_cols = [x for x in data_train_new.columns if x != "quality"] # Using a vector assembler for processing features vect_assembler = VectorAssembler(inputCols=feature_cols, outputCol="feature")