示例#1
0
def replace_column_names(data, fro=".", to="_"):
    """
    Rename single characters columns of a data frame. After renaming the columns
    returns the data frame as well as the vector of new column names.

    :param data: the data frame of which you want to rename the columns
    :param fro: sequence which you want to replace in the column name
    :param to: sequence to which the columns are renamed
    :return: returns a tuple consisting of a data frame with new column names as
      well as the vector of new column names
    """

    old_cols = data.columns
    new_cols = list(map(lambda x: x.replace(fro, to), old_cols))

    data = reduce(
        lambda d, idx: d.withColumnRenamed(old_cols[idx], new_cols[idx]),
        range(len(new_cols)), data)

    return data, new_cols
示例#2
0
# Reading the training dataset locally stored in the container
data_train = spark.read.option("delimiter", ";").csv('TrainingDataset.csv',
                                                     header=True,
                                                     inferSchema=True)

#To clean out CSV headers if quotes are present
old_column_name = data_train.schema.names
print(data_train.schema)
clean_column_name = []

for name in old_column_name:
    clean_column_name.append(name.replace('"', ''))

data_train = reduce(
    lambda data_train, idx: data_train.withColumnRenamed(
        old_column_name[idx], clean_column_name[idx]),
    range(len(clean_column_name)), data_train)
data_test = reduce(
    lambda data_test, idx: data_test.withColumnRenamed(old_column_name[idx],
                                                       clean_column_name[idx]),
    range(len(clean_column_name)), data_test)
print(data_train.schema)

# Dropping rows with quality equal to 3 because it contains very little data
data_train_new = data_train.filter(data_train['quality'] != "3")

# Selecting all columns except quality as feature columns from our train dataset
feature_cols = [x for x in data_train_new.columns if x != "quality"]

# Using a vector assembler for processing features
vect_assembler = VectorAssembler(inputCols=feature_cols, outputCol="feature")