Python reduce示例

编程语言: Python

命名空间/包名称: pyspark.rdd

方法/功能: reduce

hotexamples.com的示例: 2

Python reduce - 已找到2个示例。这些是从开源项目中提取的最受好评的pyspark.rdd.reduce现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def replace_column_names(data, fro=".", to="_"):
    """
    Rename single characters columns of a data frame. After renaming the columns
    returns the data frame as well as the vector of new column names.

    :param data: the data frame of which you want to rename the columns
    :param fro: sequence which you want to replace in the column name
    :param to: sequence to which the columns are renamed
    :return: returns a tuple consisting of a data frame with new column names as
      well as the vector of new column names
    """

    old_cols = data.columns
    new_cols = list(map(lambda x: x.replace(fro, to), old_cols))

    data = reduce(
        lambda d, idx: d.withColumnRenamed(old_cols[idx], new_cols[idx]),
        range(len(new_cols)), data)

    return data, new_cols

示例#2

显示文件

# Reading the training dataset locally stored in the container
data_train = spark.read.option("delimiter", ";").csv('TrainingDataset.csv',
                                                     header=True,
                                                     inferSchema=True)

#To clean out CSV headers if quotes are present
old_column_name = data_train.schema.names
print(data_train.schema)
clean_column_name = []

for name in old_column_name:
    clean_column_name.append(name.replace('"', ''))

data_train = reduce(
    lambda data_train, idx: data_train.withColumnRenamed(
        old_column_name[idx], clean_column_name[idx]),
    range(len(clean_column_name)), data_train)
data_test = reduce(
    lambda data_test, idx: data_test.withColumnRenamed(old_column_name[idx],
                                                       clean_column_name[idx]),
    range(len(clean_column_name)), data_test)
print(data_train.schema)

# Dropping rows with quality equal to 3 because it contains very little data
data_train_new = data_train.filter(data_train['quality'] != "3")

# Selecting all columns except quality as feature columns from our train dataset
feature_cols = [x for x in data_train_new.columns if x != "quality"]

# Using a vector assembler for processing features
vect_assembler = VectorAssembler(inputCols=feature_cols, outputCol="feature")