예제 #1
0
def group_mean(data: pyspark.sql.DataFrame, groups, response, features):
    means = scipy.zeros((len(groups), len(features)))
    for i, target in enumerate(groups):
        df_t = data.filter("{} == {}".format(response, target))
        X_t = df_t.select(features).rdd.map(numpy.array)
        means[i, :] = column_means(X_t)
    return means
예제 #2
0
def within_group_scatter(data: pyspark.sql.DataFrame, features, response,
                         targets):
    p = len(features)
    sw = numpy.zeros((p, p))
    for target in targets:
        df_t = data.filter("{} == '{}'".format(response, target))
        X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array))
        sw += X_t.computeCovariance().toArray() * (df_t.count() - 1)
    return sw