def group_mean(data: pyspark.sql.DataFrame, groups, response, features): means = scipy.zeros((len(groups), len(features))) for i, target in enumerate(groups): df_t = data.filter("{} == {}".format(response, target)) X_t = df_t.select(features).rdd.map(numpy.array) means[i, :] = column_means(X_t) return means
def within_group_scatter(data: pyspark.sql.DataFrame, features, response, targets): p = len(features) sw = numpy.zeros((p, p)) for target in targets: df_t = data.filter("{} == '{}'".format(response, target)) X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array)) sw += X_t.computeCovariance().toArray() * (df_t.count() - 1) return sw