Python sql.DataFrame.groupby 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark

클래스/타입: sql.DataFrame

메소드/함수: groupby

hotexamples.com에서의 예제들: 2

Python sql.DataFrame.groupby - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.sql.DataFrame.groupby에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

count(9)

withColumn(8)

select(6)

repartition(3)

union(2)

filter(2)

toPandas(2)

groupby(2)

where(1)

subtract(1)

sortWithinPartitions(1)

sampleBy(1)

collect(1)

persist(1)

mapInPandas(1)

limit(1)

join(1)

groupBy(1)

drop(1)

crosstab(1)

orderBy(1)

예제 #1

파일 보기

파일: evaluate.py 프로젝트: ruben-aghayan/pyspark_propensity_matching

def _calc_var(df: pyspark.sql.DataFrame, label_col: str) -> pd.DataFrame:
    r"""calculate variance for each column that isnt the label_col

     Parameters
    ----------
    df : pyspark.sql.DataFrame
        df where rows are observations, all columns except `label_col` are
        predictors.
    label_col : str

    Returns
    -------
    bias_df : pd.DataFrame
        pandas dataframe where predictors are index and only column is
        variance

    Raises
    ------
    UncaughtExceptions

    Notes
    -----


    """
    pred_cols = [x for x in df.columns if x != label_col]
    s_var_df = df.groupby(label_col).agg({x: 'variance'
                                          for x in pred_cols
                                          }).toPandas().transpose()
    s_var_df = s_var_df.reset_index()
    s_var_df['index'] = s_var_df['index'].str.replace(r')', '').str.replace(
        r'variance\(', '')
    s_var_df = s_var_df.set_index('index')
    s_var_df.columns = ["var_{0}".format(x) for x in s_var_df.columns]
    s_var_df = s_var_df.loc[s_var_df.index != 'label', :]
    return s_var_df

예제 #2

파일 보기

파일: Impact.py 프로젝트: rulises/pyspark_propensity_matching

def impact(df: pyspark.sql.DataFrame, response_col: str,
           prob_mod: mlc.Model) -> Tuple[float, float, float]:
    r"""observe impact of treatment on response variable

    currently response must be binary
    if the df is small enough return naive difference in groupby label
    response mean. otherwise do additional regression on response col
    with label as predictor and use its coefficient as a measure of its
    impact. binning and dimensionality reduction will occur if necessary
    to do an effective regression

    Parameters
    ----------
    df: pyspark.sql.DataFrame
    response_col: str
    prob_mod: Tmlc.Model
        propensity model, mostly used to keep track of feature_col,
        label_col, pred_cols

    Returns
    -------
    treatment_rate : float
        treatment response rate
    control_rate : float
        control response rate
    adjusted_response : float
        impact of treatment on response, which may be
        `control_rate`-`treatment_rate` or may have further bias adjustement

    Raises
    ------
    ValueError
        when number of rows is less than `MINIMUM_POS_COUNT`*2
    UncaughtExceptions

    See Also
    --------
    bin_features
    _reduce_dimensionality

    Notes
    -----

    """

    _persist_if_unpersisted(df)

    label_col = prob_mod.getOrDefault('labelCol')
    features_col = prob_mod.getOrDefault('featuresCol')
    pred_cols = _get_pred_cols(df, features_col)

    all_count = df.count()

    # safety check
    if all_count < MINIMUM_POS_COUNT * 2:
        logging.getLogger(__name__).critical(
            "somehow have less than 2*MINIMUM_POS_COUNT*2 rows")
        raise ValueError(
            "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening"
        )

    # dict because 1, 0 for label col are not guaranteed to be ordered
    naive_response_dict = dict()
    response_list = df.groupby(label_col).mean(response_col).collect()
    naive_response_dict[response_list[0][label_col]] = response_list[0][
        "avg({col})".format(col=response_col)]
    naive_response_dict[response_list[1][label_col]] = response_list[1][
        "avg({col})".format(col=response_col)]
    treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[
        0]
    logging.getLogger(__name__).info(
        "treatment_rate:{tr:.2f}   control_rate:{cr:.2f}".format(
            tr=treatment_rate, cr=control_rate))

    # return early if additional bias reduction is not applicable
    if all_count < NAIVE_THRESHOLD_COUNT:
        logging.getLogger(__name__).info(
            "additional bias adjustment inapplicable, returning naive difference"
        )
        return treatment_rate, control_rate, (control_rate - treatment_rate)

    logging.getLogger(__name__).info("additional bias adjustment possible")
    # choose fewer features if appropriate to prevent overfit. round down
    num_preds = int(
        df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1
    logging.getLogger(__name__).info(
        "need max {n:,} predictors".format(n=num_preds))
    if num_preds < len(list(pred_cols)):
        logging.getLogger(__name__).info(
            "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality"
            .format(np=num_preds, ep=len(pred_cols)))
        kwargs = {
            'df': df,
            'label_col': label_col,
            'binned_features_col': features_col,
            'ncols': num_preds
        }
        df, pred_cols = reduce_dimensionality(args=kwargs, method='chi')

    pred_cols_r = pred_cols + [label_col]
    assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r,
                                      outputCol='features_r')
    df = assembler_r.transform(df)
    _persist_if_unpersisted(df)
    lre_r = mlc.LogisticRegression(
        featuresCol='features_r',
        labelCol=response_col,
        predictionCol='prediction_{0}'.format(response_col),
        rawPredictionCol='rawPrediction_{0}'.format(response_col),
        probabilityCol='probability_{0}'.format(response_col))
    lrm_r = lre_r.fit(df)

    coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients))

    adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col]))
    logging.getLogger(__name__).info(
        "bias asjusted response is {ar:.2f}".format(ar=adjusted_response))
    return treatment_rate, control_rate, adjusted_response