def etl_pandas(filename, columns_names, columns_types, etl_keys):
    etl_times = {key: 0.0 for key in etl_keys}

    t0 = timer()
    train_pd = load_data_pandas(
        filename=filename,
        columns_names=columns_names,
        columns_types=columns_types,
        header=0,
        nrows=None,
        use_gzip=filename.endswith(".gz"),
        pd=run_benchmark.__globals__["pd"],
    )
    etl_times["t_readcsv"] = round((timer() - t0) * 1000)

    t_etl_begin = timer()

    for i in range(200):
        col = "var_%d" % i
        var_count = train_pd.groupby(col).agg({col: "count"})

        var_count.columns = ["%s_count" % col]
        var_count = var_count.reset_index()

        train_pd = train_pd.merge(var_count, on=col, how="left")

    for i in range(200):
        col = "var_%d" % i

        mask = train_pd["%s_count" % col] > 1
        train_pd.loc[mask, "%s_gt1" % col] = train_pd.loc[mask, col]

    train_pd = train_pd.drop(["ID_code"], axis=1)
    etl_times["t_etl"] = round((timer() - t_etl_begin) * 1000)

    return train_pd, etl_times
Пример #2
0
def etl_pandas(
    filename,
    files_limit,
    columns_names,
    columns_types,
):
    queries = {
        "Query1": q1_pandas,
        "Query2": q2_pandas,
        "Query3": q3_pandas,
        "Query4": q4_pandas,
    }
    etl_times = {x: 0.0 for x in queries.keys()}

    t0 = time.time()
    df_from_each_file = [
        load_data_pandas(
            filename=f,
            columns_names=columns_names,
            header=0,
            nrows=None,
            use_gzip=f.endswith(".gz"),
            parse_dates=[
                "pickup_datetime",
                "dropoff_datetime",
            ],
            pd=run_benchmark.__globals__["pd"],
        ) for f in filename
    ]
    concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
    etl_times["t_readcsv"] = time.time() - t0

    queries_parameters = {"df": concatenated_df}
    return run_queries(queries=queries,
                       parameters=queries_parameters,
                       etl_times=etl_times)
Пример #3
0
def etl_pandas(filename, columns_names, columns_types, etl_keys, pandas_mode):
    etl_times = {key: 0.0 for key in etl_keys}

    t0 = timer()
    if pandas_mode == "Modin_on_omnisci":
        df = load_data_modin_on_omnisci(
            filename=filename,
            columns_names=columns_names,
            columns_types=columns_types,
            skiprows=1,
            pd=run_benchmark.__globals__["pd"],
        )
    else:
        df = load_data_pandas(
            filename=filename,
            columns_names=columns_names,
            columns_types=columns_types,
            header=0,
            nrows=None,
            use_gzip=filename.endswith(".gz"),
            pd=run_benchmark.__globals__["pd"],
        )
    etl_times["t_readcsv"] = timer() - t0

    t_etl_start = timer()

    keep_cols = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "PERNUM",
        "SEX",
        "AGE",
        "INCTOT",
        "EDUC",
        "EDUCD",
        "EDUC_HEAD",
        "EDUC_POP",
        "EDUC_MOM",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
        "INCTOT_HEAD",
        "SEX_HEAD",
    ]
    df = df[keep_cols]

    df = df[df["INCTOT"] != 9999999]
    df = df[df["EDUC"] != -1]
    df = df[df["EDUCD"] != -1]

    df["INCTOT"] = df["INCTOT"] * df["CPI99"]

    for column in keep_cols:
        df[column] = df[column].fillna(-1)

        df[column] = df[column].astype("float64")

    y = df["EDUC"]
    X = df.drop(columns=["EDUC", "CPI99"])

    # trigger computation
    df.shape
    y.shape
    X.shape

    etl_times["t_etl"] = timer() - t_etl_start
    print("DataFrame shape:", X.shape)

    return df, X, y, etl_times
Пример #4
0
def etl_pandas(filename, columns_names, columns_types, etl_keys):
    etl_times = {key: 0.0 for key in etl_keys}

    t0 = timer()
    df = load_data_pandas(
        filename=filename,
        columns_names=columns_names,
        columns_types=columns_types,
        header=0,
        nrows=None,
        use_gzip=filename.endswith(".gz"),
        pd=run_benchmark.__globals__["pd"],
    )
    etl_times["t_readcsv"] = round((timer() - t0) * 1000)

    t_etl_start = timer()

    keep_cols = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "PERNUM",
        "SEX",
        "AGE",
        "INCTOT",
        "EDUC",
        "EDUCD",
        "EDUC_HEAD",
        "EDUC_POP",
        "EDUC_MOM",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
        "INCTOT_HEAD",
        "SEX_HEAD",
    ]
    df = df[keep_cols]

    df = df.query("INCTOT != 9999999")
    df = df.query("EDUC != -1")
    df = df.query("EDUCD != -1")

    df["INCTOT"] = df["INCTOT"] * df["CPI99"]

    for column in keep_cols:
        df[column] = df[column].fillna(-1)

        df[column] = df[column].astype("float64")

    y = df["EDUC"]
    X = df.drop(columns=["EDUC", "CPI99"])

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    print("DataFrame shape:", X.shape)

    return df, X, y, etl_times
Пример #5
0
def etl_pandas(
    filename, files_limit, columns_names, columns_types, output_for_validation, pandas_mode
):

    if pandas_mode == "Modin_on_omnisci" and any(f.endswith(".gz") for f in filename):
        raise NotImplementedError(
            "Modin_on_omnisci mode doesn't support import of compressed files yet"
        )

    queries = {"Query1": q1_pandas, "Query2": q2_pandas, "Query3": q3_pandas, "Query4": q4_pandas}
    etl_results = {x: 0.0 for x in queries.keys()}

    t0 = timer()
    if pandas_mode == "Modin_on_omnisci":
        df_from_each_file = [
            load_data_modin_on_omnisci(
                filename=f,
                columns_names=columns_names,
                columns_types=columns_types,
                parse_dates=["timestamp"],
                pd=run_benchmark.__globals__["pd"],
            )
            for f in filename
        ]
    else:
        df_from_each_file = [
            load_data_pandas(
                filename=f,
                columns_names=columns_names,
                header=None,
                nrows=None,
                use_gzip=f.endswith(".gz"),
                parse_dates=["pickup_datetime", "dropoff_datetime"],
                pd=run_benchmark.__globals__["pd"],
                pandas_mode=pandas_mode,
            )
            for f in filename
        ]

    concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
    # this is to trigger data import in `MOdin_on_omnisci` mode
    if pandas_mode == "Modin_on_omnisci":
        from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer

        concatenated_df.shape
        concatenated_df._query_compiler._modin_frame._partitions[0][
            0
        ].frame_id = OmnisciServer().put_arrow_to_omnisci(
            concatenated_df._query_compiler._modin_frame._partitions[0][0].get()
        )
    etl_results["t_readcsv"] = timer() - t0

    queries_parameters = {
        query_name: {
            "df": concatenated_df.copy() if pandas_mode == "Modin_on_omnisci" else concatenated_df,
            "pandas_mode": pandas_mode,
        }
        for query_name in list(queries.keys())
    }

    return run_queries(
        queries=queries,
        parameters=queries_parameters,
        etl_results=etl_results,
        output_for_validation=output_for_validation,
    )