예제 #1
0
def remove_missing_orders_primary_keys(dataframes):
    df_orders = dataframes.get(const.OLIST_TABLE_ORDERS)
    df_order_payments = dataframes.get(const.OLIST_TABLE_ORDER_PAYMENTS)
    df_order_reviews = dataframes.get(const.OLIST_TABLE_ORDER_REVIEWS)
    df_order_items = dataframes.get(const.OLIST_TABLE_ORDER_ITEMS)

    cond_order_payments = df_order_payments["order_id"].isin(
        df_orders["order_id"])
    cond_order_reviews = df_order_reviews["order_id"].isin(
        df_orders["order_id"])
    cond_order_items = df_order_items["order_id"].isin(df_orders["order_id"])

    df_order_payments.drop(df_order_payments[~cond_order_payments].index,
                           inplace=True)
    df_order_reviews.drop(df_order_reviews[~cond_order_reviews].index,
                          inplace=True)
    df_order_items.drop(df_order_items[~cond_order_items].index, inplace=True)

    dir_order_payments = const.MACRO_GET_DATASET_DIR(
        const.OLIST_TABLE_ORDER_PAYMENTS)
    __to_csv(df_order_payments, dir_order_payments)

    dir_order_reviews = const.MACRO_GET_DATASET_DIR(
        const.OLIST_TABLE_ORDER_REVIEWS)
    __to_csv(df_order_reviews, dir_order_reviews)

    dir_order_items = const.MACRO_GET_DATASET_DIR(
        const.OLIST_TABLE_ORDER_ITEMS)
    __to_csv(df_order_items, dir_order_items)
예제 #2
0
def remove_missing_geolocation_primary_keys(dataframes):
    df_geolocation = dataframes.get(const.OLIST_TABLE_GEOLOCATION)
    df_geolocation = __set_geolocation_primary_key_to_df(
        df_geolocation, "geolocation")

    df_customers = dataframes.get(const.OLIST_TABLE_CUSTOMERS)
    df_customers = __set_geolocation_primary_key_to_df(df_customers,
                                                       "customer")

    df_sellers = dataframes.get(const.OLIST_TABLE_SELLERS)
    df_sellers = __set_geolocation_primary_key_to_df(df_sellers, "seller")

    cond_customers = df_customers["primary_key"].isin(
        df_geolocation["primary_key"])
    cond_sellers = df_sellers["primary_key"].isin(
        df_geolocation["primary_key"])

    df_customers.drop(df_customers[~cond_customers].index, inplace=True)
    df_sellers.drop(df_sellers[~cond_sellers].index, inplace=True)

    df_customers = df_customers.drop("primary_key", axis=1)
    df_sellers = df_sellers.drop("primary_key", axis=1)

    dir_customers = const.MACRO_GET_DATASET_DIR(const.OLIST_TABLE_CUSTOMERS)
    __to_csv(df_customers, dir_customers)

    dir_sellers = const.MACRO_GET_DATASET_DIR(const.OLIST_TABLE_SELLERS)
    __to_csv(df_sellers, dir_sellers)
예제 #3
0
def remove_missing_customers_primary_keys(dataframes):
    df_customers = dataframes.get(const.OLIST_TABLE_CUSTOMERS)
    df_orders = dataframes.get(const.OLIST_TABLE_ORDERS)

    cond_orders = df_orders["customer_id"].isin(df_customers["customer_id"])
    df_orders.drop(df_orders[~cond_orders].index, inplace=True)

    dir_orders = const.MACRO_GET_DATASET_DIR(const.OLIST_TABLE_ORDERS)
    __to_csv(df_orders, dir_orders)
예제 #4
0
def remove_duplicates_from_geolocation(dataframes):
    df = dataframes.get(const.OLIST_TABLE_GEOLOCATION)

    df = __set_geolocation_primary_key_to_df(df, "geolocation")
    df.drop_duplicates(subset=["primary_key"], keep="last", inplace=True)
    df.drop("primary_key", axis=1, inplace=True)

    dir_geolocation = const.MACRO_GET_DATASET_DIR(
        const.OLIST_TABLE_GEOLOCATION)
    __to_csv(df, dir_geolocation)
예제 #5
0
def load_datasets(drop_na=False):
    dataframes = dict()

    # Brazilian E-Commerce Public Dataset by Olist
    for table in const.OLIST_DATASET_TABLES:
        csv_file = const.MACRO_GET_DATASET_DIR(table)
        dtype = const.OLIST_DATASET_TABLES_TYPES_MAP.get(table, None)
        df = pd.read_csv(csv_file, dtype=dtype)

        if drop_na is True:
            df.replace(r"^\s*$", np.nan, regex=True, inplace=True)

            subset = const.MACRO_GET_REQUIRED_COLUMNS(
                df, const.OLIST_DATASET_TABLES_NULLABLE_COLUMNS[table])
            df.dropna(axis=0, subset=subset, inplace=True)

        __drop_duplicate_primary_keys(df)
        dataframes[table] = df

        dir = const.MACRO_GET_DATASET_DIR(table)
        __to_csv(df, dir)

    # Women's E-Commerce Clothing Reviews
    csv_file = const.MACRO_GET_DATASET_DIR(const.WECR_DATASET_TABLE)
    wecr_df = pd.read_csv(csv_file, dtype=const.WECR_DATASET_COLUMNS_TYPE_MAP)

    if drop_na is True:
        wecr_df.replace(r"^\s*$", np.nan, regex=True, inplace=True)
        subset = const.MACRO_GET_REQUIRED_COLUMNS(
            wecr_df, const.WECR_DATASET_NULLABLE_COLUMNS)
        wecr_df.dropna(axis=0, subset=subset, inplace=True)

    __rename_columns(wecr_df, const.WECR_COLUMN_NAME_MAP)
    __drop_duplicate_primary_keys(wecr_df)

    dataframes[const.WECR_DATASET_TABLE] = wecr_df

    dir = const.MACRO_GET_DATASET_DIR(const.WECR_DATASET_TABLE)
    __to_csv(wecr_df, dir)

    return dataframes
예제 #6
0
def replace_products_product_category_name(dataframes):
    df_products = dataframes.get(const.OLIST_TABLE_PRODUCTS)

    df_products["product_category_name"] = df_products[
        "product_category_name"].apply(
            lambda name: "eletroportateis" if name ==
            "portateis_cozinha_e_preparadores_de_alimentos" else name)
    df_products["product_category_name"] = df_products[
        "product_category_name"].apply(lambda name: "pcs"
                                       if name == "pc_gamer" else name)

    dir_products = const.MACRO_GET_DATASET_DIR(const.OLIST_TABLE_PRODUCTS)
    __to_csv(df_products, dir_products)
예제 #7
0
def remove_missing_products_primary_keys(dataframes):
    df_products = dataframes.get(const.OLIST_TABLE_PRODUCTS)
    df_order_items = dataframes.get(const.OLIST_TABLE_ORDER_ITEMS)

    cond_order_payments = df_order_items["product_id"].isin(
        df_products["product_id"])

    df_order_items.drop(df_order_items[~cond_order_payments].index,
                        inplace=True)

    dir_order_items = const.MACRO_GET_DATASET_DIR(
        const.OLIST_TABLE_ORDER_ITEMS)
    __to_csv(df_order_items, dir_order_items)