Пример #1
0
    def preprocess(self):
        """Preprocess the raw file.

        Preprocess the file downloaded via the url,
        convert it to a dataframe consist of the user-item interaction
        and save in the processed directory
        """
        file_name = os.path.join(self.raw_path, "dunnhumby.zip")
        if not os.path.exists(file_name):
            print("Raw file doesn't exist, try to download it.")
            self.download()
        zip_file_name = os.path.join(self.raw_path, "dunnhumby.zip")
        unzip_file_name = os.path.join(self.raw_path, "dunnhumby_The-Complete-Journey")
        if not os.path.exists(unzip_file_name):
            un_zip(zip_file_name)

        if not os.path.exists(
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz")
        ):
            data = self.parse_raw_data(
                os.path.join(unzip_file_name, "dunnhumby - The Complete Journey CSV")
            )
            self.save_dataframe_as_npz(
                data,
                os.path.join(
                    self.processed_path, f"{self.dataset_name}_interaction.npz"
                ),
            )
Пример #2
0
    def load_random_split(self,
                          test_rate=0.1,
                          random=False,
                          n_negative=100,
                          by_user=False,
                          n_test=10):
        """load split date generated by random_split.

        Load split data generated by random_split from Onedrive, with test_rate = 0.1 and by_user = False.

        Args:
            test_rate: percentage of the test data. Note that percentage of the validation data will be the same as
                        test data.
            random: bool. Whether randomly leave one basket as testing.
            n_negative:  Number of negative samples for testing and validation data.
            by_user: bool. Default False.
                    - Ture: user-based split,
                    - False: global split,
            n_test: int. Default 10. The number of testing and validation copies.
                    If n_test==0, will load the original (no negative items) valid and test datasets.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """

        processed_random_split_path = os.path.join(self.processed_path,
                                                   "random")
        if not os.path.exists(processed_random_split_path):
            os.mkdir(processed_random_split_path)

        parameterized_path = generate_parameterized_path(test_rate=test_rate,
                                                         random=random,
                                                         n_negative=n_negative,
                                                         by_user=by_user)
        download_path = processed_random_split_path
        processed_random_split_path = os.path.join(processed_random_split_path,
                                                   parameterized_path)
        if not os.path.exists(processed_random_split_path):
            if (test_rate == 0.1 and random is False and n_negative == 100
                    and by_user is False):
                # default parameters, can be downloaded from Onedrive
                folder = OneDrive(url=self.processed_random_split_url,
                                  path=download_path)
                folder.download()
                un_zip(processed_random_split_path + ".zip", download_path)
            else:
                # make
                self.make_random_split(
                    test_rate=test_rate,
                    random=random,
                    n_negative=n_negative,
                    by_user=by_user,
                    n_test=n_test,
                )

        # load data from local storage
        return load_split_data(processed_random_split_path, n_test=n_test)
Пример #3
0
    def load_leave_one_out(self,
                           random=False,
                           n_negative=100,
                           n_test=10,
                           download=False):
        """Load split data generated by leave_out_out without random select.

        Load split data generated by leave_out_out without random select from Onedrive.

        Args:
            random: bool. Whether randomly leave one item as testing.
            n_negative:  Number of negative samples for testing and validation data.
            n_test: int. Default 10. The number of testing and validation copies.
                    If n_test==0, will load the original (no negative items) valid and test datasets.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """
        processed_leave_one_out_path = os.path.join(self.processed_path,
                                                    "leave_one_out")
        if not os.path.exists(processed_leave_one_out_path):
            os.mkdir(processed_leave_one_out_path)

        parameterized_path = generate_parameterized_path(test_rate=0,
                                                         random=random,
                                                         n_negative=n_negative,
                                                         by_user=False)

        download_path = processed_leave_one_out_path
        processed_leave_one_out_path = os.path.join(
            processed_leave_one_out_path, parameterized_path)

        if not os.path.exists(processed_leave_one_out_path):
            if download and random is False and n_negative == 100:
                # default parameters, can be downloaded from Onedrive
                folder = OneDrive(url=self.processed_leave_one_out_url,
                                  path=download_path)
                folder.download()
                un_zip(processed_leave_one_out_path + ".zip", download_path)
            else:
                # make
                self.make_leave_one_out(random=random,
                                        n_negative=n_negative,
                                        n_test=n_test)

        # load data from local storage
        return load_split_data(processed_leave_one_out_path, n_test=n_test)
Пример #4
0
    def preprocess(self):
        """Preprocess the raw file

        Preprocess the file downloaded via the url,
        convert it to a dataframe consist of the user-item interaction
        and save in the processed directory

        Download and load datasets
        1. Download instacart dataset if this dataset is not existed.
        2. Load <order> table and <order_products> table from "orders.csv" and "order_products__train.csv".
        3. Merge the two tables above.
        4. Add additional columns [rating, timestamp].
        5. Rename columns and save data model.
        """

        # Step 1: Download instacart dataset if this dataset is not existed.

        print("Start loading data from raw data")
        order_products_prior_file = os.path.join(self.raw_path,
                                                 self.dataset_name,
                                                 "order_products__prior.csv")
        order_products_train_file = os.path.join(self.raw_path,
                                                 self.dataset_name,
                                                 "order_products__train.csv")
        if not os.path.exists(order_products_prior_file) or not os.path.exists(
                order_products_train_file):
            print("Raw file doesn't exist, try to download it.")
            self.download()
            file_name = os.path.join(self.raw_path, self.dataset_name + ".gz")
            un_zip(file_name)

        orders_file = os.path.join(self.raw_path, self.dataset_name,
                                   "orders.csv")

        #  order_products__*.csv: order_id,product_id,add_to_cart_order,reordered
        prior_products = pd.read_csv(
            order_products_prior_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        train_products = pd.read_csv(
            order_products_train_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        order_products = pd.concat([prior_products, train_products])

        #  orders.csv:  order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
        orders = pd.read_csv(
            orders_file,
            usecols=["user_id", "order_id", "order_number", "eval_set"])

        user_products = order_products.merge(orders, how="left", on="order_id")

        order_addtocart_user = (user_products.groupby([
            "order_id", "add_to_cart_order", "user_id", "product_id",
            "eval_set"
        ]).size().rename("ratings").reset_index())
        order_addtocart_user.rename(
            columns={
                "order_id": DEFAULT_ORDER_COL,
                "user_id": DEFAULT_USER_COL,
                "product_id": DEFAULT_ITEM_COL,
                "ratings": DEFAULT_RATING_COL,
                "eval_set": DEFAULT_FLAG_COL,
            },
            inplace=True,
        )
        timestamp_col = {DEFAULT_TIMESTAMP_COL: order_addtocart_user.index}
        order_addtocart_user = order_addtocart_user.assign(**timestamp_col)
        print("Start sampling 25% users from the raw data")
        users = list(order_addtocart_user[DEFAULT_USER_COL].unique())
        sampled_users = random.sample(users, int(len(users) * 0.25))
        order_addtocart_user = order_addtocart_user[
            order_addtocart_user[DEFAULT_USER_COL].isin(sampled_users)]

        print("Loading raw data completed")
        # save processed data into the disk.
        self.save_dataframe_as_npz(
            order_addtocart_user,
            os.path.join(self.processed_path,
                         f"{self.dataset_name}_interaction.npz"),
        )