Exemplo n.º 1
0
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return

        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path
            visual_feature_path = getattr(config.data_config.side_information,
                                          "visual_features", None)
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            size_tuple = getattr(config.data_config.side_information,
                                 "output_image_size", None)

            if visual_feature_path and item_mapping_path:
                feature_set = set(
                    pd.read_csv(item_mapping_path, sep="\t",
                                header=None)[0].unique().tolist())
            else:
                feature_set = {}

            images_src_folder = getattr(config.data_config.side_information,
                                        "images_src_folder", None)

            if images_src_folder:
                image_set = {
                    int(path.split(".")[0])
                    for path in os.listdir(images_src_folder)
                }
            else:
                image_set = {}

            if feature_set and image_set:
                visual_set = feature_set and image_set
            elif feature_set:
                visual_set = feature_set
            elif image_set:
                visual_set = image_set
            else:
                visual_set = {}

            self.side_information_data = SimpleNamespace()

            self.train_dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe(
                path_train_data, "\t", visual_set)
            self.side_information_data.visual_feature_path = visual_feature_path
            self.side_information_data.item_mapping_path = item_mapping_path
            self.side_information_data.images_src_folder = images_src_folder
            self.side_information_data.size_tuple = size_tuple

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info('{0} - Loaded'.format(path_train_data))

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)
            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            visual_feature_path = getattr(config.data_config.side_information,
                                          "visual_features", None)
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            size_tuple = getattr(config.data_config.side_information,
                                 "output_image_size", None)

            if visual_feature_path and item_mapping_path:
                feature_set = set(
                    pd.read_csv(item_mapping_path, sep="\t",
                                header=None)[0].unique().tolist())
            else:
                feature_set = {}

            images_src_folder = getattr(config.data_config.side_information,
                                        "images_src_folder", None)

            if images_src_folder:
                image_set = {
                    int(path.split(".")[0])
                    for path in os.listdir(images_src_folder)
                }
            else:
                image_set = {}

            if feature_set and image_set:
                visual_set = feature_set and image_set
            elif feature_set:
                visual_set = feature_set
            elif image_set:
                visual_set = image_set
            else:
                visual_set = {}

            self.side_information_data = SimpleNamespace()

            self.dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe(
                path_dataset, "\t", visual_set)
            self.side_information_data.visual_feature_path = visual_feature_path
            self.side_information_data.item_mapping_path = item_mapping_path
            self.side_information_data.images_src_folder = images_src_folder
            self.side_information_data.size_tuple = size_tuple

            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info('{0} - Loaded'.format(path_dataset))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")
Exemplo n.º 2
0
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return
        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path

            self.train_dataframe = pd.read_csv(path_train_data,
                                               sep="\t",
                                               header=None,
                                               names=self.column_names)

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info(f"{path_train_data} - Loaded")

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)

            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            self.dataframe = pd.read_csv(path_dataset,
                                         sep="\t",
                                         header=None,
                                         names=self.column_names)

            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info(('{0} - Loaded'.format(path_dataset)))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")
Exemplo n.º 3
0
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return

        self.side_information_data = SimpleNamespace()

        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path

            work_directory_path = config.data_config.side_information.work_directory
            map_path = config.data_config.side_information.map
            features_path = config.data_config.side_information.features
            predicates_path = config.data_config.side_information.predicates

            self.train_dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe(
                path_train_data, predicates_path, features_path)

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info(f"{path_train_data} - Loaded")

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)

            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if config.binarize == True:
                self.test_dataframe["rating"] = 1
                self.train_dataframe["rating"] = 1

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            self.side_information_data.feature_map = self.load_attribute_file(
                item_mapping_path)

            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

            self.logger.info('{0} - Loaded'.format(
                config.data_config.root_folder))

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            work_directory_path = config.data_config.side_information.work_directory
            map_path = config.data_config.side_information.map
            features_path = config.data_config.side_information.features
            predicates_path = config.data_config.side_information.predicates

            self.dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe(
                path_dataset, predicates_path, features_path)
            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info(('{0} - Loaded'.format(path_dataset)))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            if config.binarize == True:
                self.dataframe["rating"] = 1

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")