def __init__(self,
                 split,
                 target_type='identity',
                 attr_index=None,
                 data_dir=dataset_dir,
                 transform=None):
        super(CelebADataset, self).__init__()

        # self.images, identities, attributes = load_from_pickle(os.path.join(data_dir, f'{split}.pkl'))
        # files = os.listdir(os.path.join(data_dir, f'{split}_part*.pkl'))
        files = glob.glob(os.path.join(data_dir, f'{split}_part*.pkl'))
        files = [
            os.path.join(data_dir, f'{split}_part{i}.pkl')
            for i in range(len(files) // 2)
        ]  # NOTE: TAKING HALF OF DATASET BECAUSE IT'S TOO BIG
        data = [load_from_pickle(f) for f in files]
        self.images, identities, attributes = (np.concatenate(d, axis=0)
                                               for d in zip(*data))
        self.images = self.images.transpose((0, 2, 3, 1))

        # assert target_type in ['identity', 'attr'], 'Expected argument `target_type` to be "identity" or "attr".'
        # self.targets = identities - 1 if target_type == 'identity' else attributes
        if target_type == 'identity':
            self.targets = identities - 1
        elif target_type == 'attr':
            assert attr_index is not None, 'Need to provide `attr_index` argument when `target_type`=="identity"'
            self.targets = attributes[:, attr_index]
        else:
            raise ValueError(
                'Expected argument `target_type` to be "identity" or "attr".')
        self.transform = transform
def make_dataset(imgs_dir, annos_dir, pkl_path, imgs_set_path, labels="voc", show_dataset=False):
    """

    :param imgs_dir: str
    :param annos_dir: str
    :param pkl_path: str
    :param imgs_set_path: str
    :param labels: str["coco", "voc"] or list[str] or Dict[str: int] 可多对一
    :param show_dataset: bool
    :return:
    """
    pkl_dir = os.path.dirname(pkl_path)
    os.makedirs(pkl_dir, exist_ok=True)
    xml_processor = XMLProcessor(imgs_dir, annos_dir, labels, imgs_set_path, True)
    if os.path.exists(pkl_path):
        img_path_list, target_list = load_from_pickle(pkl_path)
        xml_processor.img_path_list, xml_processor.target_list = img_path_list, target_list
        xml_processor.test_dataset()
    else:
        xml_processor.create_labels_cache()
        xml_processor.test_dataset()
        save_to_pickle((xml_processor.img_path_list, xml_processor.target_list), pkl_path)
        img_path_list, target_list = xml_processor.img_path_list, xml_processor.target_list
    if show_dataset:
        xml_processor.show_dataset()
    return img_path_list, target_list
示例#3
0
def to_numpy_format(birds_200_2011_root_dir):
    birds_200_2011_output_dir = os.path.join(birds_200_2011_root_dir,
                                             r'CUB_200_2011\parsed')
    in_pickle_path = os.path.join(birds_200_2011_output_dir, 'data.pkl')
    out_pickle_path = os.path.join(birds_200_2011_output_dir,
                                   'data_filtered.pkl')
    out_pickle_path2 = os.path.join(birds_200_2011_output_dir,
                                    'data_filtered_numpy.pkl')
    out_pickle_path3 = os.path.join(birds_200_2011_output_dir,
                                    'data_filtered_numpy_32_32_unnormed.pkl')
    out_pickle_path4_train = os.path.join(
        birds_200_2011_output_dir,
        'data_filtered_numpy_64_64_unnormed_train.pkl')
    out_pickle_path4_val = os.path.join(
        birds_200_2011_output_dir,
        'data_filtered_numpy_64_64_unnormed_val.pkl')

    images, targets = load_from_pickle(in_pickle_path)

    images_np = [(np.array(image), image, target)
                 for image, target in zip(images, targets)
                 if image.mode == 'RGB']

    images_stack = np.stack([i[0] for i in images_np])

    images_stack_normed = images_stack / 255
    rgb_mean = np.mean(images_stack_normed, axis=(0, 1, 2))
    rgb_std = np.std(images_stack_normed, axis=(0, 1, 2))
    print('RGB mean:', rgb_mean)
    print('RGB std:', rgb_std)

    dump_to_pickle([a[1:] for a in images_np], out_pickle_path)

    targets_ = [a[2] for a in images_np]
    dump_to_pickle((images_stack_normed, targets_), out_pickle_path2)

    images_stack = np.stack(
        [np.array(a[1].resize((32, 32))) for a in images_np])

    # images_stack_normed = images_stack / 255
    # rgb_mean = np.mean(images_stack, axis=(0, 1, 2))
    # rgb_std = np.std(images_stack, axis=(0, 1, 2))
    # print('RGB mean:', rgb_mean)
    # print('RGB std:', rgb_std)

    dump_to_pickle((images_stack, targets_), out_pickle_path3)

    images_stack = np.stack(
        [np.array(a[1].resize((64, 64))) for a in images_np])

    targets__ = np.stack(targets_)
    is_training_image = targets__[:, 3].astype(np.bool)
    train_set = images_stack[is_training_image]
    val_set = images_stack[~is_training_image]
    train_targets = targets__[is_training_image]
    val_targets = targets__[~is_training_image]

    dump_to_pickle((train_set, train_targets), out_pickle_path4_train)
    dump_to_pickle((val_set, val_targets), out_pickle_path4_val)
示例#4
0
def run_collaborative(properties, csvs, logger):
    """
    It processes the data to obtain the input vectors for the collaborative method and then uses the input data to
    create the model for the collaborative method.

    Args
        properties (dict): dictionary containing all the properties loaded from the yaml file - here the models
        parameter is used
        csvs (dict): the datasets loaded from the csv files
    """
    dp = CollaborativePreprocessing()
    logger.info("Creating input vectors for collaborative method")
    dp.preprocess(properties=properties, datasets=csvs, logger=logger)
    input_data = dp.users_ratings
    user_ids = dp.user_ids
    movie_ids = dp.movie_ids
    for model_name in properties["models"]["collaborative"]:
        logger.debug("Running model: {}".format(model_name))
        folder_name = CollaborativeModels.collaborative.value
        clustering = CollaborativeMethod()
        if not exists(
                join(
                    utils.app_dir,
                    properties["output_folder"], "results_{}_{}".format(
                        folder_name, properties["dataset"]))):
            mkdir(
                join(
                    utils.app_dir, properties["output_folder"],
                    "results_{}_{}".format(folder_name,
                                           properties["dataset"])))
        if exists(
                join(
                    utils.app_dir, properties["output_folder"],
                    "collaborative_user_predictions_{}.pickle".format(
                        properties["dataset"]))):
            users = utils.load_from_pickle(
                properties["output_folder"],
                "collaborative_user_predictions_{}.pickle".format(
                    properties["dataset"]))
        else:
            users = clustering.exec_collaborative_method(
                properties=properties,
                user_ratings=input_data,
                user_ids=user_ids,
                movie_ids=movie_ids,
                logger=logger)
        class_method = Classification.binary.value if properties["classification"] == "binary" else \
            Classification.multi.value
        clustering.calc_results(properties=properties,
                                users=users,
                                logger=logger,
                                classification=class_method)
示例#5
0
def count_instances_per_class(properties):
    classification = properties["classification"]
    output_folder = properties["output_folder"]
    dataset = properties["dataset"]
    ratings = utils.load_from_pickle(output_folder, "ratings.pickle_{}_{}".format(dataset, classification))
    if classification == Classification.binary.value:
        print("Get instances per class for binary classification")
        ratings_like = ratings[ratings == 0]
        ratings_dislike = ratings[ratings == 1]
        print("Like ratings: {}".format(ratings_like.shape))
        print("Dislike ratings: {}".format(ratings_dislike.shape))
    elif classification == Classification.multi.value:
        for i in range(1, 6):
            class_ratings = ratings[ratings == i]
            print("Ratings for class {} are {}".format(i, class_ratings.shape))
    def __init__(self,
                 is_train,
                 targets_type,
                 train_path=train_pickle_path,
                 val_path=val_pickle_path,
                 transform=None):
        super(Birds200_2011, self).__init__()
        self.pickle_file_path = train_path if is_train else val_path
        self.targets_type = targets_type
        self.transform = transform

        data, targets = load_from_pickle(self.pickle_file_path)
        if self.targets_type == 'attributes':
            self.attributes_targets = self._load_attributes(targets)
        else:
            assert self.targets_type == 'class', f"Unrecognized targets type {self.targets_type}"

        self.data = data
        self.targets = targets
示例#7
0
    def preprocess(self, properties, datasets, logger, kind=PreprocessKind.train.value):
        """
            Checks if the input and the rating file exist and loads them from the output folder. Otherwise, takes the
            rating, movies and tags datasets and converts them to dataframes and also loads the glove file. It iterates
            the ratings dataframe keeping from every row the movie id, user id and the rating. It uses the functions
            preprocess_rating, preprocess_text and text_to_glove to create a vector corresponding to a movie's features
            and user id. The user's id is added on the first position of that vector. Every vector is added to a list of
            vectors called input_data. Finally, the rating of every user for a particular movie is added to a list
            called ratings and both this list as well as the input_data list are being saved to the output folder.

            Args:
                properties(dict): properties loaded from yaml file. Used so as to get the output folder
                datasets (dict): contains the dataframes of all the movielens csvs
                logger (Logger): the logger to print messages
                kind (str): if set to train the ratings.csv is used for input vectors otherwise the generated
                test_recommendation.csv is used
        """
        output_folder = properties["output_folder"]
        input_data_pickle_filename = self.input_data_pickle + "_{}_{}".format(properties["dataset"],
                                                                              properties["classification"])
        ratings_pickle_filename = self.ratings_pickle + "_{}_{}".format(properties["dataset"],
                                                                        properties["classification"])
        test_dataset_pickle_filename = self.test_dataset_pickle + "_{}_{}".format(properties["dataset"],
                                                                                  properties["classification"])

        if self.check_pickle_files_exist(properties=properties):
            logger.info("Content-based input data already exist and will be loaded from pickle file")
            input_filename = input_data_pickle_filename if kind == PreprocessKind.train.value else \
                test_dataset_pickle_filename
            self.input_data = utils.load_from_pickle(output_folder, input_filename)
            self.ratings = utils.load_from_pickle(output_folder, ratings_pickle_filename)
            logger.info("Loaded inputs of shape {}".format(self.input_data.shape))
            logger.info("Loaded ratings of shape {}".format(self.ratings.shape))
        else:
            ratings_df = datasets["ratings"] if kind == PreprocessKind.train.value else datasets["test_recommendation"]
            movies_df = datasets["movies"]
            tags_df = datasets["tags"]
            glove_df = utils.load_glove_file(properties=properties, logger=logger)
            users_dict_dummy = self.__create_dummy_variables(ratings=ratings_df)
            logger.info("Generating input vectors")
            self.input_data = []
            self.ratings = []
            for index, row in ratings_df.iterrows():
                user_id, movie_id, rating, _ = row
                movie_id = int(movie_id)
                user_id = int(user_id)
                logger.debug("Preprocessing userid {} and movieid {} with rating {}".format(user_id, movie_id, rating))
                # preprocess
                rating = self._preprocess_rating(properties, rating)
                logger.debug("Preprocessed rating: {}".format(rating))
                movie_text = self._preprocess_text(movies_df, tags_df, movie_id, user_id, logger)
                logger.debug("Preprocessed text: {}".format(" ".join(movie_text)))
                movie_vector = self._text_to_glove(properties, glove_df, movie_text)
                if movie_vector.size == 0:
                    continue
                movie_vector = np.concatenate((users_dict_dummy[user_id], movie_vector), axis=1)
                self.input_data.append(movie_vector)
                self.ratings.append(rating)
                utils.print_progress(self.ratings, logger=logger)

            self.ratings = np.asarray(self.ratings)
            self.input_data = np.concatenate(self.input_data)
            logger.info("Produced a feature matrix of shape {}".format(self.input_data.shape))
            # standardization
            logger.info("Standardize input vectors")
            self.input_data = preprocessing.scale(self.input_data)
            logger.info("Save input vectors to file")
            input_filename = input_data_pickle_filename if kind == PreprocessKind.train.value else \
                test_dataset_pickle_filename
            utils.write_to_pickle(obj=self.input_data, directory=output_folder, filename=input_filename)
            utils.write_to_pickle(obj=self.ratings, directory=output_folder, filename=ratings_pickle_filename)
    def preprocess(self,
                   properties,
                   datasets,
                   logger,
                   kind=PreprocessKind.train.value):
        """
        Initially, checks if the ratings list exists in the output folder and if this is the case it loads it.
        Otherwise, it takes from the ratings dataset the ratings of the users, the name of the movies from the movies
        dataset and creates a list with the movies ids. Then, within a for loop iterates the ratings dataframe and for
        each user keeps track of the ratings he gave to every movie. If he didn't rate a movie, the algorithm put a
        zero to the corresponding position of the vector. After finishing this process for every user, it returns the
        vectors of the users as a list called user_ratings and writes it to the output folder as a pickle file.

        Args
            properties (dict): dictionary with the loaded properties from the yaml file
            datasets (dict): the datasets' dictionary which was created from the read_csv function

        """
        output_folder = properties["output_folder"]
        users_ratings_pickle_filename = self.users_ratings_pickle + "_{}".format(
            properties["dataset"])
        users_ids_pickle_filename = self.users_ids_pickle + "_{}".format(
            properties["dataset"])
        movie_ids_pickle_filename = self.movie_ids_pickle + "_{}".format(
            properties["dataset"])
        test_dataset_pickle_filename = self.test_dataset_pickle + "_{}".format(
            properties["dataset"])

        if utils.check_file_exists(output_folder,
                                   users_ratings_pickle_filename):
            logger.info(
                "Collaborative input vectors already exist and will be loaded from pickle file"
            )
            input_filename = users_ratings_pickle_filename if kind == PreprocessKind.train.value else \
                test_dataset_pickle_filename
            self.users_ratings = utils.load_from_pickle(
                output_folder, input_filename)
            self.user_ids = utils.load_from_pickle(output_folder,
                                                   users_ids_pickle_filename)
            self.movie_ids = utils.load_from_pickle(output_folder,
                                                    movie_ids_pickle_filename)
            logger.info("Loaded user ratings of shape {}".format(
                self.users_ratings.shape))
        else:
            os.makedirs(output_folder, exist_ok=True)
            ratings_df = datasets[
                "ratings"] if kind == PreprocessKind.train.value else datasets[
                    "test_recommendation"]
            movies_df = datasets["movies"]
            self.users_ratings = []
            self.user_ids = []
            self.movie_ids = movies_df["movieId"].values.tolist()
            logger.info("Generating input vectors")
            for _, row in ratings_df.iterrows():
                user_id = row["userId"]
                if user_id not in self.user_ids:
                    self.user_ids.append(user_id)
                    user_ratings = ratings_df[ratings_df["userId"] == user_id]
                    user_vector = []
                    for movie_id in self.movie_ids:
                        rating_row = user_ratings[user_ratings["movieId"] ==
                                                  movie_id]
                        if not rating_row.empty:
                            rating_row = rating_row["rating"].values.tolist()
                            user_vector.append(rating_row[0])
                        else:
                            user_vector.append(0.0)
                    user_vector = np.array(user_vector)
                    self.users_ratings.append(user_vector)
                utils.print_progress(self.users_ratings, logger=logger)
            logger.info("Writing input vectors into pickle file")
            self.users_ratings = np.array(self.users_ratings)
            self.user_ids = np.asarray(self.user_ids)
            self.movie_ids = np.asarray(self.movie_ids)
            input_filename = users_ratings_pickle_filename if kind == PreprocessKind.train.value else \
                test_dataset_pickle_filename
            utils.write_to_pickle(self.users_ratings, output_folder,
                                  input_filename)
            utils.write_to_pickle(self.user_ids, output_folder,
                                  users_ids_pickle_filename)
            utils.write_to_pickle(self.movie_ids, output_folder,
                                  movie_ids_pickle_filename)
示例#9
0
def run_test(properties, csvs, logger):
    """
    Method to run the recommendation system using the best produced models for content-based method.
    Uses the test_recommendation.csv file where no rating is available. 

    Args
        properties (dict): the loaded configuration file
        csvs (dict): the DataFrames from the input csv files
        logger (Logger): a Logger object to print info/error messages
    """
    # preprocess with test recommendation csv
    logger.info("Testing the recommendation system")
    content_based_results = join(utils.app_dir, properties["output_folder"],
                                 "test_results", "content-based")
    collaborative_results = join(utils.app_dir, properties["output_folder"],
                                 "test_results", "collaborative")
    if not exists(content_based_results):
        mkdir(content_based_results)
    if not exists(collaborative_results):
        mkdir(collaborative_results)
        pearson_dir = join(utils.app_dir, properties["output_folder"],
                           "results_pearson_{}".format(properties["dataset"]))
        for file in listdir(pearson_dir):
            if file.startswith("Predictions"):
                copyfile(join(pearson_dir, file),
                         join(collaborative_results, file))
    content_based_files = listdir(content_based_results)
    if not content_based_files or len(content_based_files) != 3:
        dp = ContentBasedPreprocessing()
        logger.info("Creating input vectors for content-based method")
        test_recommendation_df = csvs["test_recommendation"]
        test_recommendation_df.loc[:, "rating"] = 0.0
        csvs["test_recommendation"] = test_recommendation_df
        dp.preprocess(properties=properties,
                      datasets=csvs,
                      logger=logger,
                      kind=PreprocessKind.recommend.value)
        input_data = dp.input_data
        ratings = dp.ratings
        for model in properties["models"]["content-based"]:
            logger.info("Testing model: {}".format(model))
            classifier = init_content_based_model(model)
            directory = join("output", "best_models")
            filename = "best_model_{}_{}.pickle".format(
                model, properties["dataset"])
            classifier.best_model = utils.load_from_pickle(directory=directory,
                                                           file=filename)
            true_labels, predictions = classifier.test(
                input_data, ratings, kind=MetricKind.test.value)
            predicted_labels, probabilities = classifier.get_predicted_labels_and_probabilities(
                properties=properties, predictions=predictions)
            dataset_folder = Datasets.ml_latest_small.value if properties["dataset"] == Datasets.small.value \
                else Datasets.ml_latest.value
            test_csv_path = join(utils.app_dir, properties["datasets_folder"],
                                 dataset_folder, "test_recommendation.csv")
            df = pd.read_csv(test_csv_path)
            df["rating"] = predicted_labels
            df.insert(loc=4, column='probability', value=probabilities)
            logger.info("Writing results to file")
            new_csv = join(content_based_results,
                           "test_recommendation_{}.csv".format(model))
            df.to_csv(new_csv, sep=",")
    qualitative_collaborative(properties=properties,
                              logger=logger,
                              directory=collaborative_results)
    qualitative_content_based(properties=properties,
                              logger=logger,
                              directory=content_based_results)
示例#10
0
def qualitative_content_based(properties, logger, directory):
    """
    It creates a list with the top-n recommended movies to a user generated by the content-based classifiers and checks
    which of them share common genres with the actual top-rated movies of a user. The accepted movies are based on a
    threshold that counts how many times a specific genre appears in the actual rated movies. Finally, it calculates
    the recommendation accuracy of the classifiers.

    Args
        properties (dict): dataset, datasets_folder, models, output_folder, qualitative
        logger (Logger): handles the logs
        directory (str): the path where the csv files with the recommedation results are

    """
    dataset = Datasets.ml_latest_small.value if properties["dataset"] == Datasets.small.value else \
        Datasets.ml_latest.value
    dataset_path = join(utils.app_dir, properties["datasets_folder"], dataset)
    movie_df = pd.read_csv(join(dataset_path, "movies.csv"))
    ratings_df = pd.read_csv(join(dataset_path, "ratings.csv"))
    for model in properties["models"]["content-based"]:
        filename = join(directory, "test_recommendation_{}.csv".format(model))
        df = pd.read_csv(filename)
        del df['Unnamed: 0']
        user_ids = utils.load_from_pickle(
            properties["output_folder"],
            "user_ids.pickle_{}".format(properties["dataset"]))
        users_accuracies = {}
        user_accepted_movies = {}
        for userid in user_ids:
            user_predictions_df = df[(df['userId'] == userid)
                                     & (df['rating'] == 0)]
            user_true_df = ratings_df[(ratings_df['userId'] == userid)
                                      & (ratings_df['rating'] > 3)]
            true_movies = list(user_true_df['movieId'])
            true_movies_genres = {}
            for true_movie_id in true_movies:
                movie_line = movie_df[movie_df['movieId'] == true_movie_id]
                genres = movie_line.iloc[0]["genres"].split("|")
                for genre in genres:
                    if genre not in true_movies_genres.keys():
                        true_movies_genres[genre] = 0
                    true_movies_genres[genre] += 1
            user_predictions_df.sort_values('probability')
            recommend_movies = list(user_predictions_df['movieId'])
            if len(recommend_movies) > properties["qualitative"]["top_num"]:
                recommend_movies = recommend_movies[:9]
            accept = []
            if recommend_movies:
                for recom_movieid in recommend_movies:
                    movie_line = movie_df[movie_df['movieId'] == recom_movieid]
                    genres = movie_line.iloc[0]["genres"].split("|")
                    for genre in genres:
                        if genre in true_movies_genres.keys():
                            if true_movies_genres[genre] >= properties["qualitative"]["threshold"] and \
                                    recom_movieid not in accept:
                                accept.append(recom_movieid)

                user_accepted_movies[userid] = accept
                users_accuracies[userid] = len(accept) / len(recommend_movies)
                accepted_movies = " ".join(str(x)
                                           for x in accept) if accept else ""
                logger.debug("User with id {} has accepted movies: {}".format(
                    userid, accepted_movies))
                logger.debug("Accuracy for user with id {} is {}".format(
                    userid, users_accuracies[userid]))
        model_sum = 0
        model_count = 0
        for k, v in users_accuracies.items():
            # v is the list of grades for student k
            model_sum += v
            model_count += 1
        model_avg_accuracy = model_sum / model_count
        logger.info("Model's {} accuracy: {}".format(model,
                                                     model_avg_accuracy))