def __init__(self, split, target_type='identity', attr_index=None, data_dir=dataset_dir, transform=None): super(CelebADataset, self).__init__() # self.images, identities, attributes = load_from_pickle(os.path.join(data_dir, f'{split}.pkl')) # files = os.listdir(os.path.join(data_dir, f'{split}_part*.pkl')) files = glob.glob(os.path.join(data_dir, f'{split}_part*.pkl')) files = [ os.path.join(data_dir, f'{split}_part{i}.pkl') for i in range(len(files) // 2) ] # NOTE: TAKING HALF OF DATASET BECAUSE IT'S TOO BIG data = [load_from_pickle(f) for f in files] self.images, identities, attributes = (np.concatenate(d, axis=0) for d in zip(*data)) self.images = self.images.transpose((0, 2, 3, 1)) # assert target_type in ['identity', 'attr'], 'Expected argument `target_type` to be "identity" or "attr".' # self.targets = identities - 1 if target_type == 'identity' else attributes if target_type == 'identity': self.targets = identities - 1 elif target_type == 'attr': assert attr_index is not None, 'Need to provide `attr_index` argument when `target_type`=="identity"' self.targets = attributes[:, attr_index] else: raise ValueError( 'Expected argument `target_type` to be "identity" or "attr".') self.transform = transform
def make_dataset(imgs_dir, annos_dir, pkl_path, imgs_set_path, labels="voc", show_dataset=False): """ :param imgs_dir: str :param annos_dir: str :param pkl_path: str :param imgs_set_path: str :param labels: str["coco", "voc"] or list[str] or Dict[str: int] 可多对一 :param show_dataset: bool :return: """ pkl_dir = os.path.dirname(pkl_path) os.makedirs(pkl_dir, exist_ok=True) xml_processor = XMLProcessor(imgs_dir, annos_dir, labels, imgs_set_path, True) if os.path.exists(pkl_path): img_path_list, target_list = load_from_pickle(pkl_path) xml_processor.img_path_list, xml_processor.target_list = img_path_list, target_list xml_processor.test_dataset() else: xml_processor.create_labels_cache() xml_processor.test_dataset() save_to_pickle((xml_processor.img_path_list, xml_processor.target_list), pkl_path) img_path_list, target_list = xml_processor.img_path_list, xml_processor.target_list if show_dataset: xml_processor.show_dataset() return img_path_list, target_list
def to_numpy_format(birds_200_2011_root_dir): birds_200_2011_output_dir = os.path.join(birds_200_2011_root_dir, r'CUB_200_2011\parsed') in_pickle_path = os.path.join(birds_200_2011_output_dir, 'data.pkl') out_pickle_path = os.path.join(birds_200_2011_output_dir, 'data_filtered.pkl') out_pickle_path2 = os.path.join(birds_200_2011_output_dir, 'data_filtered_numpy.pkl') out_pickle_path3 = os.path.join(birds_200_2011_output_dir, 'data_filtered_numpy_32_32_unnormed.pkl') out_pickle_path4_train = os.path.join( birds_200_2011_output_dir, 'data_filtered_numpy_64_64_unnormed_train.pkl') out_pickle_path4_val = os.path.join( birds_200_2011_output_dir, 'data_filtered_numpy_64_64_unnormed_val.pkl') images, targets = load_from_pickle(in_pickle_path) images_np = [(np.array(image), image, target) for image, target in zip(images, targets) if image.mode == 'RGB'] images_stack = np.stack([i[0] for i in images_np]) images_stack_normed = images_stack / 255 rgb_mean = np.mean(images_stack_normed, axis=(0, 1, 2)) rgb_std = np.std(images_stack_normed, axis=(0, 1, 2)) print('RGB mean:', rgb_mean) print('RGB std:', rgb_std) dump_to_pickle([a[1:] for a in images_np], out_pickle_path) targets_ = [a[2] for a in images_np] dump_to_pickle((images_stack_normed, targets_), out_pickle_path2) images_stack = np.stack( [np.array(a[1].resize((32, 32))) for a in images_np]) # images_stack_normed = images_stack / 255 # rgb_mean = np.mean(images_stack, axis=(0, 1, 2)) # rgb_std = np.std(images_stack, axis=(0, 1, 2)) # print('RGB mean:', rgb_mean) # print('RGB std:', rgb_std) dump_to_pickle((images_stack, targets_), out_pickle_path3) images_stack = np.stack( [np.array(a[1].resize((64, 64))) for a in images_np]) targets__ = np.stack(targets_) is_training_image = targets__[:, 3].astype(np.bool) train_set = images_stack[is_training_image] val_set = images_stack[~is_training_image] train_targets = targets__[is_training_image] val_targets = targets__[~is_training_image] dump_to_pickle((train_set, train_targets), out_pickle_path4_train) dump_to_pickle((val_set, val_targets), out_pickle_path4_val)
def run_collaborative(properties, csvs, logger): """ It processes the data to obtain the input vectors for the collaborative method and then uses the input data to create the model for the collaborative method. Args properties (dict): dictionary containing all the properties loaded from the yaml file - here the models parameter is used csvs (dict): the datasets loaded from the csv files """ dp = CollaborativePreprocessing() logger.info("Creating input vectors for collaborative method") dp.preprocess(properties=properties, datasets=csvs, logger=logger) input_data = dp.users_ratings user_ids = dp.user_ids movie_ids = dp.movie_ids for model_name in properties["models"]["collaborative"]: logger.debug("Running model: {}".format(model_name)) folder_name = CollaborativeModels.collaborative.value clustering = CollaborativeMethod() if not exists( join( utils.app_dir, properties["output_folder"], "results_{}_{}".format( folder_name, properties["dataset"]))): mkdir( join( utils.app_dir, properties["output_folder"], "results_{}_{}".format(folder_name, properties["dataset"]))) if exists( join( utils.app_dir, properties["output_folder"], "collaborative_user_predictions_{}.pickle".format( properties["dataset"]))): users = utils.load_from_pickle( properties["output_folder"], "collaborative_user_predictions_{}.pickle".format( properties["dataset"])) else: users = clustering.exec_collaborative_method( properties=properties, user_ratings=input_data, user_ids=user_ids, movie_ids=movie_ids, logger=logger) class_method = Classification.binary.value if properties["classification"] == "binary" else \ Classification.multi.value clustering.calc_results(properties=properties, users=users, logger=logger, classification=class_method)
def count_instances_per_class(properties): classification = properties["classification"] output_folder = properties["output_folder"] dataset = properties["dataset"] ratings = utils.load_from_pickle(output_folder, "ratings.pickle_{}_{}".format(dataset, classification)) if classification == Classification.binary.value: print("Get instances per class for binary classification") ratings_like = ratings[ratings == 0] ratings_dislike = ratings[ratings == 1] print("Like ratings: {}".format(ratings_like.shape)) print("Dislike ratings: {}".format(ratings_dislike.shape)) elif classification == Classification.multi.value: for i in range(1, 6): class_ratings = ratings[ratings == i] print("Ratings for class {} are {}".format(i, class_ratings.shape))
def __init__(self, is_train, targets_type, train_path=train_pickle_path, val_path=val_pickle_path, transform=None): super(Birds200_2011, self).__init__() self.pickle_file_path = train_path if is_train else val_path self.targets_type = targets_type self.transform = transform data, targets = load_from_pickle(self.pickle_file_path) if self.targets_type == 'attributes': self.attributes_targets = self._load_attributes(targets) else: assert self.targets_type == 'class', f"Unrecognized targets type {self.targets_type}" self.data = data self.targets = targets
def preprocess(self, properties, datasets, logger, kind=PreprocessKind.train.value): """ Checks if the input and the rating file exist and loads them from the output folder. Otherwise, takes the rating, movies and tags datasets and converts them to dataframes and also loads the glove file. It iterates the ratings dataframe keeping from every row the movie id, user id and the rating. It uses the functions preprocess_rating, preprocess_text and text_to_glove to create a vector corresponding to a movie's features and user id. The user's id is added on the first position of that vector. Every vector is added to a list of vectors called input_data. Finally, the rating of every user for a particular movie is added to a list called ratings and both this list as well as the input_data list are being saved to the output folder. Args: properties(dict): properties loaded from yaml file. Used so as to get the output folder datasets (dict): contains the dataframes of all the movielens csvs logger (Logger): the logger to print messages kind (str): if set to train the ratings.csv is used for input vectors otherwise the generated test_recommendation.csv is used """ output_folder = properties["output_folder"] input_data_pickle_filename = self.input_data_pickle + "_{}_{}".format(properties["dataset"], properties["classification"]) ratings_pickle_filename = self.ratings_pickle + "_{}_{}".format(properties["dataset"], properties["classification"]) test_dataset_pickle_filename = self.test_dataset_pickle + "_{}_{}".format(properties["dataset"], properties["classification"]) if self.check_pickle_files_exist(properties=properties): logger.info("Content-based input data already exist and will be loaded from pickle file") input_filename = input_data_pickle_filename if kind == PreprocessKind.train.value else \ test_dataset_pickle_filename self.input_data = utils.load_from_pickle(output_folder, input_filename) self.ratings = utils.load_from_pickle(output_folder, ratings_pickle_filename) logger.info("Loaded inputs of shape {}".format(self.input_data.shape)) logger.info("Loaded ratings of shape {}".format(self.ratings.shape)) else: ratings_df = datasets["ratings"] if kind == PreprocessKind.train.value else datasets["test_recommendation"] movies_df = datasets["movies"] tags_df = datasets["tags"] glove_df = utils.load_glove_file(properties=properties, logger=logger) users_dict_dummy = self.__create_dummy_variables(ratings=ratings_df) logger.info("Generating input vectors") self.input_data = [] self.ratings = [] for index, row in ratings_df.iterrows(): user_id, movie_id, rating, _ = row movie_id = int(movie_id) user_id = int(user_id) logger.debug("Preprocessing userid {} and movieid {} with rating {}".format(user_id, movie_id, rating)) # preprocess rating = self._preprocess_rating(properties, rating) logger.debug("Preprocessed rating: {}".format(rating)) movie_text = self._preprocess_text(movies_df, tags_df, movie_id, user_id, logger) logger.debug("Preprocessed text: {}".format(" ".join(movie_text))) movie_vector = self._text_to_glove(properties, glove_df, movie_text) if movie_vector.size == 0: continue movie_vector = np.concatenate((users_dict_dummy[user_id], movie_vector), axis=1) self.input_data.append(movie_vector) self.ratings.append(rating) utils.print_progress(self.ratings, logger=logger) self.ratings = np.asarray(self.ratings) self.input_data = np.concatenate(self.input_data) logger.info("Produced a feature matrix of shape {}".format(self.input_data.shape)) # standardization logger.info("Standardize input vectors") self.input_data = preprocessing.scale(self.input_data) logger.info("Save input vectors to file") input_filename = input_data_pickle_filename if kind == PreprocessKind.train.value else \ test_dataset_pickle_filename utils.write_to_pickle(obj=self.input_data, directory=output_folder, filename=input_filename) utils.write_to_pickle(obj=self.ratings, directory=output_folder, filename=ratings_pickle_filename)
def preprocess(self, properties, datasets, logger, kind=PreprocessKind.train.value): """ Initially, checks if the ratings list exists in the output folder and if this is the case it loads it. Otherwise, it takes from the ratings dataset the ratings of the users, the name of the movies from the movies dataset and creates a list with the movies ids. Then, within a for loop iterates the ratings dataframe and for each user keeps track of the ratings he gave to every movie. If he didn't rate a movie, the algorithm put a zero to the corresponding position of the vector. After finishing this process for every user, it returns the vectors of the users as a list called user_ratings and writes it to the output folder as a pickle file. Args properties (dict): dictionary with the loaded properties from the yaml file datasets (dict): the datasets' dictionary which was created from the read_csv function """ output_folder = properties["output_folder"] users_ratings_pickle_filename = self.users_ratings_pickle + "_{}".format( properties["dataset"]) users_ids_pickle_filename = self.users_ids_pickle + "_{}".format( properties["dataset"]) movie_ids_pickle_filename = self.movie_ids_pickle + "_{}".format( properties["dataset"]) test_dataset_pickle_filename = self.test_dataset_pickle + "_{}".format( properties["dataset"]) if utils.check_file_exists(output_folder, users_ratings_pickle_filename): logger.info( "Collaborative input vectors already exist and will be loaded from pickle file" ) input_filename = users_ratings_pickle_filename if kind == PreprocessKind.train.value else \ test_dataset_pickle_filename self.users_ratings = utils.load_from_pickle( output_folder, input_filename) self.user_ids = utils.load_from_pickle(output_folder, users_ids_pickle_filename) self.movie_ids = utils.load_from_pickle(output_folder, movie_ids_pickle_filename) logger.info("Loaded user ratings of shape {}".format( self.users_ratings.shape)) else: os.makedirs(output_folder, exist_ok=True) ratings_df = datasets[ "ratings"] if kind == PreprocessKind.train.value else datasets[ "test_recommendation"] movies_df = datasets["movies"] self.users_ratings = [] self.user_ids = [] self.movie_ids = movies_df["movieId"].values.tolist() logger.info("Generating input vectors") for _, row in ratings_df.iterrows(): user_id = row["userId"] if user_id not in self.user_ids: self.user_ids.append(user_id) user_ratings = ratings_df[ratings_df["userId"] == user_id] user_vector = [] for movie_id in self.movie_ids: rating_row = user_ratings[user_ratings["movieId"] == movie_id] if not rating_row.empty: rating_row = rating_row["rating"].values.tolist() user_vector.append(rating_row[0]) else: user_vector.append(0.0) user_vector = np.array(user_vector) self.users_ratings.append(user_vector) utils.print_progress(self.users_ratings, logger=logger) logger.info("Writing input vectors into pickle file") self.users_ratings = np.array(self.users_ratings) self.user_ids = np.asarray(self.user_ids) self.movie_ids = np.asarray(self.movie_ids) input_filename = users_ratings_pickle_filename if kind == PreprocessKind.train.value else \ test_dataset_pickle_filename utils.write_to_pickle(self.users_ratings, output_folder, input_filename) utils.write_to_pickle(self.user_ids, output_folder, users_ids_pickle_filename) utils.write_to_pickle(self.movie_ids, output_folder, movie_ids_pickle_filename)
def run_test(properties, csvs, logger): """ Method to run the recommendation system using the best produced models for content-based method. Uses the test_recommendation.csv file where no rating is available. Args properties (dict): the loaded configuration file csvs (dict): the DataFrames from the input csv files logger (Logger): a Logger object to print info/error messages """ # preprocess with test recommendation csv logger.info("Testing the recommendation system") content_based_results = join(utils.app_dir, properties["output_folder"], "test_results", "content-based") collaborative_results = join(utils.app_dir, properties["output_folder"], "test_results", "collaborative") if not exists(content_based_results): mkdir(content_based_results) if not exists(collaborative_results): mkdir(collaborative_results) pearson_dir = join(utils.app_dir, properties["output_folder"], "results_pearson_{}".format(properties["dataset"])) for file in listdir(pearson_dir): if file.startswith("Predictions"): copyfile(join(pearson_dir, file), join(collaborative_results, file)) content_based_files = listdir(content_based_results) if not content_based_files or len(content_based_files) != 3: dp = ContentBasedPreprocessing() logger.info("Creating input vectors for content-based method") test_recommendation_df = csvs["test_recommendation"] test_recommendation_df.loc[:, "rating"] = 0.0 csvs["test_recommendation"] = test_recommendation_df dp.preprocess(properties=properties, datasets=csvs, logger=logger, kind=PreprocessKind.recommend.value) input_data = dp.input_data ratings = dp.ratings for model in properties["models"]["content-based"]: logger.info("Testing model: {}".format(model)) classifier = init_content_based_model(model) directory = join("output", "best_models") filename = "best_model_{}_{}.pickle".format( model, properties["dataset"]) classifier.best_model = utils.load_from_pickle(directory=directory, file=filename) true_labels, predictions = classifier.test( input_data, ratings, kind=MetricKind.test.value) predicted_labels, probabilities = classifier.get_predicted_labels_and_probabilities( properties=properties, predictions=predictions) dataset_folder = Datasets.ml_latest_small.value if properties["dataset"] == Datasets.small.value \ else Datasets.ml_latest.value test_csv_path = join(utils.app_dir, properties["datasets_folder"], dataset_folder, "test_recommendation.csv") df = pd.read_csv(test_csv_path) df["rating"] = predicted_labels df.insert(loc=4, column='probability', value=probabilities) logger.info("Writing results to file") new_csv = join(content_based_results, "test_recommendation_{}.csv".format(model)) df.to_csv(new_csv, sep=",") qualitative_collaborative(properties=properties, logger=logger, directory=collaborative_results) qualitative_content_based(properties=properties, logger=logger, directory=content_based_results)
def qualitative_content_based(properties, logger, directory): """ It creates a list with the top-n recommended movies to a user generated by the content-based classifiers and checks which of them share common genres with the actual top-rated movies of a user. The accepted movies are based on a threshold that counts how many times a specific genre appears in the actual rated movies. Finally, it calculates the recommendation accuracy of the classifiers. Args properties (dict): dataset, datasets_folder, models, output_folder, qualitative logger (Logger): handles the logs directory (str): the path where the csv files with the recommedation results are """ dataset = Datasets.ml_latest_small.value if properties["dataset"] == Datasets.small.value else \ Datasets.ml_latest.value dataset_path = join(utils.app_dir, properties["datasets_folder"], dataset) movie_df = pd.read_csv(join(dataset_path, "movies.csv")) ratings_df = pd.read_csv(join(dataset_path, "ratings.csv")) for model in properties["models"]["content-based"]: filename = join(directory, "test_recommendation_{}.csv".format(model)) df = pd.read_csv(filename) del df['Unnamed: 0'] user_ids = utils.load_from_pickle( properties["output_folder"], "user_ids.pickle_{}".format(properties["dataset"])) users_accuracies = {} user_accepted_movies = {} for userid in user_ids: user_predictions_df = df[(df['userId'] == userid) & (df['rating'] == 0)] user_true_df = ratings_df[(ratings_df['userId'] == userid) & (ratings_df['rating'] > 3)] true_movies = list(user_true_df['movieId']) true_movies_genres = {} for true_movie_id in true_movies: movie_line = movie_df[movie_df['movieId'] == true_movie_id] genres = movie_line.iloc[0]["genres"].split("|") for genre in genres: if genre not in true_movies_genres.keys(): true_movies_genres[genre] = 0 true_movies_genres[genre] += 1 user_predictions_df.sort_values('probability') recommend_movies = list(user_predictions_df['movieId']) if len(recommend_movies) > properties["qualitative"]["top_num"]: recommend_movies = recommend_movies[:9] accept = [] if recommend_movies: for recom_movieid in recommend_movies: movie_line = movie_df[movie_df['movieId'] == recom_movieid] genres = movie_line.iloc[0]["genres"].split("|") for genre in genres: if genre in true_movies_genres.keys(): if true_movies_genres[genre] >= properties["qualitative"]["threshold"] and \ recom_movieid not in accept: accept.append(recom_movieid) user_accepted_movies[userid] = accept users_accuracies[userid] = len(accept) / len(recommend_movies) accepted_movies = " ".join(str(x) for x in accept) if accept else "" logger.debug("User with id {} has accepted movies: {}".format( userid, accepted_movies)) logger.debug("Accuracy for user with id {} is {}".format( userid, users_accuracies[userid])) model_sum = 0 model_count = 0 for k, v in users_accuracies.items(): # v is the list of grades for student k model_sum += v model_count += 1 model_avg_accuracy = model_sum / model_count logger.info("Model's {} accuracy: {}".format(model, model_avg_accuracy))