예제 #1
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens1MReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("Movielens1MReader: Unable to find data zip file. Downloading...")
            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip")
            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")


        URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(URM_path, separator="::")

        print("Movielens1MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens1MReader: loading complete")

        return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())})
    def _load_from_original_file(self):
        # Load data from original

        print("LastFMHetrec2011Reader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "LastFMHetrec2011Reader: Unable to find or extract data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, folder_path,
                            "hetrec2011-lastfm-2k.zip")
            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        URM_path = dataFile.extract("user_artists.dat",
                                    path=folder_path + "decompressed")
        tags_path = dataFile.extract("user_taggedartists-timestamps.dat",
                                     path=folder_path + "decompressed")

        print("LastFMHetrec2011Reader: loading URM")
        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            URM_path, separator="\t", header=True)

        print("LastFMHetrec2011Reader: loading tags")
        ICM_tags, feature_mapper, _ = self._loadICM_tags(tags_path,
                                                         item_mapper,
                                                         header=True,
                                                         separator='\t',
                                                         if_new_item="ignore")

        print("LastFMHetrec2011Reader: cleaning temporary files")

        import shutil

        shutil.rmtree(folder_path + "decompressed", ignore_errors=True)

        print("LastFMHetrec2011Reader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={"ICM_all": ICM_tags},
                       ICM_mappers_dict={
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
예제 #3
0
    def _load_from_original_file(self):
        # Load data from original

        print("EpinionsReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        compressed_file_path = folder_path + "ratings_data.txt.bz2"
        decompressed_file_path = folder_path + "ratings_data.txt"

        try:

            open(decompressed_file_path, "r")

        except FileNotFoundError:

            print(
                "EpinionsReader: Unable to find decompressed data file. Decompressing..."
            )

            try:

                compressed_file = bz2.open(compressed_file_path, "rb")

            except Exception:

                print(
                    "EpinionsReader: Unable to find or open compressed data file. Downloading..."
                )
                downloadFromURL(self.DATASET_URL, folder_path,
                                "ratings_data.txt.bz2")
                compressed_file = bz2.open(compressed_file_path, "rb")

            decompressed_file = open(decompressed_file_path, "w")
            self._save_BZ2_in_text_file(compressed_file, decompressed_file)
            decompressed_file.close()

        print("EpinionsReader: loading URM")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            decompressed_file_path, separator=" ", header=True)

        print("EpinionsReader: cleaning temporary files")

        import os

        os.remove(decompressed_file_path)

        print("EpinionsReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
    def _load_from_original_file_all_amazon_datasets(self, URM_path, metadata_path=None, reviews_path=None):

        print("AmazonReviewDataReader: Loading original data")

        print("AmazonReviewDataReader: loading URM")
        URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder(URM_path, separator=",", header = False)
        urm = {"URM_all": URM_all}
        urm_mappers = {"URM_all": (user_original_ID_to_index, item_original_ID_to_index)}

        icm = {}
        icm_mappers = {}
        if metadata_path is not None:
            print("AmazonReviewDataReader: loading metadata")
            ICM_metadata, feature_mapper, item_mapper = self._loadMetadata(metadata_path, item_original_ID_to_index, if_new_item="ignore")
            ICM_metadata, _, feature_mapper = removeFeatures(ICM_metadata, minOccurrence=5, maxPercOccurrence=0.30,
                                                             reconcile_mapper=feature_mapper)
            icm["ICM_metadata"] = ICM_metadata
            icm_mappers["ICM_metadata"] = (item_mapper.copy(), feature_mapper.copy())

        if reviews_path is not None:
            print("AmazonReviewDataReader: loading reviews")
            ICM_reviews, feature_mapper, item_mapper = self._loadReviews(reviews_path, item_original_ID_to_index, if_new_item="ignore")
            ICM_reviews, _, feature_mapper = removeFeatures(ICM_reviews, minOccurrence=5, maxPercOccurrence=0.30,
                                                            reconcile_mapper=feature_mapper)
            icm["ICM_reviews"] = ICM_reviews
            icm_mappers["ICM_reviews"] = (item_mapper.copy(), feature_mapper.copy())

        if len(icm) > 0:
            ICM_names = list(icm.keys())
            ICM_all, ICM_all_mapper = icm[ICM_names[0]], icm_mappers[ICM_names[0]]
            for key in ICM_names[1:]:
                ICM_all, ICM_all_mapper = self._merge_ICM(ICM_all, icm[key], ICM_all_mapper, icm_mappers[key])
            icm["ICM_all"] = ICM_all
            icm_mappers["ICM_all"] = ICM_all_mapper

        # Clean temp files
        print("AmazonReviewDataReader: cleaning temporary files")

        if metadata_path is not None:
            os.remove(metadata_path)

        if reviews_path is not None:
            os.remove(reviews_path)

        print("AmazonReviewDataReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict=urm, URM_mappers_dict=urm_mappers,
                       ICM_dict=icm, ICM_mappers_dict=icm_mappers)