示例#1
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        similarity = Compute_Similarity(self.URM_train.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
示例#2
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif feature_weighting == "TF-IDF":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        similarity = Compute_Similarity(self.ICM.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()

        return self.W_sparse
示例#3
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            force_compute_sim=True,
            feature_weighting="none",
            feature_weighting_index=0,
            **similarity_args):

        self.feature_weighting_index = feature_weighting_index
        feature_weighting = self.FEATURE_WEIGHTING_VALUES[
            feature_weighting_index]
        self.topK = topK
        self.shrink = shrink

        if not force_compute_sim:
            found = True
            try:
                with open(
                        os.path.join(
                            "IntermediateComputations", "ICB",
                            "tot={}_topK={}_shrink={}_featureweight={}.pkl".
                            format(str(len(self.URM_train.data)),
                                   str(self.topK), str(self.shrink),
                                   str(self.feature_weighting_index))),
                        'rb') as handle:
                    (topK_new, shrink_new, W_sparse_new) = pickle.load(handle)
            except FileNotFoundError:
                print("File {} not found".format(
                    os.path.join("IntermediateComputations",
                                 "ContentBFMatrix.pkl")))
                found = False

            if found and self.topK == topK_new and self.shrink == shrink_new:
                self.W_sparse = W_sparse_new
                print("Saved CBF Similarity Matrix Used!")
                return

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif feature_weighting == "TF-IDF":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        similarity = Compute_Similarity(self.ICM.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()

            with open(
                    os.path.join(
                        "IntermediateComputations", "ICB",
                        "tot={}_topK={}_shrink={}_featureweight={}.pkl".format(
                            str(len(self.URM_train.data)), str(self.topK),
                            str(self.shrink),
                            str(self.feature_weighting_index))),
                    'wb') as handle:
                pickle.dump((self.topK, self.shrink, self.W_sparse),
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
                print("CBF similarity matrix saved")
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()
示例#4
0
    def fit(self,
            validation_every_n=5,
            show_max_performance=False,
            logFile=None,
            precompute_common_features=True,
            learning_rate=0.01,
            positive_only_weights=True,
            init_type="zero",
            normalize_similarity=False,
            use_dropout=True,
            dropout_perc=0.3,
            l1_reg=0.0,
            l2_reg=0.0,
            epochs=50,
            topK=300,
            add_zeros_quota=0.0,
            sgd_mode='adagrad',
            gamma=0.9,
            beta_1=0.9,
            beta_2=0.999,
            stop_on_validation=False,
            lower_validatons_allowed=5,
            validation_metric="MAP",
            evaluator_object=None):

        if init_type not in self.INIT_TYPE_VALUES:
            raise ValueError(
                "Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.INIT_TYPE_VALUES, init_type))

        # Import compiled module
        from FW_Similarity.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD

        self.logFile = logFile

        if validation_every_n is not None:
            self.validation_every_n = validation_every_n
        else:
            self.validation_every_n = np.inf

        self.evaluator_object = evaluator_object

        self.show_max_performance = show_max_performance
        self.positive_only_weights = positive_only_weights
        self.normalize_similarity = normalize_similarity
        self.learning_rate = learning_rate
        self.add_zeros_quota = add_zeros_quota
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.topK = topK

        self.generateTrainData_low_ram()

        weights_initialization = None

        if init_type == "random":
            weights_initialization = np.random.normal(
                0.001, 0.1, self.n_features).astype(np.float64)
        elif init_type == "one":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
        elif init_type == "zero":
            weights_initialization = np.zeros(self.n_features,
                                              dtype=np.float64)
        elif init_type == "BM25":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif init_type == "TF-IDF":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        else:
            raise ValueError(
                "CFW_D_Similarity_Cython: 'init_type' not recognized")

        # Instantiate fast Cython implementation
        self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD(
            self.row_list,
            self.col_list,
            self.data_list,
            self.n_features,
            self.ICM,
            precompute_common_features=precompute_common_features,
            non_negative_weights=self.positive_only_weights,
            weights_initialization=weights_initialization,
            use_dropout=use_dropout,
            dropout_perc=dropout_perc,
            learning_rate=learning_rate,
            l1_reg=l1_reg,
            l2_reg=l2_reg,
            sgd_mode=sgd_mode,
            gamma=gamma,
            beta_1=beta_1,
            beta_2=beta_2)

        print(self.RECOMMENDER_NAME + ": Initialization completed")

        self._train_with_early_stopping(epochs,
                                        validation_every_n,
                                        stop_on_validation,
                                        validation_metric,
                                        lower_validatons_allowed,
                                        evaluator_object,
                                        algorithm_name=self.RECOMMENDER_NAME)

        self.compute_W_sparse()

        sys.stdout.flush()
    def fit(self,
            show_max_performance=False,
            precompute_common_features=False,
            learning_rate=0.1,
            positive_only_D=True,
            initialization_mode_D="random",
            normalize_similarity=False,
            use_dropout=True,
            dropout_perc=0.3,
            l1_reg=0.0,
            l2_reg=0.0,
            epochs=50,
            topK=300,
            add_zeros_quota=0.0,
            log_file=None,
            verbose=False,
            sgd_mode='adagrad',
            gamma=0.9,
            beta_1=0.9,
            beta_2=0.999,
            **earlystopping_kwargs):

        if initialization_mode_D not in self.INIT_TYPE_VALUES:
            raise ValueError(
                "Value for 'initialization_mode_D' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.INIT_TYPE_VALUES, initialization_mode_D))

        # Import compiled module
        from FeatureWeighting.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD

        self.show_max_performance = show_max_performance
        self.normalize_similarity = normalize_similarity
        self.learning_rate = learning_rate
        self.add_zeros_quota = add_zeros_quota
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.topK = topK
        self.log_file = log_file
        self.verbose = verbose

        self._generate_train_data()

        weights_initialization_D = None

        if initialization_mode_D == "random":
            weights_initialization_D = np.random.normal(
                0.001, 0.1, self.n_features).astype(np.float64)
        elif initialization_mode_D == "one":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
        elif initialization_mode_D == "zero":
            weights_initialization_D = np.zeros(self.n_features,
                                                dtype=np.float64)
        elif initialization_mode_D == "BM25":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif initialization_mode_D == "TF-IDF":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        else:
            raise ValueError(
                "CFW_D_Similarity_Cython: 'init_type' not recognized")

        # Instantiate fast Cython implementation
        self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD(
            self.row_list,
            self.col_list,
            self.data_list,
            self.n_features,
            self.ICM,
            precompute_common_features=precompute_common_features,
            positive_only_D=positive_only_D,
            weights_initialization_D=weights_initialization_D,
            use_dropout=use_dropout,
            dropout_perc=dropout_perc,
            learning_rate=learning_rate,
            l1_reg=l1_reg,
            l2_reg=l2_reg,
            sgd_mode=sgd_mode,
            verbose=self.verbose,
            gamma=gamma,
            beta_1=beta_1,
            beta_2=beta_2)

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Initialization completed")

        self.D_incremental = self.FW_D_Similarity.get_weights()
        self.D_best = self.D_incremental.copy()

        self._train_with_early_stopping(epochs,
                                        algorithm_name=self.RECOMMENDER_NAME,
                                        **earlystopping_kwargs)

        self.compute_W_sparse(model_to_use="best")

        sys.stdout.flush()