def _check_duplicated_interactions(self):
     dups = self.df_obs.duplicated([self.uid_col, self.iid_col])
     if dups.sum():
         logger.warning(
             'ObservationsDF: Dropping %s duplicate interactions.' %
             str(dups.sum()))
         self.df_obs = self.df_obs[~dups]
예제 #2
0
            def inner(*args, **kwargs):
                nonlocal self, path_func
                read_from_cache = kwargs.pop('read_from_cache', False)
                save_to_cache = kwargs.pop('save_to_cache', True)
                cache_valid_days = kwargs.pop('cache_valid_days', None)

                if not read_from_cache and not save_to_cache:
                    # short circuit everything if cache not requested
                    return func(*args, **kwargs)

                path_func = path_func or self.cache_filepath
                cache_path = path_func(*args, **kwargs)
                cache_valid = self.is_cache_valid(cache_path, valid_days=cache_valid_days)

                read_cache_attempt = read_from_cache and cache_valid

                # using pickle here because pickling stores the dataframe more reliably
                # (data types and other information may have changed or lost during write/read of csv)

                if read_cache_attempt:
                    # df = pd.read_sv(cache_path, keep_default_na=False, na_values=NA_VALUES)
                    df = pd.read_pickle(cache_path)
                    logger.info(f'Read cache file from {cache_path}')
                else:
                    if read_from_cache:
                        logger.warning(f'Cache file not found/valid, attempting to create ({cache_path})')
                    df = func(*args, **kwargs)

                if save_to_cache and cache_path and not read_cache_attempt:
                    # df.to_csv(cache_path, index=None)
                    df.to_pickle(cache_path)

                return df
예제 #3
0
 def cache_filepath(self, *args, **kwargs):
     cache_obj = tuple(hash_str(a) for a in args) + \
                 tuple((hash_str(k), hash_str(v)) for k, v in kwargs.items()) + \
                 (self.salt,)
     if self.disk_cache_dir is None:
         logger.warning('Attempting to use cache but cache dir was not defined. Not using cache.')
     else:
         if not os.path.exists(self.disk_cache_dir):
             logger.warning("Cache dir doesn't exist, trying to create.")
             os.makedirs(self.disk_cache_dir, exist_ok=True)
         return os.path.join(self.disk_cache_dir, self.cache_file_pattern % hash_str(cache_obj))
예제 #4
0
    def _eval_on_test_by_ranking_LFM(train_ranks_func,
                                     test_ranks_func,
                                     test_dfs,
                                     test_names=None,
                                     prefix='',
                                     include_train=True,
                                     k=10):
        """
        this is just to avoid the same flow twice (or more)
        :param train_ranks_func: function that return the ranks and sparse mat of training set
        :param test_ranks_func: function that return the ranks and sparse mat of a test set
        :param test_dfs: test dataframes
        :param test_names: test dataframes names
        :param prefix: prefix for this report
        :param include_train: whether to evaluate training or not
        :return: a report dataframe
        """

        # test
        if isinstance(test_dfs, pd.DataFrame):
            test_dfs = [test_dfs]

        if test_names is None or len(test_names) != len(test_dfs):
            logger.warning(
                'No test names provided or number of names '
                'not matching number of test DFs, using empty strings')
            test_names = [''] * len(test_dfs)

        res = []
        report_dfs = []
        full_reports = {}
        with ThreadPool(len(test_dfs) + int(include_train)) as pool:
            if include_train:
                res.append(
                    (prefix + 'train', pool.apply_async(train_ranks_func)))

            for test_df, test_name in zip(test_dfs, test_names):
                res.append((prefix + test_name + 'test',
                            pool.apply_async(test_ranks_func,
                                             args=(test_df, ))))

            for name, r in res:
                ranks_mat, sp_mat = r.get()
                means_report, full_report = mean_scores_report_on_ranks(
                    ranks_list=[ranks_mat],
                    datasets=[sp_mat],
                    dataset_names=[name],
                    k=k)
                report_dfs.append(means_report)
                full_reports.update(full_report)

        report_df = pd.concat(report_dfs, sort=False)
        return report_df, full_reports
예제 #5
0
    def _check_interrupt(self):
        if self.interrupt_message_file is not None \
                and os.path.exists(self.interrupt_message_file):

            with open(self.interrupt_message_file) as f:
                message = f.readline()

            if 'stop' in message:
                raise InterruptedError('interrupted by "stop" message in %s'
                                       % self.interrupt_message_file)
            elif 'pause' in message:
                simple_logger.warning('Paused by "pause" message in %s'
                            % self.interrupt_message_file)
                while 'pause' in message:
                    time.sleep(1)
                    with open(self.interrupt_message_file) as f:
                        message = f.readline()
                self._check_interrupt()

            elif 'update' in message:
                simple_logger.warning('Updating HP space due to "update" message in %s'
                            % self.interrupt_message_file)
                raise NotImplementedError('not yet implemented')