def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from all item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        return_ratings = kwargs["return_ratings"]
        all_items = learner.item_feature_builder.id_vocab
        test_transactions_df = test_transactions.df
        users = test_transactions_df.iloc[:, TRANSACTIONS_USER_COL].unique()
        module_logger.info(
            f"Get {len(users)} unique users, and {len(all_items)} unique items."
        )

        with TimeProfile("Building complete user item transactions dataset"):
            transactions_df = self.build_user_item_cartesian_pairs(
                users=users, items=all_items)
        transactions = TransactionDataset(transactions_df)
        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_ranking_column_names)
    def _init_mpi_support(self):
        global _HVD_LIB
        _HVD_LIB = importlib.import_module("horovod.tensorflow")

        _HVD_LIB.init()
        self.hvd_rank = _HVD_LIB.rank()
        self.hvd_size = _HVD_LIB.size()
        os.environ["CUDA_VISIBLE_DEVICES"] = str(_HVD_LIB.local_rank())
        module_logger.info(
            f"Set GPU {_HVD_LIB.local_rank()} GPU as visible devices.")

        if self.hvd_rank != 0:
            self.save_dir = None
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from unrated item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        return_ratings = kwargs["return_ratings"]
        training_transactions = kwargs["training_transactions"]

        all_items = learner.item_feature_builder.id_vocab
        training_transactions_df = training_transactions.df
        training_transactions_df = training_transactions_df.rename(
            columns={
                training_transactions_df.columns[TRANSACTIONS_USER_COL]:
                USER_COLUMN,
                training_transactions_df.columns[TRANSACTIONS_ITEM_COL]:
                ITEM_COLUMN
            })
        users = test_transactions.df.iloc[:, TRANSACTIONS_USER_COL].unique()
        module_logger.info(
            f"Get {len(users)} unique users, and {len(all_items)} unique items."
        )

        with TimeProfile("Building complete user item transactions dataset"):
            transactions_df = self.build_user_item_cartesian_pairs(
                users=users, items=all_items)
            transactions_df = pd.merge(transactions_df,
                                       training_transactions_df,
                                       how='left',
                                       on=[USER_COLUMN, ITEM_COLUMN],
                                       indicator=True)
            transactions_df = transactions_df[transactions_df['_merge'] ==
                                              'left_only']
            transactions_df = transactions_df.drop(columns=['_merge'])
            transactions = TransactionDataset(transactions_df)

        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_ranking_column_names)
    def predict(self, transactions: TransactionDataset):
        if transactions.row_size == 0:
            return pd.Series()

        instances_count = transactions.row_size
        log_every_n_instances = instances_count // 5 if instances_count >= 5 else instances_count
        module_logger.info(f"Get {instances_count} test instances")
        module_logger.info(f"Rebuild model:\n {self.hyper_params}")
        self.build_model(load_checkpoints=True)
        input_fn = self.get_input_fn(transactions=transactions,
                                     batch_size=self.hyper_params.batch_size)
        predictions = []
        start_time = time()

        with TimeProfile("Making predictions for user-item pairs"):
            for p in self.estimator.predict(input_fn=input_fn):
                if len(predictions) % log_every_n_instances == 0 and len(
                        predictions) > 0:
                    cost_seconds = time() - start_time
                    remaining_seconds = cost_seconds / len(predictions) * (
                        instances_count - len(predictions))
                    module_logger.info(
                        f"Finished {len(predictions)} instance predictions, "
                        f"cost time: {datetime.timedelta(seconds=cost_seconds)}."
                        f"Remaining time: {datetime.timedelta(seconds=remaining_seconds)}"
                    )
                predictions.append(p["predictions"][0])
            module_logger.info(
                f"Finished {len(predictions)} instance predictions. "
                f"Cost time: {datetime.timedelta(seconds=(time() - start_time))}"
            )

        predictions = pd.Series(predictions)

        return predictions
 def _check_feature_columns(self):
     basic_features = parse_basic_features(
         feature_columns=[*self.wide_columns, *self.deep_columns])
     module_logger.info(
         f"Model is expected to be fed with features: {[f.key for f in basic_features]}"
     )
     feature_keys = {
         *self.user_feature_builder.feature_metas.keys(),
         *self.item_feature_builder.feature_metas.keys(),
         self.user_feature_builder.id_key, self.item_feature_builder.id_key
     }
     for feature in basic_features:
         if feature.key not in feature_keys:
             raise RuntimeError(
                 f"feature {feature.key} not found in feature datasets.")
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from rated item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        min_recommendation_pool_size = kwargs["min_recommendation_pool_size"]
        return_ratings = kwargs["return_ratings"]

        with TimeProfile(
                f"Filter users with less than {min_recommendation_pool_size} transactions"
        ):
            transactions_df = test_transactions.df.iloc[:, :
                                                        TRANSACTIONS_RATING_COL]
            transactions_df = transactions_df.iloc[(
                ~transactions_df.duplicated()).values, :]
            transactions_df = transactions_df.rename(columns=dict(
                zip(transactions_df.columns, [USER_COLUMN, ITEM_COLUMN])))
            user_group_size = transactions_df.groupby(USER_COLUMN,
                                                      as_index=False).count()
            valid_users_df = user_group_size[[
                USER_COLUMN
            ]][user_group_size[ITEM_COLUMN] >= min_recommendation_pool_size]
            transactions_df = pd.merge(left=transactions_df,
                                       right=valid_users_df,
                                       how='inner')
            transactions = TransactionDataset(transactions_df)

        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_rated_ranking_column_names)
 def train(self, transactions: TransactionDataset):
     instances_count = transactions.row_size
     batches_count = np.ceil(instances_count / self.hyper_params.batch_size)
     module_logger.info(
         f"Get {instances_count} training instances, and {batches_count} batches per epoch."
     )
     run_config = tf.estimator.RunConfig(
         tf_random_seed=self.random_seed,
         log_step_count_steps=batches_count,  # log loss after each epoch
         save_checkpoints_steps=batches_count * self.hyper_params.epochs,
         keep_checkpoint_max=1)
     module_logger.info(f"Build model:\n{self.hyper_params}")
     self.build_model(run_config=run_config)
     input_fn = self.get_input_fn(transactions=transactions,
                                  batch_size=self.hyper_params.batch_size,
                                  epochs=self.get_epochs(),
                                  shuffle=True)
     hooks = []
     if self.mpi_support:
         hooks.append(_HVD_LIB.BroadcastGlobalVariablesHook(0))
     try:
         with TimeProfile("Training Wide & Deep recommendation model"):
             module_logger.info(
                 f"Start to train model, rank {self.hvd_rank}")
             self.estimator.train(input_fn=input_fn, hooks=hooks)
     except tf.estimator.NanLossDuringTrainingError as e:
         raise NanLossDuringTrainingError from e
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Predict rating for user-item pairs.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        test_transactions_df = test_transactions.df.iloc[:, :
                                                         TRANSACTIONS_RATING_COL].copy(
                                                         )
        test_transactions_df = test_transactions_df.iloc[(
            ~test_transactions_df.duplicated()).values, :]
        test_transactions = TransactionDataset(test_transactions_df,
                                               name=test_transactions.name)
        res_df = self._predict(learner,
                               test_transactions,
                               user_features=user_features,
                               item_features=item_features)
        res_df.columns = build_regression_column_names()

        return res_df
    parser.add_argument(
        '--boolean-parameter', type=str,
        help='A boolean parameter.',
    )
    parser.add_argument(
        '--enum-parameter', type=str,
        help='A enum parameter.',
    )
    parser.add_argument(
        '--output-path',
        help='The output directory.',
    )

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world MPI from {PACKAGE_NAME} {VERSION}")

    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()

    str_param = args.string_parameter
    int_param = args.int_parameter
    bool_param = args.boolean_parameter
    enum_param = args.enum_parameter

    logger.debug(f"Received parameters:")
    logger.debug(f"    {str_param}")
    logger.debug(f"    {int_param}")
    logger.debug(f"    {bool_param}")
    logger.debug(f"    {enum_param}")
Пример #10
0
        '--ranking-metric',
        type=str,
        help='The metric of ranking used in item recommendation')
    parser.add_argument('--top-k',
                        type=int,
                        help='The number of top items to recommend.')
    parser.add_argument('--sort-top-k', type=str, help='Sort top k results.')
    parser.add_argument(
        '--remove-seen-items',
        type=str,
        help='Remove items seen in training from recommendation')
    parser.add_argument('--score-result', help='Ratings or items to output')

    args, _ = parser.parse_known_args()

    logger.info(f"Arguments: {args}")
    sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
    remove_seen_items = strtobool(
        args.remove_seen_items) if args.remove_seen_items else None
    normalize = strtobool(args.normalize) if args.normalize else None

    sar_model = load_model_from_directory(args.trained_model,
                                          model_loader=joblib_loader).data
    dataset_to_score = load_data_frame_from_directory(
        args.dataset_to_score).data
    logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")

    score_sar_module = ScoreSARModule(model=sar_model,
                                      input_data=dataset_to_score)

    score_type = ScoreType(args.score_type)
Пример #11
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--input-path', help='Input Dataframe path')

    parser.add_argument('--detect-mode',
                        choices=['AnomalyOnly', 'AnomalyAndMargin'],
                        help='Specify the detect mode.')

    parser.add_argument('--timestamp-column',
                        help='Choose the column that contains timestamps.')

    parser.add_argument('--value-column',
                        help='Choose the column that contains values.')

    parser.add_argument(
        '--batch-size',
        type=int,
        help=
        'This parameter specifies the size of each batch that the detection is perfomed.'
    )

    parser.add_argument(
        '--threshold',
        type=float,
        help=
        'This parameter specifies the threshold anomaly score that a point is judged as anomaly.'
    )

    parser.add_argument(
        '--sensitivity',
        type=float,
        help=
        'This parameter is used in AnomalyAndMargin mode to control the width of margin.'
    )

    parser.add_argument(
        '--append-mode',
        type=str2bool,
        default=False,
        help=
        'This parameter is used in AnomalyAndMargin mode to control the width of margin.'
    )

    parser.add_argument(
        '--compute-stats-in-visualization',
        type=str2bool,
        default=False,
        help='Enable this parameter to get stats visualization.')

    parser.add_argument('--output-path', help='Output Dataframe path')

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}")

    logger.debug("Received parameters:")
    logger.debug(f"input: {args.input_path}")
    logger.debug(f"detect mode: {args.detect_mode}")
    logger.debug(f"timestamp column: {args.timestamp_column}")
    logger.debug(f"value column: {args.value_column}")
    logger.debug(f"batch size: {args.batch_size}")
    logger.debug(f"threshold: {args.threshold}")
    logger.debug(f"sensitivity: {args.sensitivity}")
    logger.debug(f"appendMode: {args.append_mode}")
    logger.debug(f"appendMode: {args.compute_stats_in_visualization}")
    logger.debug(f"output path: {args.output_path}")

    invoke(args.input_path, args.detect_mode, args.timestamp_column,
           args.value_column, args.batch_size, args.threshold,
           args.sensitivity, args.append_mode,
           args.compute_stats_in_visualization, args.output_path)