示例#1
0
def load_split_dataset(config):
    """Load split dataset.

    Args:
        config (dict): Dictionary of configuration

    Returns:
        train_data (DataFrame): Interaction for training.
        valid_data list(DataFrame): List of interactions for validation.
        test_data list(DataFrame): List of interactions for testing.
    """
    print_dict_as_table(config["dataset"], tag="Dataset config")
    dataset_mapping = {
        "ml_100k": Movielens_100k,
        "ml_1m": Movielens_1m,
        "ml_25m": Movielens_25m,
        "last_fm": LastFM,
        "tafeng": Tafeng,
        "epinions": Epinions,
        "dunnhumby": Dunnhumby,
        "instacart": Instacart,
        "instacart_25": Instacart_25,
    }
    dataset = dataset_mapping[config["dataset"]["dataset"]](
        root_dir=config["system"]["root_dir"])
    return dataset.load_split(config["dataset"])
示例#2
0
def test_eval_worker(testEngine, eval_data_df, prediction):
    """Start a worker for the evaluation during training.

    Prediction and evaluation on the testing set.
    """
    result_para = {
        "run_time": [testEngine.config["run_time"]],
    }
    testEngine.n_worker += 1
    for cfg in ["model", "dataset"]:
        for col in testEngine.config[cfg]["result_col"]:
            result_para[col] = [testEngine.config[cfg][col]]

    test_result_dic = evaluate(eval_data_df, prediction, testEngine.metrics,
                               testEngine.k)
    print_dict_as_table(
        test_result_dic,
        tag="performance on test",
        columns=["metrics", "values"],
    )
    test_result_dic.update(result_para)
    lock_test_eval.acquire()  # need to be test
    result_df = pd.DataFrame(test_result_dic)
    save_to_csv(result_df, testEngine.config["system"]["result_file"])
    lock_test_eval.release()
    testEngine.n_worker -= 1
    return test_result_dic
示例#3
0
 def __init__(self, config, gcn_config=None):
     self.model = MLP(config)
     print_dict_as_table(config, tag="MLP config")
     self.gcn_config = gcn_config
     super(MLPEngine, self).__init__(config)
     self.model.to(self.device)
     if gcn_config is not None:
         self.load_pretrain_weights()
示例#4
0
 def __init__(self, config, gcn_config=None, mlp_config=None):
     self.model = NeuMF(config)
     print_dict_as_table(config, tag="Neumf config")
     self.gcn_config = gcn_config
     self.mlp_config = mlp_config
     super(NeuMFEngine, self).__init__(config)
     print(self.model)
     if gcn_config is not None and mlp_config is not None:
         self.load_pretrain_weights()
示例#5
0
文件: gcn.py 项目: anonymcodes/t-vbr
 def __init__(self, config):
     self.config = config
     print_dict_as_table(config, tag="GCN config")
     self.model = GCN_S(config)
     self.regs = config["gcn_config"]["regs"]  # reg is the regularisation
     self.decay = self.regs[0]
     self.batch_size = config["batch_size"]
     self.num_batch = config["num_batch"]
     self.user_fea_norm_adj = config["user_fea_norm_adj"]
     self.item_fea_norm_adj = config["item_fea_norm_adj"]
     super(GCN_SEngine, self).__init__(config)
示例#6
0
def train_eval_worker(testEngine,
                      valid_df,
                      test_df,
                      valid_pred,
                      test_pred,
                      epoch,
                      top_k=10):
    """ Thread worker for the evaluation during training

    Args:
        testEngine:
        valid_df:
        test_df:
        valid_pred:
        test_pred:
        epoch:
        top_k:

    Returns:

    """
    testEngine.n_worker += 1
    valid_result = evaluate(valid_df, valid_pred, testEngine.metrics, top_k)
    test_result = evaluate(test_df, test_pred, testEngine.metrics, top_k)
    lock_train_eval.acquire()  # need to be test
    testEngine.record_performance(valid_result, test_result, epoch)
    testEngine.expose_performance(valid_result, test_result)
    if (valid_result[testEngine.config["validate_metric"]] >
            testEngine.best_valid_performance):
        testEngine.n_no_update = 0
        print(
            f"Current testEngine.best_valid_performance {testEngine.best_valid_performance}"
        )
        testEngine.best_valid_performance = valid_result[
            testEngine.config["validate_metric"]]
        print_dict_as_table(
            valid_result,
            tag=f"performance on validation at epoch {epoch}",
            columns=["metrics", "values"],
        )
        print_dict_as_table(
            test_result,
            tag=f"performance on testing at epoch {epoch}",
            columns=["metrics", "values"],
        )
    else:
        testEngine.n_no_update += 1
        print(f"number of epochs that have no update {testEngine.n_no_update}")

    testEngine.n_worker -= 1
    lock_train_eval.release()
    # lock record and get best performance
    return valid_result, test_result
示例#7
0
 def __init__(self, config):
     self.config = config
     print_dict_as_table(config["model"], tag="MF model config")
     self.model = MF(config["model"])
     self.reg = (config["model"]["reg"] if "reg" in config else 0.0
                 )  # the regularization coefficient.
     self.batch_size = config["model"]["batch_size"]
     super(MFEngine, self).__init__(config)
     self.model.to(self.device)
     self.loss = (self.config["model"]["loss"]
                  if "loss" in self.config["model"] else "bpr")
     print(f"using {self.loss} loss...")
示例#8
0
def train_eval_worker(testEngine, valid_df, test_df, valid_pred, test_pred,
                      epoch):
    """Start a worker for the evaluation during training.

    Args:
        testEngine:
        valid_df:
        test_df:
        valid_pred:
        test_pred:
        epoch (int):

    Returns:
        (dict,dict): dictionary with performances on validation and testing sets.
    """
    testEngine.n_worker += 1
    valid_result = evaluate(valid_df, valid_pred, testEngine.metrics,
                            testEngine.valid_k)
    test_result = evaluate(test_df, test_pred, testEngine.metrics,
                           testEngine.valid_k)
    lock_train_eval.acquire()
    testEngine.record_performance(valid_result, test_result, epoch)
    testEngine.expose_performance(valid_result, test_result)
    if (valid_result[f"{testEngine.valid_metric}@{testEngine.valid_k}"] >
            testEngine.best_valid_performance):
        testEngine.n_no_update = 0
        print(
            f"Current testEngine.best_valid_performance {testEngine.best_valid_performance}"
        )
        testEngine.best_valid_performance = valid_result[
            f"{testEngine.valid_metric}@{testEngine.valid_k}"]
        print_dict_as_table(
            valid_result,
            tag=f"performance on validation at epoch {epoch}",
            columns=["metrics", "values"],
        )
        print_dict_as_table(
            test_result,
            tag=f"performance on testing at epoch {epoch}",
            columns=["metrics", "values"],
        )
    else:
        testEngine.n_no_update += 1
        print(f"number of epochs that have no update {testEngine.n_no_update}")

    testEngine.n_worker -= 1
    lock_train_eval.release()
    # lock record and get best performance
    return valid_result, test_result
示例#9
0
def test_eval_worker(testEngine, eval_data_df, prediction, k_li=[5, 10, 20]):
    """
    Prediction and evaluation on the testing set
    """
    result_para = {
        "model": [testEngine.config["model"]],
        "dataset": [testEngine.config["dataset"]],
        "data_split": [testEngine.config["data_split"]],
        "emb_dim": [int(testEngine.config["emb_dim"])],
        "lr": [testEngine.config["lr"]],
        "batch_size": [int(testEngine.config["batch_size"])],
        "optimizer": [testEngine.config["optimizer"]],
        "max_epoch": [testEngine.config["max_epoch"]],
        "model_run_id": [testEngine.config["model_run_id"]],
        "run_time": [testEngine.config["run_time"]],
    }
    if "late_dim" in testEngine.config:
        result_para["late_dim"] = [int(testEngine.config["late_dim"])]
    if "remark" in testEngine.config:
        result_para["remark"] = [testEngine.config["remark"]]
    if "alpha" in testEngine.config:
        result_para["alpha"] = [testEngine.config["alpha"]]
    if "activator" in testEngine.config:
        result_para["activator"] = [testEngine.config["activator"]]
    if "item_fea_type" in testEngine.config:
        result_para["item_fea_type"] = [testEngine.config["item_fea_type"]]
    if "n_sample" in testEngine.config:
        result_para["n_sample"] = [testEngine.config["n_sample"]]
    if "time_step" in testEngine.config:
        result_para["time_step"] = [testEngine.config["time_step"]]

    test_result_dic = evaluate(eval_data_df, prediction, testEngine.metrics,
                               k_li)
    print_dict_as_table(
        test_result_dic,
        tag=f"performance on test",
        columns=["metrics", "values"],
    )
    test_result_dic.update(result_para)
    lock_test_eval.acquire()  # need to be test
    result_df = pd.DataFrame(test_result_dic)
    save_to_csv(result_df, testEngine.config["result_file"])
    lock_test_eval.release()
    return test_result_dic
示例#10
0
    def prepare_env(self):
        """Prepare running environment.

        * Load parameters from json files.
        * Initialize system folders, model name and the paths to be saved.
        * Initialize resource monitor.
        * Initialize random seed.
        * Initialize logging.
        """
        # Load config file from json
        with open(self.args.config_file) as config_params:
            print(f"loading config file {self.args.config_file}")
            config = json.load(config_params)

        # Update configs based on the received args from the command line .
        update_args(config, self.args)

        # obtain abspath for the project
        config["system"]["root_dir"] = os.path.abspath(
            config["system"]["root_dir"])

        # construct unique model run id, which consist of model name, config id and a timestamp
        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        random_str = "".join(
            [random.choice(string.ascii_lowercase) for n in range(6)])
        config["system"]["model_run_id"] = (config["model"]["model"] + "_" +
                                            config["model"]["config_id"] +
                                            "_" + timestamp_str + "_" +
                                            random_str)

        # Initialize random seeds
        set_seed(config["system"]["seed"] if "seed" in
                 config["system"] else 2020)

        # Initialize working folders
        self.initialize_folders(config)

        config["system"]["process_dir"] = os.path.join(
            config["system"]["root_dir"], config["system"]["process_dir"])

        # Initialize log file
        config["system"]["log_file"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["log_dir"],
            config["system"]["model_run_id"],
        )
        logger.init_std_logger(config["system"]["log_file"])

        print("Python version:", sys.version)
        print("pytorch version:", torch.__version__)

        #  File paths to be saved
        config["model"]["run_dir"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["run_dir"],
            config["system"]["model_run_id"],
        )
        config["system"]["run_dir"] = config["model"]["run_dir"]
        print(
            "The intermediate running statuses will be reported in folder:",
            config["system"]["run_dir"],
        )

        config["system"]["tune_dir"] = os.path.join(
            config["system"]["root_dir"], config["system"]["tune_dir"])

        def get_user_temp_dir():
            tempdir = os.path.join(config["system"]["root_dir"], "tmp")
            print(f"ray temp dir {tempdir}")
            return tempdir

        ray.utils.get_user_temp_dir = get_user_temp_dir

        #  Model checkpoints paths to be saved
        config["system"]["model_save_dir"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["checkpoint_dir"],
            config["system"]["model_run_id"],
        )
        ensureDir(config["system"]["model_save_dir"])
        print("Model checkpoint will save in file:",
              config["system"]["model_save_dir"])

        config["system"]["result_file"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["result_dir"],
            config["system"]["result_file"],
        )
        print("Performance result will save in file:",
              config["system"]["result_file"])

        print_dict_as_table(config["system"], "System configs")
        return config
示例#11
0
def prepare_env(config):
    """Prepare running environment
        - Load parameters from json files.
        - Initialize system folders, model name and the paths to be saved.
        - Initialize resource monitor.
        - Initialize random seed.
        - Initialize logging.

    Args:
        config (dict): Global configs.

    """
    # obtain abspath for the project
    # You need specified it if it is running in the container.
    if "root_dir" not in config:
        file_dir = os.path.dirname(os.path.abspath(__file__))
        config["root_dir"] = os.path.abspath(os.path.join(file_dir, ".."))

    # load config file from json
    with open(config["config_file"]) as config_params:
        print("loading config file", config["config_file"])
        json_config = json.load(config_params)

    # update global parameters with these parameters received from the command line .
    json_config.update(config)
    config = json_config

    # construct unique model run id, which consist of model name, config id and a timestamp
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    random_str = "".join([random.choice(string.ascii_lowercase) for n in range(6)])
    config["model_run_id"] = (
        config["model"]
        + "_"
        + config["config_id"]
        + "_"
        + timestamp_str
        + "_"
        + random_str
    )
    set_seed(config["seed"] if "seed" in config else 2020)
    initialize_folders(config["root_dir"])

    # Initialize log file
    config["log_file"] = os.path.join(
        config["root_dir"], config["log_dir"], config["model_run_id"]
    )
    logger.init_std_logger(config["log_file"])

    print("python version:", sys.version)
    print("pytorch version:", torch.__version__)

    #  File paths to be saved
    config["run_dir"] = os.path.join(
        config["root_dir"], config["run_dir"], config["model_run_id"]
    )
    print(
        "The intermediate running statuses will be reported in folder:",
        config["run_dir"],
    )

    #  Model checkpoints paths to be saved
    config["model_save_dir"] = os.path.join(
        config["root_dir"], config["checkpoint_dir"], config["model_run_id"]
    )
    ensureDir(config["model_save_dir"])
    print("Model checkpoint will save in file:", config["model_save_dir"])

    config["result_file"] = os.path.join(
        config["root_dir"], config["result_dir"], config["result_file"]
    )
    print("Performance result will save in file:", config["result_file"])

    # remove comments

    print_dict_as_table(config, "Model configs")
    return config