def setUp(self):
     config_static = str(
         pathlib.Path(__file__).parent.absolute().joinpath("test_configs/phrase_context_static_config.json"))
     config_contextualized = str(
         pathlib.Path(__file__).parent.absolute().joinpath("test_configs/phrase_context_contextualized_config.json"))
     with open(config_static, 'r') as f:
         self.config_static = json.load(f)
     with open(config_contextualized, 'r') as f:
         self.config_contextualized = json.load(f)
     _, _, self.static_set = training_utils.get_datasets(self.config_static)
     _, _, self.bert_set = training_utils.get_datasets(self.config_contextualized)
    def setUp(self):
        config_static = str(pathlib.Path(__file__).parent.absolute().joinpath("test_configs/simple_phrase_config.json"))
        with open(config_static, 'r') as f:
            self.config_static = json.load(f)
        _, _, self._static_dataset = training_utils.get_datasets(self.config_static)
        self._data = DataLoader(self._static_dataset, batch_size=2)
        self._batch = next(iter(self._data))
        self._batch["device"] = "cpu"

        self.input_dim = 600
        self.hidden_dim = 6
        self.labels = 6
示例#3
0
    def setUp(self):
        config_static = str(
            pathlib.Path(__file__).parent.absolute().joinpath(
                "test_configs/simple_phrase_config.json"))
        data_pretrain = str(
            pathlib.Path(__file__).parent.absolute().joinpath(
                "data_pretraining/train.txt"))
        embeddings = str(
            pathlib.Path(__file__).parent.absolute().joinpath(
                "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu"
            ))
        with open(config_static, 'r') as f:
            self.config_static = json.load(f)
        _, _, self.simple_phrase_test = training_utils.get_datasets(
            self.config_static)
        self.data_loader = DataLoader(dataset=self.simple_phrase_test,
                                      batch_size=4)
        self.pretrain_dataset = StaticRankingDataset(data_path=data_pretrain,
                                                     embedding_path=embeddings,
                                                     separator=" ",
                                                     head="head",
                                                     mod="modifier",
                                                     phrase="phrase")
        self.pretrain_loader = DataLoader(dataset=self.pretrain_dataset,
                                          batch_size=4)

        self.model_multiclass = MatrixTwoWordClassifier(
            input_dim=300,
            hidden_dim=100,
            label_nr=3,
            dropout_rate=0.1,
            normalize_embeddings=True)
        self.model_pretrain = MatrixPretrain(input_dim=300,
                                             dropout_rate=0.1,
                                             normalize_embeddings=True)
        self.train_matrix_classifier()
        self.model_transfer = MatrixTransferClassifier(
            input_dim=300,
            hidden_dim=100,
            label_nr=3,
            dropout_rate=0.1,
            normalize_embeddings=True,
            pretrained_model="models/matrix_classifier")
        self.train_matrix_pretrain()
        self.model_transfer_rank = MatrixTransferRanker(
            dropout_rate=0.1,
            normalize_embeddings=True,
            pretrained_model="models/matrix_pretrain")
示例#4
0
    with open(argp.path_to_config, 'r') as f:
        config = json.load(f)
    prediction_path_dev = str(
        Path(config["model_path"]).joinpath(config["save_name"] +
                                            "_dev_predictions.npy"))
    prediction_path_test = str(
        Path(config["model_path"]).joinpath(config["save_name"] +
                                            "_test_predictions.npy"))
    eval_path_dev = str(
        Path(config["model_path"]).joinpath(config["save_name"] +
                                            "_evaluation_dev.txt"))
    eval_path_test = str(
        Path(config["model_path"]).joinpath(config["save_name"] +
                                            "_evaluation_test.txt"))
    dataset_train, dataset_valid, dataset_test = get_datasets(config)

    # load validation data in batches
    valid_loader = DataLoader(dataset_valid,
                              batch_size=len(dataset_valid),
                              shuffle=False)

    # load test data in batches
    test_loader = DataLoader(dataset_test,
                             batch_size=len(dataset_test),
                             shuffle=False)
    if argp.ranking:
        rank_path_dev = config["model_path"] + "_dev_ranks.txt"
        rank_path_test = config["model_path"] + "_test_ranks.txt"
        labels = extract_all_labels(
            training_data=config["train_data_path"],
        model_path, "_reconstructed_rep", "dev")
    prediction_path_test_reconstructed, rank_path_test_reconstructed = get_save_path(
        model_path, "_reconstructed_rep", "test")

    logging.config.dictConfig(create_config(log_file))
    logger = logging.getLogger("train")
    logger.info(
        "Training a joint model with the following parameter for the first dataset: %s"
        % str(config_1))
    logger.info("The following parameter for the second dataset: %s" %
                str(config_2))

    # set random seed
    np.random.seed(config_1["seed"])
    # create two PretrainCompModel datasets
    dataset_train_1, dataset_valid_1, dataset_test_1 = get_datasets(config_1)
    dataset_train_2, dataset_valid_2, dataset_test_2 = get_datasets(config_2)

    assert type(dataset_train_1) == StaticRankingDataset and type(
        dataset_train_2
    ) == StaticRankingDataset, "the dataset type is invalid for this kind of training"

    labels_dataset_1 = extract_all_words(
        training_data=config_1["train_data_path"],
        validation_data=config_1["validation_data_path"],
        test_data=config_1["test_data_path"],
        separator=config_1["data_loader"]["separator"],
        modifier=config_1["data_loader"]["modifier"],
        head=config_1["data_loader"]["head"],
        phrase=config_1["data_loader"]["phrase"])
    labels_dataset_2 = extract_all_labels(