Пример #1
0
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: JSONDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR,
                                               "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab,
                                            TASKS)
    map_location = "cpu" if not args.cuda else None
    model.load_state_dict(
        torch.load(os.path.join(SERIALIZATION_DIR, "best.th"),
                   map_location=map_location))
    if args.cuda:
        model = model.cuda(device=CUDA_DEVICE)

        # Empty cache to ensure larger batch can be loaded for testing
        torch.cuda.empty_cache()
    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    model.set_inference_mode(True)
    return TASKS, vocab, model, readers, test_iterator
Пример #2
0
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: ConLLDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths,
                                      readers,
                                      data_split="train")
        validation_dataset = read_datasets(dataset_paths,
                                           readers,
                                           data_split="dev")

        vocab = create_vocab([train_dataset, validation_dataset])

        # Special case for CCG
        if "ccg" in task_suffixes or "pos" in task_suffixes:
            for task in TASKS:
                if task.task_type == "ccg":
                    for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
                if task.tag_namespace == "ud_pos":
                    for tag in ["CONJ"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)

    else:
        vocab = Vocabulary.from_files(
            os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                  batch_size=BATCH_SIZE,
                                                  cache_instances=True)
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(
            f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"),
                  "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(
            roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values()))

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(),
                               lr=LR,
                               weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"),
                  "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(
            torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"]
        for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(model,
                                        readers,
                                        test_iterator,
                                        test_filepaths,
                                        cuda_device=CUDA_DEVICE)
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)