Exemplo n.º 1
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Exemplo n.º 2
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / 'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 3
0
        test_shortest_responses_labels = test_data[6].tolist()
        print("Shortest responses count:", sum(test_shortest_responses_labels))
        print("bucket indices len:", len(test_bucket_indices))

        model_file = os.path.join(
            "saved_softmax_models",
            "decomposable_attention{}{}_model_{}.th".format(
                LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

        vocabulary_filepath = os.path.join(
            "saved_softmax_models",
            "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                       NEGATIVE_PERCENTAGE))
        print("LOADING VOCABULARY")
        # Load vocabulary
        vocab = Vocabulary.from_files(vocabulary_filepath)

        EMBEDDING_DIM = 300
        PROJECT_DIM = 200
        DROPOUT = 0.2
        NUM_LAYERS = 2
        if EMBEDDING_TYPE == "":
            token_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size('tokens'),
                embedding_dim=EMBEDDING_DIM,
                projection_dim=PROJECT_DIM)
        elif EMBEDDING_TYPE == "_glove":
            token_embedding = Embedding.from_params(vocab=vocab,
                                                    params=Params({
                                                        'pretrained_file':
                                                        glove_embeddings_file,
parser.add_argument('--qrel',
                    action='store',
                    dest='qrel',
                    help='qrel, to only check judged queries',
                    required=False)

args = parser.parse_args()

#
# load data & create vocab
# -------------------------------
#

loader = IrTupleDatasetReader(lazy=True, lowercase=True)
vocab = Vocabulary.from_files(args.vocab)
if args.qrel:
    qrels = load_reference(args.qrel)

not_judged = 0
oov_queries = 0
non_oov_queries = 0
oov_count_list = []
instances = loader.read(args.query)

with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov:
    with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov:

        for i in Tqdm.tqdm(instances):
            id_str = i["source_tokens"].tokens[0].text
            if args.qrel and int(id_str) not in qrels:
Exemplo n.º 5
0
    def from_params(params: Params,
                    serialization_dir: str,
                    recover: bool = False) -> 'TrainerPieces':
        all_datasets = training_util.datasets_from_params(params)
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation))

        if recover and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"))
        else:
            vocab = Vocabulary.from_params(params.pop(
                "vocabulary", {}), (instance
                                    for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))

        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')
        test_data = all_datasets.get('test')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
                    get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return TrainerPieces(model, iterator, train_data, validation_data,
                             test_data, validation_iterator, trainer_params)
Exemplo n.º 6
0
def load_decomposable_attention_elmo_softmax_model():
    NEGATIVE_PERCENTAGE = 100
    # EMBEDDING_TYPE = ""
    # LOSS_TYPE = ""				# NLL
    # LOSS_TYPE = "_nll"				# NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    model_file = os.path.join(
        "saved_softmax_models",
        "decomposable_attention{}{}_model_{}.th".format(
            LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

    vocabulary_filepath = os.path.join(
        "saved_softmax_models",
        "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                   NEGATIVE_PERCENTAGE))
    print("LOADING VOCABULARY")
    # Load vocabulary
    vocab = Vocabulary.from_files(vocabulary_filepath)

    EMBEDDING_DIM = 300
    PROJECT_DIM = 200
    DROPOUT = 0.2
    NUM_LAYERS = 2
    if EMBEDDING_TYPE == "":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_DIM,
            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_glove":
        token_embedding = Embedding.from_params(vocab=vocab,
                                                params=Params({
                                                    'pretrained_file':
                                                    glove_embeddings_file,
                                                    'embedding_dim':
                                                    EMBEDDING_DIM,
                                                    'projection_dim':
                                                    PROJECT_DIM,
                                                    'trainable':
                                                    False
                                                }))
    elif EMBEDDING_TYPE == "_elmo":
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
        options_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
        weights_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
        # NOTE: using Small size as medium size gave CUDA out of memory error
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
        # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained_2":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options_2.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights_2.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_bert":
        print("Loading bert model")
        model = BertModel.from_pretrained('bert-base-uncased')
        token_embedding = BertEmbedder(model)
        PROJECT_DIM = 768
    else:
        print("Error: Some weird Embedding type", EMBEDDING_TYPE)
        exit()
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    HIDDEN_DIM = 200
    params = Params({
        'input_dim': PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    attend_feedforward = FeedForward.from_params(params)
    similarity_function = DotProductSimilarity()
    params = Params({
        'input_dim': 2 * PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    compare_feedforward = FeedForward.from_params(params)
    params = Params({
        'input_dim': 2 * HIDDEN_DIM,
        'hidden_dims': 1,
        'activations': 'linear',
        'num_layers': 1
    })
    aggregate_feedforward = FeedForward.from_params(params)
    model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                         attend_feedforward,
                                         similarity_function,
                                         compare_feedforward,
                                         aggregate_feedforward)
    print("MODEL CREATED")
    # Load model state
    with open(model_file, 'rb') as f:
        model.load_state_dict(torch.load(f, map_location='cuda:0'))
    print("MODEL LOADED!")
    if torch.cuda.is_available():
        # cuda_device = 3
        # model = model.cuda(cuda_device)
        cuda_device = -1
    else:
        cuda_device = -1

    predictor = DecomposableAttentionSoftmaxPredictor(model,
                                                      dataset_reader=reader)
    return model, predictor
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict(prediction_sentence)['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)

print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

# Here's how to save the model.
with open("/tmp/model.th", 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files("/tmp/vocabulary")

# And here's how to reload the model.
vocab2 = Vocabulary.from_files("/tmp/vocabulary")

model2 = LstmTagger(word_embeddings, lstm, vocab2)

with open("/tmp/model.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))

if cuda_device > -1:
    model2.cuda(cuda_device)

predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

Exemplo n.º 8
0
def train():
    reader = PWKPReader()
    train_dataset = reader.read(train_path)
    valid_dataset = reader.read(dev_path)
    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(instances=train_dataset,
                                          max_vocab_size=opt.vocab_size)
        vocab.save_to_files(vocab_dir)
    iterator = BucketIterator(batch_size=opt.batch_size,
                              sorting_keys=[("src", "num_tokens"),
                                            ("tgt", "num_tokens")])
    iterator.index_with(vocab)

    model = Seq2Seq(emb_size=opt.emb_size,
                    hidden_size=opt.hidden_size,
                    enc_layers=opt.enc_layers,
                    dec_layers=opt.dec_layers,
                    dropout=opt.dropout,
                    bidirectional=opt.bidirectional,
                    beam_size=opt.beam_size,
                    label_smoothing=opt.label_smoothing,
                    vocab=vocab)

    optimizer = optim.Adam(model.parameters(), lr=opt.lr)
    #learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=opt.lr_decay)

    val_iterator = BasicIterator(batch_size=opt.batch_size)
    val_iterator.index_with(vocab)

    predictor = Predictor(iterator=val_iterator,
                          max_decoding_step=opt.max_step,
                          vocab=vocab,
                          reader=reader,
                          data_path=test_path,
                          log_dir=save_dir,
                          map_path=ner_path,
                          cuda_device=opt.gpu)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        #learning_rate_scheduler=learning_rate_scheduler,
        learning_rate_decay=opt.lr_decay,
        ema_decay=opt.ema_decay,
        predictor=predictor,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        validation_metric='+bleu',
        cuda_device=opt.gpu,
        num_epochs=opt.epoch,
        serialization_dir=save_dir,
        num_serialized_models_to_keep=5,
        #model_save_interval=60,
        #summary_interval=500,
        should_log_parameter_statistics=False,
        grad_norm=10)

    trainer.train()
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    patience=10,
    num_epochs=1000)

trainer.train()
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=1)

print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

with open('models/tagger.th', 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files('models/vocabulary')

vocab2 = Vocabulary.from_files('models/vocabulary')
model2 = LstmTagger(word_embeddings, lstm, vocab2)

with open('models/tagger.th', 'rb') as f:
    model2.load_state_dict(torch.load(f))

predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict("The dog ate the apple")["tag_logits"]
assert tag_logits == tag_logits2
Exemplo n.º 10
0
def main(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(
        args.dataset_paths_file, 
        args.dataset_path_prefix
    )
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: ConLLDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]
    )
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS)
    model.load_state_dict(torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
    model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()
    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(
        partition_key="dataset", batch_size=BATCH_SIZE * 2
    )
    test_iterator.index_with(vocab)
    model = model.eval()
    model.set_inference_mode(True)
    return TASKS, vocab, model, readers, test_iterator
Exemplo n.º 11
0
from allennlp.modules.similarity_functions import BilinearSimilarity, CosineSimilarity, DotProductSimilarity, LinearSimilarity, MultiHeadedSimilarity
from allennlp.modules.feedforward import FeedForward
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

from allennlp.training.metrics import CategoricalAccuracy

from allennlp.data.iterators import BucketIterator

from allennlp.training.trainer import Trainer

from allennlp.predictors import SentenceTaggerPredictor

from allennlp.nn import Activation

torch.manual_seed(1)

if __name__ == '__main__':
    vocab2 = Vocabulary.from_files("./wikiqavucabulary")

    model2 = LstmTagger(word_embeddings, esim, vocab2)

    with open("./wikiqamodel.th", 'rb') as f:
        model2.load_state_dict(torch.load(f))

    if cuda_device > -1:
        model2.cuda(cuda_device)

    predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
    tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
    np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
Exemplo n.º 12
0
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: ConLLDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths,
                                      readers,
                                      data_split="train")
        validation_dataset = read_datasets(dataset_paths,
                                           readers,
                                           data_split="dev")

        vocab = create_vocab([train_dataset, validation_dataset])

        # Special case for CCG
        if "ccg" in task_suffixes or "pos" in task_suffixes:
            for task in TASKS:
                if task.task_type == "ccg":
                    for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
                if task.tag_namespace == "ud_pos":
                    for tag in ["CONJ"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)

    else:
        vocab = Vocabulary.from_files(
            os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                  batch_size=BATCH_SIZE,
                                                  cache_instances=True)
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(
            f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"),
                  "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(
            roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values()))

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(),
                               lr=LR,
                               weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"),
                  "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(
            torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"]
        for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(model,
                                        readers,
                                        test_iterator,
                                        test_filepaths,
                                        cuda_device=CUDA_DEVICE)
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)
Exemplo n.º 13
0
def main():
    # language model, fixed pop
    swap = True
    vocab = Vocabulary.from_files("saved_models/vocabulary-linzen")
    model = StackRNNLanguageModel(vocab,
                                  rnn_dim=100,
                                  stack_dim=16,
                                  num_embeddings=10030,
                                  swap_push_pop=swap)
    with open("saved_models/stack-linzen-swap.th", "rb") as fh:
        model.load_state_dict(torch.load(fh))

    # classification model, fixed pop
    """swap = True
    vocab = Vocabulary.from_files("saved_models/vocabulary-linzen")
    print(vocab.get_vocab_size())
    model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=10000, swap_push_pop=swap)
    with open("saved_models/stack-linzen-class.th", "rb") as fh:
        model.load_state_dict(torch.load(fh))"""

    # language model fixed push
    """swap = False
    vocab = Vocabulary.from_files("saved_models/vocabulary-linzen")
    model = StackRNNLanguageModel(vocab, rnn_dim=100, stack_dim=16, num_embeddings=10030, swap_push_pop=swap)
    with open("saved_models/stack-linzen.th", "rb") as fh:
        model.load_state_dict(torch.load(fh))"""
    """swap=False
    vocab = Vocabulary.from_files("saved_models/vocabulary-linzen")
    model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=9968, push_ones=False, swap_push_pop=swap)
    with open("saved_models/stack-linzen-class-nopushpop.th", "rb") as fh:
        model.load_state_dict(torch.load(fh))"""
    """swap=True
    vocab = Vocabulary.from_files("saved_models/vocabulary-linzen")
    model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=9968, swap_push_pop=swap)
    with open("saved_models/stack-linzen-class-pop.th", "rb") as fh:
        model.load_state_dict(torch.load(fh))"""

    dataset_reader = BrownDatasetReader(labels=False)  # true?
    predictor = TreePredictor(model, dataset_reader)

    sentence = "the man in the hospitals eats an apple"
    prediction = predictor.predict(sentence)
    fig = plt.figure()
    #one_hist_wonder(prediction, sentence.split(" "), fig)
    profile_sentence(prediction, sentence.split(" "), fig, swap)
    plt.show()

    sentence = "the cat that dogs chase eats apples"
    prediction = predictor.predict(sentence)
    fig = plt.figure()
    #one_hist_wonder(prediction, sentence.split(" "), fig)
    profile_sentence(prediction, sentence.split(" "), fig, swap)
    plt.show()

    sentence = "the man who likes eating apples is full"
    prediction = predictor.predict(sentence)
    fig = plt.figure()
    #one_hist_wonder(prediction, sentence.split(" "), fig)
    profile_sentence(prediction, sentence.split(" "), fig, swap)
    plt.show()
    """sentence = "dogs chase the cat"
Exemplo n.º 14
0
def train(args):
    _train_data_path = os.path.join(args.data_dir, args.train_file_name)
    _validation_data_path = os.path.join(args.data_dir,
                                         args.validation_file_name)
    print(_train_data_path)
    reader = PosDatasetReader()
    train_dataset = reader.read(_train_data_path)
    validation_dataset = reader.read(_validation_data_path)
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=args.embedding_dim)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(args.embedding_dim, args.hidden_dim, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=args.epochs,
                      cuda_device=cuda_device)
    metrics = trainer.train()
    for m in metrics:
        if m.startswith("validation"):
            print("{}={}".format(m, metrics[m]))

    predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
    tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    # Here's how to save the model.
    model_path = os.path.join(args.model_dir, "model.th")
    vocab_path = os.path.join(args.model_dir, "vocabulary")
    with open(model_path, 'wb') as f:
        torch.save(model.state_dict(), f)
    vocab.save_to_files(vocab_path)

    # And here's how to reload the model.
    vocab2 = Vocabulary.from_files(vocab_path)
    model2 = LstmTagger(word_embeddings, lstm, vocab2)
    with open(model_path, 'rb') as f:
        model2.load_state_dict(torch.load(f))
    if cuda_device > -1:
        model2.cuda(cuda_device)

    predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
    tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
    np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
Exemplo n.º 15
0
    def __init__(
            self,
            vocab_path=None,
            model_paths=None,
            weigths=None,
            max_len=50,
            min_len=3,
            lowercase_tokens=False,
            log=False,
            iterations=3,
            min_probability=0.0,
            model_name='roberta',
            special_tokens_fix=1,
            is_ensemble=True,
            # is_ensemble=False,
            min_error_probability=0.0,
            confidence=0,
            resolve_cycles=False,
            prune_amount=0.,
            num_layers_to_keep=12):
        # print('here')
        self.model_weights = list(map(
            float, weigths)) if weigths else [1] * len(model_paths)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.max_len = max_len
        self.min_len = min_len
        self.lowercase_tokens = lowercase_tokens
        self.min_probability = min_probability
        self.min_error_probability = min_error_probability
        self.vocab = Vocabulary.from_files(vocab_path)
        self.log = log
        self.iterations = iterations
        self.confidence = confidence
        self.resolve_cycles = resolve_cycles
        # set training parameters and operations

        self.indexers = []
        self.models = []

        for model_path in model_paths:
            # print('model_path:', model_path); exit(0)
            if is_ensemble:
                model_name, special_tokens_fix = self._get_model_data(
                    model_path)
            weights_name = get_weights_name(model_name, lowercase_tokens)
            self.indexers.append(
                self._get_indexer(weights_name, special_tokens_fix))
            # token_embs = get_token_embedders(model_name, tune_bert=1, special_tokens_fix=special_tokens_fix)

            model = Seq2Labels(
                vocab=self.vocab,
                text_field_embedder=self._get_embbeder(weights_name,
                                                       special_tokens_fix),
                # text_field_embedder= token_embs,
                confidence=self.confidence).to(self.device)
            # count number of params
            pytorch_total_params = sum(p.numel() for p in model.parameters())
            print('total params:', pytorch_total_params)

            # print('model:', model)
            print('type:', type(model))
            #exit(0)

            if torch.cuda.is_available():
                model.load_state_dict(torch.load(model_path))
            else:
                model.load_state_dict(
                    torch.load(model_path, map_location=torch.device('cpu')))
            # print('chk1'); exit(0)
            # get model size
            def print_size_of_model(model):
                torch.save(model.state_dict(), "temp.p")
                print('Size (MB):', os.path.getsize("temp.p") / 1e6)
                os.remove('temp.p')

            # print(model)
            print_size_of_model(model)
            #exit(0)
            print('type:', type(model))
            #exit(0)

            def deleteEncodingLayers(
                    model,
                    num_layers_to_keep):  # must pass in the full bert model
                oldModuleList = model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer
                # print('oldModuleList:', oldModuleList)
                # print('oldModuleList:', len(oldModuleList)); exit(0)

                newModuleList = nn.ModuleList()

                # Now iterate over all layers, only keeping only the relevant layers.
                for i in range(0, num_layers_to_keep):
                    # for i in range(0, len(num_layers_to_keep)):
                    newModuleList.append(oldModuleList[i])

                # create a copy of the model, modify it with the new list, and return
                copyOfModel = copy.deepcopy(model)
                copyOfModel.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = newModuleList

                return copyOfModel

            print('before model 12:', model)
            # model = deleteEncodingLayers(model, 12)
            # print('after 12:', model)
            # print_size_of_model(model)
            #
            # print('before model:', model)
            # model = deleteEncodingLayers(model, 11)
            # print ('after 11:', model)
            # print_size_of_model(model)

            model = deleteEncodingLayers(model, num_layers_to_keep)
            print('after', num_layers_to_keep, ' :', model)
            print_size_of_model(model)

            # exit(0)

            # # save model
            # torch.save(model, 'pytorch-saved.pth')
            #
            # print('model:', model)
            #
            # for name, module in model.named_modules():
            #     print('name:', name)
            #     print('module:', module)

            # exit(0)

            #  onnx_batch_size = 64
            #  dummy_input = {'tokens': {
            # 'bert': torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0')),
            # 'bert-offsets':torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0')),
            # 'mask': torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0'))
            #  }}
            #  # # print('dummy_input:', dummy_input.shape)
            #  # # pred = model(dummy_input['tokens'])
            #  # # print('pred:', pred)
            #  # d_inp = (dummy_input['tokens']['bert'], dummy_input['tokens']['bert-offsets'],
            #  #          dummy_input['tokens']['mask'])
            #  d_inp = dummy_input['tokens']
            #  input_names = ['bert', 'bert-offsets', 'mask']
            #  output_names = ['output']
            #
            #  # convert model to onnx
            #  torch.onnx.export(model, d_inp, 'bert_64.onnx',
            #                    input_names=input_names, output_names=output_names, verbose = False)
            #  # torch.onnx.export(model, dummy_input['tokens'], 'try.onnx', verbose=False)
            #  # d_inp = {'bert': np.zeros(shape=(1, 64), 'bert-offsets': np.zeros(1, 64), 'mask': torch.zeros(1, 64)}
            #  exit(0)

            # model = torch.quantization.quantize_dynamic(
            #     model,
            #     # {torch.nn.Linear},
            #     dtype=torch.qint8
            # )
            # print_size_of_model(model)

            # ##########################
            # # # quantized_model = torch.quantization.quantize_dynamic(
            # # #     model, {torch.nn.Linear}, dtype=torch.qint8
            # # # )
            # quantized_model = torch.quantization.quantize_dynamic(
            #     model.cpu(),
            #     # model,
            #     # {torch.nn.Linear},
            #     dtype=torch.qint8
            # )
            # # # print_size_of_model(model)
            # print_size_of_model(quantized_model)
            # # # quantized_model.cuda()
            # # # exit(0)
            #
            # quantized_model.eval()
            # self.models.append(quantized_model)
            # #######################################

            # prune model

            #################################################
            # random unstructured
            # model = prune.random_unstructured(model, 'weight', amount=0.2)
            # # l1_unstructured
            # # m = prune.l1_unstructured(model, 'weight', amount=0.2)
            # # m = prune.l1_unstructured(model, 'bias', amount=3)
            print_size_of_model(model)
            for name, module in model.named_modules():
                # print('name:', name)
                # print('module:', module)#; exit(0)
                # prune.random_unstructured(module, name='weight', amount=0.2)

                # # prune 20% of connections in all 2D-conv layers
                # if isinstance(module, torch.nn.Conv2d):
                #     prune.l1_unstructured(module, name='weight', amount=0.2)
                # prune 40% of connections in all linear layers
                if isinstance(module, torch.nn.Linear):
                    # print('prune_amount:', prune_amount)
                    # print('.....pruning.....')
                    # print('before pruning:', torch.sum(module.weight)); #exit(0)
                    # print(list(module.named_parameters()))
                    prune.l1_unstructured(module,
                                          name='weight',
                                          amount=prune_amount)
                    # print('shape:', module.weight.shape); #exit(0)
                    # prune.ln_structured(module, name='weight', amount=prune_amount, n=1, dim=module.weight.shape[1])
                    # print(list(module.named_parameters())); exit(0)
                    # print('after pruning:', torch.sum(module.weight));
                    prune.remove(module, name='weight')
                    # module.weight = torch.nn.Parameter(module.weight.data.to_sparse())
                    # print('after removing:', torch.sum(module.weight));
                    # print('shape:', module.weight.shape); exit(0)

                    # exit(0)

                    # prune.random_unstructured(module, name='weight', amount=0.25)
                    # exit(0)
            # exit(0)
            print('About to return')
            print_size_of_model(model)
            #exit(0)
            ##############################################################

            model.eval()
            self.models.append(model)
def save_top_results(process_no, start_index, end_index):
    print("Starting process {} with start at {} and end at {}".format(
        process_no, start_index, end_index))
    DATA_FOLDER = "train_data"
    # EMBEDDING_TYPE = ""
    LOSS_TYPE = ""  # NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt")
    # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt")
    # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt")

    #NOTE: Squad dev test set
    q_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt")
    r_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt")
    rules_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt")
    reader = QuestionResponseSoftmaxReader(q_file,
                                           r_file,
                                           token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    glove_embeddings_file = os.path.join("data", "glove",
                                         "glove.840B.300d.txt")
    # RESULTS_DIR = "squad_seq2seq_train2"
    #NOTE: All other experiments
    # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized"
    # make_dir_if_not_exists(RESULTS_DIR)
    # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index))

    #NOTE: Squad dev test set
    RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized"
    make_dir_if_not_exists(RESULTS_DIR)
    all_results_save_file = os.path.join(
        RESULTS_DIR,
        "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format(
            start_index, end_index))

    with open(all_results_save_file, "w") as all_writer:
        print("Testing out model with", EMBEDDING_TYPE, "embeddings")
        print("Testing out model with", LOSS_TYPE, "loss")
        # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]:
        for NEGATIVE_PERCENTAGE in [100]:
            model_file = os.path.join(
                "saved_softmax_models",
                "decomposable_attention{}{}_model_{}.th".format(
                    LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

            vocabulary_filepath = os.path.join(
                "saved_softmax_models",
                "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                           NEGATIVE_PERCENTAGE))
            print("LOADING VOCABULARY")
            # Load vocabulary
            vocab = Vocabulary.from_files(vocabulary_filepath)

            EMBEDDING_DIM = 300
            PROJECT_DIM = 200
            DROPOUT = 0.2
            NUM_LAYERS = 2
            if EMBEDDING_TYPE == "":
                token_embedding = Embedding(
                    num_embeddings=vocab.get_vocab_size('tokens'),
                    embedding_dim=EMBEDDING_DIM,
                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_glove":
                token_embedding = Embedding.from_params(
                    vocab=vocab,
                    params=Params({
                        'pretrained_file': glove_embeddings_file,
                        'embedding_dim': EMBEDDING_DIM,
                        'projection_dim': PROJECT_DIM,
                        'trainable': False
                    }))
            elif EMBEDDING_TYPE == "_elmo":
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
                options_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_options.json")
                weights_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
                # NOTE: using Small size as medium size gave CUDA out of memory error
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
                # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
                # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained_2":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options_2.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights_2.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_bert":
                print("Loading bert model")
                model = BertModel.from_pretrained('bert-base-uncased')
                token_embedding = BertEmbedder(model)
                PROJECT_DIM = 768
            else:
                print("Error: Some weird Embedding type", EMBEDDING_TYPE)
                exit()
            word_embeddings = BasicTextFieldEmbedder(
                {"tokens": token_embedding})
            HIDDEN_DIM = 200
            params = Params({
                'input_dim': PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            attend_feedforward = FeedForward.from_params(params)
            similarity_function = DotProductSimilarity()
            params = Params({
                'input_dim': 2 * PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            compare_feedforward = FeedForward.from_params(params)
            params = Params({
                'input_dim': 2 * HIDDEN_DIM,
                'hidden_dims': 1,
                'activations': 'linear',
                'num_layers': 1
            })
            aggregate_feedforward = FeedForward.from_params(params)
            model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                                 attend_feedforward,
                                                 similarity_function,
                                                 compare_feedforward,
                                                 aggregate_feedforward)
            print("MODEL CREATED")
            # Load model state
            with open(model_file, 'rb') as f:
                device = torch.device('cpu')
                model.load_state_dict(torch.load(f, map_location=device))
            print("MODEL LOADED!")
            if torch.cuda.is_available():
                # cuda_device = 3
                # model = model.cuda(cuda_device)
                cuda_device = -1
            else:
                cuda_device = -1

            predictor = DecomposableAttentionSoftmaxPredictor(
                model, dataset_reader=reader)
            # Read test file and get predictions
            gold = list()
            predicted_labels = list()
            probs = list()
            total_time = avg_time = 0.0
            print("Started Testing:", NEGATIVE_PERCENTAGE)
            # before working on anything just save all the questions and responses in a list
            all_data = list()
            examples_count = processed_examples_count = 0
            with open(q_file,
                      'r') as q_reader, open(r_file, "r") as r_reader, open(
                          rules_file, "r") as rule_reader:
                logger.info("Reading questions from : %s", q_file)
                logger.info("Reading responses from : %s", r_file)
                q = next(q_reader).lower().strip()
                q = mt.tokenize(q, return_str=True, escape=False)
                current_qa = (q, "")
                current_rules_and_responses = list()
                for i, (response,
                        rule) in enumerate(zip(r_reader, rule_reader)):
                    response = response.strip()
                    rule = rule.strip()
                    if response and rule:
                        # get current_answer from response
                        a = get_answer_from_response(response)
                        if not current_qa[1]:
                            current_qa = (q, a)
                        else:
                            # verify if the a is same as the one in current_qa
                            if a != current_qa[1]:
                                # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response)
                                current_qa = (current_qa[0], a)
                                # print(current_rules_and_responses)
                                # exit()
                        # Add it to the current responses
                        current_rules_and_responses.append((response, rule))
                    elif len(current_rules_and_responses) > 0:
                        # Create a instance
                        # print(current_qa)
                        # print(current_rules_and_responses)
                        # exit()
                        if rule or response:
                            print("Rule Response mismatch")
                            print(current_qa)
                            print(response)
                            print(rule)
                            print(examples_count)
                            print(i)
                            exit()

                        if examples_count < start_index:
                            examples_count += 1
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                            current_qa = (q, "")
                            current_rules_and_responses = list()
                            continue
                        elif examples_count > end_index:
                            break

                        all_data.append(
                            (current_qa, current_rules_and_responses))
                        try:
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                        except StopIteration:
                            # previous one was the last question
                            q = ""
                        current_qa = (q, "")
                        current_rules_and_responses = list()
                        examples_count += 1
                        # if(examples_count%100 == 0):
                        # 	print(examples_count)
                    else:
                        # Serious Bug
                        print("Serious BUG!!")
                        print(current_qa)
                        print(response)
                        print(rule)
                        print(examples_count)
                        print(i)
                        exit()
            print("{}:\tFINISHED IO".format(process_no))
            examples_count = start_index
            processed_examples_count = 0
            for current_qa, responses_and_rules in all_data:
                start_time = time.time()
                # Tokenize and preprocess the responses
                preprocessed_responses = [
                    mt.tokenize(remove_answer_brackets(response),
                                return_str=True,
                                escape=False)
                    for response, rule in responses_and_rules
                ]
                # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules])
                predictions = predictor.predict(current_qa[0],
                                                preprocessed_responses)
                label_probs = predictions["label_probs"]
                tuples = zip(responses_and_rules, label_probs)
                sorted_by_score = sorted(tuples,
                                         key=lambda tup: tup[1],
                                         reverse=True)
                count = 0
                all_writer.write("{}\n".format(current_qa[0]))
                all_writer.write("{}\n".format(current_qa[1]))
                for index, ((response, rule),
                            label_prob) in enumerate(sorted_by_score):
                    if index == 3:
                        break
                    all_writer.write("{}\t{}\t{}\t{}\n".format(
                        response,
                        mt.tokenize(remove_answer_brackets(response),
                                    return_str=True,
                                    escape=False), rule, label_prob))
                all_writer.write("\n")
                all_writer.flush()
                end_time = time.time()
                processed_examples_count += 1
                examples_count += 1
                total_time += end_time - start_time
                avg_time = total_time / float(processed_examples_count)
                print(
                    "{}:\ttime to write {} with {} responses is {} secs. {} avg time"
                    .format(process_no, examples_count,
                            len(responses_and_rules), end_time - start_time,
                            avg_time))
Exemplo n.º 17
0
parser.add_argument('--learning_rate', type=float, default=2.0)
parser.add_argument('--max_steps', type=int, default=30)
parser.add_argument('--num_updates', type=int, default=1)
parser.add_argument('--beam_size', type=int, default=1)
parser.add_argument('--maskers',
                    type=str,
                    default=IDENTITY_TOKEN,
                    help='string with comma-separated values')
parser.add_argument('--early_stopping', action='store_true')
parser.add_argument('--sample', type=int, default=None)

if __name__ == '__main__':
    args = parser.parse_args()

    class_reader = ClassificationReader(skip_start_end=True)
    class_vocab = Vocabulary.from_files(Path(args.classifier_path) / 'vocab')
    class_model_args = get_args_from_path(
        Path(args.classifier_path) / 'args.json')
    class_model = get_model_by_name(**class_model_args, vocab=class_vocab)
    load_weights(class_model, Path(args.classifier_path) / 'best.th')

    reader = CopyNetReader(masker=None)
    copynet_vocab = Vocabulary.from_files(Path(args.copynet_path) / 'vocab')
    copynet_model_args = get_args_from_path(
        Path(args.copynet_path) / 'args.json')
    copynet_model = get_model_by_name(**copynet_model_args,
                                      vocab=copynet_vocab,
                                      beam_size=args.beam_size)
    load_weights(copynet_model, Path(args.copynet_path) / 'best.th')

    class_model_copynet_args = get_args_from_path(
Exemplo n.º 18
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name, "use_classifier")
        setattr(task, "_classifier_name", task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}"
    )

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab",input_module_tokenizer_name(args.input_module)) #to allow roberta and albert (with diff vocabs) in one exp folder
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            # To store preprocessed data for models that use different indexers in the same exp directory
            indexer = input_module_tokenizer_name(args.input_module)
            relative_path = _get_serialized_record_path(task.name, split, "preproc",indexer)
            cache_found =_find_cached_file(
                args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix #TODO change global one to point to arwen, and local one to be in one exp folder with diff runs
            )
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(task.name, split, preproc_dir,indexer)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(
                    task, split, indexers, vocab, record_file, model_preprocessing_interface
                )

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        indexer = input_module_tokenizer_name(args.input_module)
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir,indexer=indexer)
        task.test_data = _get_instance_generator(task.name, "test", preproc_dir,indexer=indexer)
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction
            ,indexer=indexer)
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name, "train", preproc_dir, fraction=args.target_train_data_fraction
            ,indexer=indexer)
            target_tasks.append(task)



    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
Exemplo n.º 19
0
# change paths to your data directory
config = {
    "vocab_directory": "../data/allen_vocab_lower_10",
    "pre_trained_embedding": "../data/glove.42B.300d.txt",
    "model": "knrm",
    "train_data": "../data/triples.train.tsv",
    "validation_data": "../data/tuples.validation.tsv",
    "test_data": "../data/tuples.test.tsv",
}

#
# data loading
#

vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding.from_params(
    vocab,
    Params({
        "pretrained_file": config["pre_trained_embedding"],
        "embedding_dim": 300,
        "trainable": True,
        "padding_index": 0
    }))

word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

# recommended default params for the models (but you may change them if you want)
if config["model"] == "knrm":
    model = KNRM(word_embedder, n_kernels=11)
elif config["model"] == "conv_knrm":
Exemplo n.º 20
0
    def from_params(
            cls,
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None,
    ) -> "TrainerPieces":
        all_datasets = training_util.meta_dataset_from_params(params, cache_directory, cache_prefix)
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation),
        )

        if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
            vocab_params = params.pop("vocabulary", {})
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"),
                vocab_params.get("padding_token", None),
                vocab_params.get("oov_token", None),
            )
        else:
            instance_train = (
                instance
                for key, dataset in all_datasets.items()
                if key == 'train'
                for subdata in dataset
                for instance in subdata
            )
            instance_valid_test = (
                instance
                for key, dataset in all_datasets.items()
                if key != 'train'
                for instance in dataset
            )
            instances = chain(instance_train, instance_valid_test)
            vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                # Using a generator comprehension here is important
                # because, being lazy, it allows us to not iterate over the
                # dataset when directory_path is specified.

                # (
                #     instance
                #     for key, dataset in all_datasets.items()
                #     if (key in datasets_for_vocab_creation)
                #     for instance in dataset
                # ),
                instances
            )

        model = Model.from_params(vocab=vocab, params=params.pop("model"))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        # Save the vocab only in the master
        if not is_distributed() or is_master():
            vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        # print('[info] iterator in meta_pieces is:{}'.format(params.pop("iterator")))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets["train"]
        validation_data = all_datasets.get("validation")
        test_data = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model
        )
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return cls(
            model,
            iterator,
            train_data,
            validation_data,
            test_data,
            validation_iterator,
            trainer_params,
        )
Exemplo n.º 21
0
 def test_from_files_with_model_archive(self):
     vocab = Vocabulary.from_files(str(self.model_archive))
     vocab.get_namespaces() == {"tokens", "labels"}
     assert vocab.get_token_from_index(3, namespace="tokens") == "u.n."
Exemplo n.º 22
0
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000,
                  cuda_device=cuda_device)
trainer.train()

predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)

print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

with open('/tmp/model.th', 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files('/tmp/vocabulary')
vocab2 = Vocabulary.from_files('/tmp/vocabulary')

model2 = LstmTagger(word_embeddings, lstm, vocab2)

with open('/tmp/model.th', 'rb') as f:
    model2.load_state_dict(torch.load(f))

if cuda_device > -1:
    model2.cuda(cuda_device)

predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict('The dog ate the apple')['tag_logits']
np.testing.assert_array_almost_equal(tag_logits, tag_logits2)
Exemplo n.º 23
0
    overrides = overrides = json.dumps(
        {"trainer": {
            "cuda_device": cuda_device
        }})
    params = Params.from_file(model_config, overrides)
    model_file = 'checkpoint/%s%s/' % (model_name, attn)
    iterator = DataIterator.from_params(params.pop("iterator"))

    torch.manual_seed(0)
    numpy.random.seed(0)

    if write_file:
        wf = Write_outfile(Wfile_name)

    print("Loading vocabulary")
    vocab = Vocabulary.from_files(model_file + 'vocabulary')

    print('Initialing model')
    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    print("Loading Model file from %s" % (model_file + 'best.th'))
    with open(model_file + 'best.th', 'rb') as f:
        model.load_state_dict(torch.load(f, encoding='utf-8'))

    iterator.index_with(vocab)
    dataset_reader_params = params.pop('dataset_reader')
    datareader = DatasetReader.from_params(dataset_reader_params)
    model.eval()

    #读取文件数据
    for file in files:
        dom = xml.dom.minidom.parse(file)
Exemplo n.º 24
0
    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(
            [Token(t) for t in tokens])


EMBEDDING_DIM = 6
HIDDEN_DIM = 6

lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

vocab = Vocabulary.from_files("{}/vocabulary".format(out_dir))
model = LstmClassifier(word_embeddings, lstm, vocab)
with open("{}/model.th".format(out_dir), 'rb') as f:
    model.load_state_dict(torch.load(f))
if cuda_device > -1:
    model.cuda(cuda_device)
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)

# EVALUATION
true_pos = 0
false_pos = 0
false_neg = 0

fo = open('{}/test.txt'.format(out_dir), 'r')
lines = fo.readlines()
fo.close()
parser.add_argument('--vocab-file', action='store', dest='vocab_file',
                    help='vocab directory path', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}

loader = IrTripleDatasetReader(lazy=True,#token_indexers=_token_indexers,
tokenizer=BlingFireTokenizer()) #BlingFireTokenizer()) #WordTokenizer(word_splitter=JustSpacesWordSplitter()))
#,max_doc_length=200,max_query_length=20,min_doc_length=200,min_query_length=20)

instances = loader.read(args.dataset_file)
_iterator = BucketIterator(batch_size=64,
                           sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

#vocab_map,vocab_data = FastTextVocab.load_ids(args.vocab_file,20)

#vocab = FastTextVocab(vocab_map, vocab_data,20)

_iterator.index_with(Vocabulary.from_files(args.vocab_file))

with Timer("iterate over all"):
    for i in _iterator(instances, num_epochs=1):
        exit()
Exemplo n.º 26
0
def main(args):
    # fix_seed()   不要fix效果才好.!!!!!!!!!!!!!!否则shuffle没用了
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)
    #   list(train_data)
    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab  # 这里面是生成字典的算法
    if args.vocab_path:
        old_vocab = Vocabulary.from_files(args.vocab_path)

    # 代码修改成,不管传入不传入都根据数据集重新简历字典,然后进行2个字典的合并.

    if 1:  # 生成字典. 利用数据集生成字典!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # 下面看看这个生成vocab 如何做的. 生成之后的东西就是对应到目录下的output_vocabulary里面的内容.
        # 直接调用的是allennlp库包,还是需要看懂里面实现的算法.

        #-------------需要对这个from_instances进行修改,看里面如何生成我的更大字典.这个是已经封装好的了.所以不用看了.改上面数据才行.
        new_vocab = Vocabulary.from_instances(train_data,
                                              max_vocab_size={
                                                  'tokens': 30000,
                                                  'labels':
                                                  args.target_vocab_size,
                                                  'd_tags': 2
                                              },
                                              tokens_to_add=tokens_to_add)

    from allennlp.common.params import Params
    params = Params({"non_padded_namespaces": set(namespaces)})
    vocab = old_vocab

    old_vocab.extend_from_instances(params, train_data)

    old_vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))
    from pathlib import Path
    vocabdir = Path(__file__).resolve().parent.parent / os.path.join(
        args.model_dir, 'vocabulary', 'labels.txt')
    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:  # 我们不用这个地方来加载
        model.load_state_dict(
            torch.load(
                os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    model = model.to(device)

    print("Model is set", '模型加载完毕')

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
        iterator=iterator,
        train_dataset=train_data,
        validation_dataset=dev_data,
        serialization_dir=args.model_dir,
        patience=args.patience,
        num_epochs=args.n_epoch,
        cuda_device=cuda_device,
        shuffle=True,  # 吧这个地方改了.true
        accumulated_batch_count=args.accumulation_size,
        cold_step_count=args.cold_steps_count,
        cold_lr=args.cold_lr,
        cuda_verbose_step=int(args.cuda_verbose_steps)
        if args.cuda_verbose_steps else None)
    print("Start training")
    trainer.train(args.oldmodel)

    # Here's how to save the model. # 最优模型再存一遍.所以最后这个目录里面只存model.th即可.而不用管那些带系数的.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped", "训练全部结束,model存在了", args.model_dir + ' / model.th')
Exemplo n.º 27
0
    def from_params(cls,
                    params: Params,
                    instances: Iterable['adi.Instance'] = None):
        """
        There are two possible ways to build a vocabulary; from a
        collection of instances, using :func:`Vocabulary.from_instances`, or
        from a pre-saved vocabulary, using :func:`Vocabulary.from_files`.
        You can also extend pre-saved vocabulary with collection of instances
        using this method. This method wraps these options, allowing their
        specification from a ``Params`` object, generated from a JSON
        configuration file.
        Parameters
        ----------
        params: Params, required.
        instances: Iterable['adi.Instance'], optional
            If ``params`` doesn't contain a ``directory_path`` key,
            the ``Vocabulary`` can be built directly from a collection of
            instances (i.e. a dataset). If ``extend`` key is set False,
            dataset instances will be ignored and final vocabulary will be
            one loaded from ``directory_path``. If ``extend`` key is set True,
            dataset instances will be used to extend the vocabulary loaded
            from ``directory_path`` and that will be final vocabulary used.
        Returns
        -------
        A ``Vocabulary``.
        """
        # pylint: disable=arguments-differ
        # Vocabulary is ``Registrable`` so that you can configure a custom subclass,
        # but (unlike most of our registrables) almost everyone will want to use the
        # base implementation. So instead of having an abstract ``VocabularyBase`` or
        # such, we just add the logic for instantiating a registered subclass here,
        # so that most users can continue doing what they were doing.
        vocab_type = params.pop("type", None)
        if vocab_type is not None:
            return cls.by_name(vocab_type).from_params(params=params,
                                                       instances=instances)

        extend = params.pop("extend", False)
        vocabulary_directory = params.pop("directory_path", None)
        if not vocabulary_directory and not instances:
            raise ConfigurationError(
                "You must provide either a Params object containing a "
                "vocab_directory key or a Dataset to build a vocabulary from.")
        if extend and not instances:
            raise ConfigurationError(
                "'extend' is true but there are not instances passed to extend."
            )
        if extend and not vocabulary_directory:
            raise ConfigurationError(
                "'extend' is true but there is not 'directory_path' to extend from."
            )

        if vocabulary_directory and instances:
            if extend:
                logger.info(
                    "Loading Vocab from files and extending it with dataset.")
            else:
                logger.info("Loading Vocab from files instead of dataset.")

        if vocabulary_directory:
            vocab = Vocabulary.from_files(vocabulary_directory)
            if not extend:
                params.assert_empty("Vocabulary - from files")
                return vocab
        if extend:
            vocab.extend_from_instances(params, instances=instances)
            return vocab
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces",
                                           EXTENDED_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        min_pretrained_embeddings = params.pop("min_pretrained_embeddings",
                                               None)
        only_include_pretrained_words = params.pop_bool(
            "only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")
        return ExtendedVocabulary.from_instances(
            instances=instances,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add,
            min_pretrained_embeddings=min_pretrained_embeddings)
Exemplo n.º 28
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class",
        token_indexers={"tokens": single_id_indexer},
        use_subtrees=True)
    train_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class", token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=300,
            weight=weight,
            trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embedding_dim,
                      hidden_size=512,
                      num_layers=2,
                      batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in train mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data,
                                         num_epochs=5,
                                         shuffle=True),
                                group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train()  # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model, batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
Exemplo n.º 29
0
    ]

    vocab = Vocabulary()
    for ns in ["tokens", "token_in", "token_out"]:
        for chord in itertools.product(note_list, accidental_list,
                                       chord_type_list):
            vocab.add_token_to_namespace("".join(chord), namespace=ns)

        vocab.add_token_to_namespace(START_SYMBOL, namespace=ns)
        vocab.add_token_to_namespace(END_SYMBOL, namespace=ns)

    key_list = [
        "".join(x) for x in itertools.product(note_list, accidental_list)
    ]
    form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"]
    figbass_list = ["7", "6"]
    for char in (key_list + form_list + figbass_list):
        vocab.add_token_to_namespace(char, namespace="token_characters")

    note_number_list = [str(x) for x in range(12)]
    for note_number in note_number_list:
        vocab.add_token_to_namespace(note_number, namespace="notes")

    vocab.save_to_files("data/vocabulary")


if __name__ == "__main__":
    generate_vocab()
    vocab = Vocabulary.from_files("data/vocabulary")
    print(vocab.get_token_to_index_vocabulary())
Exemplo n.º 30
0
def multiprocess_single_sequence_loader(process_number: int, _config,
                                        _queue: mp.Queue,
                                        _wait_for_exit: mp.Event, _local_file,
                                        _fasttext_vocab_cached_mapping,
                                        _fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(
            pretrained_model=_config["bert_pretrained_model"],
            do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {
                "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
            }
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {
                "tokens":
                FastTextNGramIndexer(_config["fasttext_max_subwords"])
            }
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                                   _fasttext_vocab_cached_data,
                                   _config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.put(None)  # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore
def main(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={
                                              'tokens': 30000,
                                              'labels': args.target_vocab_size,
                                              'd_tags': 2
                                          },
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(
            torch.load(os.path.join(args.pretrain_folder,
                                    args.pretrain + '.th'),
                       map_location=torch.device('cpu')))

    model = model.to(device)

    print("Model is set")

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None)
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")
Exemplo n.º 32
0
def main(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={
                                              'tokens': 30000,
                                              'labels': args.target_vocab_size,
                                              'd_tags': 2
                                          },
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    # model = GecBERTModel(vocab_path=args.vocab_path,
    #                      model_paths=args.model_path,
    #                      max_len=args.max_len, min_len=args.min_len,
    #                      iterations=args.iteration_count,
    #                      min_error_probability=args.min_error_probability,
    #                      min_probability=args.min_error_probability,
    #                      lowercase_tokens=args.lowercase_tokens,
    #                      model_name=args.transformer_model,
    #                      special_tokens_fix=args.special_tokens_fix,
    #                      log=False,
    #                      confidence=args.additional_confidence,
    #                      is_ensemble=args.is_ensemble,
    #                      weigths=args.weights)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(
            torch.load(
                os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    print('cuda_device:', cuda_device)
    #exit(0)
    model = model.to(device)

    print("Model is set")

    # print('model:', model)
    def print_size_of_model(model):
        torch.save(model.state_dict(), "temp.p")
        print('Size (MB):', os.path.getsize("temp.p") / 1e6)
        os.remove('temp.p')

    if args.keep != 12:

        prev_model = GecBERTModel(
            vocab_path=args.vocab_path,
            model_paths=args.model_path,
            max_len=args.max_len,
            min_len=args.min_len,
            iterations=args.iteration_count,
            min_error_probability=args.min_error_probability,
            min_probability=args.min_error_probability,
            lowercase_tokens=args.lowercase_tokens,
            model_name=args.transformer_model,
            special_tokens_fix=args.special_tokens_fix,
            log=False,
            confidence=args.additional_confidence,
            is_ensemble=args.is_ensemble,
            weigths=args.weights,
            num_layers_to_keep=args.keep)

        # print('prev_model:', prev_model.models)

        # print(model)
        print_size_of_model(model)
        print_size_of_model(prev_model.models[0])

        model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = \
            prev_model.models[0].text_field_embedder.token_embedder_bert.bert_model.encoder.layer

    print_size_of_model(model)

    # exit(0)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None)

    GPUtil.showUtilization()
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")