# In[1163]:

loss_func = torch.nn.CrossEntropyLoss()
loss = loss_func(logits, label)

# In[ ]:

# In[1164]:

optimizer = optim.SGD(model.parameters(), lr=0.01)
cuda_device = -1

# In[1165]:

iterator = BucketIterator(batch_size=20,
                          sorting_keys=[("token_node_resolveds", "num_tokens")
                                        ])
iterator.index_with(vocab)

# In[1166]:

from mrp_library.models.generalizer import Generalizer, ActionGeneralizer

# In[ ]:

# In[1167]:

model = ActionGeneralizer(vocab=vocab,
                          field_type2embedder=field_type2embedder,
                          seq2vec_encoder=seq2vec_encoder,
                          seq2seq_encoder=seq2seq_encoder,
def train_lstm(train_dataset, validation_dataset, batch_size, num_layers, double_input, dense_vector=False,
               col_name=None, use_elmo=False, epochs=30, patience=5, bidirectional=True, learning_rate=3e-4, hidden_size=64,
               num_classes=2, use_gpu=False):
    """
    Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset; optionally, perform early stopping based on validation loss. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.

    Functionality to run it for (1) Single Input: reply (OR) question, (2) Double Input: reply + context comment,
    (3) Dense Vector + reply/question, and (4) Dense Vector + reply + context comment.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    num_layers: int
        number of BiLSTM layers: 2 or higher for Stacked BiLSTMs
    double_input: bool
        True to run DoubleInput classifier | False for SingleInput classifier
    dense_vector: bool
        True to concatenate dense feature vector before feeding to the FeedForward layer
    col_name: str
        'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True
    use_elmo: bool
        use elmo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). 'None' to disable early stopping.
    bidirectional: bool
        True for a bidirectional LSTM
    learning_rate: float
        learning rate for Adam Optimizer
    hidden_size: int
        size of the hidden layer in the encoder
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    if double_input: # need context_tokens as well
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens"),
                                                ("context_tokens", "num_tokens")])

    else: # only reply_tokens
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    if double_input: # DoubleInput Classifier: two BiLSTM encoders
        lstm_reply: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(),
                                                                   hidden_size,
                                                                   num_layers=num_layers,
                                                                   bidirectional=bidirectional,
                                                                   batch_first=True))
        lstm_context: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(),
                                                                    hidden_size,
                                                                    num_layers=num_layers,
                                                                    bidirectional=bidirectional,
                                                                    batch_first=True))

        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = 2 * (lstm_reply.get_output_dim() + DENSE_VECTOR_LEN)
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)

            model = models.DenseDoubleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=lstm_reply,
                                                 context_encoder=lstm_context,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            # Feedforward:
            classifier_feedforward: FeedForward = nn.Linear(2 * lstm_reply.get_output_dim(), num_classes)

            model = models.DoubleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=lstm_reply,
                                                 context_encoder=lstm_context,
                                                 classifier_feedforward=classifier_feedforward)


    else: # SingleInput Classifier: one BiLSTM encoder
        encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(),
                                                                hidden_size,
                                                                num_layers=num_layers,
                                                                bidirectional=bidirectional,
                                                                batch_first=True))
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseSingleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes)
            model = models.SingleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # No early stopping: train on both train+validation dataset
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #3
0
def _build_trainer(config, model, vocab, train_data, valid_data):
    optimizer = optim.AdamW(model.parameters(), lr=config.trainer.lr)
    scheduler = None

    if config.embedder.name.endswith('bert') or config.embedder.name == 'both':
        non_bert_params = (param for name, param in model.named_parameters()
                           if not name.startswith('text_field_embedder'))

        optimizer = optim.AdamW([{
            'params':
            model.text_field_embedder.parameters(),
            'lr':
            config.trainer.bert_lr
        }, {
            'params': non_bert_params,
            'lr': config.trainer.lr
        }, {
            'params': []
        }])

        scheduler = SlantedTriangular(
            optimizer=optimizer,
            num_epochs=config.trainer.num_epochs,
            num_steps_per_epoch=len(train_data) / config.trainer.batch_size,
            cut_frac=config.trainer.cut_frac,
            gradual_unfreezing=config.trainer.gradual_unfreezing,
            discriminative_fine_tuning=config.trainer.
            discriminative_fine_tuning)

    logger.info('Trainable params:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            logger.info('\t' + name)

    iterator = BucketIterator(batch_size=config.trainer.batch_size)
    iterator.index_with(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
        logger.info('Using cuda')
    else:
        cuda_device = -1
        logger.info('Using cpu')

    logger.info('Example batch:')
    _log_batch(next(iterator(train_data)))

    if config.embedder.name.endswith('bert') or config.embedder.name == 'both':
        train_data = _filter_data(train_data, vocab)
        valid_data = _filter_data(valid_data, vocab)

    return Trainer(model=model,
                   optimizer=optimizer,
                   iterator=iterator,
                   train_dataset=train_data,
                   validation_dataset=valid_data,
                   validation_metric='+MeanAcc',
                   patience=config.trainer.patience,
                   num_epochs=config.trainer.num_epochs,
                   cuda_device=cuda_device,
                   grad_clipping=5.,
                   learning_rate_scheduler=scheduler,
                   serialization_dir=os.path.join(config.data.models_dir,
                                                  config.model_name),
                   should_log_parameter_statistics=False,
                   should_log_learning_rate=False,
                   num_gradient_accumulation_steps=config.trainer.
                   num_gradient_accumulation_steps)
Пример #4
0
                      source_embedder,
                      encoder,
                      max_decoding_steps,
                      target_embedding_dim=ZH_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      attention=attention,
                      beam_size=12,
                      use_bleu=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(
    device
)  # without this there is no error, but it runs in CPU (instead of GPU).

optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=128,
                          sorting_keys=[("source_tokens", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  patience=2,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=1,
                  cuda_device=CUDA_DEVICE)

for i in range(20):
    print('Epoch: {}'.format(i))
    trainer.train()
def train_nli(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, use_elmo=False, epochs=30, patience=5,
              learning_rate=3e-4, num_classes=2, use_gpu=False):
    """
    Trains a Natural Language Inference (InferSent) inspired architecture.
    Reply and Context are separately encoded using CNN and GloVe embeddings (or optionally ELMo to dynamically compute embeddings).

    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings(large=True)
    else:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("reply_tokens", "num_tokens"),
                                            ("context_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    # CNN encoders:
    cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                           num_filters=num_filters,
                                           ngram_filter_sizes=filter_sizes)

    cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                             num_filters=num_filters,
                                             ngram_filter_sizes=filter_sizes)

    # Feedforward:
    classifier_feedforward: FeedForward = nn.Linear(4 * cnn_reply.get_output_dim(), num_classes) # 4 because we perform [concatenation, element-wise subtraction (abs), element-wise multiplication]

    model = models.InferModel(vocab=vocab,
                              word_embeddings=word_embeddings,
                              reply_encoder=cnn_reply,
                              context_encoder=cnn_context,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # No early stopping: train on both train+validation dataset if patience is None
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #6
0
 def test_create_batches_groups_correctly(self):
     iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')])
     grouped_instances = iterator._create_batches(self.dataset, shuffle=False)
     assert grouped_instances == [[self.instances[4], self.instances[2]],
                                  [self.instances[0], self.instances[1]],
                                  [self.instances[3]]]
Пример #7
0
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(
    text_field_embedder=word_embeddings,
    encoder=GatedCnnEncoder(input_dim=EMBEDDING_DIM,
                            layers=[[[2, 512, 2**i] for i in range(2)] +
                                    [[2, EMBEDDING_DIM, 1]]]),
    vocab=vocab,
    label_encoding="BIOUL",
)
cuda_device = -1

optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2,
                          sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    patience=10,
    num_epochs=1000,
    cuda_device=cuda_device,
)
trainer.train()
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
tag_logits = predictor.predict("The dog ate the apple")["tag_logits"]
tag_ids = np.argmax(tag_logits, axis=-1)
Пример #8
0
def main():
    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    # TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    # SingleIdTokenIndexer = Tokens are single integers
    # TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = CopyNetDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_namespace='tokens'  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)
    """
    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    # input()
    """
    # vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    Attention = DotProductAttention()
    print(Attention)

    max_decoding_steps = 4  # TODO: make this variable

    model = CopyNetSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps=max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        # target_namespace = 'target_tokens',
        beam_size=beamSize,
        attention=Attention)
    # Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    # iterator = BasicIterator(batch_size=2)
    # iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        # patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    """
parser.add_argument('--vocab-file', action='store', dest='vocab_file',
                    help='vocab directory path', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}

loader = IrTripleDatasetReader(lazy=True,#token_indexers=_token_indexers,
tokenizer=BlingFireTokenizer()) #BlingFireTokenizer()) #WordTokenizer(word_splitter=JustSpacesWordSplitter()))
#,max_doc_length=200,max_query_length=20,min_doc_length=200,min_query_length=20)

instances = loader.read(args.dataset_file)
_iterator = BucketIterator(batch_size=64,
                           sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

#vocab_map,vocab_data = FastTextVocab.load_ids(args.vocab_file,20)

#vocab = FastTextVocab(vocab_map, vocab_data,20)

_iterator.index_with(Vocabulary.from_files(args.vocab_file))

with Timer("iterate over all"):
    for i in _iterator(instances, num_epochs=1):
        exit()
Пример #10
0
def main():
    parser = utils.opt_parser.get_trainer_opt_parser()
    parser.add_argument('models',
                        nargs='*',
                        help='pretrained models for the same setting')
    parser.add_argument('--test', action="store_true", help='use testing mode')
    parser.add_argument('--no-act',
                        action="store_true",
                        help='Do not use ACT for layer computation')
    parser.add_argument('--num-layer',
                        type=int,
                        help='maximum number of stacked layers')
    parser.add_argument('--warm-up',
                        type=int,
                        default=10,
                        help='number of warmup-steps for Noam Scheduler')

    args = parser.parse_args()

    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    st_ds_conf = config.UTRANSFORMER_CONF[args.dataset]
    if args.no_act:
        st_ds_conf['act'] = False
    if args.num_layer:
        st_ds_conf['max_num_layers'] = args.num_layer
    if args.epoch:
        config.TRAINING_LIMIT = args.epoch
    if args.batch:
        st_ds_conf['batch_sz'] = args.batch

    encoder = UTEncoder(
        input_dim=st_ds_conf['emb_sz'],
        max_num_layers=st_ds_conf['max_num_layers'],
        num_heads=st_ds_conf['num_heads'],
        feedforward_hidden_dim=st_ds_conf['emb_sz'],
        use_act=st_ds_conf['act'],
        attention_dropout=st_ds_conf['attention_dropout'],
        residual_dropout=st_ds_conf['residual_dropout'],
        feedforward_dropout=st_ds_conf['feedforward_dropout'],
        use_vanilla_wiring=st_ds_conf['vanilla_wiring'],
    )
    decoder = UTDecoder(
        input_dim=st_ds_conf['emb_sz'],
        max_num_layers=st_ds_conf['max_num_layers'],
        num_heads=st_ds_conf['num_heads'],
        feedforward_hidden_dim=st_ds_conf['emb_sz'],
        use_act=st_ds_conf['act'],
        attention_dropout=st_ds_conf['attention_dropout'],
        residual_dropout=st_ds_conf['residual_dropout'],
        feedforward_dropout=st_ds_conf['feedforward_dropout'],
        use_vanilla_wiring=st_ds_conf['vanilla_wiring'],
    )
    source_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('nltokens'),
        embedding_dim=st_ds_conf['emb_sz'])
    target_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('lftokens'),
        embedding_dim=st_ds_conf['emb_sz'])
    model = ParallelSeq2Seq(
        vocab=vocab,
        encoder=encoder,
        decoder=decoder,
        source_embedding=source_embedding,
        target_embedding=target_embedding,
        target_namespace='lftokens',
        start_symbol=START_SYMBOL,
        eos_symbol=END_SYMBOL,
        max_decoding_step=st_ds_conf['max_decoding_len'],
    )

    if args.models:
        logging.getLogger().setLevel(logging.INFO)
        logging.info(f"loads pretrained model from {args.models[0]}")
        model.load_state_dict(torch.load(args.models[0]))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters(),
                                 betas=(.9, .98),
                                 eps=1.e-9)

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'universal_transformer',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=args.device,
            num_epochs=config.TRAINING_LIMIT,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in tqdm.tqdm(testing_set, total=len(testing_set)):
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            del instance.fields['target_tokens']
            output = predictor.predict_instance(instance)
            print('PRED:', ' '.join(output['predicted_tokens']))
Пример #11
0
if os.path.exists(vocab_dir):
    vocab = Vocabulary.from_files(vocab_dir)
else:
    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={'source_tokens': min_count, 'target_tokens': min_count},
                                      max_vocab_size=max_vocab_size)
    vocab.save_to_files(vocab_dir)

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('source_tokens'),
                         embedding_dim=embedding_dim)
source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True))
train_iterator = BucketIterator(batch_size=batch_size,
                                sorting_keys=[("source_tokens", "num_tokens")],
                                instances_per_epoch=INSTANCES_PER_EPOCH)

validation_iterator = BucketIterator(batch_size=batch_size,
                                     sorting_keys=[("source_tokens", "num_tokens")])

train_iterator.index_with(vocab)
validation_iterator.index_with(vocab)

model = SimpleSeq2Seq(vocab, source_embedder, encoder,
                      max_decoding_steps=max_decoding_steps,
                      target_embedding_dim=embedding_dim,
                      target_namespace='target_tokens',
                      attention=BilinearAttention(hidden_dim * 2, hidden_dim * 2),
                      beam_size=beam_size)
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--model_name', type=str, default='baseline',
                        help='model name (default: baseline)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    annotations = pd.read_csv('data/data/annotations_merged.csv')
    prompts = pd.read_csv('data/data/prompts_merged.csv')

    feature_dictionary = {}
    prompts_dictionary = {}

    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for index, row in annotations.iterrows():
        if row['PMCID'] not in feature_dictionary:
            feature_dictionary[row['PMCID']] = []
        feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']]
                                                + prompts_dictionary[row['PromptID']])

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)}

    reader = EIDatasetReader(bert_token_indexer, feature_dictionary)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    bert_token_embedding = PretrainedBertEmbedder(
        'scibert/weights.tar.gz', requires_grad=args.tunable
    )

    word_embeddings = BasicTextFieldEmbedder(
        {"bert": bert_token_embedding},
        {"bert": ['bert']},
        allow_unmatched_keys=True
    )

    model = Baseline(word_embeddings, vocab)

    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('intervention', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Пример #13
0
def train_lstm(train_dataset,
               batch_size,
               num_layers,
               use_elmo=False,
               epochs=15,
               bidirectional=True,
               learning_rate=3e-4,
               hidden_size=64,
               num_classes=2,
               use_gpu=False):
    """
    Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.


    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_layers: int
        number of BiLSTM layers: 2 or higher for Stacked BiLSTMs
    use_elmo: bool
        use elmo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    bidirectional: bool
        True for a bidirectional LSTM
    learning_rate: float
        learning rate for Adam Optimizer
    hidden_size: int
        size of the hidden layer in the encoder
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                hidden_size,
                num_layers=num_layers,
                bidirectional=bidirectional,
                batch_first=True))

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #14
0
def train_cnn(train_dataset,
              batch_size,
              num_filters,
              filter_sizes,
              use_elmo=False,
              epochs=15,
              learning_rate=3e-4,
              num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = CnnEncoder(
        embedding_dim=word_embeddings.get_output_dim(),
        num_filters=num_filters,
        ngram_filter_sizes=filter_sizes)

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #15
0
def train_and_fit(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={
                                              'tokens': 30000,
                                              'labels': args.target_vocab_size,
                                              'd_tags': 2
                                          },
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(
            torch.load(
                os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    model = model.to(device)

    print("Model is set")

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None)
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")
Пример #16
0
def main():
    parser = utils.opt_parser.get_trainer_opt_parser()
    parser.add_argument('models',
                        nargs='*',
                        help='pretrained models for the same setting')
    parser.add_argument('--test', action="store_true", help='use testing mode')
    parser.add_argument('--num-layer',
                        type=int,
                        help='maximum number of stacked layers')
    parser.add_argument(
        '--use-ut',
        action="store_true",
        help='Use universal transformer instead of transformer')

    args = parser.parse_args()

    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    st_ds_conf = config.TRANS2SEQ_CONF[args.dataset]
    if args.num_layer:
        st_ds_conf['max_num_layers'] = args.num_layer
    if args.epoch:
        config.TRAINING_LIMIT = args.epoch
    if args.batch:
        st_ds_conf['batch_sz'] = args.batch
    bsz = st_ds_conf['batch_sz']
    emb_sz = st_ds_conf['emb_sz']

    src_embedder = BasicTextFieldEmbedder(
        token_embedders={
            "tokens": Embedding(vocab.get_vocab_size('nltokens'), emb_sz)
        })

    if args.use_ut:
        transformer_encoder = UTEncoder(
            input_dim=emb_sz,
            max_num_layers=st_ds_conf['max_num_layers'],
            num_heads=st_ds_conf['num_heads'],
            feedforward_hidden_dim=emb_sz,
            feedforward_dropout=st_ds_conf['feedforward_dropout'],
            attention_dropout=st_ds_conf['attention_dropout'],
            residual_dropout=st_ds_conf['residual_dropout'],
            use_act=st_ds_conf['act'],
            use_vanilla_wiring=st_ds_conf['vanilla_wiring'])
    else:
        transformer_encoder = TransformerEncoder(
            input_dim=emb_sz,
            num_layers=st_ds_conf['max_num_layers'],
            num_heads=st_ds_conf['num_heads'],
            feedforward_hidden_dim=emb_sz,
            feedforward_dropout=st_ds_conf['feedforward_dropout'],
            attention_dropout=st_ds_conf['attention_dropout'],
            residual_dropout=st_ds_conf['residual_dropout'],
        )

    model = allennlp.models.SimpleSeq2Seq(
        vocab,
        source_embedder=src_embedder,
        encoder=transformer_encoder,
        max_decoding_steps=50,
        attention=allennlp.modules.attention.DotProductAttention(),
        beam_size=6,
        target_namespace="lftokens",
        use_bleu=True)

    if args.models:
        model.load_state_dict(torch.load(args.models[0]))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=bsz)
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters())

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'transformer2seq',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=args.device,
            num_epochs=config.TRAINING_LIMIT,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in tqdm.tqdm(testing_set, total=len(testing_set)):
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            del instance.fields['target_tokens']
            output = predictor.predict_instance(instance)
            print('PRED:', ' '.join(output['predicted_tokens']))
Пример #17
0
    """)

    # MODEL
    reset_seed(args.SEED)
    model = create_model(vocab,
                         args.EMBEDDING_DIM,
                         args.HIDDEN_DIM,
                         TaskModel=BaseSequenceTagger,
                         encoder_type=args.ENCODER_RNN,
                         bidirectional=True,
                         wemb=args.W_EMB,
                         dropout=args.DROPOUT,
                         num_layers=args.N_LAYERS)

    # Training
    iterator = BucketIterator(batch_size=args.BATCH_SIZE,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    optimizer = OPTIM_config[args.OPTIM.lower()](model.parameters(),
                                                 lr=args.LR)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=d1_train_dataset,
        validation_dataset=d1_valid_dataset,
        patience=args.PATIENCE,
        num_epochs=args.EPOCHS,
        cuda_device=args.GPU,
        serialization_dir=None,
Пример #18
0
    word_embeddings = BasicTextFieldEmbedder({"sentence": token_embedding})

    # init model
    model = SentimentClassifier(output_size=8,
                                hidden_size=256,
                                embedding_length=100,
                                bidirection=True,
                                attention_dim=64,
                                attention_hop=16,
                                connect_size=1024,
                                dropout=0.5,
                                word_embeddings=word_embeddings,
                                vocab=vocab)

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=10, sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      num_epochs=5,
                      patience=3,  # stop training before loss raise
                      cuda_device=-1
                      )

    # start train
    trainer.train()

    # save params
Пример #19
0
sexism_test = pd.read_csv('TemporalCorpora\\test.csv')
sexism_train = pd.read_csv('TemporalCorpora\\train.csv')
sexism_full = pd.concat([sexism_test, sexism_train], ignore_index=True)
sexism_full.to_csv('sexist.csv', encoding='utf-8')
full_ds = reader.read('sexist.csv')
labels_enc = LabelEncoder()
y_full = labels_enc.fit_transform(sexism_full['Label'])
train_ds, test_ds, y_train, y_test = train_test_split(full_ds,
                                                      y_full,
                                                      shuffle=True,
                                                      stratify=y_full,
                                                      train_size=0.9,
                                                      test_size=0.1)
vocab = Vocabulary.from_instances(train_ds + test_ds)
iterator = BucketIterator(batch_size=32,
                          biggest_batch_first=True,
                          sorting_keys=[("tokens", "num_tokens")],
                          padding_noise=.15)
iterator.index_with(vocab)
batch = next(iter(iterator(train_ds)))
EMBEDDING_DIM = 256
HIDDEN_DIM = 64
# These files are trained by us, for pretrained ELMO just to take pretrained ones
options_file = 'forELMO\\options.json'
weight_file = 'forELMO\\corp_trained.hdf5'
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(word_embeddings.get_output_dim(),
                  HIDDEN_DIM,
                  batch_first=True,
                  bidirectional=True))
Пример #20
0
    HIDDEN_DIM = 200

    model_params = Params({
        'type': 'lstm',
        'input_size': EMBEDDING_DIM,
        'hidden_size': HIDDEN_DIM
    })

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding})
    lstm = Seq2SeqEncoder.from_params(model_params)

    model = POSTagger(word_embedding, lstm, vocab)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[('sentence', 'num_tokens')])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100)

    trainer.train()
Пример #21
0
if args.optimizer.lower() == 'adam':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           betas=(args.beta1, args.beta2))
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)
elif args.optimizer.lower() == 'radam':
    optimizer = RAdam(model.parameters(),
                      lr=args.lr,
                      betas=(args.beta1, args.beta2))
else:
    raise TypeError
iterator = BucketIterator(batch_size=args.batch,
                          sorting_keys=[("source", "num_tokens")])
iterator.index_with(vocab)

#scheduler = _PyTorchLearningRateSchedulerWrapper(ReduceLROnPlateau(optimizer, patience=4))

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
    print('using gpu')
else:
    cuda_device = -1
    print('using cpu')

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
Пример #22
0
def _build_trainer(config, model, vocab, train_data, valid_data):
    optimizer = optim.AdamW(model.parameters(), lr=config.trainer.lr)
    scheduler = None

    is_bert_based = any(
        model.name.endswith('bert') for model in config.embedder.models)
    is_trainable_elmo_based = any(
        model.name == 'elmo' and model.params['requires_grad']
        for model in config.embedder.models)

    if is_bert_based or is_trainable_elmo_based:

        def _is_pretrained_param(name):
            return 'transformer_model' in name or '_elmo_lstm' in name

        pretrained_params, non_pretrained_params = [], []
        for name, param in model.named_parameters():
            if _is_pretrained_param(name):
                logger.info('Pretrained param: %s', name)
                pretrained_params.append(param)
            else:
                logger.info('Non-pretrained param: %s', name)
                non_pretrained_params.append(param)

        optimizer = optim.AdamW([{
            'params': pretrained_params,
            'lr': config.trainer.bert_lr
        }, {
            'params': non_pretrained_params,
            'lr': config.trainer.lr
        }, {
            'params': []
        }])

        scheduler = SlantedTriangular(
            optimizer=optimizer,
            num_epochs=config.trainer.num_epochs,
            num_steps_per_epoch=len(train_data) / config.trainer.batch_size,
            cut_frac=config.trainer.cut_frac,
            gradual_unfreezing=config.trainer.gradual_unfreezing,
            discriminative_fine_tuning=config.trainer.
            discriminative_fine_tuning)

    logger.info('Trainable params:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            logger.info('\t' + name)

    iterator = BucketIterator(batch_size=config.trainer.batch_size)
    iterator.index_with(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
        logger.info('Using cuda')
    else:
        cuda_device = -1
        logger.info('Using cpu')

    logger.info('Example batch:')
    _log_batch(next(iterator(train_data)))

    if is_bert_based:
        train_data = _filter_data(train_data, vocab)
        valid_data = _filter_data(valid_data, vocab)

    return Trainer(model=model,
                   optimizer=optimizer,
                   iterator=iterator,
                   train_dataset=train_data,
                   validation_dataset=valid_data,
                   validation_metric='+MeanAcc',
                   patience=config.trainer.patience,
                   num_epochs=config.trainer.num_epochs,
                   cuda_device=cuda_device,
                   grad_clipping=5.,
                   learning_rate_scheduler=scheduler,
                   serialization_dir=os.path.join(config.data.models_dir,
                                                  config.model_name),
                   should_log_parameter_statistics=False,
                   should_log_learning_rate=False,
                   num_gradient_accumulation_steps=config.trainer.
                   num_gradient_accumulation_steps)
Пример #23
0
def train_cnn(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, double_input=False,
              dense_vector=False, col_name=None, use_elmo=False, epochs=30, patience=5, learning_rate=3e-4, num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset; optionally, perform early stopping based on validation loss. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Functionality to run it for (1) Single Input: reply/question, (2) Double Input: reply + context comment,
    (3) Dense Vector + reply/question, and (4) Dense Vector + reply + context comment.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    double_input: bool
        True to run DoubleInput classifier | False (default) for SingleInput classifier
    dense_vector: bool
        True to concatenate dense feature vector before feeding to the FeedForward layer
    col_name: str
        'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    if double_input: # need context_tokens as well
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens"),
                                                ("context_tokens", "num_tokens")])

    else: # only reply_tokens
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    if double_input: # DoubleInput Classifier: two CNN encoders
        cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                               num_filters=num_filters,
                                               ngram_filter_sizes=filter_sizes)

        cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                                 num_filters=num_filters,
                                                 ngram_filter_sizes=filter_sizes)
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = 2 * (cnn_reply.get_output_dim() + DENSE_VECTOR_LEN)
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseDoubleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=cnn_reply,
                                                 context_encoder=cnn_context,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(2 * cnn_reply.get_output_dim(), num_classes)
            model = models.DoubleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=cnn_reply,
                                                 context_encoder=cnn_context,
                                                 classifier_feedforward=classifier_feedforward)


    else: # SingleInput Classifier: one CNN encoder
        encoder: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                             num_filters=num_filters,
                                             ngram_filter_sizes=filter_sizes)

        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseSingleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes)
            model = models.SingleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)


    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # Train on both train+validation dataset if patience is None
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #24
0
def run_model(args):
    st_ds_conf = get_updated_settings(args)
    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    model = get_model(vocab, st_ds_conf)
    device_tag = "cpu" if config.DEVICE < 0 else f"cuda:{config.DEVICE}"
    if args.models:
        model.load_state_dict(
            torch.load(args.models[0], map_location=device_tag))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters(),
                                 lr=config.ADAM_LR,
                                 betas=config.ADAM_BETAS,
                                 eps=config.ADAM_EPS)
        if args.fine_tune:
            optim = torch.optim.SGD(model.parameters(), lr=config.SGD_LR)

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'unc_s2s',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=config.DEVICE,
            num_epochs=config.TRAINING_LIMIT,
            grad_clipping=config.GRAD_CLIPPING,
            num_serialized_models_to_keep=-1,
        )

        trainer.train()

    else:
        if args.test_on_val:
            testing_set = reader.read(config.DATASETS[args.dataset].dev_path)
        else:
            testing_set = reader.read(config.DATASETS[args.dataset].test_path)

        model.eval()
        model.skip_loss = True  # skip loss computation on testing set for faster evaluation

        if config.DEVICE > -1:
            model = model.cuda(config.DEVICE)

        # batch testing
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)
        eval_generator = iterator(testing_set, num_epochs=1, shuffle=False)
        for batch in tqdm.tqdm(eval_generator,
                               total=iterator.get_num_batches(testing_set)):
            batch = move_to_device(batch, config.DEVICE)
            output = model(**batch)
        metrics = model.get_metrics()
        print(metrics)

        if args.dump_test:

            predictor = allennlp.predictors.SimpleSeq2SeqPredictor(
                model, reader)

            for instance in tqdm.tqdm(testing_set, total=len(testing_set)):
                print('SRC: ', instance.fields['source_tokens'].tokens)
                print(
                    'GOLD:', ' '.join(
                        str(x) for x in
                        instance.fields['target_tokens'].tokens[1:-1]))
                del instance.fields['target_tokens']
                output = predictor.predict_instance(instance)
                print('PRED:', ' '.join(output['predicted_tokens']))
Пример #25
0
def train_bert(train_dataset, validation_dataset, batch_size, pretrained_model, double_input=False, dense_vector=False,
               col_name=None, epochs=100, patience=None, learning_rate=3e-4, num_classes=2, use_gpu=False):
    """
    Trains BERT on train_dataset; with optional early stopping on validation_dataset.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    pretrained_model: str
        pretrained BERT model to use
    double_input: bool
        True to run DoubleInput classifier | False (default) for SingleInput classifier
    dense_vector: bool
        True to concatenate dense feature vector before feeding to the FeedForward layer
    col_name: str
        'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    vocab = Vocabulary()

    if double_input: # need context_tokens as well
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens"),
                                                ("context_tokens", "num_tokens")])

    else: # only reply_tokens
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    word_embeddings: TextFieldEmbedder = load_bert_embeddings(pretrained_model)
    encoder: Seq2VecEncoder = BertPooler(pretrained_model=pretrained_model,
                                         requires_grad=True)

    if double_input: # consider preceding 'comment_text'
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = 2 * (encoder.get_output_dim() + DENSE_VECTOR_LEN)
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseDoubleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=encoder,
                                                 context_encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(2*encoder.get_output_dim(), num_classes)
            model = models.DoubleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=encoder,
                                                 context_encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)
    else: # only 'reply_text' or 'question'
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseSingleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            # Feedforward:
            classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes)

            model = models.SingleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # No early stopping: train on both train+validation dataset
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Пример #26
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('train.txt')
    dev_dataset = reader.read('dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=20,
                      num_epochs=1000)
    trainer.train()
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    day = 12
    while day <= 30:
        # 0,1,2,3,text,source,6,7,8,9,10,favourites_count,12,13,14,15,followers_count,friends_count,18,19,20,lang
        total = 0
        res = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
        with open(f'2020-03-{day} Coronavirus Tweets.CSV', 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                lang = row[-1]
                if lang != 'en':
                    continue
                source = row[5]
                if source == 'Twitter for Advertisers':
                    continue
                followers_count = row[16]
                friends_count = row[17]
                try:
                    followers_count = int(followers_count)
                    friends_count = int(friends_count)
                    if friends_count > followers_count * 80:
                        continue
                except Exception:
                    print("Cannot get friends and follower")
                content = clean_tweets(row[4])
                if not content:
                    continue
                try:
                    if content.count('#') >= 5:
                        continue
                except Exception:
                    print("Cannot get hash tag")
                total += 1
                try:
                    fav = row[11]
                    fav = int(fav)
                except Exception:
                    print("Cannot get favorite")
                try:
                    logits = predictor.predict(content)['logits']
                    label_id = np.argmax(logits)
                    lab = model.vocab.get_token_from_index(label_id, 'labels')
                    res[lab] += 1
                    total += fav
                    res[lab] += fav
                except Exception:
                    print(f"Error in {row[4]}")
        print(f"Day {day}: Total: {total} tweets")
        print(
            f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}% ",
            end='')
        with open('tweets.log', 'w+') as log:
            log.write(f"Day {day}: Total: {total} tweets")
            log.write(
                f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}%"
            )
        day += 1
Пример #27
0
    def __init__(self, training=False):
        self.training = training
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        hidden_dim = config['hidden_dim']
        batch_size = config['batch_size']
        epoch = config['epoch']
        self.model_path = config['model']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        # 定义数据读取器,WordTokenizer代表按照空格分割,target的namespace用于生成输出层的vocab时不和source混在一起
        self.reader = MySeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        if training and self.model_path is not None:
            # 从文件中读取数据
            self.train_dataset = self.reader.read(
                os.path.join(prefix, train_file))
            self.valid_dataset = self.reader.read(
                os.path.join(prefix, valid_file))

            # 定义词汇
            self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                   self.valid_dataset,
                                                   min_count={
                                                       'tokens': 3,
                                                       'target_tokens': 3
                                                   })
        elif not training:
            try:
                self.vocab = Vocabulary.from_files(self.model_path)
            except Exception as e:
                logger.exception('vocab file does not exist!')

                # 从文件中读取数据
                self.train_dataset = self.reader.read(
                    os.path.join(prefix, train_file))
                self.valid_dataset = self.reader.read(
                    os.path.join(prefix, valid_file))

                # 定义词汇
                self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                       self.valid_dataset,
                                                       min_count={
                                                           'tokens': 3,
                                                           'target_tokens': 3
                                                       })

        # 定义embedding层
        src_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        # 定义encoder,这里使用的是BiGRU
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(src_embedding_dim,
                         hidden_dim // 2,
                         batch_first=True,
                         bidirectional=True))

        # 定义decoder,这里使用的是GRU,因为decoder的输入需要和encoder的输出一致
        decoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(hidden_dim, hidden_dim, batch_first=True))
        # 将index 映射到 embedding上,tokens与data reader中用的TokenInder一致
        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        # 线性Attention层
        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        # 定义模型
        self.model = Seq2SeqKnu(vocab=self.vocab,
                                source_embedder=source_embedder,
                                encoder=encoder,
                                target_namespace='target_tokens',
                                decoder=decoder,
                                attention=attention,
                                max_decoding_steps=20,
                                cuda_device=cuda_device)

        # 判断是否训练
        if training and self.model_path is not None:
            optimizer = optim.Adam(self.model.parameters())
            # sorting_keys代表batch的时候依据什么排序
            iterator = BucketIterator(batch_size=batch_size,
                                      sorting_keys=[("source_tokens",
                                                     "num_tokens")])
            # 迭代器需要接受vocab,在训练时可以用vocab来index数据
            iterator.index_with(self.vocab)

            self.model.cuda(cuda_device)

            # 定义训练器
            self.trainer = Trainer(model=self.model,
                                   optimizer=optimizer,
                                   iterator=iterator,
                                   patience=10,
                                   validation_metric="+accuracy",
                                   train_dataset=self.train_dataset,
                                   validation_dataset=self.valid_dataset,
                                   serialization_dir=self.model_path,
                                   num_epochs=epoch,
                                   cuda_device=cuda_device)
        elif not training:
            with open(os.path.join(self.model_path, 'best.th'), 'rb') as f:
                self.model.load_state_dict(torch.load(f))
            self.model.cuda(cuda_device)
            self.predictor = MySeqPredictor(self.model,
                                            dataset_reader=self.reader)
Пример #28
0
def main():
    parser = utils.opt_parser.get_trainer_opt_parser()
    parser.add_argument('models',
                        nargs='*',
                        help='pretrained models for the same setting')
    parser.add_argument('--test', action="store_true", help='use testing mode')
    parser.add_argument('--num-layer',
                        type=int,
                        help="stacked layer of transformer model")

    args = parser.parse_args()

    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    st_ds_conf = config.TRANSFORMER_CONF[args.dataset]
    if args.num_layer:
        st_ds_conf['num_layers'] = args.num_layer

    encoder = TransformerEncoder(
        input_dim=st_ds_conf['emb_sz'],
        num_layers=st_ds_conf['num_layers'],
        num_heads=st_ds_conf['num_heads'],
        feedforward_hidden_dim=st_ds_conf['emb_sz'],
    )
    decoder = TransformerDecoder(
        input_dim=st_ds_conf['emb_sz'],
        num_layers=st_ds_conf['num_layers'],
        num_heads=st_ds_conf['num_heads'],
        feedforward_hidden_dim=st_ds_conf['emb_sz'],
        feedforward_dropout=0.1,
    )
    source_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('nltokens'),
        embedding_dim=st_ds_conf['emb_sz'])
    target_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('lftokens'),
        embedding_dim=st_ds_conf['emb_sz'])
    model = ParallelSeq2Seq(
        vocab=vocab,
        encoder=encoder,
        decoder=decoder,
        source_embedding=source_embedding,
        target_embedding=target_embedding,
        target_namespace='lftokens',
        start_symbol=START_SYMBOL,
        eos_symbol=END_SYMBOL,
        max_decoding_step=st_ds_conf['max_decoding_len'],
    )

    if args.models:
        model.load_state_dict(torch.load(args.models[0]))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters())

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'transformer',
            datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=args.device,
            num_epochs=config.TRAINING_LIMIT,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in testing_set:
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            del instance.fields['target_tokens']
            output = predictor.predict_instance(instance)
            print('PRED:', ' '.join(output['predicted_tokens']))
Пример #29
0
vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  max_vocab_size=config.max_vocab_size)
token_embedder = Embedding.from_params(vocab=vocab,
                                       params=Params({
                                           'pretrained_file':
                                           'glove.twitter.27B.50d.txt',
                                           'embedding_dim': 50
                                       }))
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedder})

#Iterrator: batching data + preparing it for input
from allennlp.data.iterators import BucketIterator

iterator = BucketIterator(batch_size=config.batch_size,
                          sorting_keys=[("tokens", "num_tokens")],
                          max_instances_in_memory=512)
iterator.index_with(vocab)

lstm = PytorchSeq2VecWrapper(
    nn.LSTM(word_embeddings.get_output_dim(),
            config.hidden_sz,
            bidirectional=True,
            batch_first=True,
            dropout=.25))

model = LSTM_Model(word_embeddings, lstm, 2)
optimizer = optim.SGD(model.parameters(), lr=config.lr)

trainer = Trainer(
    model=model,
Пример #30
0
def _build_trainer(config, model, vocab, train_data, valid_data):
    optimizer = optim.AdamW(model.parameters(), lr=config.trainer.lr)
    scheduler = None

    is_bert_based = any(
        model.name.endswith('bert') for model in config.embedder.models)
    is_trainable_elmo_based = any(
        model.name == 'elmo' and model.params['requires_grad']
        for model in config.embedder.models)

    if is_bert_based or is_trainable_elmo_based:
        params_list = []
        non_pretrained_params = []
        if is_bert_based:
            bert_groups = [
                'transformer_model.embeddings.',
                'transformer_model.encoder.layer.0.',
                'transformer_model.encoder.layer.1.',
                'transformer_model.encoder.layer.2.',
                'transformer_model.encoder.layer.3.',
                'transformer_model.encoder.layer.4.',
                'transformer_model.encoder.layer.5.',
                'transformer_model.encoder.layer.6.',
                'transformer_model.encoder.layer.7.',
                'transformer_model.encoder.layer.8.',
                'transformer_model.encoder.layer.9.',
                'transformer_model.encoder.layer.10.',
                'transformer_model.encoder.layer.11.',
                'transformer_model.pooler.'
            ]
            bert_group2params = {bg: [] for bg in bert_groups}
            for name, param in model.named_parameters():
                is_bert_layer = False
                for bg in bert_groups:
                    if bg in name:
                        is_bert_layer = True
                        bert_group2params[bg].append(param)
                        logger.info('Param: %s assigned to %s group', name, bg)
                        break
                if not is_bert_layer:
                    non_pretrained_params.append(param)
                    logger.info('Param: %s assigned to non_pretrained group',
                                name)
            for bg in bert_groups:
                params_list.append({
                    'params': bert_group2params[bg],
                    'lr': config.trainer.bert_lr
                })
            params_list.append({
                'params': non_pretrained_params,
                'lr': config.trainer.lr
            })
            params_list.append({'params': []})
        elif is_trainable_elmo_based:
            pretrained_params = []
            for name, param in model.named_parameters():
                if '_elmo_lstm' in name:
                    logger.info('Pretrained param: %s', name)
                    pretrained_params.append(param)
                else:
                    logger.info('Non-pretrained param: %s', name)
                    non_pretrained_params.append(param)
            params_list = [{
                'params': pretrained_params,
                'lr': config.trainer.bert_lr
            }, {
                'params': non_pretrained_params,
                'lr': config.trainer.lr
            }, {
                'params': []
            }]
        optimizer = optim.AdamW(params_list)
        scheduler = SlantedTriangular(
            optimizer=optimizer,
            num_epochs=config.trainer.num_epochs,
            num_steps_per_epoch=len(train_data) / config.trainer.batch_size,
            cut_frac=config.trainer.cut_frac,
            gradual_unfreezing=config.trainer.gradual_unfreezing,
            discriminative_fine_tuning=config.trainer.
            discriminative_fine_tuning)

    logger.info('Trainable params:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            logger.info('\t' + name)

    iterator = BucketIterator(batch_size=config.trainer.batch_size)
    iterator.index_with(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
        logger.info('Using cuda')
    else:
        cuda_device = -1
        logger.info('Using cpu')

    logger.info('Example batch:')
    _log_batch(next(iterator(train_data)))

    if is_bert_based:
        train_data = _filter_data(train_data, vocab)
        valid_data = _filter_data(valid_data, vocab)

    return Trainer(model=model,
                   optimizer=optimizer,
                   iterator=iterator,
                   train_dataset=train_data,
                   validation_dataset=valid_data,
                   validation_metric='+MeanAcc',
                   patience=config.trainer.patience,
                   num_epochs=config.trainer.num_epochs,
                   cuda_device=cuda_device,
                   grad_clipping=5.,
                   learning_rate_scheduler=scheduler,
                   serialization_dir=os.path.join(config.data.models_dir,
                                                  config.model_name),
                   should_log_parameter_statistics=False,
                   should_log_learning_rate=False,
                   num_gradient_accumulation_steps=config.trainer.
                   num_gradient_accumulation_steps)