Пример #1
0
def train(args,
          model,
          dataset_reader,
          train_loader,
          device=0,
          val_loader=None,
          test_data=None,
          num_epochs=10,
          patience=None,
          serialization_dir=None):
    optimizer = AdamOptimizer(model.named_parameters(),
                              lr=args.lr,
                              weight_decay=args.l2)
    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_loader,
        validation_data_loader=val_loader,
        cuda_device=device,
        num_epochs=num_epochs,
        serialization_dir=serialization_dir,
        patience=patience,
        grad_clipping=args.clip,
    )
    trainer.train()

    if test_data is not None:
        predictor = Seq2SeqPredictor(model, dataset_reader)
        for instance in itertools.islice(test_data, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])
            print('-' * 50)
Пример #2
0
def main():
    reader = SkipGramReader()
    text8 = reader.read('data/text8/text8')

    vocab = Vocabulary.from_instances(text8,
                                      min_count={
                                          'token_in': 5,
                                          'token_out': 5
                                      })

    reader = SkipGramReader(vocab=vocab)
    text8 = reader.read('data/text8/text8')
    text8.index_with(vocab)

    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)

    data_loader = DataLoader(text8, batch_size=BATCH_SIZE)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=data_loader,
                                     num_epochs=5,
                                     cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
Пример #3
0
    def __call__(
        self,
        trainer: GradientDescentTrainer,
        metrics: Dict[str, Any],
        epoch: int,
        is_master: bool,
    ):
        """Callback call implementation."""
        batch = next(iter(trainer._validation_data_loader))
        outputs = trainer.model.make_output_human_readable(
                trainer.batch_outputs(batch, for_training=False),
        )['predicted_sentences']
        idx = random.randrange(0, len(outputs))

        vocab = trainer.model.vocab
        removal_tokens = {START_SYMBOL, END_SYMBOL, vocab._padding_token}

        pred_sentence = outputs[idx]
        source_sentence = ' '.join(
            [
                vocab.get_token_from_index(tidx.item())
                for tidx in batch['source_tokens']['tokens']['tokens'][idx]
                if vocab.get_token_from_index(tidx.item()) not in removal_tokens
            ],
        )

        logger.info('{0} -> {1}'.format(source_sentence, pred_sentence))
Пример #4
0
def build_trainer(
    model: Model,
    serialization_dir: str,
    train_loader: DataLoader,
    dev_loader: DataLoader = None,
    num_epochs: int = 1,
    cuda_device: int = -1,
    patience: int = None
    ) -> Trainer:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        cuda_device=cuda_device,
        patience=patience
    )
    return trainer
def build_trainer(
    model: Model,
    serialization_dir: str,
    train_loader: DataLoader,
    dev_loader: DataLoader
) -> Trainer:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]

    checkpointer  = Checkpointer(serialization_dir, num_serialized_models_to_keep=0)
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        checkpointer=checkpointer,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=50,
        optimizer=optimizer,
        cuda_device=0,
        validation_metric="-loss",
        patience=5,


    )
    return trainer
Пример #6
0
def build_trainer(
    config,
    lr: float,
    serialization_dir: str,
    num_epochs: int,
    model: Model,
    train_loader: DataLoader,
    dev_loader: DataLoader) -> Trainer:

    parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=lr)
    if torch.cuda.is_available():
        model.cuda()

    # remove serialization dir
    if os.path.exists(serialization_dir) and config.shutil_pre_finished_experiment:
        shutil.rmtree(serialization_dir)

    if not os.path.exists(serialization_dir):
        os.makedirs(serialization_dir)

    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        serialization_dir=serialization_dir,
        cuda_device=0 if torch.cuda.is_available() else -1
    )

    return trainer
Пример #7
0
def build_trainer(model: Model, serialization_dir: str,
                  train_loader: DataLoader, dev_loader: DataLoader) -> Trainer:

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("cpu/gpu? ", device)

    model = model.to(device)

    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=0.001)
    scheduler = ReduceOnPlateauLearningRateScheduler(optimizer=optimizer,
                                                     patience=5,
                                                     verbose=True)

    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        cuda_device=device,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        learning_rate_scheduler=scheduler,
        patience=20,
        num_epochs=200,
        optimizer=optimizer,
        validation_metric="+f1",
    )
    return trainer
Пример #8
0
def build_trainer(model: Model, serialization_dir: str,
                  train_loader: PyTorchDataLoader,
                  dev_loader: PyTorchDataLoader) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=lr)
    trainer = GradientDescentTrainer(model=model, serialization_dir=serialization_dir, data_loader=train_loader, \
                validation_data_loader=dev_loader, num_epochs=num_epoch, optimizer=optimizer, num_gradient_accumulation_steps=grad_accum)
    return trainer
def train_model(model, lr, wd, train_loader, validation_loader, patience,
                epochs, cuda_device, serialization_dir):
    """Train an initialized model"""

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    trainer = GradientDescentTrainer(model=model,
                                     data_loader=train_loader,
                                     validation_data_loader=validation_loader,
                                     optimizer=optimizer,
                                     patience=patience,
                                     num_epochs=epochs,
                                     cuda_device=cuda_device,
                                     serialization_dir=serialization_dir)

    fold_metrics = trainer.train()
    # Save embedding weights for visualization
    # n = word_embeddings.token_embedder_tokens.weight.item()
    # pd.DataFrame(n).to_csv(os.path.join(TENSORBOARD_DIR, run_name, 'model_weights.tsv'),
    #                       header=None, index=None, sep='\t')

    return fold_metrics, model
Пример #10
0
def build_trainer(model: Model, serialization_dir: str,
                  train_loader: DataLoader, dev_loader: DataLoader) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        #serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=5,
        optimizer=optimizer,
    )
    return trainer
Пример #11
0
def build_trainer(model: Model, serialization_dir: str,
                  train_loader: DataLoader, dev_loader: DataLoader) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    # optimizer = AdamOptimizer(parameters)
    optimizer = HuggingfaceAdamWOptimizer(parameters, lr=1e-5)
    trainer = GradientDescentTrainer(model=model,
                                     serialization_dir=serialization_dir,
                                     data_loader=train_loader,
                                     validation_data_loader=dev_loader,
                                     num_epochs=50,
                                     optimizer=optimizer,
                                     cuda_device=0,
                                     validation_metric="+auc",
                                     patience=5)
    return trainer
Пример #12
0
def build_trainer(model: Model, serialization_dir: str, train_loader: DataLoader,
                  dev_loader: DataLoader) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr = 0.001)
    scheduler = ReduceOnPlateauLearningRateScheduler(optimizer = optimizer,
                                                     patience = 5,
                                                    verbose=True)
    
    trainer = GradientDescentTrainer(model=model, serialization_dir=serialization_dir, 
                                     data_loader=train_loader, validation_data_loader=dev_loader, 
                                     learning_rate_scheduler = scheduler,
                                     patience=20, num_epochs=200,
                                     optimizer=optimizer,
                                    validation_metric = "+accuracy",
                                    )
    return trainer
Пример #13
0
def build_trainer(model: Model, ser_dir: str, train_loader: DataLoader, valid_loader: DataLoader,
                  hugging_optim: bool, cuda_device: int) -> Trainer:
    params = [ [n, p] for n, p in model.named_parameters() if p.requires_grad ]
    logging.info(f"{len(params)} parameters requiring grad updates")
    if hugging_optim:
        optim = HuggingfaceAdamWOptimizer(params, lr=1.0e-5)
    else:
        optim = AdamOptimizer(params)
    return GradientDescentTrainer(
        model=model,
        serialization_dir=ser_dir,
        data_loader=train_loader,
        validation_data_loader=valid_loader,
        num_epochs=5,
        patience=None,  # early stopping is disabled
        optimizer=optim,
        cuda_device=cuda_device
    )
Пример #14
0
def build_trainer(model: Model, serialization_dir:str, train_loader: PyTorchDataLoader, dev_loader: PyTorchDataLoader) -> Trainer:
    parameters = [[n,p] for n, p in model.named_parameters() if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=lr, weight_decay=weight_decay, betas=(0.9, 0.98), eps=1e-09)
    lr_scheduler = NoamLR(optimizer, model_size = embedding_dim, warmup_steps = warmup)
    # lr_scheduler = InverseSquareRootLR(optimizer, warmup_steps = warmup, end_lr = lr)
    # lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, factor = 0.8, patience = 3, min_lr = 0.000001, eps=1e-08)
    trainer = GradientDescentTrainer(
        model=model, 
        serialization_dir=serialization_dir, 
        data_loader=train_loader, \
        validation_data_loader=dev_loader, 
        num_epochs=num_epoch, 
        optimizer=optimizer, \
        num_gradient_accumulation_steps=grad_accum,
        grad_norm=grad_norm, 
        patience=patience,
        learning_rate_scheduler=lr_scheduler)
    return trainer
Пример #15
0
def build_trainer(model: Model,
                  serialization_dir: str,
                  train_loader: DataLoader,
                  dev_loader: DataLoader,
                  num_epochs: int,
                  learning_rate: float = 0.001,
                  cuda_device=None) -> Trainer:
    """
    Builds instance of Trainer class with specified training hyperparameters
    Adapted from https://guide.allennlp.org/training-and-prediction

    Parameters
        model : Model
            The model to train
        serialization_dir : str
            Directory to save checkpoints and results
        train_loader : DataLoader
            Previously built dataset loader for training data
        dev_loader : DataLoader
            Previously built loader for dev data
        num_epochs : int
            Number of epochs to train for
        learning_rate : float (default: 0.001)
        cuda_device : int (default: None)
            >=0 if using GPU

    Returns
        trainer : Trainer
    """
    parameters = [(n, p) for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=learning_rate)  # type: ignore
    trainer = GradientDescentTrainer(model=model,
                                     checkpointer=Checkpointer(
                                         serialization_dir,
                                         num_serialized_models_to_keep=-1),
                                     serialization_dir=serialization_dir,
                                     data_loader=train_loader,
                                     validation_data_loader=dev_loader,
                                     num_epochs=num_epochs,
                                     optimizer=optimizer,
                                     cuda_device=cuda_device)
    print("Will train for", num_epochs, "epochs")
    return trainer
def build_trainer(
    config,
    model: Model,
    train_loader: DataLoader,
    dev_loader: DataLoader,
) -> Trainer:
    parameters = [(n, p) for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=config.lr)  # type: ignore
    model.cuda()
    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=config.num_epochs,
        optimizer=optimizer,
        cuda_device=0,
        serialization_dir=config.serialization_dir)
    return trainer
Пример #17
0
def build_trainer(model: Model, serialization_dir: str,
                  train_loader: DataLoader, dev_loader: DataLoader) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters)
    # There are a *lot* of other things you could configure with the trainer.  See
    # http://docs.allennlp.org/master/api/training/trainer/#gradientdescenttrainer-objects for more
    # information.

    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=5,
        optimizer=optimizer,
        validation_metric="+accuracy",
    )
    return trainer
Пример #18
0
    def __call__(
        self,
        trainer: GradientDescentTrainer,
        batch_inputs: List[List[TensorDict]],
        batch_outputs: List[Dict[str, Any]],
        epoch: int,
        batch_number: int,
        is_training: bool,
        is_master: bool,
    ) -> None:

        if is_training:

            attacker = Attacker(classifier=trainer.model,
                                reader=self.reader,
                                device=-1)
            for batch in batch_inputs:

                instances = []
                for element in batch:
                    data = TransactionsData.from_tensors(
                        inputs=element, vocab=trainer.model.vocab)
                    adv_data = attacker.attack(data)

                    instance = self.reader.text_to_instance(**adv_data)
                    instances.append(instance)

                new_batch = Batch(instances)
                new_batch.index_instances(vocab=trainer.model.vocab)

                new_batch = new_batch.as_tensor_dict()

                batch_outputs = trainer.batch_outputs(new_batch,
                                                      for_training=True)
                loss = batch_outputs.get("loss")
                _ = batch_outputs.get("reg_loss")
                loss.backward()
                trainer.optimizer.step()
                trainer.optimizer.zero_grad()
Пример #19
0
def build_grad_desc_with_adam_trainer(model: Model,
                                      serialization_dir: str,
                                      train_loader: DataLoader,
                                      dev_loader: DataLoader,
                                      lr: float,
                                      num_epochs: int,
                                      wbrun: Any = None) -> Trainer:
    """
    Build the model trainer.
    Includes instantiating the optimizer as well.
    This builder uses the GradientDescentTrainer &
    HuggingfaceAdamWOptimizer combo.
    Also allows setting callbacks (atm for WandB mainly).

    :param model: The model object to be trained.
    :param serialization_dir: The serialization directory to output
            results to.
    :param train_loader: The training data loader.
    :param dev_loader: The dev data loader.
    :param lr: Learning rate.
    :param num_epochs: Number of epochs to train for.
    :param wbrun: WandB object to use for callbacks.
    :return trainer: The Trainer object.
    """
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = HuggingfaceAdamWOptimizer(parameters, lr=lr)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        callbacks=(build_callbacks(serialization_dir, wbrun)
                   if wbrun else None))

    return trainer
Пример #20
0
def build_classifier_trainer(model: Model,
                             serialization_dir: str,
                             train_loader: DataLoader,
                             dev_loader: DataLoader,
                             num_epochs: int = 1,
                             cuda_device: int = -1,
                             learning_rate: float = 0.000025,
                             world_size: int = 1,
                             distributed: bool = False) -> Trainer:
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=learning_rate)
    trainer = GradientDescentTrainer(model=model,
                                     serialization_dir=serialization_dir,
                                     data_loader=train_loader,
                                     validation_data_loader=dev_loader,
                                     num_epochs=num_epochs,
                                     optimizer=optimizer,
                                     cuda_device=cuda_device,
                                     world_size=world_size,
                                     distributed=distributed,
                                     validation_metric='+accuracy')
    return trainer
Пример #21
0
train_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True)
vocab = Vocabulary.from_instances(train_dataset)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
TARGET_EMBEDDING_DIM = 512

token_embedding = Embedding(embedding_dim=EMBEDDING_DIM, num_embeddings=vocab.get_vocab_size(namespace="tokens"))
word_embedding = BasicTextFieldEmbedder({"token": token_embedding})

bi_rnn_encoder = RnnSeq2SeqEncoder(EMBEDDING_DIM, HIDDEN_DIM, 2, bidirectional=True)
dot_attn = DotProductAttention()
model = CopyNetSeq2Seq(vocab, word_embedding, bi_rnn_encoder, dot_attn,
                       target_namespace="trg", target_embedding_dim=TARGET_EMBEDDING_DIM)

with tempfile.TemporaryDirectory() as serialization_dir:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=None,
        num_epochs=5,
        optimizer=optimizer,
    )
    trainer.train()

Пример #22
0
    encoder = LstmSeq2VecEncoder(10, 32, bidirectional=True)
    # encoder = BagOfEmbeddingsEncoder(embedding_dim=10)

    model = IntentEstimator(vocab, embedder, encoder)
    model.cuda()

    # モデルの学習

    with tempfile.TemporaryDirectory() as serialization_dir:
        parameters = [
            [n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = AdamOptimizer(parameters)
        trainer = GradientDescentTrainer(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_loader,
            validation_data_loader=valid_loader,
            num_epochs=20,
            optimizer=optimizer,
            cuda_device=0)

        trainer.train()

    # モデルの実行

    predictor = IntentEstimatorPredictor(model, dataset_reader)

    text = dataset_reader.tokenizer('東京駅から富山駅まで行きたいです')
    output = predictor.predict(text)
    print(text)
    print([(vocab.get_token_from_index(label_id, 'labels'), prob)
           for label_id, prob in enumerate(output['probs'])])
Пример #23
0
def main():

    opts = options()

    # select a bert specific indexer
    if opts.with_bert:
        from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=opts.bert_name, max_length=opts.bert_max_len)
    # separate by spaces
    else:
        from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
        indexer = SingleIdTokenIndexer()

    reader = TaggerDatasetReader(token_indexers={"tokens": indexer})
    train_dataset = reader.read(opts.train_file)
    valid_dataset = reader.read(opts.valid_file)
    params = Tagger.opts2params(opts)

    with open(opts.model_dir + "/params.pkl", mode='wb') as f:
        pickle.dump(params, f)

    vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                      min_count={'tokens': opts.min_freq})
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    train_data_loader = PyTorchDataLoader(train_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              train_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))
    valid_data_loader = PyTorchDataLoader(valid_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              valid_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))

    model = Tagger.build(params, vocab)
    if torch.cuda.is_available():
        cuda_device = opts.gpuid
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    # select an optimizer for fine-tuning
    if opts.with_bert:
        from allennlp.training.optimizers import HuggingfaceAdamWOptimizer
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = HuggingfaceAdamWOptimizer(model_parameters=parameters,
                                              lr=0.0003,
                                              parameter_groups=[
                                                  ([".*transformer.*"], {
                                                      "lr": 1e-05
                                                  })
                                              ])
    # optimizer for random initialization
    else:
        import torch.optim as optim
        optimizer = optim.Adam(model.parameters(), lr=0.001)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=valid_data_loader,
        num_epochs=1,
        use_amp=opts.use_amp,
        num_gradient_accumulation_steps=opts.num_gradient_accumulation_steps,
        cuda_device=cuda_device)

    vocab.save_to_files(opts.model_dir + "/vocab")

    best_f1 = 0.0
    for i in range(opts.epochs):
        epoch = i + 1
        print('Epoch: {}'.format(epoch))
        info = trainer.train()
        print(info)
        if info["validation_accuracy"] > best_f1:
            best_f1 = info["validation_accuracy"]
            with open(opts.model_dir + "/save_" + str(epoch) + ".save",
                      'wb') as f_model:
                torch.save(model.state_dict(), f_model)
Пример #24
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data'
    train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    train_dataset.index_with(vocab)
    dev_dataset.index_with(vocab)

    train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=dev_data_loader,
        patience=10,
        num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Пример #25
0
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1


# don't put a slash after this?
serialization_dir = f"tmp-{expname}/"

if os.path.exists(serialization_dir):
    print("serialization directory exists, removing...")
    shutil.rmtree(serialization_dir)

batch_size = 32
validation_batch_size = 64

data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=allennlp_collate)
validation_data_loader = DataLoader(validation_dataset, batch_size=validation_batch_size, collate_fn=allennlp_collate)

trainer = GradientDescentTrainer(model=model,
                  optimizer=optimizer,
                  data_loader=data_loader,
                  validation_data_loader=validation_data_loader,
                  patience=10,
                  num_epochs=75,
                  validation_metric="+f1-measure-overall",
                  cuda_device=cuda_device,
                  serialization_dir=serialization_dir)

metrics = trainer.train()
Пример #26
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data'
    # train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt')
    # dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt')
    train_dataset = reader.read('Treebank_train.txt')
    print(type(train_dataset))
    print(train_dataset)

    dev_dataset = reader.read('Treebank_dev.txt')



    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.

    # 您可以选择指定令牌 / 标签的最小计数。
    # 'min_count = {tokens:3}'
    # 这里的意思是任何出现少于三次的标记都将被忽略,并且不会包含在词汇表中。
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification

    # BasicTextFieldEmbedder需要一个dict-我们需要一个仅用于令牌的嵌入,
    # 不适用于标签,它被用作句子分类的“答案”
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).

    # Seq2VecEncoder是一个神经网络抽象,它需要一系列的东西
    # (通常是一系列嵌入的词向量),处理它,并返回一个
    # 矢量。通常这是基于RNN的体系结构(例如,LSTM或GRU),但是
    # AllenNLP还支持cnn和其他简单的体系结构(例如,
    # 对输入向量求平均值)。
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    train_dataset.index_with(vocab)
    dev_dataset.index_with(vocab)

    train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=dev_data_loader,
        patience=10,
        num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    # logits = predictor.predict('This is the best movie ever!')['logits']
    logits = predictor.predict('''On August 28, Mustafa varank, Turkey's minister of industry and technology, said Turkey plans to become a production center for automotive batteries by investing in cells, battery modules and battery packs. The country also hopes to become Europe's largest and the world's top five electric and autopilot auto makers by 2030. In order to achieve this goal, varank said Turkey would support the investment of electronic and electrical companies in the automotive industry. Varank points out that modern Turkish plants will cover half of the world's I20 capacity, 90% of which is expected to be exported abroad. "It took 27 months to build this line, with a total investment of $194 million. The productivity of I20 in Turkey will exceed 60%, which will increase gradually. In the past year, Turkey has developed EMUs, SUVs, tractors and excavators equipped with electric engines, and now plans to develop electric vehicle technology. Varank said Turkey would build an ecosystem to produce key components for electric vehicles, such as electric engines, inverters, charging equipment and compressors. He stressed that the automobile industry is the "locomotive" of Turkey's industrial sector, which also provides advantages for other industries. In May and June this year, Turkey's industrial production increased by double-digit compared with the same period last year. In the first half of 2020, Turkey issued 1200 investment award certificates worth US $108 billion (about US $16.7 billion) and created 163000 new jobs. On August 28, Turkey released its economic confidence index for August, and varank said: "the positive trend continues, and our citizens have more positive expectations for the post epidemic period." Choi Hong GHI, South Korea's ambassador to Ankara, said that Hyundai Motor, one of the world's top five auto manufacturers, established its first overseas factory in Turkey 23 years ago. "Hyundai's zmit factory is a symbol of economic cooperation between the two countries, which directly promotes employment and exports in Turkey." Eckkyun Oh, chief executive of Hyundai assan, said the company has produced more than two million cars in Turkey, most of which are exported to countries in Europe, the Middle East and North Africa. "We will produce 100000 new I20 cars here," he said.''')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Пример #27
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")])
        dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")])
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)
        dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler)
        optimizer = optim.Adam(model.parameters())
        trainer = GradientDescentTrainer(model=model,
                                         optimizer=optimizer,
                                         data_loader=train_loader,
                                         validation_data_loader=dev_loader,
                                         num_epochs=5,
                                         patience=1,
                                         cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in eval mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model)  # also save the word embedding matrix

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)
    targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in eval mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data),
                                         batch_size=universal_perturb_batch_size,
                                         drop_last=False)  # TODO don't drop last
    targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler)
    # sample batches, update the triggers, and repeat
    for epoch in range(5):
        for batch in targeted_loader:
            # get accuracy with current triggers
            utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
            model.train()  # rnn cannot do backwards in eval mode

            # get gradient w.r.t. trigger embeddings for current batch
            averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

            # pass the gradients to a particular attack to generate token candidates for each token.
            cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                            embedding_weight,
                                                            trigger_token_ids,
                                                            num_candidates=40,
                                                            increase_loss=True)
            # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
            #                                                trigger_token_ids,
            #                                                num_candidates=40)
            # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
            #                                                        embedding_weight,
            #                                                        trigger_token_ids,
            #                                                        tree,
            #                                                        100,
            #                                                        num_candidates=40,
            #                                                        increase_loss=True)

            # Tries all of the candidates and returns the trigger sequence with highest loss.
            trigger_token_ids = utils.get_best_candidates(model,
                                                          batch,
                                                          trigger_token_ids,
                                                          cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)