Пример #1
0
def train(config, device):
    dataset = News20Dataset(config.cache_data_dir,
                            config.vocab_path,
                            is_train=True)
    dataloader = MyDataLoader(dataset, config.batch_size)

    model = HierarchialAttentionNetwork(
        num_classes=dataset.num_classes,
        vocab_size=dataset.vocab_size,
        embed_dim=config.embed_dim,
        word_gru_hidden_dim=config.word_gru_hidden_dim,
        sent_gru_hidden_dim=config.sent_gru_hidden_dim,
        word_gru_num_layers=config.word_gru_num_layers,
        sent_gru_num_layers=config.sent_gru_num_layers,
        word_att_dim=config.word_att_dim,
        sent_att_dim=config.sent_att_dim).to(device)

    optimizer = optim.Adam(params=filter(lambda p: p.requires_grad,
                                         model.parameters()),
                           lr=config.lr)

    criterion = nn.NLLLoss(reduction='sum').to(device)

    trainer = Trainer(config, model, optimizer, criterion, dataloader)
    trainer.train()
def main():
    """
    Training and validation.
    """
    global checkpoint, start_epoch, word_map

    # Initialize model or load checkpoint
    if checkpoint is not None:
        checkpoint = torch.load(checkpoint)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']
        word_map = checkpoint['word_map']
        start_epoch = checkpoint['epoch'] + 1
        print(
            '\nLoaded checkpoint from epoch %d.\n' % (start_epoch - 1))
    else:
        embeddings, emb_size = load_word2vec_embeddings(word2vec_file, word_map)  # load pre-trained word2vec embeddings

        model = HierarchialAttentionNetwork(n_classes=n_classes,
                                            vocab_size=len(word_map),
                                            emb_size=emb_size,
                                            word_rnn_size=word_rnn_size,
                                            sentence_rnn_size=sentence_rnn_size,
                                            word_rnn_layers=word_rnn_layers,
                                            sentence_rnn_layers=sentence_rnn_layers,
                                            word_att_size=word_att_size,
                                            sentence_att_size=sentence_att_size,
                                            dropout=dropout)
        model.sentence_attention.word_attention.init_embeddings(
            embeddings)  # initialize embedding layer with pre-trained embeddings
        model.sentence_attention.word_attention.fine_tune_embeddings(fine_tune_word_embeddings)  # fine-tune
        optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    # Loss functions
    criterion = nn.CrossEntropyLoss()

    # Move to device
    model = model.to(device)
    criterion = criterion.to(device)
    if device == 'cuda':
        model = torch.nn.DataParallel(model)
        cudnn.benchmark = True
    # DataLoaders
    train_loader = torch.utils.data.DataLoader(HANDataset(data_folder, 'train'), batch_size=batch_size, shuffle=True,
                                               num_workers=workers, pin_memory=True)

    # Epochs
    for epoch in range(start_epoch, epochs):
        # One epoch's training
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

        # Decay learning rate every epoch
        adjust_learning_rate(optimizer, 0.1)

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer, word_map)
    def __init__(self,
                 word2vec_config_path,
                 word2vec_model_path,
                 HAN_mdoel_path,
                 HAN_config_path,
                 tokenizer_name="word_tokenizer",
                 device=torch.device("cpu")):
        class Struct:
            def __init__(self, **entries):
                self.__dict__.update(entries)

        self.device = device

        ##Load word2vec config
        with open(word2vec_config_path, 'r') as f:
            word2vec_config = json.load(f)
        word2vec_config = Struct(**word2vec_config)

        self.word2vec_model = MyGensimModel(word2vec_model_path)

        ##Load tokenizer
        self.tokenizer = MyTokenizer(tokenizer_name)

        ##Load HAN config
        with open(HAN_config_path, 'r') as f:
            HAN_config = json.load(f)
        HAN_config = Struct(**HAN_config)

        ##Load HAN model
        self.model = HierarchialAttentionNetwork(
            dictionary_size=self.word2vec_model.dict_size,
            embedding_size=word2vec_config.size,
            hidden_size=HAN_config.hidden_size,
            attention_size=HAN_config.atten_size,
            num_class=HAN_config.num_class,
            n_layers=HAN_config.n_layers,
            device=device)
        self.model.set_embedding(self.word2vec_model.embedding)
        check_point = torch.load(HAN_mdoel_path)
        self.model.load_state_dict(check_point["model"])
        self.model.to(device)
class Classifier():
    def __init__(self,
                 word2vec_config_path,
                 word2vec_model_path,
                 HAN_mdoel_path,
                 HAN_config_path,
                 tokenizer_name="word_tokenizer",
                 device=torch.device("cpu")):
        class Struct:
            def __init__(self, **entries):
                self.__dict__.update(entries)

        self.device = device

        ##Load word2vec config
        with open(word2vec_config_path, 'r') as f:
            word2vec_config = json.load(f)
        word2vec_config = Struct(**word2vec_config)

        self.word2vec_model = MyGensimModel(word2vec_model_path)

        ##Load tokenizer
        self.tokenizer = MyTokenizer(tokenizer_name)

        ##Load HAN config
        with open(HAN_config_path, 'r') as f:
            HAN_config = json.load(f)
        HAN_config = Struct(**HAN_config)

        ##Load HAN model
        self.model = HierarchialAttentionNetwork(
            dictionary_size=self.word2vec_model.dict_size,
            embedding_size=word2vec_config.size,
            hidden_size=HAN_config.hidden_size,
            attention_size=HAN_config.atten_size,
            num_class=HAN_config.num_class,
            n_layers=HAN_config.n_layers,
            device=device)
        self.model.set_embedding(self.word2vec_model.embedding)
        check_point = torch.load(HAN_mdoel_path)
        self.model.load_state_dict(check_point["model"])
        self.model.to(device)

    def analysis(self, doc):
        # |doc| = (doc)

        tokens = [[
            word for word in self.tokenizer.tokenize(sentences, lemma=False)
        ] for sentences in sent_tokenize(doc)]
        temp_index = [[
            self.word2vec_model.word2index.get(word)
            if self.word2vec_model.word2index.get(word) else 0
            for word in self.tokenizer.tokenize(sentences)
        ] for sentences in sent_tokenize(doc)]
        for sentence in temp_index:
            ##Even though there is no word after preprocess procedure, must put something like "[UNK]" to run machine
            if len(sentence) == 0:
                sentence.extend([0])

        temp_sent_len = len(temp_index)
        temp_word_len = [len(sent) for sent in temp_index]

        max_sent_len = temp_sent_len
        max_word_len = max(temp_word_len)

        for sent in temp_index:
            if len(sent) < max_word_len:
                extended_words = [0 for _ in range(max_word_len - len(sent))]
                sent.extend(extended_words)

        if len(temp_index) < max_sent_len:
            extended_sentences = [[0 for _ in range(max_word_len)]
                                  for _ in range(max_sent_len -
                                                 len(temp_index))]
            temp_index.extend(extended_sentences)

        temp_index = [sentences[:max_word_len]
                      for sentences in temp_index][:max_sent_len]

        if len(temp_word_len) < max_sent_len:
            extended_word_len = [
                0 for _ in range(max_sent_len - len(temp_word_len))
            ]
            temp_word_len.extend(extended_word_len)
        temp_word_len = temp_word_len[:max_sent_len]

        temp_index = torch.tensor(temp_index)
        temp_sent_len = torch.tensor(temp_sent_len)
        temp_word_len = torch.tensor(temp_word_len)

        temp_index = temp_index.unsqueeze(0).to(self.device)
        temp_sent_len = temp_sent_len.unsqueeze(0).to(self.device)
        temp_word_len = temp_word_len.unsqueeze(0).to(self.device)
        y_hat, sent_weights, word_weights = self.model(temp_index,
                                                       temp_sent_len,
                                                       temp_word_len)
        ps = torch.exp(y_hat)
        top_p, top_class = ps.topk(1, dim=1)

        sent_weights = sent_weights.squeeze()
        word_weights = word_weights.squeeze()

        return top_class, tokens, sent_weights, word_weights

    def view(self, doc):
        top_class, tokens, sent_weights, word_weights = self.analysis(doc)
        sent_weights = sent_weights.tolist()
        word_weights = word_weights.tolist()
        total_len = len(sent_weights)

        for sent, word_weight, sent_weight in zip(tokens, word_weights,
                                                  sent_weights):
            temp_str = self.mk_weight_string(sent, word_weight, sent_weight,
                                             total_len)
            self.printmd(temp_str)

    def mk_weight_string(self, str_list, w_list, s_weight, total_len):
        temp_str = []
        for string, weight in zip(str_list, w_list):
            temp_str += [
                '<span style="background-color:rgba(255,0,0,' + str(weight) +
                ');  font-size: ' + str(int(total_len) * 10 * s_weight) +
                'pt;">' + string + '</span>'
            ]
        return " ".join(temp_str)

    # Markdown Printer
    def printmd(self, string):
        display(Markdown(string))
def main():
    """
    Training and validation.
    """
    global best_acc, epochs_since_improvement, checkpoint, start_epoch, word_map

    # Initialize model or load checkpoint
    if checkpoint is not None:
        checkpoint = torch.load(checkpoint)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']
        word_map = checkpoint['word_map']
        start_epoch = checkpoint['epoch'] + 1
        best_acc = checkpoint['best_acc']
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        print(
            '\nLoaded checkpoint from epoch %d, with a previous best accuracy of %.3f.\n'
            % (start_epoch - 1, best_acc))
    else:
        embeddings, emb_size = load_word2vec_embeddings(
            word2vec_file, word_map)  # load pre-trained word2vec embeddings

        model = HierarchialAttentionNetwork(
            n_classes=n_classes,
            vocab_size=len(word_map),
            emb_size=emb_size,
            word_rnn_size=word_rnn_size,
            sentence_rnn_size=sentence_rnn_size,
            word_rnn_layers=word_rnn_layers,
            sentence_rnn_layers=sentence_rnn_layers,
            word_att_size=word_att_size,
            sentence_att_size=sentence_att_size,
            dropout=dropout)
        model.sentence_attention.word_attention.init_embeddings(
            embeddings
        )  # initialize embedding layer with pre-trained embeddings
        model.sentence_attention.word_attention.fine_tune_embeddings(
            fine_tune_word_embeddings)  # fine-tune
        optimizer = optim.Adam(params=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                               lr=lr)

    # Loss functions
    criterion = nn.CrossEntropyLoss()

    # Move to device
    model = model.to(device)
    criterion = criterion.to(device)

    # DataLoaders
    train_loader = torch.utils.data.DataLoader(HANDataset(
        data_folder, 'train'),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(HANDataset(data_folder, 'test'),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=workers,
                                             pin_memory=True)

    # Epochs
    for epoch in range(start_epoch, epochs):
        # One epoch's training
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

        # One epoch's validation
        acc = validate(val_loader=val_loader, model=model, criterion=criterion)

        # Did validation accuracy improve?
        is_best = acc > best_acc
        best_acc = max(acc, best_acc)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Decay learning rate every epoch
        # adjust_learning_rate(optimizer, 0.5)

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer, best_acc, word_map,
                        epochs_since_improvement, is_best)
Пример #6
0
def run(config):
    def _print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    _print_config(config)

    if not logging.getLogger() == None:
        for handler in logging.getLogger().handlers[:]:  # make a copy of the list
            logging.getLogger().removeHandler(handler)

    if not config.save_path and config.dict_path:
        all_subdir = [int(s) for s in os.listdir(config.dict_path) if os.path.isdir(os.path.join(config.dict_path, str(s)))]
        max_dir_num = 0
        if all_subdir:
            max_dir_num = max(all_subdir)
        max_dir_num += 1
        config.save_path = os.path.join(config.dict_path, str(max_dir_num))
        os.mkdir(config.save_path)

    logging.basicConfig(filename=os.path.join(config.save_path, 'train_log'),
                        level=tools.LOGFILE_LEVEL,
                        filemode='w')

    console = logging.StreamHandler()
    console.setLevel(tools.CONSOLE_LEVEL)
    logging.getLogger().addHandler(console)

    logging.info("##################### Start Training")
    logging.debug(vars(config))

    ##load data loader
    logging.info("##################### Load DataLoader")
    loader = MyDataLoader(train_path=config.train_path,
                          valid_path=config.valid_path,
                          dict_path=config.dict_path,
                          batch_size=config.batch_size,
                          tokenizer_name=config.tokenizer_name,
                          max_sent_len=config.max_sent_len,
                          max_word_len=config.max_word_len)

    train, valid, num_class = loader.get_train_valid()
    logging.info("##################### Train Dataset size : [" + str(len(train)) + "]")
    logging.info("##################### Valid Dataset size : [" + str(len(valid)) + "]")
    logging.info("##################### class size : [" + str(num_class) + "]")

    dict_size = loader.get_dict_size()
    word_vec_dim = loader.get_dict_vec_dim()
    embedding = loader.get_embedding()
    config.num_class = num_class

    logging.info("##################### Load 'HAN' Model")
    model = HierarchialAttentionNetwork(dictionary_size=dict_size,
                                        embedding_size=word_vec_dim,
                                        hidden_size=config.hidden_size,
                                        attention_size=config.atten_size,
                                        num_class=num_class,
                                        n_layers=config.n_layers,
                                        device=config.device
                                        )
    model.set_embedding(embedding)
    model.to(config.device)

    crit = nn.NLLLoss()
    trainer = Trainer(model=model,
                      crit=crit,
                      config=config,
                      device=config.device)
    history = trainer.train(train, valid)
    return history
Пример #7
0
def main():
    """
    Training and validation.
    """
    global checkpoint, start_epoch, word_map

    iter = 5
    res = {"best_eval_acc": [], "best_eval_f1": [], "best_eval_step": []}
    for i in range(1, 1 + iter):
        print("=" * 10 + "ROUND " + str(i) + "=" * 10)

        # DataLoaders
        train_loader = torch.utils.data.DataLoader(HANDataset(
            data_folder, 'train'),
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=workers,
                                                   pin_memory=True)
        # Load test data
        test_loader = torch.utils.data.DataLoader(HANDataset(
            data_folder, 'test'),
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  num_workers=workers,
                                                  pin_memory=True)

        # Initialize model or load checkpoint
        if checkpoint is not None:
            checkpoint = torch.load(checkpoint)
            model = checkpoint['model']
            optimizer = checkpoint['optimizer']
            word_map = checkpoint['word_map']
            start_epoch = checkpoint['epoch'] + 1
            print('\nLoaded checkpoint from epoch %d.\n' % (start_epoch - 1))
        else:
            # embeddings, emb_size = load_word2vec_embeddings(word2vec_file, word_map)  # load pre-trained word2vec embeddings

            # embeddings, emb_size = load_glove_w2v(word_map)  # load pre-trained word2vec embeddings

            emb_size = 200
            # embeddings = torch.FloatTensor(len(word_map), emb_size)
            # init_embedding(embeddings)

            model = HierarchialAttentionNetwork(
                n_classes=n_classes,
                vocab_size=len(word_map),
                emb_size=emb_size,
                word_rnn_size=word_rnn_size,
                sentence_rnn_size=sentence_rnn_size,
                word_rnn_layers=word_rnn_layers,
                sentence_rnn_layers=sentence_rnn_layers,
                word_att_size=word_att_size,
                sentence_att_size=sentence_att_size,
                dropout=dropout)
            # model.sentence_attention.word_attention.init_embeddings(
            #     embeddings)  # initialize embedding layer with pre-trained embeddings
            model.sentence_attention.word_attention.fine_tune_embeddings(
                fine_tune_word_embeddings)  # fine-tune
            optimizer = optim.Adam(params=filter(lambda p: p.requires_grad,
                                                 model.parameters()),
                                   lr=lr)

        # Loss functions
        criterion = nn.CrossEntropyLoss()

        # Move to device
        model = model.to(device)
        criterion = criterion.to(device)

        best_acc = 0.0
        best_f1 = 0.0
        best_step = 0

        # Epochs
        for epoch in range(start_epoch, epochs):
            # One epoch's training
            eval_acc, eval_f1, eval_step = train(train_loader=train_loader,
                                                 test_loader=test_loader,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 epoch=epoch)

            if eval_acc > best_acc:
                best_acc = eval_acc
                best_f1 = eval_f1
                best_step = eval_step

            # Decay learning rate every epoch
            # adjust_learning_rate(optimizer, 0.1)

            # Save checkpoint
            # save_checkpoint(epoch, model, optimizer, word_map)

        res["best_eval_acc"].append(best_acc)
        res["best_eval_f1"].append(best_f1)
        res["best_eval_step"].append(best_step)

    print("=" * 20 + "TRAINING FINISHED" + "=" * 20)
    print("avg acc: %f" %
          (float(np.sum(res["best_eval_acc"])) / len(res["best_eval_acc"])))
    print("avg f1: %f" %
          (float(np.sum(res["best_eval_f1"])) / len(res["best_eval_f1"])))
    print(" ".join(["{}: {}".format(key, str(res[key])) for key in res]))
    writer.close()