예제 #1
0
 def __init__(self,
              model_args,
              model_dir,
              vocab_file=None,
              label_file=None,
              use_cuda=False,
              wombat_path=None):
     self.wombat_object = None
     if wombat_path is not None:
         self.wombat_object = Wombat(wombat_path)
     margs = SaveloadHP.load(model_dir + model_args)
     margs.use_cuda = use_cuda
     if vocab_file and vocab_file != margs.vocab_file:
         margs.vocab_file = vocab_file
     if label_file and label_file != margs.label_file:
         margs.label_file = label_file
     if model_dir and model_dir != margs.model_dir:
         margs.model_dir = model_dir
     self.tagger = Labeler_model(margs)
     # labeler_filename = os.path.join(margs.model_dir, margs.labeler_file)
     # print("Load Model from file: %s" % labeler_filename)
     # self.tagger.labeler.load_state_dict(torch.load(labeler_filename))
     # self.tagger.labeler.to(self.tagger.device)
     self.tagger.load_parameters(epoch=-1)
예제 #2
0
 def save(tokenizer, tokenize_file):
     SaveloadHP.save(tokenizer, tokenize_file)
예제 #3
0
 def load(tokenize_file):
     return SaveloadHP.load(tokenize_file)
예제 #4
0
    def build_data(args):
        if not args.tl:
            if not os.path.exists(args.model_dir):
                os.mkdir(args.model_dir)
            if args.timestamped_subdir:
                sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(args.model_dir, sub_folder)):
                os.mkdir(os.path.join(args.model_dir, sub_folder))
            args.model_dir = os.path.join(args.model_dir, sub_folder)
            args.log_file = os.path.join(args.model_dir, args.log_file)
            if args.tokenize_type != "bpe":
                s_paras = [args.wl_th, args.wcutoff]
                t_paras = [args.wl_th, args.wcutoff]
                print("INFO: - Build vocabulary...")

                tokenizer = Tokenizer(s_paras, t_paras)
                files = [args.train_file]
                if args.train_file != args.dev_file:
                    files.append(args.dev_file)
                # Load datasets to build vocabulary
                data = Tokenizer.load_file(files, task=2)
                tokenizer.build(datasets=data)
                sw2i = tokenizer.sw2i
                tw2i = tokenizer.tw2i
                print("INFO: - Save vocabulary...")
                Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab"))
            else:
                print("INFO: - Load vocabulary...")
                tokenizer = BPE.load(args.vocab_file)
                tokenizer.add_tokens(sys_tokens)
                sw2i = tokenizer.get_vocab()
                tw2i = tokenizer.get_vocab()

            # args.tokenizer = tokenizer
            # Source language
            args.swd_pretrained = None
            args.twd_pretrained = None
            if len(args.swd_embfile) != 0:
                scale = np.sqrt(3.0 / args.swd_dim)
                emb_reader = Embeddings(args.swd_embfile)
                args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale)
                if args.twd_embfile == args.swd_embfile:
                    scale = np.sqrt(3.0 / args.twd_dim)
                    args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # Target language
            if len(args.twd_embfile) != 0:
                scale = np.sqrt(3.0 / args.twd_dim)
                if args.twd_pretrained is None:
                    emb_reader = Embeddings(args.swd_embfile)
                args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # directly integrate transfer learning if no updating new words
            SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args))
            return args
        else:
            print("INFO: - Use transfer learning technique")
            assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file")
            # load pre-trained argument file from a previous training folder
            margs = SaveloadHP.load(args.tlargs)
            # margs.tl = args.tl
            # margs.log_file = args.log_file

            # TODO update new vocab and all other new arguments used for new training
            # 0. Read vocab
            # 1. Update schema
            # 2. Update vocab
            # args.tokenizer = margs.tokenizer
            # 3. Use all model file directory of previous train
            args.model_dir = margs.model_dir
            args.seq2seq_file = margs.seq2seq_file
            # 4. Keep the remaining current arguments
            # add a constraint at the loading time that if fail to load any model, just skip it
            args.swd_pretrained = margs.swd_pretrained
            args.twd_pretrained = margs.twd_pretrained
            return args
예제 #5
0
    def train(self):
        # training result is returned after training to inform calling code of the outcome of training
        # Values: Matching threshold reached (success): 0, Otherwise: 1
        # training_result = 1
        train_data, train_numlines = Tokenizer.prepare_iter(self.args.train_file, firstline=self.args.firstline, task=2)
        dev_data, dev_numlines = Tokenizer.prepare_iter(self.args.dev_file, firstline=self.args.firstline, task=2)
        test_data, test_numlines = Tokenizer.prepare_iter(self.args.test_file, firstline=self.args.firstline, task=2)

        saved_epoch = 0
        nepoch_no_imprv = 0
        epoch_start = time.time()
        max_epochs = self.args.max_epochs
        best_dev = -np.inf if self.args.metric == "bleu" else np.inf

        if self.args.tl:
            # 1. Load pre-trained model from previous model_dir
            print("INFO: - Load transfer learning models")
            self.load_transferlearning(epoch=-1)
            # 2. Update model_dir to the new one
            if self.args.timestamped_subdir:
                self.args.model_dir = os.path.abspath(os.path.join(self.args.model_dir, ".."))
                sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)):
                os.mkdir(os.path.join(self.args.model_dir, sub_folder))
            self.args.model_dir = os.path.join(self.args.model_dir, sub_folder)
            # 3. Update logfile dir
            self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file)
            with open(self.args.log_file, "w") as f:
                f.write("START TRAINING\n")
            # 4. save updated arguments and log file to the new folder
            print("INFO: - Save new argument file")
            SaveloadHP.save(self.args, os.path.join(self.args.model_dir, self.args.model_args))

            dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file)
            best_dev = dev_bleu[0] if self.args.metric == "bleu" else dev_loss
            print("INFO: - Transfer learning performance")
            print("         - Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                  (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
            self.appendfile("\t- Transfer learning performance")
            self.appendfile("\t\t- Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" %
                            (dev_loss, dev_bleu[0], dev_string_match, dev_speed))

            # print("INFO: - Save transfer learning models")
            # self.save_parameters(epoch=0)
            # suppose the transfered model is the best one and save in the main dir
            self.save_parameters(epoch=-1)
        else:
            with open(self.args.log_file, "w") as f:
                f.write("START TRAINING\n")

        print('Dev metric:', self.args.metric)
        for epoch in range(1, max_epochs + 1):
            print("Epoch: %s/%s" % (epoch, max_epochs))
            stime = time.time()
            train_loss = self.train_batch(train_data, train_numlines)
            print("BONUS: Training time of %.4f" % (time.time() - stime))
            # Save the  model
            # print("INFO: - Frequently save models to checkpoint folders")
            # self.save_parameters(epoch=epoch)
            # set the first model as the best one and save to the main dir
            # evaluate on developing data

            dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines,
                                                                                  self.args.pred_dev_file)

            dev_metric = dev_bleu[0] if self.args.metric == "bleu" else dev_loss
            cond = dev_metric > best_dev if self.args.metric == "bleu" else dev_loss < best_dev
            if cond:
                nepoch_no_imprv = 0
                saved_epoch = epoch
                best_dev = dev_metric
                print("UPDATES: - New improvement")
                print("         - Train loss: %.4f" % train_loss)
                print("         - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                      (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch)
                self.appendfile("\t\t- Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" %
                                (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                print("INFO: - Save best models")
                self.save_parameters(epoch=-1)

                # if dev_string_match >= self.args.matching_threshold:
                #     # TODO: automatically load models to gcp
                #     training_result = 0
                #     break

            else:
                print("UPDATES: - No improvement")
                print("         - Train loss: %.4f" % train_loss)
                print("         - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                      (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                nepoch_no_imprv += 1
                # Decay learning_rate if no improvement
                if self.args.decay_rate > 0:
                    self.lr_decay(epoch)

                if nepoch_no_imprv >= self.args.patience:
                    # Load the current best models
                    print("INFO: - Load best models")
                    self.load_parameters(epoch=-1)

                    test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines,
                                                                                              self.args.pred_test_file)
                    print("SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv)
                    print("         - Dev metric (%s): %.4f" % (self.args.metric, best_dev))
                    print("         - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" %
                          (test_loss, test_bleu[0], test_string_match, test_speed))

                    self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
                    self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch)
                    self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test speed: %.2f(tokens/s)\n" %
                                    (test_loss, test_bleu[0], test_speed))
                    return test_bleu[0]

            epoch_finish, epoch_remain = Timer.timeEst2(epoch_start, epoch / max_epochs)
            print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish))
            print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain))

        # print("INFO: - Save best models")
        # self.save_parameters(epoch=-1)
        print("INFO: - Load best models")
        self.load_parameters(epoch=-1)

        test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines,
                                                                                  self.args.pred_test_file)
        print("SUMMARY: - Completed %d epoches" % max_epochs)
        print("         - Dev metric (%s): %.4f" % (self.args.metric, best_dev))
        print("         - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" %
              (test_loss, test_bleu[0], test_string_match, test_speed))
        self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
        self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch)
        self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_bleu[0], test_string_match, test_speed))
        return test_bleu[0]
예제 #6
0
    def train(self):
        train_data, train_numlines = Tokenizer.prepare_iter(
            self.args.train_file, firstline=self.args.firstline, task=2)
        dev_data, dev_numlines = Tokenizer.prepare_iter(
            self.args.dev_file, firstline=self.args.firstline, task=2)
        test_data, test_numlines = Tokenizer.prepare_iter(
            self.args.test_file, firstline=self.args.firstline, task=2)

        saved_epoch = 0
        nepoch_no_imprv = 0
        epoch_start = time.time()
        max_epochs = self.args.max_epochs
        # best_dev = -np.inf if self.args.metric == "f1" else np.inf
        best_dev = np.inf if self.args.metric == "loss" else -np.inf

        with open(self.args.log_file, "w") as f:
            f.write("START TRAINING\n")
        if self.args.tl:
            # 1. Load pre-trained model from previous model_dir
            print("INFO: - Load transfer learning models")
            self.load_transferlearning(epoch=-1)
            # 2. Update model_dir to the new one
            if self.args.timestamped_subdir:
                self.args.model_dir = os.path.abspath(
                    os.path.join(self.args.model_dir, ".."))
                sub_folder = datetime.now().isoformat(
                    sep='-',
                    timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(self.args.model_dir,
                                               sub_folder)):
                os.mkdir(os.path.join(self.args.model_dir, sub_folder))
            self.args.model_dir = os.path.join(self.args.model_dir, sub_folder)
            # 3. Update logfile dir
            self.args.log_file = os.path.join(self.args.model_dir,
                                              self.args.log_file)

            # 4. save updated arguments and log file to the new folder
            print("INFO: - Save new argument file")
            SaveloadHP.save(
                self.args,
                os.path.join(self.args.model_dir, self.args.model_args))

            dev_loss, dev_metrics, dev_speed = self.evaluate_batch(
                dev_data, dev_numlines)
            # best_dev = dev_metrics[2] if self.args.metric == "f1" else dev_loss
            best_dev = dev_loss if self.args.metric == "loss" else dev_metrics[
                2]
            print("INFO: - Transfer learning performance")
            print(
                "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                 dev_speed))
            print(
                "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                % (dev_metrics[3], dev_metrics[4]))
            self.appendfile("\t- Transfer learning performance")
            self.appendfile(
                "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                 dev_speed))
            self.appendfile(
                "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" %
                (dev_metrics[3], dev_metrics[4]))

            # print("INFO: - Save transfer learning models")
            # self.save_parameters(epoch=0)
            # suppose the transfered model is the best one and save in the main dir
            self.save_parameters(epoch=-1)

        for epoch in range(1, max_epochs + 1):
            print("Epoch: %s/%s" % (epoch, max_epochs))
            stime = time.time()
            train_loss = self.train_batch(train_data, train_numlines)
            print("BONUS: Training time of %.4f" % (time.time() - stime))
            # Save the  model
            # print("INFO: - Frequently save models to checkpoint folders")
            # self.save_parameters(epoch=epoch)
            # evaluate on developing data
            dev_loss, dev_metrics, dev_speed = self.evaluate_batch(
                dev_data, dev_numlines)
            # dev_metric = dev_metrics[2] if self.args.metric == "f1" else dev_loss
            dev_metric = dev_loss if self.args.metric == "loss" else dev_metrics[
                2]
            # cond = dev_metric > best_dev if self.args.metric == "f1" else dev_loss < best_dev
            cond = dev_loss < best_dev if self.args.metric == "loss" else dev_metric > best_dev
            if cond:
                nepoch_no_imprv = 0
                saved_epoch = epoch
                best_dev = dev_metric
                print("UPDATES: - New improvement")
                print("         - Train loss: %.4f" % train_loss)
                print(
                    "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                print(
                    "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                self.appendfile("\t- New improvement at epoch %d:\n" %
                                saved_epoch)
                self.appendfile(
                    "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)\n" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                self.appendfile(
                    "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                print("INFO: - Save best models")
                self.save_parameters(epoch=-1)

            else:
                print("UPDATES: - No improvement")
                print("         - Train loss: %.4f" % train_loss)
                print(
                    "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                print(
                    "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                nepoch_no_imprv += 1
                # Decay learning_rate if no improvement
                if self.args.decay_rate > 0:
                    self.lr_decay(epoch)

                if nepoch_no_imprv >= self.args.patience:
                    # Load the current best models
                    print("INFO: - Load best models")
                    self.load_parameters(epoch=-1)

                    test_loss, test_metrics, test_speed = self.evaluate_batch(
                        test_data, test_numlines)
                    print(
                        "SUMMARY: - Early stopping after %d epochs without improvements"
                        % nepoch_no_imprv)
                    print("         - Dev metric (%s): %.4f" %
                          (self.args.metric, best_dev))
                    print(
                        "         - Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
                    print(
                        "         - Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
                        % (test_metrics[3], test_metrics[4]))

                    self.appendfile("STOP TRAINING at epoch %s/%s\n" %
                                    (epoch, max_epochs))
                    self.appendfile(
                        "\t- Testing the best model at epoch %d:\n" %
                        saved_epoch)
                    self.appendfile(
                        "\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
                    self.appendfile(
                        "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
                        % (test_metrics[3], test_metrics[4]))
                    return test_metrics

            epoch_finish, epoch_remain = Timer.timeEst2(
                epoch_start, epoch / max_epochs)
            print("INFO: - Trained time for %d epochs: %s" %
                  (epoch, epoch_finish))
            print("\t- Remained time for %d epochs (est): %s\n" %
                  (max_epochs - epoch, epoch_remain))

        print("INFO: - Load best models")
        self.load_parameters(epoch=-1)

        test_loss, test_metrics, test_speed = self.evaluate_batch(
            test_data, test_numlines)
        print("SUMMARY: - Completed %d epoches" % max_epochs)
        print("         - Dev metric (%s): %.4f" %
              (self.args.metric, best_dev))
        print("         - Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
              "Test F1: %.4f; Test speed: %.2f(tokens/s)" %
              (test_loss, test_metrics[0], test_metrics[1], test_metrics[2],
               test_speed))
        print(
            "         - Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
            % (test_metrics[3], test_metrics[4]))
        self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
        self.appendfile("\t- Testing the best model at epoch %d:\n" %
                        saved_epoch)
        self.appendfile("\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
        self.appendfile(
            "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" %
            (test_metrics[3], test_metrics[4]))
        return test_metrics