Exemplo n.º 1
0
def train(args):
    config = load_config(args.model_dir)

    train_dataset = LMDataset(config["train_file"],
                              vocab_file=config["vocab_file"])

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")
    with open(vocab_dump_path, 'wb') as fp:
        pickle.dump(train_dataset.vocab, fp)

    valid_dataset = LMDataset(config["valid_file"], vocab_dump=vocab_dump_path)

    config["vocab_size"] = len(train_dataset.vocab)
    model = LM(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(
            args.epoch))
        model.load_model(args.model_dir, args.epoch)

    model.train(epochs=config["train_epochs"],
                batch_size=config["batch_size"],
                data_engine=train_dataset,
                valid_data_engine=valid_dataset,
                train_decoder_epochs=config.get("train_decoder_epochs", 0),
                max_iter_per_epoch=config.get("max_iter_per_epoch", 100000))
Exemplo n.º 2
0
def parse_dialogues(raw_dialogues, is_spacy):
    dialogues = []
    if is_spacy:
        spacy_parser = spacy.load('en')
    else:
        nltk_lemmatizer = WordNetLemmatizer()
    for idx, dialog in enumerate(raw_dialogues):
        if idx % 1000 == 0:
            print_time_info("Processed {}/{} dialogues".format(
                idx, len(raw_dialogues)))
        spacy_parsed_dialog = []
        nltk_parsed_dialog = []
        for line in dialog:
            spacy_line, nltk_line = [], []
            if is_spacy:
                parsed_line = spacy_parser(line)
                spacy_line = [
                    d for d in [(word.text, word.pos_) for word in parsed_line]
                    if d[0] != ' '
                ]
                spacy_parsed_dialog.append(spacy_line)
            else:
                nltk_line = pos_tag(word_tokenize(line), tagset='universal')
                nltk_line = [(d[0], d[1]) if d[1] != '.' else (d[0], 'PUNCT')
                             for d in nltk_line]
                nltk_parsed_dialog.append(nltk_line)

        if spacy_parsed_dialog != []:
            dialogues.append(spacy_parsed_dialog)
        else:
            dialogues.append(nltk_parsed_dialog)
    del (raw_dialogues)
    return dialogues
Exemplo n.º 3
0
def test(args):
    config = load_config(args.model_dir)
    dataset_cls = DATASETS[config.get("dataset_cls", "text")]

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")
    label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl")

    test_file = config["test_file"] if len(args.test_file) == 0 else args.test_file
    test_dataset = dataset_cls(
        test_file,
        vocab_dump=vocab_dump_path,
        label_vocab_dump=label_vocab_dump_path,
        n_prev_turns=config.get("n_prev_turns", 0),
        **(config.get("dataset_args", {})))

    config["model"]["vocab_size"] = len(test_dataset.vocab)
    config["model"]["label_vocab_size"] = len(test_dataset.label_vocab.vocab)
    model = SLU(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(args.epoch))
        epoch = model.load_model(args.model_dir, args.epoch)
    else:
        print_time_info("Loading last checkpoint from model_dir")
        epoch = model.load_model(args.model_dir)

    loss, acc, y_true, y_pred = model.test(
        batch_size=config["batch_size"],
        data_engine=test_dataset,
        report=True,
        verbose=args.verbose
    )
Exemplo n.º 4
0
def test(args):
    config = load_config(args.model_dir)
    dataset_cls = DATASETS[config.get("dataset_cls", "text")]

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")

    test_file = config["test_file"] if len(
        args.test_file) == 0 else args.test_file
    test_dataset = dataset_cls(test_file,
                               vocab_dump=vocab_dump_path,
                               **(config.get("dataset_args", {})))

    config["vocab_size"] = len(test_dataset.vocab)
    model = LM(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(
            args.epoch))
        epoch = model.load_model(args.model_dir, args.epoch)
    else:
        print_time_info("Loading last checkpoint from model_dir")
        epoch = model.load_model(args.model_dir)

    loss = model.test(batch_size=config["batch_size"],
                      data_engine=test_dataset)
Exemplo n.º 5
0
 def shrink_vocab(self, vocab_size):
     special_token = " + _UNK, _BOS, _EOS, _PAD"
     print_time_info("Shrink vocab size to {}{}".format(
         vocab_size, special_token))
     # 4 for special token
     shrink_rev_vocab = self.rev_vocab[vocab_size + 4:]
     for word in shrink_rev_vocab:
         self.vocab.pop(word)
     self.rev_vocab = self.rev_vocab[:vocab_size + 4]
Exemplo n.º 6
0
 def save_model(self, model_dir):
     encoder_path = os.path.join(model_dir, "encoder.ckpt")
     decoder_paths = [
         os.path.join(model_dir, "decoder_{}.ckpt".format(idx))
         for idx in range(self.n_decoders)
     ]
     torch.save(self.encoder, encoder_path)
     for idx, path in enumerate(decoder_paths):
         torch.save(self.decoders[idx], path)
     print_time_info("Save model successfully")
Exemplo n.º 7
0
    def train(self,
              epochs,
              batch_size,
              data_engine,
              valid_data_engine=None,
              test_data_engine=None,
              checkpoint=True):
        collate_fn = getattr(data_engine,
                             self.config.get("collate_fn", "collate_fn_asr"))
        self.prepare_training(batch_size, data_engine, collate_fn)

        run_batch_fn = getattr(self,
                               self.config.get("run_batch_fn", "run_batch"))

        for idx in range(1, epochs + 1):
            epoch_loss = 0
            epoch_acc = 0.0
            batch_amount = 0

            pbar = tqdm(self.train_data_loader,
                        desc="Iteration",
                        ascii=True,
                        dynamic_ncols=True)

            for b_idx, batch in enumerate(pbar):
                loss, logits = run_batch_fn(batch, testing=False)
                epoch_loss += loss.item()
                batch_amount += 1
                y_true = batch[data_engine.label_idx]
                y_pred = logits.detach().cpu().max(dim=1)[1].numpy()
                epoch_acc += (y_true == y_pred).sum() / len(y_true)
                pbar.set_postfix(Loss="{:.5f}".format(epoch_loss /
                                                      batch_amount),
                                 Acc="{:.4f}".format(epoch_acc / batch_amount))

            epoch_loss /= batch_amount
            epoch_acc /= batch_amount
            print_time_info(
                "Epoch {} finished, training loss {}, acc {}".format(
                    idx, epoch_loss, epoch_acc))

            valid_loss, valid_acc, _, _ = self.test(batch_size,
                                                    valid_data_engine)
            test_loss, test_acc = -1.0, -1.0
            if test_data_engine is not None:
                test_loss, test_acc, _, _ = self.test(batch_size,
                                                      test_data_engine)
            with open(self.log_file, 'a') as fw:
                fw.write(f"{idx},{epoch_loss},{epoch_acc},"
                         f"{valid_loss},{valid_acc},{test_loss},{test_acc}\n")

            if checkpoint:
                print_time_info("Epoch {}: save model...".format(idx))
                self.save_model(self.model_dir, idx)
Exemplo n.º 8
0
def build_dataset(dialogues, is_lemma, use_punct, min_length):
    input_data = []
    output_labels = [[] for _ in range(4)]
    spacy_parser = spacy.load('en')
    """
        For now, the data has four different layers:
            1. NOUN + PROPN + PRON
            2. NOUN + PROPN + PRON + VERB
            3. NOUN + PROPN + PRON + VERB + ADJ + ADV
            4. ALL
    """
    for idx, dialog in enumerate(dialogues):
        if idx % 1000 == 0:
            print_time_info("Parsed {}/{} dialogues".format(
                idx, len(dialogues)))
        for idx in range(len(dialog) - 1):
            input_data.append([
                word[0].lower() for word in dialog[idx]
                if (word[1] != 'PUNCT' or use_punct == 1)
            ])
            output_label = [[] for _ in range(4)]
            for w in dialog[idx + 1]:
                if w[1] in ['NOUN', 'PROPN', 'PRON']:
                    output_label[0].append(w[0].lower())
                    output_label[1].append(w[0].lower())
                    output_label[2].append(w[0].lower())
                    output_label[3].append(w[0].lower())
                elif w[1] == 'VERB':
                    word = w[0].lower()
                    if is_lemma:
                        word = spacy_parser(word)[0].lemma_
                    output_label[1].append(word)
                    output_label[2].append(word)
                    output_label[3].append(word)
                elif w[1] in ['ADJ', 'ADV']:
                    output_label[2].append(w[0].lower())
                    output_label[3].append(w[0].lower())
                else:
                    if w[1] == "PUNCT" and not use_punct:
                        continue
                    output_label[3].append(w[0].lower())

            for idx in range(4):
                output_labels[idx].append(output_label[idx])

    if min_length == -1:
        print_time_info("No minimal length, data count: {}".format(
            len(dialogues)))
    else:
        print_time_info("Minimal length is {}".format(min_length))
        idxs = []
        for idx, sent in enumerate(input_data):
            if len(output_labels[3][idx]) > min_length:
                idxs.append(idx)
        input_data = [input_data[i] for i in idxs]
        output_labels = [[output_label[i] for i in idxs]
                         for output_label in output_labels]
        print_time_info("Data count: {}".format(len(idxs)))
    return input_data, output_labels
Exemplo n.º 9
0
    def test(self, batch_size, data_engine, report=False, verbose=False):
        collate_fn = getattr(
            data_engine, self.config.get("collate_fn_test", "collate_fn_asr"))
        self.prepare_testing(batch_size, data_engine, collate_fn)

        run_batch_fn = getattr(self,
                               self.config.get("run_batch_fn", "run_batch"))

        test_probs = []
        all_y_true, all_y_pred = [], []
        test_acc = 0.0
        with torch.no_grad():
            test_loss = 0
            batch_amount = 0
            for b_idx, batch in enumerate(tqdm(self.test_data_loader)):
                loss, logits = run_batch_fn(batch, testing=True)
                test_loss += loss.item()
                batch_amount += 1
                y_true = batch[data_engine.label_idx]
                y_pred = logits.detach().cpu().max(dim=1)[1].numpy()
                test_acc += (y_true == y_pred).sum() / len(y_true)
                all_y_true += list(y_true)
                all_y_pred += list(y_pred)

            test_loss /= batch_amount
            test_acc /= batch_amount
            print_time_info("testing finished, testing loss {}, acc {}".format(
                test_loss, test_acc))

        if report:
            metrics = classification_report(
                np.array(all_y_true),
                np.array(all_y_pred),
                labels=list(range(len(data_engine.label_vocab.vocab))),
                target_names=data_engine.label_vocab.vocab,
                digits=3)
            print(metrics)

        if verbose:
            for i, (y_true, y_pred) in enumerate(zip(all_y_true, all_y_pred)):
                if y_true == y_pred:
                    continue
                label = data_engine.label_vocab.i2l(y_true)
                pred = data_engine.label_vocab.i2l(y_pred)
                print("{} [{}] [{}]".format(data_engine[i]["text"], label,
                                            pred))

        return test_loss, test_acc, all_y_true, all_y_pred
Exemplo n.º 10
0
def train(args):
    config = load_config(args.model_dir)
    dataset_cls = DATASETS[config.get("dataset_cls", "text")]
    train_dataset = dataset_cls(
        config["train_file"],
        vocab_file=config["vocab_file"],
        label_vocab_dump=config.get("label_vocab_dump", None),
        n_prev_turns=config.get("n_prev_turns", 0),
        **(config.get("dataset_args", {})))

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")
    with open(vocab_dump_path, 'wb') as fp:
        pickle.dump(train_dataset.vocab, fp)
    label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl")
    with open(label_vocab_dump_path, 'wb') as fp:
        pickle.dump(train_dataset.label_vocab, fp)

    valid_dataset = dataset_cls(
        config["valid_file"],
        vocab_dump=vocab_dump_path,
        label_vocab_dump=label_vocab_dump_path,
        n_prev_turns=config.get("n_prev_turns", 0),
        **(config.get("dataset_args", {})))

    test_dataset = None
    if len(args.test_file) > 0:
        test_dataset = dataset_cls(
            args.test_file,
            vocab_dump=vocab_dump_path,
            label_vocab_dump=label_vocab_dump_path,
            n_prev_turns=config.get("n_prev_turns", 0),
            **(config.get("dataset_args", {})))

    config["model"]["vocab_size"] = len(train_dataset.vocab)
    config["model"]["label_vocab_size"] = len(train_dataset.label_vocab.vocab)
    model = SLU(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(args.epoch))
        model.load_model(args.model_dir, args.epoch)

    model.train(
        epochs=config["train_epochs"],
        batch_size=config["batch_size"],
        data_engine=train_dataset,
        valid_data_engine=valid_dataset,
        test_data_engine=test_dataset
    )
Exemplo n.º 11
0
 def __init__(self, vocab_path, split_vocab, regen, train):
     self.vocab_path = vocab_path
     self.split_vocab = split_vocab
     if (not regen or not train):
         if os.path.exists(vocab_path):
             print_time_info("Read vocab data from {}".format(
                 self.vocab_path))
             if self.split_vocab:
                 self.vocab, self.rev_vocab, \
                     self.token_vocab, self.rev_token_vocab = \
                     pickle.load(open(self.vocab_path, 'rb'))
             else:
                 self.vocab, self.rev_vocab = \
                     pickle.load(open(self.vocab_path, 'rb'))
         else:
             print_time_info("Vocab file doesn't exist...")
Exemplo n.º 12
0
 def __init__(self,
              data_dir,
              dataset,
              save_path='data.pkl',
              vocab_path='vocab.pkl',
              is_spacy=True,
              is_lemma=True,
              fold_attr=True,
              use_punct=False,
              vocab_size=20000,
              n_layers=4,
              min_length=5,
              en_max_length=None,
              de_max_length=None,
              regen=False,
              train=True
              # partition_ratio=0.95
              ):
     if is_spacy:
         self.spacy_parser = spacy.load('en')
         print_time_info("Use Spacy as the parser")
     else:
         self.nltk_lemmatizer = WordNetLemmatizer()
         print_time_info("Use NLTK as the parser")
     self.is_spacy = is_spacy
     self.is_lemma = is_lemma
     self.fold_attr = fold_attr
     self.use_punct = use_punct
     self.data_dir = data_dir
     self.save_path = save_path
     self.vocab_path = vocab_path
     self.vocab_size = vocab_size
     self.n_layers = n_layers
     self.dataset = dataset
     self.min_length = min_length
     self.en_max_length = en_max_length if en_max_length else -1
     self.de_max_length = de_max_length if de_max_length else -1
     self.regen = regen
     if self.dataset in ["CMDC", "OPENSUBS", "REPEATSEQ"]:
         self.split_vocab = False
     else:
         self.split_vocab = True
     self.tokenizer = Tokenizer(vocab_path, self.split_vocab, regen, train)
     self.counter = 0
     self.train = train
     # self.partition_ratio = partition_ratio
     self.prepare_data()
Exemplo n.º 13
0
def REPEATSEQ(data_dir):
    input_data = []
    output_labels = [[], [], [], []]
    with open(os.path.join(data_dir, "data.txt"), 'r') as file:
        data_size = int(
            subprocess.getoutput("wc -l {}".format(
                os.path.join(data_dir, "data.txt"))).split(' ')[0])
        for l_idx, line in enumerate(file):
            if l_idx % 1000 == 0:
                print_time_info("Processed {}/{} lines".format(
                    l_idx, data_size))
            _input, _output = line.strip().split(' | ')
            input_data.append(_input.split(' '))
            _output = _output.split(' ')
            for idx in range(4):
                output_labels[idx].append(_output)

    return input_data, output_labels
Exemplo n.º 14
0
    def predict(self, F, data_info, time_info):
        '''
        This function should provide predictions of labels on (test) data.
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually returns probabilities or continuous values.
        '''

        info_dict = extract(data_info, time_info)
        print_time_info(info_dict)

        if params['algo'] == Algo.OLD_CODE:
            return self.mdl.predict(F, data_info, time_info)
        elif params['algo'] == Algo.ORIGINAL:
            return self._original_predict(F, info_dict)
        elif params['algo'] == Algo.FACEBOOK_LR:
            return self._facebook_lr_predict(F, info_dict)
        elif params['algo'] == Algo.BASIC:
            return self._basic_predict(F, info_dict)
Exemplo n.º 15
0
def parse_dialogues(raw_dialogues):
    dialogues = []
    spacy_parser = spacy.load('en')
    for idx, dialog in enumerate(raw_dialogues):
        if idx % 1000 == 0:
            print_time_info("Processed {}/{} dialogues".format(
                idx, len(raw_dialogues)))
        parsed_dialog = []
        # encoder input
        parsed_dialog.append(dialog[0])
        # output label
        line = dialog[1]
        parsed_line = spacy_parser(line)
        spacy_line = [
            d for d in [[word.text, word.pos_] for word in parsed_line]
            if d[0] != ' '
        ]
        parsed_dialog.append(spacy_line)
        dialogues.append(parsed_dialog)
    del (raw_dialogues)
    return dialogues
Exemplo n.º 16
0
def parse_dialogues(raw_dialogues, is_spacy):
    dialogues = []
    '''
    if is_spacy:
        spacy_parser = spacy.load('en')
    '''
    for idx, dialog in enumerate(raw_dialogues):
        if idx % 1000 == 0:
            print_time_info("Processed {}/{} dialogues".format(
                idx, len(raw_dialogues)))
        spacy_parsed_dialog = []
        nltk_parsed_dialog = []
        # encoder input
        spacy_parsed_dialog.append(dialog[0])
        # output label
        line = dialog[1]
        spacy_line, nltk_line = [], []
        if is_spacy:
            '''
            parsed_line = spacy_parser(line)
            spacy_line = [
                    d for d in [
                        [word.text, word.pos_]
                        for word in parsed_line] if d[0] != ' ']
            spacy_parsed_dialog.append(spacy_line)
            '''
            line = [[word] for word in line.split()]
            spacy_parsed_dialog.append(line)
        else:
            nltk_line = pos_tag(word_tokenize(line), tagset='universal')
            nltk_line = [[d[0], d[1]] if d[1] != '.' else [d[0], 'PUNCT']
                         for d in nltk_line]
            nltk_parsed_dialog.append(nltk_line)

        if spacy_parsed_dialog != []:
            dialogues.append(spacy_parsed_dialog)
        else:
            dialogues.append(nltk_parsed_dialog)

    return dialogues
Exemplo n.º 17
0
    def test(self, batch_size, data_engine):
        collate_fn = getattr(data_engine,
                             self.config.get("collate_fn", "collate_fn"))
        self.prepare_testing(batch_size, data_engine, collate_fn)

        run_batch_fn = getattr(self,
                               self.config.get("run_batch_fn", "run_batch"))

        with torch.no_grad():
            test_loss_for = test_loss_rev = test_loss_ca_pos = test_loss_ca_neg = 0
            batch_amount = 0
            for b_idx, batch in enumerate(tqdm(self.test_data_loader)):
                loss_for, loss_rev, loss_ca_pos, loss_ca_neg = run_batch_fn(
                    batch, testing=True)
                test_loss_for += loss_for.item()
                test_loss_rev += loss_rev.item()
                test_loss_ca_pos += loss_ca_pos.item()
                test_loss_ca_neg += loss_ca_neg.item()
                batch_amount += 1

            test_loss_lm = (test_loss_for + test_loss_rev) / 2
            test_loss_ca = (test_loss_ca_pos + test_loss_ca_neg)
            test_loss = \
                (self.lm_scale * test_loss_lm + \
                self.ca_scale * test_loss_ca) / batch_amount
            print_time_info(
                "testing finished, testing loss {}".format(test_loss))
            print_time_info(f"forward lm: {test_loss_for/batch_amount}, "
                            f"backward lm: {test_loss_rev/batch_amount}")
            print_time_info(f"ca pos: {test_loss_ca_pos/batch_amount}, "
                            f"ca neg: {test_loss_ca_neg/batch_amount}")

        return test_loss
Exemplo n.º 18
0
    def __init__(self, text_path, vocab_file=None, vocab_dump=None):
        self.data = []

        print_time_info("Reading text from {}".format(text_path))

        with open(text_path) as csvfile:
            reader = csv.DictReader(csvfile)
            for i, row in enumerate(reader):
                words = row["text"].split()
                if "id" in row:
                    self.data.append((row["id"], words))
                else:
                    self.data.append((i, words))
        # for line in tqdm(open(text_path)):
        #     uid, *words = line.strip().split()
        #     self.data.append((uid, words))

        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
Exemplo n.º 19
0
    def fit(self, F, y, data_info, time_info):
        '''
        This function trains the model parameters.
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        info_dict = extract(data_info, time_info)
        print_time_info(info_dict)

        if params['algo'] == Algo.OLD_CODE:
            return self.mdl.partial_fit(F, y, data_info, time_info)
        elif params['algo'] == Algo.ORIGINAL:
            return self._original_fit(F, y, info_dict)
        elif params['algo'] == Algo.FACEBOOK_LR:
            return self._facebook_lr_fit(F, y, info_dict)
        elif params['algo'] == Algo.BASIC:
            return self._basic_fit(F, y, info_dict)
Exemplo n.º 20
0
def build_dataset(dialogues, is_lemma, use_punct, min_length):
    input_data = []
    input_attr_seqs = []
    output_labels = []
    spacy_parser = spacy.load('en')
    for idx, dialog in enumerate(dialogues):
        if idx % 1000 == 0:
            print_time_info("Parsed {}/{} dialogues".format(
                idx, len(dialogues)))
        attrs = []
        attrs_seq = []
        for attr_pair in dialog[0]:
            attrs_seq.append(attr_pair[0])
            attrs_seq.append(attr_pair[1])
            attrs.append('{}:{}'.format(attr_pair[0], attr_pair[1]))
        input_data.append(attrs)
        input_attr_seqs.append(attrs_seq)
        output_label = []
        for w in dialog[1]:
            output_label.append(w[0])

        output_labels.append(deepcopy(output_label))

    if min_length == -1:
        print_time_info("No minimal length, data count: {}".format(
            len(dialogues)))
    else:
        print_time_info("Minimal length is {}".format(min_length))
        idxs = []
        for idx, sent in enumerate(input_data):
            if len(output_labels[idx]) > min_length:
                idxs.append(idx)
        input_data = [input_data[i] for i in idxs]
        input_attr_seqs = [input_attr_seqs[i] for i in idxs]
        output_labels = [output_labels[i] for i in idxs]
        print_time_info("Data count: {}".format(len(idxs)))
    return input_data, input_attr_seqs, output_labels
Exemplo n.º 21
0
 def load_model(self, model_dir, epoch=None, name='lm.ckpt'):
     if epoch is None:
         paths = glob.glob(os.path.join(model_dir, "{}.*".format(name)))
         epoch = max(
             sorted(
                 map(int, [path.strip().split('.')[-1] for path in paths])))
         print_time_info("Epoch is not specified, loading the "
                         "last epoch ({}).".format(epoch))
     path = os.path.join(model_dir, "{}.{}".format(name, epoch))
     if not os.path.exists(path):
         print_time_info("Loading failed, start training from scratch...")
     else:
         self.lm.load_state_dict(
             torch.load(path, map_location=self.device).state_dict())
         print_time_info(
             "Load model from {} successfully".format(model_dir))
     return epoch
Exemplo n.º 22
0
def test(args):
    config = load_config(args.model_dir)
    dataset_cls = DATASETS[config.get("dataset_cls", "text")]

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")
    label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl")

    test_file = config["test_file"] if len(
        args.test_file) == 0 else args.test_file
    dataset_args = config.get("dataset_args", {})
    if args.text_input:
        dataset_args["text_input"] = True

    test_dataset = dataset_cls(test_file,
                               vocab_dump=vocab_dump_path,
                               label_vocab_dump=label_vocab_dump_path,
                               **dataset_args)

    config["model"]["vocab_size"] = len(test_dataset.vocab)
    config["model"]["label_vocab_size"] = len(test_dataset.label_vocab.vocab)
    model = SLU(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(
            args.epoch))
        epoch = model.load_model(args.model_dir, args.epoch)
    elif args.best_valid:
        with open(f"{args.model_dir}/log.csv") as csv_file:
            reader = csv.DictReader(csv_file)
            log = list(reader)
            epoch = int(
                sorted(log, key=lambda x: x['valid_f1'],
                       reverse=True)[0]['epoch'])
        print_time_info(
            "Loading best validation checkpoint {} from model_dir".format(
                epoch))
        epoch = model.load_model(args.model_dir, epoch)
    else:
        print_time_info("Loading last checkpoint from model_dir")
        epoch = model.load_model(args.model_dir)

    loss, acc, y_true, y_pred = model.test(batch_size=config["batch_size"],
                                           data_engine=test_dataset,
                                           report=True,
                                           verbose=args.verbose)
Exemplo n.º 23
0
    def load_model(self, model_dir):
        # Get the latest modified model (files or directory)
        files_in_dir = glob.glob(os.path.join(model_dir, "*"))
        latest_file = sorted(files_in_dir, key=os.path.getctime)[-2]
        print(latest_file)
        if os.path.isdir(latest_file):
            encoder_path = os.path.join(latest_file, "encoder.ckpt")
            decoder_paths = [
                os.path.join(latest_file, "decoder_{}.ckpt".format(idx))
                for idx in range(self.n_decoders)
            ]
        else:
            encoder_path = os.path.join(model_dir, "encoder.ckpt")
            decoder_paths = [(os.path.join(model_dir,
                                           "decoder_{}.ckpt".format(idx))
                              for idx in range(self.n_decoders))]

        loader = True
        if not os.path.exists(encoder_path):
            loader = False
        else:
            encoder = torch.load(encoder_path)
        decoders = []
        for path in decoder_paths:
            if not os.path.exists(path):
                loader = False
            else:
                decoders.append(torch.load(path))

        if not loader:
            print_time_info("Loading failed, start training from scratch...")
        else:
            self.encoder = encoder
            self.decoders = decoders
            if os.path.isdir(latest_file):
                print_time_info(
                    "Load model from {} successfully".format(latest_file))
            else:
                print_time_info(
                    "Load model from {} successfully".format(model_dir))
Exemplo n.º 24
0
    def prepare_data(self):
        if not os.path.exists(self.save_path) or self.regen:
            if self.regen:
                print_time_info("Regenerate the data...")
            else:
                print_time_info("There isn't any usable save...")
            if not os.path.isdir(self.data_dir):
                print_time_info("Error: The dataset doesn't exist")
                exit()
            print_time_info("Start reading dataset {} from {}".format(
                self.dataset, self.data_dir))
            if self.dataset == "CMDC":
                self.input_data, self.output_labels = CMDC(
                    self.data_dir, self.is_spacy, self.is_lemma,
                    self.use_punct, self.min_length)
            elif self.dataset == "E2ENLG":
                self.input_data, self.output_labels = E2ENLG(
                    self.data_dir, self.is_spacy, self.is_lemma,
                    self.fold_attr, self.use_punct, self.min_length,
                    self.train)
            elif self.dataset == "REPEATSEQ":
                self.input_data, self.output_labels = REPEATSEQ(self.data_dir)
            elif self.dataset == "DSTC6":
                self.DSTC6()
            elif self.dataset == "DBDC3":
                self.DBDC3()
            elif self.dataset == 'OPENSUBS':
                self.OPENSUBS()
        else:
            self.input_data, self.output_labels = \
                    pickle.load(open(self.save_path, 'rb'))
            print_time_info("Load the data from {}".format(self.save_path))

        if not os.path.exists(self.vocab_path) or (self.regen and self.train):
            self.build_vocab()
        if not os.path.exists(self.save_path) or self.regen:
            self.tokenize_sents()
            self.crop()
            pickle.dump([self.input_data, self.output_labels],
                        open(self.save_path, 'wb'))
            print_time_info("Create the save file {}".format(self.save_path))

        # shrink the vocab to vocab size
        self.tokenizer.shrink_vocab(self.vocab_size)
        self.add_unk()

        # pick the labels for different n_layers
        if self.n_layers == 1:
            self.output_labels = [self.output_labels[3]]
        elif self.n_layers == 2:
            self.output_labels = [self.output_labels[1], self.output_labels[3]]

        # partition training and testing data
        """
Exemplo n.º 25
0
 def __init__(self, vocab_path):
     print_time_info("Reading vocabulary from {}".format(vocab_path))
     self.read_vocab(vocab_path)
Exemplo n.º 26
0
 def save_model(self, model_dir, epoch, name='lm.ckpt'):
     path = os.path.join(model_dir, "{}.{}".format(name, epoch))
     torch.save(self.lm, path)
     print_time_info("Save model successfully")
Exemplo n.º 27
0
    def train(self,
              epochs,
              batch_size,
              data_engine,
              valid_data_engine=None,
              train_decoder_epochs=0,
              max_iter_per_epoch=100000):
        collate_fn = getattr(data_engine,
                             self.config.get("collate_fn", "collate_fn"))
        self.prepare_training(batch_size, data_engine, collate_fn)

        run_batch_fn = getattr(self,
                               self.config.get("run_batch_fn", "run_batch"))

        for param in self.lm.elmo.parameters():
            param.requires_grad_(False)

        for idx in range(1, epochs + 1):
            if idx == train_decoder_epochs + 1 or (idx == 1 and
                                                   idx > train_decoder_epochs):
                for param in self.lm.elmo.parameters():
                    param.requires_grad_(True)

            epoch_loss_for = epoch_loss_rev = epoch_loss_ca_pos = epoch_loss_ca_neg = 0
            batch_amount = 0

            pbar = tqdm(self.train_data_loader,
                        desc="Iteration",
                        ascii=True,
                        dynamic_ncols=True)

            for b_idx, batch in enumerate(pbar):
                loss_for, loss_rev, loss_ca_pos, loss_ca_neg = run_batch_fn(
                    batch, testing=False)
                epoch_loss_for += loss_for.item()
                epoch_loss_rev += loss_rev.item()
                epoch_loss_ca_pos += loss_ca_pos.item()
                epoch_loss_ca_neg += loss_ca_neg.item()
                batch_amount += 1
                pbar.set_postfix(
                    FLoss="{:.5f}".format(epoch_loss_for / batch_amount),
                    BLoss="{:.5f}".format(epoch_loss_rev / batch_amount),
                    PosLoss="{:.5f}".format(epoch_loss_ca_pos / batch_amount),
                    NegLoss="{:.5f}".format(epoch_loss_ca_neg / batch_amount))
                if b_idx == max_iter_per_epoch:
                    break

            epoch_loss_lm = (epoch_loss_for + epoch_loss_rev) / 2
            epoch_loss_ca = (epoch_loss_ca_pos + epoch_loss_ca_neg)
            epoch_loss = \
                (self.lm_scale * epoch_loss_lm + \
                self.ca_scale * epoch_loss_ca) / batch_amount
            print_time_info("Epoch {} finished, training loss {}".format(
                idx, epoch_loss))

            valid_loss = self.test(batch_size, valid_data_engine)
            with open(self.log_file, 'a') as fw:
                fw.write(f"{idx},{epoch_loss},{valid_loss}\n")

            print_time_info("Epoch {}: save model...".format(idx))
            self.save_model(self.model_dir, idx)
Exemplo n.º 28
0
def build_dataset(dialogues, is_lemma, use_punct, min_length):
    input_data = []
    output_labels = [[] for _ in range(4)]
    spacy_parser = spacy.load('en')
    """
        For now, the data has four different layers:
            1. NOUN + PROPN + PRON
            2. NOUN + PROPN + PRON + VERB
            3. NOUN + PROPN + PRON + VERB + ADJ + ADV
            4. ALL
    """
    for idx, dialog in enumerate(dialogues):
        if idx % 1000 == 0:
            print_time_info(
                    "Parsed {}/{} dialogues".format(idx, len(dialogues)))
        attrs = []
        for attr_pair in dialog[0]:
            attrs.append(attr_pair[0])
            attrs.append(attr_pair[1])
        input_data.append(attrs)
        output_label = [[] for _ in range(4)]
        for w in dialog[1]:

            # ['NOUN', 'PROPN', 'PRON'] -> ['VERB'] -> ['ADJ', 'ADV'] -> OTHERS
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[0].append(w[0])
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[1].append(word)
                output_label[2].append(word)
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[3].append(w[0])

            '''
            # ['NOUN', 'PROPN', 'PRON'] -> ['ADJ', 'ADV'] -> ['VERB'] -> OTHERS
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[0].append(w[0])
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[2].append(word)
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[3].append(w[0])
            '''

            '''
            # ['VERB'] -> ['NOUN', 'PROPN', 'PRON'] -> ['ADJ', 'ADV'] -> OTHERS
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[0].append(word)
                output_label[1].append(word)
                output_label[2].append(word)
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[3].append(w[0])
            '''

            '''
            # ['VERB'] -> ['ADJ', 'ADV'] -> ['NOUN', 'PROPN', 'PRON'] -> OTHERS
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[0].append(word)
                output_label[1].append(word)
                output_label[2].append(word)
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[3].append(w[0])
            '''

            '''
            # ['NOUN', 'PROPN', 'PRON'] -> OTHERS -> ['VERB'] -> ['ADJ', 'ADV']
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[0].append(w[0])
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[2].append(word)
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[1].append(w[0])
                    output_label[2].append(w[0])
                    output_label[3].append(w[0])
            '''

            '''
            # ['NOUN', 'PROPN', 'PRON'] -> OTHERS -> ['ADJ', 'ADV'] -> ['VERB']
            if w[0] in ["NAMETOKEN", "NEARTOKEN"]:
                w[1] = "NOUN"
            if w[1] in ['NOUN', 'PROPN', 'PRON']:
                output_label[0].append(w[0])
                output_label[1].append(w[0])
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            elif w[1] == 'VERB':
                word = w[0]
                if is_lemma:
                    word = spacy_parser(word)[0].lemma_
                output_label[3].append(word)
            elif w[1] in ['ADJ', 'ADV']:
                output_label[2].append(w[0])
                output_label[3].append(w[0])
            else:
                if w[1] == "PUNCT" and not use_punct:
                    pass
                else:
                    output_label[1].append(w[0])
                    output_label[2].append(w[0])
                    output_label[3].append(w[0])
            '''


        for idx in range(4):
            output_labels[idx].append(deepcopy(output_label[idx]))

    if min_length == -1:
        print_time_info(
                "No minimal length, data count: {}".format(len(dialogues)))
    else:
        print_time_info("Minimal length is {}".format(min_length))
        idxs = []
        for idx, sent in enumerate(input_data):
            if len(output_labels[3][idx]) > min_length:
                idxs.append(idx)
        input_data = [input_data[i] for i in idxs]
        output_labels = [
                [output_label[i] for i in idxs]
                for output_label in output_labels]
        print_time_info("Data count: {}".format(len(idxs)))
    return input_data, output_labels
Exemplo n.º 29
0
                    help='the max length of generated sequence [20]')
parser.add_argument('--data_size',
                    type=int,
                    default=25000,
                    help='the generated data size [25000]')
parser.add_argument('--vocab_size',
                    type=int,
                    default=10,
                    help='the vocab size of sequences [10]')
parser.add_argument('--reverse',
                    type=int,
                    default=0,
                    help='reverse the output sequence or not [0]')
args = parser.parse_args()

print_time_info("Data size: {}".format(args.data_size))
print_time_info("Min length: {}".format(args.min_length))
print_time_info("Max length: {}".format(args.max_length))
print_time_info("Vocab size: {}".format(args.vocab_size))
print_time_info("Start generate data...")

lengths = random.randint(args.min_length, args.max_length + 1, args.data_size)
data = [random.randint(0, args.vocab_size, length) for length in lengths]
labels = [d[::-1] if args.reverse else d for d in data]
with open(os.path.join(args.data_dir, "data.txt"), 'w') as file:
    for idx in range(args.data_size):
        d_string = ' '.join(map(str, data[idx]))
        l_string = ' '.join(map(str, labels[idx]))
        file.write("{} | {}\n".format(d_string, l_string))

print_time_info("Done")
Exemplo n.º 30
0
    def build_vocab(self, corpus, tokens=None):
        # You should pass a list with all words in the dataset as corpus
        self.vocab, self.rev_vocab = {}, []
        self.vocab['_UNK'] = len(self.rev_vocab)
        self.rev_vocab.append('_UNK')
        self.vocab['_PAD'] = len(self.rev_vocab)
        self.rev_vocab.append('_PAD')
        self.vocab['_BOS'] = len(self.rev_vocab)
        self.rev_vocab.append('_BOS')
        self.vocab['_EOS'] = len(self.rev_vocab)
        self.rev_vocab.append('_EOS')
        print_time_info("Build vocab: {} words".format(len(corpus)))
        raw_vocab = {}
        for word in corpus:
            if word not in raw_vocab:
                raw_vocab[word] = 0
            raw_vocab[word] += 1

        sorted_vocab = sorted(raw_vocab.items(),
                              key=operator.itemgetter(1))[::-1]
        word_cnt = 0
        for idx, word in enumerate(sorted_vocab):
            word_cnt += word[1]
            if ((word_cnt / len(corpus)) >= 0.9
                    and (word_cnt - word[1]) / len(corpus) < 0.9):
                print_time_info("90% coverage: vocab size {}".format(idx))
            if ((word_cnt / len(corpus)) >= 0.95
                    and ((word_cnt - word[1]) / len(corpus)) < 0.95):
                print_time_info("95% coverage: vocab size {}".format(idx))
            if ((word_cnt / len(corpus)) >= 0.99
                    and ((word_cnt - word[1]) / len(corpus)) < 0.99):
                print_time_info("99% coverage: vocab size {}".format(idx))
        print_time_info("100% coverage: vocab size {}".format(
            len(sorted_vocab)))

        for word, _ in sorted_vocab:
            self.vocab[word] = len(self.rev_vocab)
            self.rev_vocab.append(word)

        if self.split_vocab:
            self.token_vocab, self.rev_token_vocab = {}, []
            self.token_vocab['_UNK'] = len(self.rev_token_vocab)
            self.rev_token_vocab.append('_UNK')
            self.token_vocab['_PAD'] = len(self.rev_token_vocab)
            self.rev_token_vocab.append('_PAD')
            raw_vocab = {}
            for token in tokens:
                if token not in raw_vocab:
                    raw_vocab[token] = 0
                raw_vocab[token] += 1

            sorted_vocab = sorted(raw_vocab.items(),
                                  key=operator.itemgetter(1))[::-1]

            for token, _ in sorted_vocab:
                self.token_vocab[token] = len(self.rev_token_vocab)
                self.rev_token_vocab.append(token)

        print_time_info("Save vocab data to {}".format(self.vocab_path))
        if not tokens:
            pickle.dump([self.vocab, self.rev_vocab],
                        open(self.vocab_path, 'wb'))
        else:
            pickle.dump([
                self.vocab, self.rev_vocab, self.token_vocab,
                self.rev_token_vocab
            ], open(self.vocab_path, 'wb'))