예제 #1
0
    def load_training_and_validation_data(self,
                                          training_data_ids_path,
                                          training_data_labels_path,
                                          validation_data_ids_path,
                                          validation_data_labels_path,
                                          classifier_data_dir,
                                          batch_size=10):
        training_token_ids = np.load(training_data_ids_path)
        validation_token_ids = np.load(validation_data_ids_path)
        training_labels = np.load(training_data_labels_path)
        validation_labels = np.load(validation_data_labels_path)

        training_labels = training_labels.flatten()
        validation_labels = validation_labels.flatten()
        training_labels -= training_labels.min()
        validation_labels -= validation_labels.min()

        training_dataset = TextDataset(training_token_ids, training_labels)
        validation_dataset = TextDataset(validation_token_ids,
                                         validation_labels)
        training_data_sampler = SortishSampler(
            data_source=training_token_ids,
            key=lambda x: len(training_token_ids[x]),
            bs=batch_size // 2)
        validation_data_sampler = SortSampler(
            data_source=validation_token_ids,
            key=lambda x: len(validation_token_ids[x]))
        training_dataloader = DataLoader(dataset=training_dataset,
                                         batch_size=batch_size // 2,
                                         transpose=True,
                                         num_workers=1,
                                         pad_idx=1,
                                         sampler=training_data_sampler)
        validation_dataloader = DataLoader(dataset=validation_dataset,
                                           batch_size=batch_size,
                                           transpose=True,
                                           num_workers=1,
                                           pad_idx=1,
                                           sampler=validation_data_sampler)
        self.model_data = ModelData(path=classifier_data_dir,
                                    trn_dl=training_dataloader,
                                    val_dl=validation_dataloader)
예제 #2
0
파일: rnn.py 프로젝트: dotannn/nlp-final
    def _train_classifier(self,
                          train_ids,
                          train_labels,
                          batch_size=4,
                          val_ids=None,
                          val_labels=None):
        # change from multi-label to multi-class:

        def one_hot_idxs(idxs, n_classes):
            res = np.zeros(n_classes)
            res[idxs] = 1.
            return res

        onehot_train_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in train_labels])
        onehot_val_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in val_labels])

        train_ds = TextDataset(train_ids, onehot_train_labels)
        val_ds = TextDataset(val_ids, onehot_val_labels)

        train_sampler = SortishSampler(train_ids,
                                       key=lambda x: len(train_ids[x]),
                                       bs=batch_size)
        val_sampler = SortSampler(val_ids, key=lambda x: len(val_ids[x]))

        train_dl = DataLoader(train_ds,
                              batch_size,
                              num_workers=1,
                              transpose=True,
                              pad_idx=1,
                              sampler=train_sampler)
        val_dl = DataLoader(val_ds,
                            batch_size,
                            num_workers=1,
                            transpose=True,
                            pad_idx=1,
                            sampler=val_sampler)

        md = ModelData("tmp", train_dl, val_dl)

        m = get_rnn_classifier(
            self._bptt,
            20 * 70,
            self._n_classes,
            self._vocab.size,
            emb_sz=self._embedding_size,
            n_hid=self._n_hidden_activations,
            n_layers=self._n_layers,
            pad_token=1,
            layers=[self._embedding_size * 3, 128, self._n_classes],
            drops=[self._dropouts_classifier[4], 0.1],
            dropouti=self._dropouts_classifier[0],
            wdrop=self._dropouts_classifier[1],
            dropoute=self._dropouts_classifier[2],
            dropouth=self._dropouts_classifier[3])

        self._classifier_model = RNN_Learner(md,
                                             TextModel(to_gpu(m)),
                                             opt_fn=self.OPT_FN)
        self._classifier_model.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self._classifier_model.clip = 25.  # or 0.3 ?!

        def binary_ce_wrapper(predicted, gt):
            out = F.sigmoid(predicted)
            return binary_cross_entropy(out, gt)

        self._classifier_model.crit = binary_ce_wrapper
        jaccard_0_5 = partial(self.func_metric, func=jaccard_index)
        jaccard_0_5.__name__ = "jaccard_0_5"
        precision_0_5 = partial(self.func_metric, func=precision)
        precision_0_5.__name__ = "precision_0_5"
        recall_0_5 = partial(self.func_metric, func=recall)
        recall_0_5.__name__ = "recall_0_5"
        f1_0_5 = partial(self.func_metric, func=f1)
        f1_0_5.__name__ = "f1_0_5"

        self._classifier_model.metrics = [
            jaccard_0_5, precision_0_5, recall_0_5, f1_0_5
        ]

        lr = 3e-3
        lrm = 2.6
        lrs = np.array(
            [lr / (lrm**4), lr / (lrm**3), lr / (lrm**2), lr / lrm, lr])

        self._classifier_model.load_encoder('enc_weights')

        self._classifier_model.freeze_to(-1)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.freeze_to(-2)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.unfreeze()
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=24,
            use_clr=(32, 10),
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._classifier_model.save('classifier_weights')

df_train.label.value_counts()


# In[19]:


bs = 64
trn_ds = TextDataset(tokens_train, df_train.label.values)
val_ds = TextDataset(tokens_val, df_val.label.values)
trn_samp = SortishSampler(tokens_train, key=lambda x: len(tokens_train[x]), bs=bs//2)
val_samp = SortSampler(tokens_val, key=lambda x: len(tokens_val[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=2, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=2, sampler=val_samp)
model_data = ModelData(path, trn_dl, val_dl)


# In[20]:


model= get_transformer_classifier(
    n_tok=n_toks, 
    emb_sz=EMB_DIM, 
    n_head=12, 
    n_layer=3, 
    n_ctx=200,
    max_seq_len=100,
    clf_layers=[EMB_DIM, 50, 3],
    pad_token=2,
    embd_pdrop=0.1,