示例#1
0
    def train(self, train_loader, test_loader):
        train_text, y_train = self._gather_text(train_loader)
        test_text, y_test = self._gather_text(test_loader)

        vectorizer = TfidfVectorizer()
        log.info("Fitting a vectorizer")
        vectorizer.fit(train_text)
        log.info("... complete")
        log.info("Transforming text")
        X_train = vectorizer.transform(train_text)
        X_test = vectorizer.transform(test_text)
        log.info("... complete")

        param_grid = {
            "n_estimators": [10, 50, 150, 250, 500],
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 10, 2],
            "min_samples_split": [2, 3]
        }
        clf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)

        clf.fit(X_train, y_train)

        monitor = {
            "test_F1": Multilabel.f1_scores(y_test, clf.predict(X_test)),
            "train_F1": Multilabel.f1_scores(y_train, clf.predict(X_train))
        }

        return monitor
示例#2
0
    def train(self, train_loader, test_loader, epochs):
        optimizer = optim.Adam(self.model.parameters())
        criterion = nn.BCEWithLogitsLoss()
        y_true, y_pred = self._gather_outputs(test_loader)
        log.info("Test F1: {}".format(Multilabel.f1_scores(y_true, y_pred)))

        for epoch in range(epochs):
            log.info("Epoch: {}".format(epoch))
            self.model.train(True)
            for idx, (_id, labels, text, _, _,
                      _) in enumerate(train_loader, 1):
                labels = torch.FloatTensor(labels)
                seq = torch.LongTensor(text)
                if self.use_cuda:
                    seq, labels = seq.cuda(), labels.cuda()
                self.model.zero_grad()
                self.model.hidden = self.model.init_hidden()
                output = self.model(seq)
                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()

                if idx % 1000 == 0:
                    log.info("Train Loop: {} done".format(idx))

            y_true, y_pred = self._gather_outputs(test_loader)
            log.info("Test F1: {}".format(Multilabel.f1_scores(y_true,
                                                               y_pred)))
示例#3
0
 def eval(self, train_loader, test_loader):
     y_true, y_pred = self.gather_outputs(test_loader)
     testScore = Multilabel.f1_scores(y_true, y_pred)
     logger.info("Test F1: {}".format(testScore))
     y_true, y_pred = self.gather_outputs(train_loader)
     trainScore = Multilabel.f1_scores(y_true, y_pred)
     logger.info("Train F1: {}".format(trainScore))
     self.testAccuracies.append(testScore)
     self.trainAccuracies.append(trainScore)
示例#4
0
 def test(self, test_loader, word_model_path, sent_model_path):
     if self.cuda:
         self.word_attention = self.word_attention.cuda()
         self.sent_attention = self.sent_attention.cuda()
     self.word_attention.load_state_dict(torch.load(word_model_path))
     self.sent_attention.load_state_dict(torch.load(sent_model_path))
     y_true_single, y_true_multi, y_pred_single, y_pred_multi = self.gather_outputs(
         test_loader)
     test_f_score_single = Multilabel.f1_scores(y_true_single,
                                                y_pred_single)
     test_f_score_multi = Multilabel.f1_scores(y_true_multi, y_pred_multi)
     test_f_score_all = Multilabel.f1_scores(
         np.concatenate([y_true_multi, y_true_single]),
         np.concatenate([y_pred_multi, y_pred_single]))
     log.info("Test F1 single:{}".format(test_f_score_single))
     log.info("Test F1 multi:{}".format(test_f_score_multi))
     log.info("Test F1 all:{}".format(test_f_score_all))
示例#5
0
    def fit(self, train_loader, test_loader, epochs):
        if self.cuda:
            self.model = self.model.cuda()

        optimizer = optim.Adam(self.model.parameters())
        criterion = nn.BCEWithLogitsLoss()

        y_true, y_pred = self.gather_outputs(test_loader)
        log.info("Test F1: {}".format(Multilabel.f1_scores(y_true, y_pred)))

        monitor = {"train_F1": [], "test_F1": []}
        for epoch in range(epochs):
            log.info("Epoch: {}".format(epoch))
            self.model.train(True)
            for text_batch, labels_batch in self._batch(
                    train_loader, self.batch_size):
                if self.cuda:
                    text_batch, labels_batch = text_batch.cuda(
                    ), labels_batch.cuda()
                self.model.zero_grad()
                output = self.model(text_batch)
                loss = criterion(output, labels_batch)
                loss.backward()
                optimizer.step()

                #log.info("Loss: {}".format(loss.item()))

            y_true, y_pred = self.gather_outputs(test_loader)
            test_f1 = Multilabel.f1_scores(y_true, y_pred)
            log.info("Test F1: {}".format(test_f1))
            y_true, y_pred = self.gather_outputs(train_loader)
            train_f1 = Multilabel.f1_scores(y_true, y_pred)
            log.info("Train F1: {}".format(train_f1))
            monitor["train_F1"].append(train_f1)
            monitor["test_F1"].append(test_f1)

        return monitor
示例#6
0
                    model.hidden = model.init_hidden()

                    if len(ners[0]) != 0:
                        ner_word_seq = torch.LongTensor(ners[0])
                        ner_label_seq = torch.LongTensor(ners[1])

                        output = model.forward(ner_word_seq, ner_label_seq)

                        loss = criterion(output, labels)
                        loss.backward()
                        optimizer.step()

                y_true, y_pred = eval_utilsPN.gather_outputs(
                    train_set, model, cuda, args.model)
                log.info("Train F1: {}".format(
                    Multilabel.f1_scores(y_true, y_pred)))

                y_true, y_pred = eval_utilsPN.gather_outputs(
                    test_set, model, cuda, args.model)
                log.info("Test F1: {}".format(
                    Multilabel.f1_scores(y_true, y_pred)))

        elif args.model == 'ner-comb-model':

            model = NERCombinedModel(len(train_set.label_dict),
                                     len(vocabulary.vocab),
                                     len(vocabulary.vocab_ner),
                                     len(vocabulary.entity_types_id))

            optimizer = optim.Adam(model.parameters())
            criterion = nn.BCEWithLogitsLoss()
示例#7
0
            groundtruth = []
            predictions = []
            for index, test_datapoint in enumerate(test_loader):
                prediction = randomForest.predict(
                    [lda.predict(test_datapoint)[0][0]])
                predictions.extend(prediction.tolist())
                groundtruth.append(list(test_datapoint[1][0].numpy()))
                if (index + 1) % 100 == 0:
                    print("Predicting Random Forest {}/{}".format(
                        index + 1, len(test_loader)))

            groundtruth, predictions = np.array(groundtruth), np.array(
                predictions)

            print("Test F1: {}".format(
                Multilabel.f1_scores(groundtruth, predictions)))

        elif args.model == "simple-deep":
            assert args.epochs > 0, "Provide number of epochs"
            cuda = torch.cuda.is_available()
            model = SimpleDeepModel(len(train_set.label_dict),
                                    len(vocabulary),
                                    2,
                                    use_cuda=cuda)
            log.info("Use CUDA: {}".format(cuda))
            if cuda:
                model = model.cuda()
            optimizer = optim.Adam(model.parameters())
            criterion = nn.BCEWithLogitsLoss()
            epochs = args.epochs
            y_true, y_pred = eval_utils.gather_outputs(test_set, model, cuda)
示例#8
0
    def fit(self, train_loader, test_loader, epochs):
        if self.cuda:
            self.word_attention = self.word_attention.cuda()
            self.sent_attention = self.sent_attention.cuda()

        word_optimizer = optim.Adam(self.word_attention.parameters(), lr=self.learning_rate,
                                    weight_decay=self.weight_decay)
        sent_optimizer = optim.Adam(self.sent_attention.parameters(), lr=self.learning_rate,
                                    weight_decay=self.weight_decay)
        scheduler_word = torch.optim.lr_scheduler.StepLR(word_optimizer, step_size=40, gamma=0.2)
        scheduler_sent = torch.optim.lr_scheduler.StepLR(sent_optimizer, step_size=40, gamma=0.2)
        best_fscore = 0

        for epoch in range(epochs):
            log.info("Epoch: {}".format(epoch))
            self.word_attention.train(True)
            self.sent_attention.train(True)

            count = 0
            all_loss = []
            for text_batch, labels_batch in self._batch(train_loader, self.batch_size):
                '''
                count += 1
                if count == 10:
                    count = 0
                    break
                '''
                if self.cuda:
                    text_batch, labels_batch = text_batch.cuda(), labels_batch.cuda()

                if text_batch.size()[0]!=self.batch_size:
                    continue
                self.sent_attention.zero_grad()
                self.word_attention.zero_grad()
                predictions = self.forward(text_batch)

                loss = self.bce(predictions.view(-1), labels_batch.view(-1), self.neg_weight, 1 - self.neg_weight)

                loss.backward()
                sent_optimizer.step()
                word_optimizer.step()

                all_loss.append(loss.item())

                count += 1
                if count % 500 == 0:
                    log.info("Loss: {}".format(loss.item()))
                    break
            train_loss = np.mean(all_loss)
            log.info("Train avg Loss: {}".format(train_loss))
            self.log_score_to_file(os.path.join(self.log_file_path, "{}_loss_train.txt".format(self.exp_num)),
                                   train_loss)
            y_true, y_pred, y_prob = self.gather_outputs(test_loader)
            test_f_score = Multilabel.f1_scores(y_true, y_pred)
            log.info("Test F1: {}".format(test_f_score))
            self.log_score_to_file(os.path.join(self.log_file_path, "{}_f_score_test.txt".format(self.exp_num)),
                                   test_f_score)
            if self.learning_rate_type == "step":
                scheduler_sent.step()
                scheduler_word.step()
            y_true, y_pred, y_prob = self.gather_outputs(train_loader)
            train_f_score = Multilabel.f1_scores(y_true, y_pred)
            log.info("Train F1: {}".format(train_f_score))
            self.log_score_to_file(os.path.join(self.log_file_path, "{}_f_score_train.txt".format(self.exp_num)),
                                   train_f_score)
            if test_f_score >= best_fscore:
                best_fscore = test_f_score
                torch.save(self.sent_attention.state_dict(),
                           os.path.join(self.log_file_path, '{}_best_model_sent.pt').format(self.exp_num))
                torch.save(self.word_attention.state_dict(),
                           os.path.join(self.log_file_path, '{}_best_model_word.pt').format(self.exp_num))
示例#9
0
            for epoch in range(args.epochs):
                for _id, labels, text, _,  _, _ in train_loader:
                    labels = torch.FloatTensor(labels)
                    model.zero_grad()
                    model.hidden = model.init_hidden()
                    seq = torch.LongTensor(text)
                    if args.cuda:
                        labels, seq = labels.cuda(), seq.cuda()
                    output = model.forward(seq)
                    loss = criterion(output, labels)
                    loss.backward()
                    optimizer.step()

                y_true, y_pred = eval_utils.gather_outputs(
                    test_set, model, args.cuda)
                test_f1 = Multilabel.f1_scores(y_true, y_pred)
                log.info("Test F1: {}".format(test_f1))
                monitor["test_f1"].append(test_f1)

            file_utils.save_obj(monitor, "./results_{}".format(args.model_id))

        elif args.model == "embedding-glove":
            assert args.composition_method is not None, "Please provide composition method"
            glove_model_path = "./common_persist/glove.pkl"
            if os.path.exists(glove_model_path):
                log.info("Loading existing glove model")
                glove = file_utils.load_obj(glove_model_path)
            else:
                log.info("Reading and saving glove model")
                glove = GloVeEmbeddings(
                    "./common_persist/embeddings/glove.6B.300d.txt", vocabulary)