Пример #1
0
def question_1(data, data_features):
    print("Question 1:")
    knn_runner = AlgorithmRunner("KNN")
    rocchio_runner = AlgorithmRunner("Rocchio")
    stats = Statistics()

    #KNN calculations
    kfoldKNN = data.split_to_k_folds()
    sumPrecisionKNN = 0
    sumRecallKNN = 0
    sumAccuracyKNN = 0

    for trainKNN, testKNN in kfoldKNN:
        knn_runner.fit(
            data_features.loc[:, data_features.columns != 'imdb_score'].
            iloc[trainKNN], data_features['imdb_score'].iloc[trainKNN])
        pred = knn_runner.algorithm.predict(
            data_features.loc[:, data_features.columns != 'imdb_score'].
            iloc[testKNN])
        sumPrecisionKNN = stats.precision(labels=np.array(
            data_features['imdb_score'].iloc[testKNN]).T,
                                          predictions=pred) + sumPrecisionKNN
        sumRecallKNN = stats.recall(labels=np.array(
            data_features['imdb_score'].iloc[testKNN]),
                                    predictions=pred) + sumRecallKNN
        sumAccuracyKNN = stats.accuracy(labels=np.array(
            data_features['imdb_score'].iloc[testKNN]),
                                        predictions=pred) + sumAccuracyKNN
    print("KNN classifier: ", sumPrecisionKNN / 5, ",", sumRecallKNN / 5, ",",
          sumAccuracyKNN / 5)

    #Rocchio calculations
    kfoldRocciho = data.split_to_k_folds()
    sumPrecisionRocchio = 0
    sumRecallRocchio = 0
    sumAccuracyRocchio = 0
    for trainRocciho, testRocciho in kfoldRocciho:
        rocchio_runner.fit(
            data_features.loc[:, data_features.columns != 'imdb_score'].
            iloc[trainRocciho], data_features['imdb_score'].iloc[trainRocciho])
        pred = rocchio_runner.algorithm.predict(
            data_features.loc[:, data_features.columns != 'imdb_score'].
            iloc[testRocciho])
        sumPrecisionRocchio = stats.precision(
            labels=np.array(data_features['imdb_score'].iloc[testRocciho]).T,
            predictions=pred) + sumPrecisionRocchio
        sumRecallRocchio = stats.recall(labels=np.array(
            data_features['imdb_score'].iloc[testRocciho]),
                                        predictions=pred) + sumRecallRocchio
        sumAccuracyRocchio = stats.accuracy(
            labels=np.array(data_features['imdb_score'].iloc[testRocciho]),
            predictions=pred) + sumAccuracyRocchio
    print("Rocchio classifier: ", sumPrecisionRocchio / 5, ",",
          sumRecallRocchio / 5, ",", sumAccuracyRocchio / 5)
    print(" ")
Пример #2
0
  def run_train_batched(self, train_data, valid_data, vocabs):
    print(self.model.parameters)

    total_train = train_data.compute_batches(self.opt.batch_size, vocabs, self.opt.max_chars, 0, 1, self.opt.decoder_type,  trunc=self.opt.trunc)
    total_valid = valid_data.compute_batches(self.opt.batch_size, vocabs, self.opt.max_chars, 0, 1, self.opt.decoder_type, randomize=False, trunc=self.opt.trunc)

    print('Computed Batches. Total train={}, Total valid={}'.format(total_train, total_valid))

    report_stats = Statistics()
    self.last_ppl = None

    for epoch in range(self.start_epoch, self.opt.epochs + 1):
      self.model.train()

      total_stats = Statistics()
      batch_number = -1

      for idx, batch in enumerate(train_data.batches):
        batch['gpu'] = self.opt.gpuid[0]
        loss, batch_stats = self.model.forward(batch)
        batch_size = batch['code'].size(0)
        loss.div(batch_size).backward() 
        report_stats.update(batch_stats)
        total_stats.update(batch_stats)
        batch_number += 1

        clip_grad_norm_(self.model.parameters(), self.opt.max_grad_norm)
        self.optimizer.step()
        self.optimizer.zero_grad()

        if batch_number % self.opt.report_every == -1 % self.opt.report_every:
          report_stats.output(epoch, batch_number + 1, len(train_data.batches), total_stats.start_time)
          report_stats = Statistics()

      print('Train perplexity: %g' % total_stats.ppl())
      print('Train accuracy: %g' % total_stats.accuracy())

      self.model.eval()
      valid_stats = Statistics()
      for idx, batch in enumerate(valid_data.batches):
        batch['gpu'] = self.opt.gpuid[0]
        loss, batch_stats = self.model.forward(batch)
        valid_stats.update(batch_stats)

      print('Validation perplexity: %g' % valid_stats.ppl())
      print('Validation accuracy: %g' % valid_stats.accuracy())

      self.update_learning_rate(valid_stats)
      print('Saving model')
      self.save_checkpoint(epoch, valid_stats)
      print('Model saved')
Пример #3
0
    def run_train_batched(self, train_data, valid_data, vocabs):
        print(self.model.parameters)

        total_train = train_data.compute_batches(self.opt.batch_size, vocabs,
                                                 self.opt.max_camel, 0, 1)

        total_valid = valid_data.compute_batches(10,
                                                 vocabs,
                                                 self.opt.max_camel,
                                                 0,
                                                 1,
                                                 randomize=False)

        print('Computed Batches. Total train={}, Total valid={}'.format(
            total_train, total_valid))

        for epoch in range(self.start_epoch, self.opt.epochs + 1):
            self.model.train()

            total_stats = Statistics()
            for idx, batch in enumerate(train_data.batches):
                loss, batch_stats = self.model.forward(batch)
                batch_size = batch['code'].size(0)
                loss.div(batch_size).backward()

                report_stats = Statistics()
                report_stats.update(batch_stats)
                total_stats.update(batch_stats)

                # clip_grad_norm(self.model.parameters(), self.opt.max_grad_norm)
                self.optimizer.step()
                self.optimizer.zero_grad()

                #report_stats.output(epoch, idx + 1, len(train_data.batches), total_stats.start_time)

            print('Train perplexity: %g' % total_stats.ppl())
            print('Train accuracy: %g' % total_stats.accuracy())

            self.train_scores.append(total_stats.accuracy())
            self.train_ppl.append(total_stats.ppl())

            self.model.eval(
            )  # set to evaluation mode so no gradients are accumulated
            valid_stats = Statistics()
            for idx, batch in enumerate(valid_data.batches):
                loss, batch_stats = self.model.forward(batch)
                valid_stats.update(batch_stats)

            print('Validation perplexity: %g' % valid_stats.ppl())
            print('Validation accuracy: %g' % valid_stats.accuracy())

            self.valid_scores.append(valid_stats.accuracy())
            self.valid_ppl.append(valid_stats.ppl())

            # plt.figure(1)
            # plt.plot(self.train_scores, label='train accuracy', color='red')
            # plt.plot(self.valid_scores, label='valid accuracy', color='blue')
            # plt.savefig('accuracy.png')
            #
            # plt.figure(2)
            # plt.plot(self.train_ppl, label='train perplexity', color='red')
            # plt.plot(self.valid_ppl, label='valid perplexity', color='blue')
            # plt.savefig('perplexity.png')

            print('Saving model')
            self.save_checkpoint(epoch, valid_stats)
            print('Model saved')