def question_1(data, data_features): print("Question 1:") knn_runner = AlgorithmRunner("KNN") rocchio_runner = AlgorithmRunner("Rocchio") stats = Statistics() #KNN calculations kfoldKNN = data.split_to_k_folds() sumPrecisionKNN = 0 sumRecallKNN = 0 sumAccuracyKNN = 0 for trainKNN, testKNN in kfoldKNN: knn_runner.fit( data_features.loc[:, data_features.columns != 'imdb_score']. iloc[trainKNN], data_features['imdb_score'].iloc[trainKNN]) pred = knn_runner.algorithm.predict( data_features.loc[:, data_features.columns != 'imdb_score']. iloc[testKNN]) sumPrecisionKNN = stats.precision(labels=np.array( data_features['imdb_score'].iloc[testKNN]).T, predictions=pred) + sumPrecisionKNN sumRecallKNN = stats.recall(labels=np.array( data_features['imdb_score'].iloc[testKNN]), predictions=pred) + sumRecallKNN sumAccuracyKNN = stats.accuracy(labels=np.array( data_features['imdb_score'].iloc[testKNN]), predictions=pred) + sumAccuracyKNN print("KNN classifier: ", sumPrecisionKNN / 5, ",", sumRecallKNN / 5, ",", sumAccuracyKNN / 5) #Rocchio calculations kfoldRocciho = data.split_to_k_folds() sumPrecisionRocchio = 0 sumRecallRocchio = 0 sumAccuracyRocchio = 0 for trainRocciho, testRocciho in kfoldRocciho: rocchio_runner.fit( data_features.loc[:, data_features.columns != 'imdb_score']. iloc[trainRocciho], data_features['imdb_score'].iloc[trainRocciho]) pred = rocchio_runner.algorithm.predict( data_features.loc[:, data_features.columns != 'imdb_score']. iloc[testRocciho]) sumPrecisionRocchio = stats.precision( labels=np.array(data_features['imdb_score'].iloc[testRocciho]).T, predictions=pred) + sumPrecisionRocchio sumRecallRocchio = stats.recall(labels=np.array( data_features['imdb_score'].iloc[testRocciho]), predictions=pred) + sumRecallRocchio sumAccuracyRocchio = stats.accuracy( labels=np.array(data_features['imdb_score'].iloc[testRocciho]), predictions=pred) + sumAccuracyRocchio print("Rocchio classifier: ", sumPrecisionRocchio / 5, ",", sumRecallRocchio / 5, ",", sumAccuracyRocchio / 5) print(" ")
def run_train_batched(self, train_data, valid_data, vocabs): print(self.model.parameters) total_train = train_data.compute_batches(self.opt.batch_size, vocabs, self.opt.max_chars, 0, 1, self.opt.decoder_type, trunc=self.opt.trunc) total_valid = valid_data.compute_batches(self.opt.batch_size, vocabs, self.opt.max_chars, 0, 1, self.opt.decoder_type, randomize=False, trunc=self.opt.trunc) print('Computed Batches. Total train={}, Total valid={}'.format(total_train, total_valid)) report_stats = Statistics() self.last_ppl = None for epoch in range(self.start_epoch, self.opt.epochs + 1): self.model.train() total_stats = Statistics() batch_number = -1 for idx, batch in enumerate(train_data.batches): batch['gpu'] = self.opt.gpuid[0] loss, batch_stats = self.model.forward(batch) batch_size = batch['code'].size(0) loss.div(batch_size).backward() report_stats.update(batch_stats) total_stats.update(batch_stats) batch_number += 1 clip_grad_norm_(self.model.parameters(), self.opt.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() if batch_number % self.opt.report_every == -1 % self.opt.report_every: report_stats.output(epoch, batch_number + 1, len(train_data.batches), total_stats.start_time) report_stats = Statistics() print('Train perplexity: %g' % total_stats.ppl()) print('Train accuracy: %g' % total_stats.accuracy()) self.model.eval() valid_stats = Statistics() for idx, batch in enumerate(valid_data.batches): batch['gpu'] = self.opt.gpuid[0] loss, batch_stats = self.model.forward(batch) valid_stats.update(batch_stats) print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) self.update_learning_rate(valid_stats) print('Saving model') self.save_checkpoint(epoch, valid_stats) print('Model saved')
def run_train_batched(self, train_data, valid_data, vocabs): print(self.model.parameters) total_train = train_data.compute_batches(self.opt.batch_size, vocabs, self.opt.max_camel, 0, 1) total_valid = valid_data.compute_batches(10, vocabs, self.opt.max_camel, 0, 1, randomize=False) print('Computed Batches. Total train={}, Total valid={}'.format( total_train, total_valid)) for epoch in range(self.start_epoch, self.opt.epochs + 1): self.model.train() total_stats = Statistics() for idx, batch in enumerate(train_data.batches): loss, batch_stats = self.model.forward(batch) batch_size = batch['code'].size(0) loss.div(batch_size).backward() report_stats = Statistics() report_stats.update(batch_stats) total_stats.update(batch_stats) # clip_grad_norm(self.model.parameters(), self.opt.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() #report_stats.output(epoch, idx + 1, len(train_data.batches), total_stats.start_time) print('Train perplexity: %g' % total_stats.ppl()) print('Train accuracy: %g' % total_stats.accuracy()) self.train_scores.append(total_stats.accuracy()) self.train_ppl.append(total_stats.ppl()) self.model.eval( ) # set to evaluation mode so no gradients are accumulated valid_stats = Statistics() for idx, batch in enumerate(valid_data.batches): loss, batch_stats = self.model.forward(batch) valid_stats.update(batch_stats) print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) self.valid_scores.append(valid_stats.accuracy()) self.valid_ppl.append(valid_stats.ppl()) # plt.figure(1) # plt.plot(self.train_scores, label='train accuracy', color='red') # plt.plot(self.valid_scores, label='valid accuracy', color='blue') # plt.savefig('accuracy.png') # # plt.figure(2) # plt.plot(self.train_ppl, label='train perplexity', color='red') # plt.plot(self.valid_ppl, label='valid perplexity', color='blue') # plt.savefig('perplexity.png') print('Saving model') self.save_checkpoint(epoch, valid_stats) print('Model saved')