def train_TG( classes, train_fname, model_fname, iterations=2, L1=0, heldout_fname=None, crossValidation=None, verbose=False ): """ Train using binary maximum entropy model (i.e. logistic regression) using stocastic gradient decent method. If heldout_fname is given then we will report the accuracy on heldout data after each iteration. If cross-validation is set to a number (e.g. 5 for five-fold cross-validation) then we will perform cross-validation and will report accuracy for each fold as well as the average. You cannot specify both cross-validation and holdout evaluation at the same time. If you do so then an error will be reported. """ TrainSeqFileReader = SEQUENTIAL_FILE_READER(train_fname) trainVects = TrainSeqFileReader.read() TrainSeqFileReader.close() if heldout_fname: HeldoutSeqFileReader = SEQUENTIAL_FILE_READER(heldout_fname) heldoutVects = HeldoutSeqFileReader.read() HeldoutSeqFileReader.close() Learner = TruncatedGradient(classes) Learner.total_iterations = iterations Learner.c = L1 Learner.verbose = verbose if crossValidation: Learner.folds = crossValidation if heldout_fname: Learner.heldoutVects = heldoutVects["vects"] no_features = classes * len(trainVects["featIDs"]) Learner.train(trainVects["vects"], no_features) Learner.writeModel(no_features, model_fname) pass
def load_model(self, model_fname, train_fname): """ Load the matrix from the model file. """ self.Cinv = genfromtxt("%s.matrix" % model_fname) train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() (self.D, self.t) = get_train_data(train_vects) para_file = open(model_fname) self.beta = float(para_file.readline().split()[1]) kernel_type = para_file.readline().strip().split("\t")[1] if kernel_type == "GAUSSIAN_QUADRATIC_KERNEL": self.kernel = GAUSSIAN_QUADRATIC_KERNEL() for i in range(0, 4): (para, val) = para_file.readline().strip().split() if para == "theta_0": self.kernel.theta_0 = float(val) elif para == "theta_1": self.kernel.theta_1 = float(val) elif para == "theta_2": self.kernel.theta_2 = float(val) elif para == "theta_3": self.kernel.theta_3 = float(val) para_file.close() pass
def predict_GPR(test_fname, train_fname, model_fname, output_fname=None, accuracy=False): """ Predict the outputs for the test instances. If the output is not specified, then write to the standard output. """ test_file = SEQUENTIAL_FILE_READER(test_fname) test_vects = test_file.read() test_file.close() learner = GPR() learner.load_model(model_fname, train_fname) count = 0 error = 0 if output_fname: output_file = SEQUENTIAL_FILE_WRITER(output_fname) else: output_file = SEQUENTIAL_FILE_WRITER(None, "STDOUT") for v in test_vects["vects"]: (mean, variance) = learner.predict(v) output_file.writeLine("%f\t%f\n" % (mean, variance)) if accuracy: error += (v.label - mean)**2 count += 1 error = sqrt(error) / float(count) if accuracy: output_file.writeLine("RMSE = %f\n" % error) output_file.close() pass
def train_GPR(train_fname, model_fname, verbose=True, beta=1, theta_0=None, theta_1=None, theta_2=None, theta_3=None): """ This is the utility function used to train a regression model using Gaussian Process. """ train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() learner = GPR() learner.verbose = verbose learner.beta = beta kernel = GAUSSIAN_QUADRATIC_KERNEL() if theta_0: kernel.theta_0 = theta_0 if theta_1: kernel.theta_1 = theta_1 if theta_2: kernel.theta_2 = theta_2 if theta_3: kernel.theta_3 = theta_3 learner.set_kernel(kernel) learner.train(train_vects) learner.save_model(model_fname) pass
def load_model(self, model_fname, train_fname): """ Load the matrix from the model file. """ self.Cinv = genfromtxt("%s.matrix" % model_fname) train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() (self.D, self.t) = get_train_data(train_vects) para_file = open(model_fname) self.beta = float(para_file.readline().split()[1]) kernel_type = para_file.readline().strip().split("\t")[1] if kernel_type == "GAUSSIAN_QUADRATIC_KERNEL": self.kernel = GAUSSIAN_QUADRATIC_KERNEL() for i in range(0,4): (para,val) = para_file.readline().strip().split() if para == "theta_0": self.kernel.theta_0 = float(val) elif para == "theta_1": self.kernel.theta_1 = float(val) elif para == "theta_2": self.kernel.theta_2 = float(val) elif para == "theta_3": self.kernel.theta_3 = float(val) para_file.close() pass
def predict_GPR(test_fname, train_fname, model_fname, output_fname=None, accuracy=False): """ Predict the outputs for the test instances. If the output is not specified, then write to the standard output. """ test_file = SEQUENTIAL_FILE_READER(test_fname) test_vects = test_file.read() test_file.close() learner = GPR() learner.load_model(model_fname, train_fname) count = 0 error = 0 if output_fname: output_file = SEQUENTIAL_FILE_WRITER(output_fname) else: output_file = SEQUENTIAL_FILE_WRITER(None, "STDOUT") for v in test_vects["vects"]: (mean, variance) = learner.predict(v) output_file.writeLine("%f\t%f\n" % (mean, variance)) if accuracy: error += (v.label - mean) ** 2 count += 1 error = sqrt(error) / float(count) if accuracy: output_file.writeLine("RMSE = %f\n" % error) output_file.close() pass
def train_SGD(classes, train_fname, model_fname, iterations=2, L2=0, heldout_fname=None, crossValidation=None, verbose=False): """ Train using binary maximum entropy model (i.e. logistic regression) using stocastic gradient decent method. If heldout_fname is given then we will report the accuracy on heldout data after each iteration. If cross-validation is set to a number (e.g. 5 for five-fold cross-validation) then we will perform cross-validation and will report accuracy for each fold as well as the average. You cannot specify both cross-validation and holdout evaluation at the same time. If you do so then an error will be reported. """ TrainSeqFileReader = SEQUENTIAL_FILE_READER(train_fname) trainVects = TrainSeqFileReader.read() TrainSeqFileReader.close() heldoutVects = None if heldout_fname: HeldoutSeqFileReader = SEQUENTIAL_FILE_READER(heldout_fname) heldoutVects = HeldoutSeqFileReader.read() HeldoutSeqFileReader.close() Learner = SGD(classes) Learner.total_iterations = iterations Learner.c = L2 Learner.verbose = verbose if crossValidation: Learner.folds = crossValidation if heldout_fname: Learner.heldoutVects = heldoutVects["vects"] no_features = classes * len(trainVects["featIDs"]) Learner.train(trainVects["vects"], no_features) print "Writing the model... %s" % model_fname Learner.writeModel(no_features, model_fname) pass