def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump()
def loadData(self): ''' initialises the data and loads it from hdfs and prepares the data. ''' self.training_data, self.test_data = RegressionModel.load_datasets(balance= True, viral_threshold=50000) self.viral_threshold = 50000 # picking a subset of training data as training with full data takes a lot of time. self.X_train = self.training_data[:30000, :-1] self.Y_train = self.training_data[:30000,-1] self.X_test = self.test_data[:, :-1] self.Y_test = self.test_data[:, -1] self.test_median = np.median(self.Y_test) self.train_median = np.median(self.Y_train) self.train_median = self.viral_threshold self.test_median = self.viral_threshold print "Test Median" print self.test_median print "Training Median" print self.train_median
def loadData(self): ''' initialises the data and loads it from hdfs and prepares the data. ''' self.training_data, self.test_data = RegressionModel.load_datasets( balance=True, viral_threshold=50000) self.viral_threshold = 50000 # picking a subset of training data as training with full data takes a lot of time. self.X_train = self.training_data[:30000, :-1] self.Y_train = self.training_data[:30000, -1] self.X_test = self.test_data[:, :-1] self.Y_test = self.test_data[:, -1] self.test_median = np.median(self.Y_test) self.train_median = np.median(self.Y_train) self.train_median = self.viral_threshold self.test_median = self.viral_threshold print "Test Median" print self.test_median print "Training Median" print self.train_median