def __init__(self,
                 normalize=False,
                 balance=False,
                 tweet_threshold=0,
                 score=False,
                 dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                self.model.dump()
예제 #2
0
	def loadData(self):
		'''
		initialises the data and loads it from hdfs
		and prepares the data.

		'''

		self.training_data, self.test_data = RegressionModel.load_datasets(balance= True, viral_threshold=50000)

		self.viral_threshold = 50000

		#  picking a subset of training data as training with full data takes a lot of time.

		self.X_train = self.training_data[:30000, :-1]
		self.Y_train = self.training_data[:30000,-1]
		self.X_test = self.test_data[:, :-1]
		self.Y_test = self.test_data[:, -1]
		self.test_median = np.median(self.Y_test)
		self.train_median = np.median(self.Y_train)

		self.train_median = self.viral_threshold
		self.test_median = self.viral_threshold
		print "Test Median"
		print self.test_median
		print "Training Median"
		print self.train_median
예제 #3
0
    def loadData(self):
        '''
		initialises the data and loads it from hdfs
		and prepares the data.

		'''

        self.training_data, self.test_data = RegressionModel.load_datasets(
            balance=True, viral_threshold=50000)

        self.viral_threshold = 50000

        #  picking a subset of training data as training with full data takes a lot of time.

        self.X_train = self.training_data[:30000, :-1]
        self.Y_train = self.training_data[:30000, -1]
        self.X_test = self.test_data[:, :-1]
        self.Y_test = self.test_data[:, -1]
        self.test_median = np.median(self.Y_test)
        self.train_median = np.median(self.Y_train)

        self.train_median = self.viral_threshold
        self.test_median = self.viral_threshold
        print "Test Median"
        print self.test_median
        print "Training Median"
        print self.train_median
    def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                    self.model.dump()