class ViralityPrediction: SCORE_PLOT_FILENAME = "hashtags_score.png" # If CLASSIFICATION is set to True, classification is used, otherwise regression CLASSIFICATION = True # K defines the number of results in the Top-K virality predictions K = 10 def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump() def predict(self, hashtags, hashtag_threshold=None): """ Return a dictionary containing for each hashtag its virality prediction - hashtags: dictionary hashtag -> array of tweets features - hashtag_threshold: if defined, the dictionary value will be a boolean set at true if the hashtag goes viral, false otherwise. If not defined, the value will be the number of retweets Features are [followers_count, friends_count, listed_count, statuses_count, hashtags_count, media_count, user_mention_count, url_count, verified_account, is_a_retweet, tweet_length] """ values = {} for key, value in hashtags.iteritems(): if ViralityPrediction.CLASSIFICATION: tweets_values = self.model.predictClassifier(value) else: tweets_values = self.model.predictRegression(value) hashtag_value = sum(tweets_values) if hashtag_threshold is not None: if hashtag_value >= hashtag_threshold: values[key] = 1 else: values[key] = 0 else: # Round to the nearest 10 below the current value values[key] = max(0, int(math.floor(hashtag_value / 10.0)) * 10) return values def score(self, expected, predicted, labels=None, showPlot=True, savePlot=False): if showPlot or savePlot: x = np.arange(len(expected)) width = 0.8 ticks = x + x * width fig = plt.figure() ax = fig.add_subplot(111) bar1 = ax.bar(ticks, expected, color='green') bar2 = ax.bar(ticks + width, predicted, color='blue') ax.set_xlim(-width, (ticks + width)[-1] + 2 * width) ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05) ax.set_xticks(ticks + width) if labels is not None: xtickNames = ax.set_xticklabels(labels) plt.setp(xtickNames, rotation=45, fontsize=10) ax.set_title('Expected and predicted retweet count per hashtag') ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted')) if savePlot: plt.savefig(DataAnalyser.PLOT_DIR + ViralityPrediction.SCORE_PLOT_FILENAME, format='png') if showPlot: plt.show() return np.mean(predicted - expected)**2
class ViralityPrediction: SCORE_PLOT_FILENAME = "hashtags_score.png" # If CLASSIFICATION is set to True, classification is used, otherwise regression CLASSIFICATION = True # K defines the number of results in the Top-K virality predictions K = 10 def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump() def predict(self, hashtags, hashtag_threshold=None): """ Return a dictionary containing for each hashtag its virality prediction - hashtags: dictionary hashtag -> array of tweets features - hashtag_threshold: if defined, the dictionary value will be a boolean set at true if the hashtag goes viral, false otherwise. If not defined, the value will be the number of retweets Features are [followers_count, friends_count, listed_count, statuses_count, hashtags_count, media_count, user_mention_count, url_count, verified_account, is_a_retweet, tweet_length] """ values = {} for key, value in hashtags.iteritems(): if ViralityPrediction.CLASSIFICATION: tweets_values = self.model.predictClassifier(value) else: tweets_values = self.model.predictRegression(value) hashtag_value = sum(tweets_values) if hashtag_threshold is not None: if hashtag_value >= hashtag_threshold: values[key] = 1 else: values[key] = 0 else: # Round to the nearest 10 below the current value values[key] = max(0, int(math.floor(hashtag_value / 10.0)) * 10) return values def score(self, expected, predicted, labels=None, showPlot=True, savePlot=False): if showPlot or savePlot: x = np.arange(len(expected)) width = 0.8 ticks = x + x * width fig = plt.figure() ax = fig.add_subplot(111) bar1 = ax.bar(ticks, expected, color='green') bar2 = ax.bar(ticks + width, predicted, color='blue') ax.set_xlim(-width, (ticks + width)[-1] + 2 * width) ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05) ax.set_xticks(ticks + width) if labels is not None: xtickNames = ax.set_xticklabels(labels) plt.setp(xtickNames, rotation=45, fontsize=10) ax.set_title('Expected and predicted retweet count per hashtag') ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted')) if savePlot: plt.savefig(DataAnalyser.PLOT_DIR + ViralityPrediction.SCORE_PLOT_FILENAME, format='png') if showPlot: plt.show() return np.mean(predicted - expected) ** 2