def train_over_dumps(self, model, filenames): # remove duplicate file names filenames = sorted(list(set(filenames))) self.__filenames = filenames reader = DataReader(filenames, on_file_start_cb=self.on_file_start, on_file_stop_cb=self.on_file_stop) sys.stdout.write("Processing %d files...\n" % len(filenames)) for item in reader.get_tweets(): sentiment, tweet = item model.fit(tweet, sentiment) return model
def run(self): args = self.get_arguments() model_file = args[0] filenames = args[1:] if not os.path.exists(model_file): print("ERROR: model file '%s' doesn't exist" % model_file) return True model = SentimentModel.load(model_file) reader = DataReader(filenames) print("Prediction:") for item in reader.get_tweets(): sentiment, tweet = item label = TEXT_LABELS[model.predict(tweet)] print(" %10s: %s" % (label, tweet.text)) return False
def get_gold_comparison(self): """ Creates two lists, the former containing the gold standard and the latter containing the predicted labels for the given datasets. Both lists will have the same size, and items in the i-th position will contain the prediction the predicted label for that item. """ if not self.gold and not self.test: # cache gold/test generation as it may be called from more # than one place and doesn't change unless the dataset # changes (and thus the class is initialized again) reader = DataReader(self.datasets) self.gold, self.test = [], [] for item in reader.get_tweets(): sentiment, tweet = item self.gold.append(sentiment) self.test.append(self.model.predict(tweet)) return (self.gold, self.test)