Пример #1
0
    def train_over_dumps(self, model, filenames):
        # remove duplicate file names
        filenames = sorted(list(set(filenames)))

        self.__filenames = filenames
        reader = DataReader(filenames, on_file_start_cb=self.on_file_start,
                                       on_file_stop_cb=self.on_file_stop)

        sys.stdout.write("Processing %d files...\n" % len(filenames))
        for item in reader.get_tweets():
            sentiment, tweet = item
            model.fit(tweet, sentiment)

        return model
Пример #2
0
    def run(self):
        args = self.get_arguments()
        model_file = args[0]
        filenames = args[1:]

        if not os.path.exists(model_file):
            print("ERROR: model file '%s' doesn't exist" % model_file)
            return True

        model = SentimentModel.load(model_file)
        reader = DataReader(filenames)

        print("Prediction:")
        for item in reader.get_tweets():
            sentiment, tweet = item
            label = TEXT_LABELS[model.predict(tweet)]
            print(" %10s: %s" % (label, tweet.text))

        return False
Пример #3
0
    def run(self):
        args = self.get_arguments()
        model_file = args[0]
        filenames = args[1:]

        if not os.path.exists(model_file):
            print("ERROR: model file '%s' doesn't exist" % model_file)
            return True

        model = SentimentModel.load(model_file)
        reader = DataReader(filenames)

        print("Prediction:")
        for item in reader.get_tweets():
            sentiment, tweet = item
            label = TEXT_LABELS[model.predict(tweet)]
            print(" %10s: %s" % (label, tweet.text))

        return False
Пример #4
0
    def get_gold_comparison(self):
        """
        Creates two lists, the former containing the gold standard
        and the latter containing the predicted labels for the given datasets.

        Both lists will have the same size, and items in the i-th
        position will contain the prediction the predicted label for that
        item.
        """
        if not self.gold and not self.test:
            # cache gold/test generation as it may be called from more
            # than one place and doesn't change unless the dataset
            # changes (and thus the class is initialized again)
            reader = DataReader(self.datasets)
            self.gold, self.test = [], []
            for item in reader.get_tweets():
                sentiment, tweet = item
                self.gold.append(sentiment)
                self.test.append(self.model.predict(tweet))

        return (self.gold, self.test)
Пример #5
0
    def get_gold_comparison(self):
        """
        Creates two lists, the former containing the gold standard
        and the latter containing the predicted labels for the given datasets.

        Both lists will have the same size, and items in the i-th
        position will contain the prediction the predicted label for that
        item.
        """
        if not self.gold and not self.test:
            # cache gold/test generation as it may be called from more
            # than one place and doesn't change unless the dataset
            # changes (and thus the class is initialized again)
            reader = DataReader(self.datasets)
            self.gold, self.test = [], []
            for item in reader.get_tweets():
                sentiment, tweet = item
                self.gold.append(sentiment)
                self.test.append(self.model.predict(tweet))

        return (self.gold, self.test)