예제 #1
0
파일: plot.py 프로젝트: bwbaugh/infertweet
def confusion_worker(queue, animation=False):
    """Matplotlib worker."""

    config = get_config()
    titles = json.loads(config.get('sentiment', 'titles'))

    def init():
        clear_annotations(annotations)
        for ax, image, title in zip(axes, images, titles):
            empty_confusion = [[0] * 3] * 3
            image.set_data(empty_confusion)
            # annotate_confusion_matrix(ax, empty_confusion, annotations)
        return images

    fig, axes, images, annotations = make_confusion()
    # Important to assign it to a variable, even if we don't use it.
    anim = FuncAnimation(fig=fig,
                         func=update_confusion,
                         frames=lambda: get_data(queue),
                         fargs=(images, axes, annotations),
                         interval=200,
                         repeat=False,
                         init_func=init,
                         blit=False)
    if animation:
        anim.save('confusion.mp4', fps=10, extra_args=['-vcodec', 'libx264'])
    else:
        plt.show()
예제 #2
0
def parse_artfestival():
    config = get_config()
    with open(config.get('art_festival', 'corpus')) as f:
        for line in f:
            if line.startswith('#'):
                continue
            data = line.rstrip().split('\t')
            author, tweet, sentiment = [x.strip() for x in data[:3]]
            yield Experiment.DataInstance(tweet, sentiment)
예제 #3
0
파일: plot.py 프로젝트: bwbaugh/infertweet
    def update_axes(count, axes):
        """Rescale axes to fit the new data."""
        config = get_config()
        chunk_size = config.getint('sentiment', 'chunk_size')

        def round_nearest(x, base):
            return int(base * round(float(x) / base))

        for ax in axes:
            ax.set_xbound(0, round_nearest(count + chunk_size, chunk_size))
예제 #4
0
파일: plot.py 프로젝트: bwbaugh/infertweet
 def update_images(data, images, axes, annotations):
     clear_annotations(annotations)
     annotations = []
     config = get_config()
     titles = json.loads(config.get('sentiment', 'titles'))
     for ax, image, title in zip(axes, images, titles):
         confusion_matrix = data[title]['confusion']._confusion
         norm_conf = normalized_confusion_matrix(confusion_matrix)
         image.set_data(norm_conf)
         image.norm.vmin, image.norm.vmax = 0.0, 1.0
         annotations.append(annotate_confusion_matrix(ax, confusion_matrix,
                                                      annotations))
     return annotations
예제 #5
0
    def _train_data(self):
        for x in super(TrainSemEvalSelfLearning, self)._train_data():
            yield x

        config = get_config()
        for tweet in tweet_generator(config.get('twitter_corpus', 'emoticons')):
            features = self.extractor.extract(tweet.text)
            label, probability, = self._predict(features)

            if label == 'neutral':
                continue

            if probability > 0.8:
                yield self.DataInstance(tweet.text, label)
예제 #6
0
파일: plot.py 프로젝트: bwbaugh/infertweet
 def setup_axes(axes, empty_confusion):
     """Setup look and feel of the axes."""
     config = get_config()
     titles = json.loads(config.get('sentiment', 'titles'))
     width = len(empty_confusion)
     height = len(empty_confusion[0])
     for ax, title in zip(axes, titles):
         ax.set_title(title)
         ax.set_aspect(1)
         alphabet = ['Neg', 'Neu', 'Pos']
         ax.set_xticks(range(width))
         ax.set_xticklabels(alphabet[:width])
         ax.set_yticks(range(height))
         ax.set_yticklabels(alphabet[:height])
예제 #7
0
 def _test_data(self):
     config = get_config()
     with codecs.open(config.get('sanders', 'corpus'),
                      encoding='utf-8') as f:
         for line in f:
             line = line.split(',', 5)
             if len(line) != 5:
                 continue
             sentiment = line[1][1:-1].encode('utf-8')
             if sentiment not in ['positive', 'negative', 'neutral']:
                 continue
             text = line[4].encode('utf-8')
             # Strip surrounding '"'
             text = text[1:-1]
             yield self.DataInstance(text, sentiment)
예제 #8
0
 def _train_data(self):
     config = get_config()
     with open(config.get('stanford', 'corpus')) as f:
         for line in f:
             line = line.split(',', 5)
             if len(line) != 6:
                 continue
             sentiment = int(line[0][1:-1])
             if sentiment == 0:
                 sentiment = 'negative'
             elif sentiment == 2:
                 sentiment = 'neutral'
             elif sentiment == 4:
                 sentiment = 'positive'
             text = line[5]
             # Strip surrounding '"'
             text = text[1:-1]
             yield self.DataInstance(text, sentiment)
예제 #9
0
    def _train_data(self):
        for count, x in enumerate(super(TrainSemEvalBoosting, self)
                                  ._train_data(), start=1):
            yield x

        config = get_config()

        self.chunk_size /= 5
        while 1:
            with open(config.get('semeval', 'training')) as f:
                old_count = count
                for tweet in task_b_generator(f):
                    document = tweet.text
                    features = self.extractor.extract(document)
                    if tweet.label != self._predict(features)[0]:
                        yield self.DataInstance(tweet.text, tweet.label)
                        count += 1
                if count < (old_count + 10):
                    break
예제 #10
0
파일: plot.py 프로젝트: bwbaugh/infertweet
 def make_lines(axes):
     """Create the lines for each axes."""
     config = get_config()
     labels = json.loads(config.get('sentiment', 'labels'))
     lines = []
     marker = itertools.cycle('o^vds')
     for ax in axes:
         ax_lines = []
         for label in labels:
             x, y = [0], [0]
             line, = ax.plot(x, y, label=label)  # comma for unpacking.
             line.set_marker(next(marker))
             line.set_alpha(0.75)
             if label == 'Accuracy':
                 line.set_color('black')
                 line.set_zorder(0)  # Drawn first, so is underneath.
             ax_lines.append((line, x, y))
         lines.append(ax_lines)
     return lines
예제 #11
0
파일: main.py 프로젝트: bwbaugh/infertweet
def main():
    """Starts the web server as a user interface to the system."""
    config = get_config()

    setup_logging(config)

    logger = logging.getLogger('ui.web')

    git_version, git_commit = get_git_version()
    if git_version:
        logger.info('Version: {0} ({1})'.format(git_version, git_commit))
    else:
        logger.warning('Could not detect current Git commit.')

    twitter = Twitter(config=config)

    logger.info('Starting web server on port {}'.format(config.getint('web',
                                                                      'port')))
    start_server(config=config, twitter=twitter,
                 git_version=(git_version, git_commit))
예제 #12
0
파일: plot.py 프로젝트: bwbaugh/infertweet
    def setup_axes(axes):
        """Setup look and feel of the axes."""
        config = get_config()
        titles = json.loads(config.get('sentiment', 'titles'))
        labels = json.loads(config.get('sentiment', 'labels'))
        for ax, title in zip(axes, titles):
            ax.set_title(title)
            ax.set_xlabel('training instances')
            ax.set_ylabel('performance')

            ax.set_xbound(0, 100)  # bound will change as needed.
            ax.set_ylim(0, 1)  # limit won't change automatically.

            # ax.xaxis.set_major_locator(MaxNLocator(10))
            # ax.xaxis.set_minor_locator(AutoMinorLocator())
            ax.yaxis.set_major_locator(MaxNLocator(10))
            ax.yaxis.set_minor_locator(AutoMinorLocator(2))

            ax.grid(True)

            # Use current line labels to build legend.
            ax.legend(loc='upper center', ncol=len(labels))
예제 #13
0
def write_semeval_predictions(experiment, final=False):
    config = get_config()
    twitter_test = config.get('semeval', 'twitter_test')
    twitter_predict = config.get('semeval', 'twitter_predict')
    sms_test = config.get('semeval', 'sms_test')
    sms_predict = config.get('semeval', 'sms_predict')

    # task2-B-twitter
    with open(twitter_test) as f, \
            open(twitter_predict + ('.final' if final else ''), mode='w') as w:
        for instance in task_b_generator(f):
            sid, uid, label, text = instance
            features = experiment.extractor.extract(instance.text)
            label, probability = experiment._predict(features)
            w.write('\t'.join([sid, uid, label, text]) + '\n')

    # task2-B-SMS
    with open(sms_test) as f, \
            open(sms_predict + ('.final' if final else ''), mode='w') as w:
        for instance in task_b_generator(f):
            sid, uid, label, text = instance
            features = experiment.extractor.extract(instance.text)
            label, probability = experiment._predict(features)
            w.write('\t'.join([sid, uid, label, text]) + '\n')
예제 #14
0
파일: train.py 프로젝트: bwbaugh/infertweet
def main():
    plot_queue = multiprocessing.Queue()
    confusion_queue = multiprocessing.Queue()
    start_plot(plot_queue, confusion_queue)

    first = (SingleClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval)
    second = (HierarchicalClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval)

    extractor = FeatureExtractor(tokenizer=tokenizer)
    extractor.min_n, extractor.max_n = 1, 2

    config = get_config()
    chunk_size = config.getint("sentiment", "chunk_size")
    first_chunk = config.getint("sentiment", "first_chunk")
    titles = json.loads(config.get("sentiment", "titles"))

    experiment = run_experiment(first, second, extractor, chunk_size, first_chunk)

    try:
        for data in experiment:
            data[titles[0]] = parse_performance(data[titles[0]])
            data[titles[1]] = parse_performance(data[titles[1]])
            plot_queue.put(data)
            confusion_queue.put(data)
            print data[titles[0]]["count"], data[titles[0]]["SemEval"], data[titles[1]]["SemEval"], data[titles[0]][
                "vocab"
            ], data[titles[1]]["vocab"]
    except KeyboardInterrupt:
        pass
    finally:
        plot_queue.put(None)
        plot_queue.close()
        confusion_queue.put(None)
        confusion_queue.close()

    print "Done processing."
예제 #15
0
def main():
    config = get_config()
    path = config.get('sentiment', 'path')
    sentiment_fname = config.get('sentiment', 'classifier')
    sentiment_classifier_location = os.path.join(path, sentiment_fname)

    print 'Loading classifier (may take a few minutes) ... ',
    assert namedtuple and tokenizer
    with open(sentiment_classifier_location, mode='rb') as f:
        sentiment_classifier = pickle.load(f)
    print 'DONE'

    extractor = sentiment_classifier[0]
    subjective = sentiment_classifier[1]
    polarity = sentiment_classifier[2]

    def train(*documents):
        for document, label in documents:
            if label != 'neutral':
                assert label in set(['positive', 'negative'])
                polarity.train((document, label))
                label = 'subjective'
            assert label in set(['neutral', 'subjective'])
            subjective.train((document, label))

    print 'Training (pickled) classifier using misclassified instances...',
    with open(config.get('web', 'misclassified_file')) as f:
        for line in f:
            date, user, flag, mislabel, text = line.rstrip().split('\t')
            features = extractor.extract(text)
            train((features, flag))
    print 'DONE'

    print 'Training (pickled) classifier using active learning instances...',
    with open(config.get('web', 'active_file')) as f:
        for line in f:
            date, user, flag, original, text = line.rstrip().split('\t')
            features = extractor.extract(text)
            train((features, flag))
    print 'DONE'

    class SentimentService(rpyc.Service):
        def exposed_extract(self, document):
            return extractor.extract(document)

        def exposed_subjective_classify(self, features):
            return subjective.classify(features)

        def exposed_subjective_conditional(self, feature, label):
            return subjective.conditional(feature, label)

        def exposed_polarity_classify(self, features):
            return polarity.classify(features)

        def exposed_polarity_conditional(self, feature, label):
            return polarity.conditional(feature, label)

        def exposed_train(self, *documents):
            train(*documents)

    rpc_port = int(config.get('sentiment', 'rpc_port'))
    t = ThreadedServer(SentimentService, port=rpc_port)
    t.start()
예제 #16
0
 def _test_data(self):
     config = get_config()
     with open(config.get('semeval', 'development')) as f:
         for instance in task_b_generator(f):
             yield instance
예제 #17
0
 def _train_data(self):
     config = get_config()
     with open(config.get('semeval', 'training')) as f:
         for instance in task_b_generator(f):
             yield instance
예제 #18
0
def test_get_config_create():
    get_config(fname=TEST_FNAME, create=True, exit=False)
    assert get_config(fname=TEST_FNAME, create=False, exit=False)
    os.remove(TEST_FNAME)
예제 #19
0
def test_get_config_none():
    assert_raises(OSError, os.remove, TEST_FNAME)
    result = get_config(fname=TEST_FNAME, create=False, exit=False)
    assert result is None
예제 #20
0
def run_experiment(first, second, extractor, chunk_size, first_chunk=0):
    config = get_config()
    titles = json.loads(config.get('sentiment', 'titles'))
    test_scale = config.getint('sentiment', 'test_scale')

    Approach = type('_'.join(x.__name__ for x in first), first, {})
    singular_classifier = Approach(extractor, chunk_size, first_chunk,
                                   test_scale=test_scale,
                                   evaluator=evaluate)
    print repr(singular_classifier)

    Approach = type('_'.join(x.__name__ for x in second), second, {})
    hierarchical_classifier = Approach(extractor, chunk_size, first_chunk,
                                       test_scale=test_scale,
                                       evaluator=evaluate)
    print repr(hierarchical_classifier)
    # hierarchical_classifier = OldClassifier(extractor)

    best_performance = None, None, None
    p1, p2 = [], []  # Declare in case the try block raises an exception.
    try:
        for single, hierarchy in itertools.izip(singular_classifier,
                                                hierarchical_classifier):
            new_best = None
            c1, p1 = single
            c2, p2 = hierarchy
            data = dict()
            data[titles[0]] = p1
            data[titles[0]]['count'] = c1
            # data[titles[0]]['vocab'] = singular_classifier.nb._vocab_size#, len(singular_classifier.nb._most_common['positive'].store)
            data[titles[0]]['vocab'] = singular_classifier.nb._vocab_size  # , len(singular_classifier.polarity._most_common['positive'].store)
            data[titles[1]] = p2
            data[titles[1]]['count'] = c2
            data[titles[1]]['vocab'] = hierarchical_classifier.polarity._vocab_size  # , len(hierarchical_classifier.polarity._most_common['positive'].store)

            if data[titles[0]]['semeval f_measure'] > best_performance[0]:
                new_best = data[titles[0]]['semeval f_measure'], singular_classifier, data[titles[0]]
                best_performance = new_best
            if data[titles[1]]['semeval f_measure'] > best_performance[0]:
                new_best = data[titles[1]]['semeval f_measure'], hierarchical_classifier, data[titles[1]]
                best_performance = new_best
            # if new_best:
            #     print 'New Best! (see below):', new_best[1].__class__.__name__
            #     pprint(new_best[2])
            #     with open(r"D:\semeval-best.pickle", mode='wb') as f:
            #         f.write(new_best[1].pickle_dumps())
            #     write_semeval_predictions(new_best[1])

            yield data
    except KeyboardInterrupt:
        raise
    finally:
        print 'Final performance:'
        try:
            for label, performance in zip(titles, (p1, p2)):
                confusion_matrix = performance['confusionmatrix'].pp()
                # del performance['confusion']
                print label
                pprint(performance)
                print confusion_matrix
        except:
            print 'ERROR: Unavailable.'