def confusion_worker(queue, animation=False): """Matplotlib worker.""" config = get_config() titles = json.loads(config.get('sentiment', 'titles')) def init(): clear_annotations(annotations) for ax, image, title in zip(axes, images, titles): empty_confusion = [[0] * 3] * 3 image.set_data(empty_confusion) # annotate_confusion_matrix(ax, empty_confusion, annotations) return images fig, axes, images, annotations = make_confusion() # Important to assign it to a variable, even if we don't use it. anim = FuncAnimation(fig=fig, func=update_confusion, frames=lambda: get_data(queue), fargs=(images, axes, annotations), interval=200, repeat=False, init_func=init, blit=False) if animation: anim.save('confusion.mp4', fps=10, extra_args=['-vcodec', 'libx264']) else: plt.show()
def parse_artfestival(): config = get_config() with open(config.get('art_festival', 'corpus')) as f: for line in f: if line.startswith('#'): continue data = line.rstrip().split('\t') author, tweet, sentiment = [x.strip() for x in data[:3]] yield Experiment.DataInstance(tweet, sentiment)
def update_axes(count, axes): """Rescale axes to fit the new data.""" config = get_config() chunk_size = config.getint('sentiment', 'chunk_size') def round_nearest(x, base): return int(base * round(float(x) / base)) for ax in axes: ax.set_xbound(0, round_nearest(count + chunk_size, chunk_size))
def update_images(data, images, axes, annotations): clear_annotations(annotations) annotations = [] config = get_config() titles = json.loads(config.get('sentiment', 'titles')) for ax, image, title in zip(axes, images, titles): confusion_matrix = data[title]['confusion']._confusion norm_conf = normalized_confusion_matrix(confusion_matrix) image.set_data(norm_conf) image.norm.vmin, image.norm.vmax = 0.0, 1.0 annotations.append(annotate_confusion_matrix(ax, confusion_matrix, annotations)) return annotations
def _train_data(self): for x in super(TrainSemEvalSelfLearning, self)._train_data(): yield x config = get_config() for tweet in tweet_generator(config.get('twitter_corpus', 'emoticons')): features = self.extractor.extract(tweet.text) label, probability, = self._predict(features) if label == 'neutral': continue if probability > 0.8: yield self.DataInstance(tweet.text, label)
def setup_axes(axes, empty_confusion): """Setup look and feel of the axes.""" config = get_config() titles = json.loads(config.get('sentiment', 'titles')) width = len(empty_confusion) height = len(empty_confusion[0]) for ax, title in zip(axes, titles): ax.set_title(title) ax.set_aspect(1) alphabet = ['Neg', 'Neu', 'Pos'] ax.set_xticks(range(width)) ax.set_xticklabels(alphabet[:width]) ax.set_yticks(range(height)) ax.set_yticklabels(alphabet[:height])
def _test_data(self): config = get_config() with codecs.open(config.get('sanders', 'corpus'), encoding='utf-8') as f: for line in f: line = line.split(',', 5) if len(line) != 5: continue sentiment = line[1][1:-1].encode('utf-8') if sentiment not in ['positive', 'negative', 'neutral']: continue text = line[4].encode('utf-8') # Strip surrounding '"' text = text[1:-1] yield self.DataInstance(text, sentiment)
def _train_data(self): config = get_config() with open(config.get('stanford', 'corpus')) as f: for line in f: line = line.split(',', 5) if len(line) != 6: continue sentiment = int(line[0][1:-1]) if sentiment == 0: sentiment = 'negative' elif sentiment == 2: sentiment = 'neutral' elif sentiment == 4: sentiment = 'positive' text = line[5] # Strip surrounding '"' text = text[1:-1] yield self.DataInstance(text, sentiment)
def _train_data(self): for count, x in enumerate(super(TrainSemEvalBoosting, self) ._train_data(), start=1): yield x config = get_config() self.chunk_size /= 5 while 1: with open(config.get('semeval', 'training')) as f: old_count = count for tweet in task_b_generator(f): document = tweet.text features = self.extractor.extract(document) if tweet.label != self._predict(features)[0]: yield self.DataInstance(tweet.text, tweet.label) count += 1 if count < (old_count + 10): break
def make_lines(axes): """Create the lines for each axes.""" config = get_config() labels = json.loads(config.get('sentiment', 'labels')) lines = [] marker = itertools.cycle('o^vds') for ax in axes: ax_lines = [] for label in labels: x, y = [0], [0] line, = ax.plot(x, y, label=label) # comma for unpacking. line.set_marker(next(marker)) line.set_alpha(0.75) if label == 'Accuracy': line.set_color('black') line.set_zorder(0) # Drawn first, so is underneath. ax_lines.append((line, x, y)) lines.append(ax_lines) return lines
def main(): """Starts the web server as a user interface to the system.""" config = get_config() setup_logging(config) logger = logging.getLogger('ui.web') git_version, git_commit = get_git_version() if git_version: logger.info('Version: {0} ({1})'.format(git_version, git_commit)) else: logger.warning('Could not detect current Git commit.') twitter = Twitter(config=config) logger.info('Starting web server on port {}'.format(config.getint('web', 'port'))) start_server(config=config, twitter=twitter, git_version=(git_version, git_commit))
def setup_axes(axes): """Setup look and feel of the axes.""" config = get_config() titles = json.loads(config.get('sentiment', 'titles')) labels = json.loads(config.get('sentiment', 'labels')) for ax, title in zip(axes, titles): ax.set_title(title) ax.set_xlabel('training instances') ax.set_ylabel('performance') ax.set_xbound(0, 100) # bound will change as needed. ax.set_ylim(0, 1) # limit won't change automatically. # ax.xaxis.set_major_locator(MaxNLocator(10)) # ax.xaxis.set_minor_locator(AutoMinorLocator()) ax.yaxis.set_major_locator(MaxNLocator(10)) ax.yaxis.set_minor_locator(AutoMinorLocator(2)) ax.grid(True) # Use current line labels to build legend. ax.legend(loc='upper center', ncol=len(labels))
def write_semeval_predictions(experiment, final=False): config = get_config() twitter_test = config.get('semeval', 'twitter_test') twitter_predict = config.get('semeval', 'twitter_predict') sms_test = config.get('semeval', 'sms_test') sms_predict = config.get('semeval', 'sms_predict') # task2-B-twitter with open(twitter_test) as f, \ open(twitter_predict + ('.final' if final else ''), mode='w') as w: for instance in task_b_generator(f): sid, uid, label, text = instance features = experiment.extractor.extract(instance.text) label, probability = experiment._predict(features) w.write('\t'.join([sid, uid, label, text]) + '\n') # task2-B-SMS with open(sms_test) as f, \ open(sms_predict + ('.final' if final else ''), mode='w') as w: for instance in task_b_generator(f): sid, uid, label, text = instance features = experiment.extractor.extract(instance.text) label, probability = experiment._predict(features) w.write('\t'.join([sid, uid, label, text]) + '\n')
def main(): plot_queue = multiprocessing.Queue() confusion_queue = multiprocessing.Queue() start_plot(plot_queue, confusion_queue) first = (SingleClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval) second = (HierarchicalClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval) extractor = FeatureExtractor(tokenizer=tokenizer) extractor.min_n, extractor.max_n = 1, 2 config = get_config() chunk_size = config.getint("sentiment", "chunk_size") first_chunk = config.getint("sentiment", "first_chunk") titles = json.loads(config.get("sentiment", "titles")) experiment = run_experiment(first, second, extractor, chunk_size, first_chunk) try: for data in experiment: data[titles[0]] = parse_performance(data[titles[0]]) data[titles[1]] = parse_performance(data[titles[1]]) plot_queue.put(data) confusion_queue.put(data) print data[titles[0]]["count"], data[titles[0]]["SemEval"], data[titles[1]]["SemEval"], data[titles[0]][ "vocab" ], data[titles[1]]["vocab"] except KeyboardInterrupt: pass finally: plot_queue.put(None) plot_queue.close() confusion_queue.put(None) confusion_queue.close() print "Done processing."
def main(): config = get_config() path = config.get('sentiment', 'path') sentiment_fname = config.get('sentiment', 'classifier') sentiment_classifier_location = os.path.join(path, sentiment_fname) print 'Loading classifier (may take a few minutes) ... ', assert namedtuple and tokenizer with open(sentiment_classifier_location, mode='rb') as f: sentiment_classifier = pickle.load(f) print 'DONE' extractor = sentiment_classifier[0] subjective = sentiment_classifier[1] polarity = sentiment_classifier[2] def train(*documents): for document, label in documents: if label != 'neutral': assert label in set(['positive', 'negative']) polarity.train((document, label)) label = 'subjective' assert label in set(['neutral', 'subjective']) subjective.train((document, label)) print 'Training (pickled) classifier using misclassified instances...', with open(config.get('web', 'misclassified_file')) as f: for line in f: date, user, flag, mislabel, text = line.rstrip().split('\t') features = extractor.extract(text) train((features, flag)) print 'DONE' print 'Training (pickled) classifier using active learning instances...', with open(config.get('web', 'active_file')) as f: for line in f: date, user, flag, original, text = line.rstrip().split('\t') features = extractor.extract(text) train((features, flag)) print 'DONE' class SentimentService(rpyc.Service): def exposed_extract(self, document): return extractor.extract(document) def exposed_subjective_classify(self, features): return subjective.classify(features) def exposed_subjective_conditional(self, feature, label): return subjective.conditional(feature, label) def exposed_polarity_classify(self, features): return polarity.classify(features) def exposed_polarity_conditional(self, feature, label): return polarity.conditional(feature, label) def exposed_train(self, *documents): train(*documents) rpc_port = int(config.get('sentiment', 'rpc_port')) t = ThreadedServer(SentimentService, port=rpc_port) t.start()
def _test_data(self): config = get_config() with open(config.get('semeval', 'development')) as f: for instance in task_b_generator(f): yield instance
def _train_data(self): config = get_config() with open(config.get('semeval', 'training')) as f: for instance in task_b_generator(f): yield instance
def test_get_config_create(): get_config(fname=TEST_FNAME, create=True, exit=False) assert get_config(fname=TEST_FNAME, create=False, exit=False) os.remove(TEST_FNAME)
def test_get_config_none(): assert_raises(OSError, os.remove, TEST_FNAME) result = get_config(fname=TEST_FNAME, create=False, exit=False) assert result is None
def run_experiment(first, second, extractor, chunk_size, first_chunk=0): config = get_config() titles = json.loads(config.get('sentiment', 'titles')) test_scale = config.getint('sentiment', 'test_scale') Approach = type('_'.join(x.__name__ for x in first), first, {}) singular_classifier = Approach(extractor, chunk_size, first_chunk, test_scale=test_scale, evaluator=evaluate) print repr(singular_classifier) Approach = type('_'.join(x.__name__ for x in second), second, {}) hierarchical_classifier = Approach(extractor, chunk_size, first_chunk, test_scale=test_scale, evaluator=evaluate) print repr(hierarchical_classifier) # hierarchical_classifier = OldClassifier(extractor) best_performance = None, None, None p1, p2 = [], [] # Declare in case the try block raises an exception. try: for single, hierarchy in itertools.izip(singular_classifier, hierarchical_classifier): new_best = None c1, p1 = single c2, p2 = hierarchy data = dict() data[titles[0]] = p1 data[titles[0]]['count'] = c1 # data[titles[0]]['vocab'] = singular_classifier.nb._vocab_size#, len(singular_classifier.nb._most_common['positive'].store) data[titles[0]]['vocab'] = singular_classifier.nb._vocab_size # , len(singular_classifier.polarity._most_common['positive'].store) data[titles[1]] = p2 data[titles[1]]['count'] = c2 data[titles[1]]['vocab'] = hierarchical_classifier.polarity._vocab_size # , len(hierarchical_classifier.polarity._most_common['positive'].store) if data[titles[0]]['semeval f_measure'] > best_performance[0]: new_best = data[titles[0]]['semeval f_measure'], singular_classifier, data[titles[0]] best_performance = new_best if data[titles[1]]['semeval f_measure'] > best_performance[0]: new_best = data[titles[1]]['semeval f_measure'], hierarchical_classifier, data[titles[1]] best_performance = new_best # if new_best: # print 'New Best! (see below):', new_best[1].__class__.__name__ # pprint(new_best[2]) # with open(r"D:\semeval-best.pickle", mode='wb') as f: # f.write(new_best[1].pickle_dumps()) # write_semeval_predictions(new_best[1]) yield data except KeyboardInterrupt: raise finally: print 'Final performance:' try: for label, performance in zip(titles, (p1, p2)): confusion_matrix = performance['confusionmatrix'].pp() # del performance['confusion'] print label pprint(performance) print confusion_matrix except: print 'ERROR: Unavailable.'