def test_get_data_constructor(self): ds = data_sources.RedisDataSource(self.db, 'stanford-corpus', ['positive', 'negative']) c = mock.MagicMock() sentiments = ds.get_data(c) self.assertEqual(c.call_count, 3)
def main(collection, destination, nfeats, nbigrams, classifier_type): LOGGER.info("Started classifier") if not destination: destination = generate_path_for_classifier(collection, nfeats, nbigrams, classifier_type) LOGGER.info("Classifier will be saved in: %s" % destination) LOGGER.info("Training a %s classifier with %s feats and %s bigrams" % (classifier_type, nfeats, nbigrams)) # Get training data using data source LOGGER.info("Building datasource") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) ds = data_sources.RedisDataSource(redis.Redis(), collection, ['positive', 'negative']) phrases = ds.get_data(make_phrase) # Initialize the Text Processor, get bigrams and informative features LOGGER.info("Building text processor") processor = phrase.TextProcessor(phrases, FORMATTER) # Train the classifier using the Text Processor meta = {'train_corpus': collection, 'classifier_type': classifier_type} LOGGER.info("Training Classifier") classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta) # Serialize the classifier LOGGER.info("Serializing classifier") if not os.path.exists(destination): os.makedirs(destination) classifier.serialize(destination)
def test_get_data(self): ds = data_sources.RedisDataSource(self.db, 'stanford-corpus', ['positive', 'negative']) sentiments = ds.get_data() self.assertEqual(len(sentiments.keys()), 2) self.assertEqual(sentiments['positive'][0], 'I am positive') self.assertEqual(sentiments['negative'][0], 'I am negative')
def main(path, against, nodb): LOGGER.info("Started testing") LOGGER.info("Loading classifier") classifier = phrase.TrainedClassifier.load(path, FORMATTER) LOGGER.info("Loading testing data") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) ds = data_sources.RedisDataSource(redis.Redis(), against, ['positive', 'negative']) data = ds.get_data(make_phrase) LOGGER.info("Making testing data") test_data = [] for sentiment, phrases in data.iteritems(): for p in phrases: test_data.append((p, sentiment)) try: classifier.show_most_informative_features() except AttributeError: pass accuracy = nltk.classify.util.accuracy(classifier, test_data) LOGGER.info("Accuracy is: %s" % accuracy) if not nodb: conn = pymongo.Connection() db = conn['worldmood'] coll = db['statistics'] s = classifier.meta s['accuracy'] = accuracy s['test_corpus'] = against coll.update({ 'uid': classifier.get_uid() }, s, upsert=True) LOGGER.info("Updated collection database: %s" % s)
def test_it_initializes(self): ds = data_sources.RedisDataSource(self.db, 'stanford-corpus', ['positive', 'negative']) self.assertEqual(len(ds.get_classes()), 2)