def _build_prob_dist(self, fd, cfd): LOGGER.info("Building frequency distribution") for word, sentiment in self.phrases_it.iterate_formatted_words( self.formatter): fd.inc(word) cfd[sentiment].inc(word) return fd, cfd
def train_classifier(self, formatter, n_bigrams, n_feats, meta= {}): freq_dist, cond_freq_dist = self._build_prob_dist(FreqDist(), ConditionalFreqDist()) feats = self._get_most_informative_features(n_feats, freq_dist, cond_freq_dist) bigrams = self.get_bigram_analyzer(n_bigrams, freq_dist.iterkeys()) meta['n_bigrams'] = n_bigrams meta['n_feats'] = n_feats LOGGER.info("Building TrainedClassifier") return TrainedClassifier(formatter, bigrams, feats, meta, phrases_iterator=self.phrases_it)
def train_classifier(self, formatter, n_bigrams, n_feats, meta={}): freq_dist, cond_freq_dist = self._build_prob_dist( FreqDist(), ConditionalFreqDist()) feats = self._get_most_informative_features(n_feats, freq_dist, cond_freq_dist) bigrams = self.get_bigram_analyzer(n_bigrams, freq_dist.iterkeys()) meta['n_bigrams'] = n_bigrams meta['n_feats'] = n_feats LOGGER.info("Building TrainedClassifier") return TrainedClassifier(formatter, bigrams, feats, meta, phrases_iterator=self.phrases_it)
def _get_most_informative_features(self, nfeats, freq_dist, cond_freq_dist): LOGGER.info("Getting most informative fearures") LOGGER.info("Building Heap") heap = [] smallest_score = -1 res = [] for word, total_freq in freq_dist.iteritems(): score = 0 for sentiment in self._get_class_sentiments(): score += BigramAssocMeasures.chi_sq( cond_freq_dist[sentiment][word], (total_freq, cond_freq_dist[sentiment].N()), freq_dist.N()) if len(heap) < nfeats: heapq.heappush(heap, (score, word)) if score < smallest_score: smallest_score = score elif score > smallest_score: smallest_score = score heapq.heapreplace(heap, (score, word)) LOGGER.info("Smallest score has increased to: %s" % smallest_score) sorted_res = [] while heap: score, word = heapq.heappop(heap) sorted_res.insert(0, word) return sorted_res
def _get_most_informative_features(self, nfeats, freq_dist, cond_freq_dist): LOGGER.info("Getting most informative fearures") LOGGER.info("Building Heap") heap = [] smallest_score = -1 res = [] for word, total_freq in freq_dist.iteritems(): score = 0 for sentiment in self._get_class_sentiments(): score += BigramAssocMeasures.chi_sq( cond_freq_dist[sentiment][word], (total_freq, cond_freq_dist[sentiment].N()), freq_dist.N() ) if len(heap) < nfeats: heapq.heappush(heap, (score, word)) if score < smallest_score: smallest_score = score elif score > smallest_score: smallest_score = score heapq.heapreplace(heap, (score, word)) LOGGER.info("Smallest score has increased to: %s" % smallest_score) sorted_res = [] while heap: score, word = heapq.heappop(heap) sorted_res.insert(0, word) return sorted_res
def main(path): LOGGER.info("Started worker") LOGGER.info("Loading classifier") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) classifier = phrase.TrainedClassifier.load(path, FORMATTER) LOGGER.info("Ready and waiting for work") socket = get_zmq_socket(DEFAULT_ADDRESS) while True: try: message = socket.recv() data = json.loads(message) p = make_phrase(data['text']) result = prob_dist_to_dict(classifier.prob_classify(p)) if not result: socket.send('') continue LOGGER.info("[%s] %s" % (result['result'], data['text'])) data['prediction'] = result socket.send(json.dumps(data)) except zmq.error.ZMQError as e: LOGGER.error("Trying to recover from ZMQError crash, sending NIL") socket.send('') except Exception as e: LOGGER.error(e)
def main(path): LOGGER.info("Started worker") LOGGER.info("Loading classifier") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) classifier = phrase.TrainedClassifier.load(path, FORMATTER) LOGGER.info("Ready and waiting for work") socket = get_zmq_socket(DEFAULT_ADDRESS) while True: try: message = socket.recv() data = json.loads(message) p = make_phrase(data["text"]) result = prob_dist_to_dict(classifier.prob_classify(p)) if not result: socket.send("") continue LOGGER.info("[%s] %s" % (result["result"], data["text"])) data["prediction"] = result socket.send(json.dumps(data)) except zmq.error.ZMQError as e: LOGGER.error("Trying to recover from ZMQError crash, sending NIL") socket.send("") except Exception as e: LOGGER.error(e)
def main(path, against, nodb): LOGGER.info("Started testing") LOGGER.info("Loading classifier") classifier = phrase.TrainedClassifier.load(path, FORMATTER) LOGGER.info("Loading testing data") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) ds = data_sources.RedisDataSource(redis.Redis(), against, ['positive', 'negative']) data = ds.get_data(make_phrase) LOGGER.info("Making testing data") test_data = [] for sentiment, phrases in data.iteritems(): for p in phrases: test_data.append((p, sentiment)) try: classifier.show_most_informative_features() except AttributeError: pass accuracy = nltk.classify.util.accuracy(classifier, test_data) LOGGER.info("Accuracy is: %s" % accuracy) if not nodb: conn = pymongo.Connection() db = conn['worldmood'] coll = db['statistics'] s = classifier.meta s['accuracy'] = accuracy s['test_corpus'] = against coll.update({ 'uid': classifier.get_uid() }, s, upsert=True) LOGGER.info("Updated collection database: %s" % s)
def get_bigram_analyzer(self, n, words): LOGGER.info("Building Bigram Analyzer") bigram_measures = collocations.BigramAssocMeasures() finder = collocations.BigramCollocationFinder.from_words(words) return BigramAnalyzer( finder.above_score(bigram_measures.likelihood_ratio, n))
def main(collection, destination, nfeats, nbigrams, classifier_type): LOGGER.info("Started classifier") if not destination: destination = generate_path_for_classifier(collection, nfeats, nbigrams, classifier_type) LOGGER.info("Classifier will be saved in: %s" % destination) LOGGER.info("Training a %s classifier with %s feats and %s bigrams" % (classifier_type, nfeats, nbigrams)) # Get training data using data source LOGGER.info("Building datasource") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) ds = data_sources.RedisDataSource(redis.Redis(), collection, ['positive', 'negative']) phrases = ds.get_data(make_phrase) # Initialize the Text Processor, get bigrams and informative features LOGGER.info("Building text processor") processor = phrase.TextProcessor(phrases, FORMATTER) # Train the classifier using the Text Processor meta = { 'train_corpus': collection, 'classifier_type': classifier_type } LOGGER.info("Training Classifier") classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta) # Serialize the classifier LOGGER.info("Serializing classifier") if not os.path.exists(destination): os.makedirs(destination) classifier.serialize(destination)
def main(collection, destination, nfeats, nbigrams, classifier_type): LOGGER.info("Started classifier") if not destination: destination = generate_path_for_classifier(collection, nfeats, nbigrams, classifier_type) LOGGER.info("Classifier will be saved in: %s" % destination) LOGGER.info("Training a %s classifier with %s feats and %s bigrams" % (classifier_type, nfeats, nbigrams)) # Get training data using data source LOGGER.info("Building datasource") make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER) ds = data_sources.RedisDataSource(redis.Redis(), collection, ['positive', 'negative']) phrases = ds.get_data(make_phrase) # Initialize the Text Processor, get bigrams and informative features LOGGER.info("Building text processor") processor = phrase.TextProcessor(phrases, FORMATTER) # Train the classifier using the Text Processor meta = {'train_corpus': collection, 'classifier_type': classifier_type} LOGGER.info("Training Classifier") classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta) # Serialize the classifier LOGGER.info("Serializing classifier") if not os.path.exists(destination): os.makedirs(destination) classifier.serialize(destination)
def _build_prob_dist(self, fd, cfd): LOGGER.info("Building frequency distribution") for word, sentiment in self.phrases_it.iterate_formatted_words(self.formatter): fd.inc(word) cfd[sentiment].inc(word) return fd, cfd
def get_bigram_analyzer(self, n, words): LOGGER.info("Building Bigram Analyzer") bigram_measures = collocations.BigramAssocMeasures() finder = collocations.BigramCollocationFinder.from_words(words) return BigramAnalyzer(finder.above_score(bigram_measures.likelihood_ratio, n))