def test_get_classifier_returns_naive_bayes_classifier_with_corpora_when_corpora_exist( self): # mock the shuffle list method classify.shuffle = mock.MagicMock() # regenerate the command class being tested as we mocked an extra module above self.command = classify.Command() # create a dummy news item and corpus news_item = NewsItem() news_item.title = 'foo' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = True corpus.save() # call the method being tested classifier = self.command.get_classifier() # shuffled the corpora classify.shuffle.assert_called_once_with([('foo', 'pos')]) # a classifier with corpora was created classify.NaiveBayesClassifier.assert_called_once_with([('foo', 'pos')]) # the method returned a non-empty result self.assertTrue(classifier != None)
class RssFeedTestCase(TestCase): rss_feed = None news_item = None def setUp(self): self.news_item = NewsItem() self.rss_feed = RssFeed() def test_items_returns_empty_list_when_newsitem_does_not_exist(self): items = self.rss_feed.items() self.assertEquals([], list(items)) def test_items_returns_newsitems_when_positive_newsitem_exists(self): self.news_item.score = 1 self.news_item.published = True self.news_item.save() items = self.rss_feed.items() self.assertEquals(1, len(list(items))) def test_items_returns_empty_list_when_no_positive_newsitem_exists(self): self.news_item.score = 0 self.news_item.published = False self.news_item.save() items = self.rss_feed.items() self.assertEquals([], list(items)) def test_item_title_returns_newsitem_title_when_set(self): self.news_item.title = 'foo_title' self.assertEquals('foo_title', self.rss_feed.item_title(self.news_item)) def test_item_title_returns_empty_string_when_newsitem_title_not_set(self): self.assertEquals('', self.rss_feed.item_title(self.news_item)) def test_item_description_returns_newsitem_description_when_set(self): self.news_item.description = 'foo_description' self.assertEquals('foo_description', self.rss_feed.item_description(self.news_item)) def test_item_description_returns_empty_string_when_newsitem_description_not_set( self): self.assertEquals('', self.rss_feed.item_description(self.news_item)) def test_item_link_returns_newsitem_url_when_set(self): self.news_item.url = 'https://www.google.com' self.assertEquals('https://www.google.com', self.rss_feed.item_link(self.news_item)) def test_item_link_returns_empty_string_when_newsitem_url_not_set(self): self.assertEquals('', self.rss_feed.item_link(self.news_item)) def test_item_pubdate_returns_none_when_newsitem_added_at_not_set(self): self.assertNotEquals(None, self.rss_feed.item_pubdate(self.news_item)) def test_item_pubdate_returns_newsitem_added_at_datetime_when_set(self): self.news_item.url = timezone.now() self.assertNotEquals(None, self.rss_feed.item_pubdate(self.news_item))
def test_get_classifier_uses_unique_corpora(self): # mock the shuffle list method classify.shuffle = mock.MagicMock() # regenerate the command class being tested as we mocked an extra module above self.command = classify.Command() # create dummy news items and corpora with same title and classification for index in range(1, 3): news_item = NewsItem() news_item.title = 'foo' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = True corpus.save() # test that two corpora exist corpora = Corpus.objects.all() self.assertEquals(2, len(corpora)) # call the method being tested classifier = self.command.get_classifier() # shuffled the corpora with only one of the identical corpora classify.shuffle.assert_called_once_with([('foo', 'pos')])
def test_changelist_view_returns_metrics_when_corpora_exist_and_query_is_on_the_added_at_date( self): news_item = NewsItem() news_item.title = 'foo' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = True corpus.added_at = '2018-12-02 21:00:00+00:00' corpus.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request( '/admin/rss/corpusmetric/?added_at__month=12&added_at__year=2018', superuser) response = self.admin.changelist_view(request) expected_metrics = [{'positive': True, 'total': 1}] expected_metrics_total = {'total': 1} self.assertEquals(expected_metrics, response.context_data['corpus_metrics']) self.assertEquals(expected_metrics_total, response.context_data['corpus_metrics_total'])
def test_get_newsitem_title_returns_title_when_news_item_is_not_none(self): news_item = NewsItem() news_item.title = 'foo' corpus = Corpus() corpus.news_item = news_item admin = CorpusAdmin(Corpus, AdminSite()) self.assertEquals('foo', admin.get_news_item_title(corpus))
def test_news_item_unpublish_unpublishes_newsitems_in_query_set(self): news_items = NewsItem.objects.all() self.assertEquals([], list(news_items)) news_item = NewsItem() news_item.title = 'foo' news_item.published = True query_set = [news_item] self.newsitem.news_item_unpublish(None, None, query_set) news_items = NewsItem.objects.all() self.assertEquals(1, len(news_items)) self.assertEquals(False, news_items[0].published)
def get_classifier(self): self.logger.info('Training classifier...') stopwords_blacklisted = self.get_stopwords() stopwords_pattern = re.compile(r'\b(' + r'|'.join(stopwords_blacklisted) + r')\b\s*') corpora_classified = list() for corpus in Corpus.objects.filter(active=True): title = stopwords_pattern.sub('', corpus.news_item.title) corpora_classified.append((title, corpus.get_classification())) for news_item in NewsItem.find_neutral(): title = stopwords_pattern.sub('', news_item.title) corpora_classified.append((title, 'neu')) corpora_classified = list(set(corpora_classified)) shuffle(corpora_classified) classifier = NaiveBayesClassifier(corpora_classified) self.logger.info('Dumping classifier.') pickle.dump(classifier, open(settings.CLASSIFIER_DUMP_FILEPATH, 'wb')) self.logger.info('Classifier dumped!') return classifier
def handle(self, *args, **options): news_items = NewsItem.find_negative( settings.SENTIMENT_POLARITY_THRESHOLD) if not news_items: self.logger.info('No news items found.') return connection = pika.BlockingConnection( pika.ConnectionParameters(host=settings.QUEUE_HOSTNAME)) channel = connection.channel() channel.queue_declare(queue=settings.QUEUE_NAME_CLASSIFY, durable=True) self.logger.info( 'Found %s negative news items that are going to be re-classified.', len(news_items)) for news_item in news_items: body = serializers.serialize('json', [news_item]) channel.basic_publish(exchange='', routing_key=settings.QUEUE_NAME_CLASSIFY, body=body, properties=pika.BasicProperties( delivery_mode=2, headers={'x-is-self-train': True})) self.logger.info('Successfully re-queued #%s "%s"!', news_item.id, news_item.title)
def test_changelist_view_returns_metrics_when_positive_newsitem_and_negative_corpus_exist(self): news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.added_at = '2018-12-03 21:00:00+00:00' news_item.save() corpus = Corpus() corpus.positive = False corpus.news_item = news_item corpus.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/', superuser) response = self.admin.changelist_view(request) self.assertEquals(1, response.context_data['news_items_count']) self.assertEquals(0, response.context_data['news_items_unclassified']) self.assertEquals(1, response.context_data['classification_initial']['positive']) self.assertEquals(0, response.context_data['classification_initial']['negative']) self.assertEquals(0, response.context_data['classification_supervised']['positive']) self.assertEquals(1, response.context_data['classification_supervised']['negative']) self.assertEquals(0, response.context_data['corpus_count']['positive']) self.assertEquals(1, response.context_data['corpus_count']['negative']) self.assertEquals([{'accuracy': 0, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
def news(request): template = loader.get_template('web/news.html') news_items = NewsItem.find_positive(settings.SENTIMENT_POLARITY_THRESHOLD, settings.WEB_NEWS_ITEMS_COUNT) context = {'news_items': news_items} return HttpResponse(template.render(context))
def test_get_classifier_uses_corpora_clean_of_stopwords(self): # mock the shuffle list method classify.shuffle = mock.MagicMock() # regenerate the command class being tested as we mocked an extra module above self.command = classify.Command() # create dummy news item (having title that contains stopwords) and corpora news_item = NewsItem() news_item.title = 'when foo then bar' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = True corpus.save() # call the method being tested classifier = self.command.get_classifier() # shuffled the corpora with only one of the identical corpora classify.shuffle.assert_called_once_with([('foo bar', 'pos')])
def test_get_accuracy_total_returns_100_percent_when_no_corpora(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(100, accuracy)
def test_changelist_view_returns_positive_and_negative_metrics_when_positive_and_negative_corpora_exist( self): news_item = NewsItem() news_item.title = 'foo' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = True corpus.save() news_item = NewsItem() news_item.title = 'bar' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/corpusmetric/', superuser) response = self.admin.changelist_view(request) expected_metrics = [{ 'positive': True, 'total': 1 }, { 'positive': False, 'total': 1 }] expected_metrics_total = {'total': 2} self.assertEquals(expected_metrics, response.context_data['corpus_metrics']) self.assertEquals(expected_metrics_total, response.context_data['corpus_metrics_total'])
def test_get_accuracy_returns_100_percent_when_no_corpora(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals(1, len(metrics)) self.assertEquals(100, metrics[0]['accuracy']) self.assertEquals('2018-11-24', metrics[0]['added_at'])
def test_corpus_create_positive_creates_positive_corpora_and_enqueues_job_to_retrain_classifier_when_newsitems_in_query_set( self): news_item = NewsItem() news_item.title = 'foo' news_item.published = True news_item.save() query_set = [news_item] self.newsitem.corpus_create_positive(None, None, query_set) self.newsitem.enqueue_corpus_creation.assert_called_once() corpora = Corpus.objects.all() self.assertEquals(1, len(corpora)) self.assertEquals(True, corpora[0].positive)
def test_get_accuracy_total_returns_0_percent_when_only_not_accurate_newsitems_exist( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(0.0, accuracy)
def setUp(self): # retain the original imported packages classify.serializers_real = classify.serializers # mock the tread target methods self.thread_classify = mock.MagicMock() self.thread_train = mock.MagicMock() classify.threading.Thread = mock.MagicMock( side_effect=[self.thread_classify, self.thread_train]) # mock pika.BlockingConnection self.connection = mock.MagicMock() classify.pika.BlockingConnection = mock.MagicMock( return_value=self.connection) # mock pika.BlockingConnection.channel self.channel = mock.MagicMock() self.connection.channel = mock.MagicMock(return_value=self.channel) # mock ConnectionParameters classify.pika.ConnectionParameters = mock.MagicMock() # mock the ORM connection self.db_connection = mock.MagicMock() classify.connection = self.db_connection # mock the classifier classify.NaiveBayesClassifier = mock.MagicMock() classify.settings.AUTO_PUBLISH = False # a fake news item to be used as classification input news_item = NewsItem() news_item.title = 'foo' self.serialized_news_item = serializers.serialize('json', [news_item]) news_item = NewsItem() news_item.title = 'foo' news_item.published = 'foo' self.serialized_news_item_crap = serializers.serialize( 'json', [news_item]) # mock the logger self.logger = mock.MagicMock() classify.logging.getLogger = mock.MagicMock(return_value=self.logger) self.command = classify.Command()
def test_news_item_publish_and_corpus_create_negative_publishes_newsitems_and_creates_negative_corpora_when_newsitems_in_query_set( self): news_item = NewsItem() news_item.title = 'foo' news_item.publshed = True news_item.score = 1.00 news_item.save() query_set = [news_item] self.newsitem.news_item_unpublish_and_corpus_create_negative( None, None, query_set) self.newsitem.enqueue_corpus_creation.assert_called_once() news_items = NewsItem.objects.all() self.assertEquals(1, len(news_items)) self.assertFalse(news_items[0].published) corpus = Corpus.objects.filter(news_item=news_items[0]) self.assertNotEquals(None, corpus) self.assertFalse(corpus[0].positive)
def test_get_accuracy_returns_0_percent_when_only_not_accurate_newsitems_exist( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals(1, len(metrics)) self.assertEquals(0, metrics[0]['accuracy']) self.assertEquals('2018-11-24', metrics[0]['added_at'])
def test_news_returns_http_response_with_template_and_positive_newsitems_when_positive_newsitems_exist(self): # create a positive news item news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.published = True news_item.save() # make request to news view response = self.get_news() # returns an instance of HttpResponse self.assertTrue(type(response) is HttpResponse) # request didn't fail self.assertEquals(200, response.status_code) # response contains news items content = response.getvalue() self.assertTrue('news-item' in str(content))
def test_changelist_view_returns_metrics_when_newsitems_exist_but_no_corpora_and_date_query_includes_newsitem(self): news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.added_at = '2018-12-03 21:00:00+00:00' news_item.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/?added_at__month=12&added_at__year=2018', superuser) response = self.admin.changelist_view(request) self.assertEquals(1, response.context_data['news_items_count']) self.assertEquals(0, response.context_data['news_items_unclassified']) self.assertEquals(1, response.context_data['classification_initial']['positive']) self.assertEquals(0, response.context_data['classification_initial']['negative']) self.assertEquals(1, response.context_data['classification_supervised']['positive']) self.assertEquals(0, response.context_data['classification_supervised']['negative']) self.assertEquals(0, response.context_data['corpus_count']['positive']) self.assertEquals(0, response.context_data['corpus_count']['negative']) self.assertEquals([{'accuracy': 100.0, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
def test_news_item_publish_and_corpus_create_positive_publishes_newsitems_and_creates_positive_corpora_when_newsitems_in_query_set( self): news_item = NewsItem() news_item.title = 'foo' news_item.published = False news_item.save() query_set = [news_item] self.newsitem.news_item_publish_and_corpus_create_positive( None, None, query_set) self.newsitem.enqueue_corpus_creation.assert_called_once() corpora = Corpus.objects.all() self.assertEquals(1, len(corpora)) self.assertEquals(True, corpora[0].positive) news_items = NewsItem.objects.all() self.assertEquals(1, len(news_items)) self.assertEquals(True, news_items[0].published)
def test_about_returns_http_response_with_stats(self): # create a news item news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.published = False news_item.save() # create an active source source = Source() source.name = 'foo' source.url = 'http://www.foo.com' source.homepage = 'http://www.foo.com' source.save() # create an inactive source source = Source() source.name = 'bar' source.url = 'http://www.bar.com' source.homepage = 'http://www.bar.com' source.active = False source.save() # make request to news view response = self.get_about() # returns an instance of HttpResponse self.assertTrue(type(response) is HttpResponse) # request didn't fail self.assertEquals(200, response.status_code) # response does not contain any news items content = response.getvalue() self.assertTrue('Sources crawled</strong>: 1' in str(content)) self.assertTrue('News classified</strong>: 1' in str(content)) self.assertTrue('Corpora created</strong>: 0' in str(content)) self.assertTrue('Classification accuracy</strong>: 100%' in str(content))
def test_handle_publishes_when_newsitems_exist(self): news_item = NewsItem() news_item.id = 1 news_item.title = 'foo' news_item.save() # the method being tested self.command.handle() classify_requeue.pika.BlockingConnection.assert_called_once() classify_requeue.pika.ConnectionParameters.assert_called_once() self.connection.channel.assert_called_once() self.channel.queue_declare.assert_called_once_with( queue=settings.QUEUE_NAME_CLASSIFY, durable=True) self.channel.basic_publish.assert_called_once() self.logger.info.assert_any_call( 'Found %s news items that need to be classified.', 1) self.logger.info.assert_any_call('Successfully re-queued #%s "%s"!', 1, 'foo')
def test_to_dict_returns_dictionaries_when_cursor_more_than_one_results( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 02:00:00+00:00' news_item.save() with connection.cursor() as cursor: cursor.execute('SELECT * FROM news_item') results = self.newsitem_metric.to_dict(cursor) self.assertEquals(2, len(results)) self.assertEquals(dict, type(results[0])) self.assertEquals(dict, type(results[1]))
def setUp(self): news_item = NewsItem() self.corpus = Corpus() self.corpus.news_item = news_item
def test_to_dict_returns_one_dictionary_when_cursor_has_one_result(self): source = Source() source.id = 1 source.name = 'foo_source' source.save() news_item = NewsItem() news_item.id = 1 news_item.title = 'foo' news_item.description = 'bar' news_item.url = 'https://www.google.com' news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.source = source news_item.published = False news_item.score = 1 news_item.save() with connection.cursor() as cursor: cursor.execute('SELECT * FROM news_item') result = self.newsitem_metric.to_dict(cursor) self.assertEquals(1, len(result)) self.assertEquals([{ 'id': 1, 'title': 'foo', 'description': 'bar', 'url': 'https://www.google.com', 'added_at': datetime(2018, 11, 23, 1, 0), 'source_id': 1, 'published': False, 'score': 1 }], result)
def test_get_accuracy_does_not_include_unclassified_newsitems(self): news_item = NewsItem() news_item.score = None news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = None news_item.published = True news_item.added_at = '2018-11-24 02:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals([], metrics)
def test_get_accuracy_total_uses_two_day_statistics_when_exist(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-25 01:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(50.0, accuracy)
def setUp(self): self.news_item = NewsItem() self.rss_feed = RssFeed()