class N_Grams_Feature_Generator_Unittests(unittest.TestCase): def setUp(self): self._db = DB() self._db.setUp() self._domain = 'test' self._posts = [] self._authors = [] def tearDown(self): self._db.session.close() self._db.deleteDB() self._db.session.close() def execute_module(self): authors = self._authors posts = self._db.get_posts_by_domain('test') parameters = {"authors": authors, "posts": posts, "graphs": []} n_gram_module = N_Grams_Feature_Generator(self._db, **parameters) n_gram_module._stemming = False n_gram_module.execute(window_start=None) def test_simple_case(self): self._add_author('1') self._add_post('do that', 'This article includes a list of references, but its sources remain unclear because it has insufficient inline citations. Please help to improve this article by introducing more precise citations.', '1') self._add_post('to do ', 'article citations insufficient inline because the damn thing will not do that', '1') self._add_author('2') self._add_post('this was a triumph', 'im making a note here insufficient inline', '2') self.execute_module() features = self._db.get_author_features() db_val = self._db.get_author_feature('1', "2_gram_insufficient_inline").attribute_value self.assertEqual(db_val, str(2)) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = 'test author' author.name = 'test' author.domain = 'test' self._db.add_author(author) self._db.session.commit() self._authors.append(author) def _add_post(self, title, content, author_guid): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = 'test' post.post_id = len(self._posts) post.guid = post.post_id post.date = date('2020-01-01 23:59:59') self._db.addPost(post) self._db.session.commit() self._posts.append(post)
class TestBehaviorFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() pass ######################## Average minute between posts tests ###################### def test_average_minutes_between_posts_no_post_expected_0(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._features = [ 'average_minutes_between_posts' ] self._behavior_feature_generator._targeted_fields = [{ 'source': { "table_name": "posts", "id": "author_guid", "target_field": "content", "where_clauses": [{ "field_name": 1, "value": 1 }] }, "connection": {}, "destination": {} }] result = self._behavior_feature_generator.average_minutes_between_posts( **{'posts': self._posts}) self.assertEqual(0, result) def test_average_minutes_between_posts_one_post_expected_0(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { "table_name": "posts", "id": "author_guid", "target_field": "content", "where_clauses": [{ "field_name": 1, "value": 1 }] }, "connection": {}, "destination": {} }] self._behavior_feature_generator.execute() result_feature = self._db.get_author_feature( u"author_guid", u"BehaviorFeatureGenerator_average_minutes_between_posts") feature_value = getattr(result_feature, u'attribute_value') self.assertEqual('0', feature_value) def test_average_minutes_between_posts_3_post_expected_105(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-12 06:00:00") self._add_post(u"post3", u"content 3", "2017-06-12 08:30:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_minutes_between_posts( **{'posts': self._posts}) self.assertEqual(105, result) ######################## Average posts per day tests ###################### def test_average_posts_per_day_active_days_no_posts_expect_0(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertEqual(0, result) def test_average_posts_per_day_1_active_days_1_post_each_expect_1(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertAlmostEqual(1.0, result, 0.0000001) def test_average_posts_per_day_3_active_days_1_post_each_expect_1(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-16 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertAlmostEqual(1.0, result, 0.0000001) def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2( self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertEqual(2.0, result) def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2_represent_by_post( self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "<=" }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_posts_per_day_active_days") self.assertEqual(u'2.0', author_feature.attribute_value) author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_posts_per_day_total") self.assertGreater(float(author_feature.attribute_value), 0) def test_retweet_count_0_posts(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'0', author_feature.attribute_value) def test_retweet_count_1_retweet(self): self._add_author(u"author_guid") self._add_post(u"post1", u"RT @content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'1', author_feature.attribute_value) def test_retweet_count_3_retweet(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"RT @content 3 RT @hi", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5 bla RT @bla", "2017-06-16 04:00:00") self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'3', author_feature.attribute_value) author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_retweets") self.assertEqual(u'0.5', author_feature.attribute_value) def test_received_retweets_count_0_retweets(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'0', author_feature.attribute_value) def test_received_retweets_count_1_retweets(self): self._add_author(u"author_guid") self._add_post(u"post1", u"RT @author_guid content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'1', author_feature.attribute_value) def test_received_retweets_count_3_retweets_only_from_microblog_tweets( self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"RT @author_guid content 3 RT @hi", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5 bla RT @author_guid", "2017-06-16 04:00:00") self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'3', author_feature.attribute_value) ######################## argument_parser tests ###################### def test_argument_parser_connection_conditions(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "<=" }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post4', 'post5', 'post6'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "timeinterval", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post2', 'post3'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_before_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "before", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post2', u'post3'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_after_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-15 05:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "after", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post3'} self.assertSetEqual(actual, expected) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.author_screen_name = author_guid author.name = u'test' author.domain = u'tests' author.statuses_count = 0 author.created_at = u"2017-06-14 05:00:00" # self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date self._db.addPost(post) self._posts.append(post) self._author.statuses_count += 1 def _get_params(self): posts = {self._author.author_guid: self._posts} params = {'authors': [self._author], 'posts': posts} return params def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass
class TestAccountPropertiesFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self.author_guid = u"author_guid" author = Author() author.author_guid = self.author_guid author.author_full_name = u'author' author.name = u'author_name' author.author_screen_name = u'author_screen_name' author.domain = u'Microblog' author.statuses_count = 10 author.friends_count = 5 author.followers_count = 6 author.favourites_count = 8 author.author_sub_type = u"bot" author.author_type = u"bad" author.created_at = u"2017-06-17 05:00:00" author.default_profile = True author.default_profile_image = True author.verified = True self._db.add_author(author) post = Post() post.author = self.author_guid post.author_guid = self.author_guid post.content = u"content" post.title = u"title" post.domain = u"domain" post.post_id = u"post_id" post.guid = post.post_id post.date = convert_str_to_unicode_datetime("2017-06-14 05:00:00") post.created_at = post.date self._db.addPost(post) self._db.session.commit() self.feature_prefix = u"AccountPropertiesFeatureGenerator_" self.account_properties_feature_generator = AccountPropertiesFeatureGenerator( self._db, **{ 'authors': [author], 'posts': { self.author_guid: [post] } }) self.account_properties_feature_generator.execute() def tearDown(self): self._db.session.close() pass def test_account_age(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"account_age") account_creation_date = parser.parse(u"2017-06-17 05:00:00").date() today_date = datetime.date.today() delta = today_date - account_creation_date self.assertEqual(delta.days, int(author_feature.attribute_value)) def test_number_followers(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"number_followers") self.assertEqual(6, int(author_feature.attribute_value)) def test_number_friends(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"number_friends") self.assertEqual(5, int(author_feature.attribute_value)) def test_friends_followers_ratio(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"friends_followers_ratio") self.assertAlmostEqual(5.0 / 6, float(author_feature.attribute_value), places=5) def test_number_of_crawled_posts(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"number_of_crawled_posts") self.assertEqual("1", author_feature.attribute_value) def test_number_of_posts(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"number_of_posts") self.assertEqual(10, int(author_feature.attribute_value)) def test_default_profile(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"default_profile") self.assertEqual(u"1", author_feature.attribute_value) def test_default_profile_image(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"default_profile_image") self.assertEqual(u"1", author_feature.attribute_value) def test_verified(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"verified") self.assertEqual(u'1', author_feature.attribute_value) def test_screen_name_length(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"screen_name_length") self.assertEqual(18, int(author_feature.attribute_value)) def test_author_screen_name(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"author_screen_name") self.assertEqual(u"author_screen_name", author_feature.attribute_value) def test_author_type(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"author_type") self.assertEqual(u"bad", author_feature.attribute_value) def test_author_sub_type(self): author_feature = self._db.get_author_feature( self.author_guid, self.feature_prefix + u"author_sub_type") self.assertEqual(u"bot", author_feature.attribute_value)
class TestClaimToTopicConverter(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._claim_dictionary = {} self._authors = [] self._add_author(u'test author') self._preprocess_visualization = ClaimToTopicConverter(self._db) def tearDown(self): self._db.session.close_all() self._db.deleteDB() self._db.session.close() def test_generate_topics_no_topics(self): claim_id_posts_dict = self._db.get_claim_id_posts_dict() self._preprocess_visualization.generate_topics_tables() topics = self._db.get_topics() self.assertEqual(topics, []) def test_generate_topics_from_1_claim(self): self._add_claim(u'claim1', u'claim1 content') self._db.session.commit() claim_id_posts_dict = self._db.get_claim_id_posts_dict() self._preprocess_visualization.generate_topics_tables() self.assertTopicInserted(u'claim1', [u'claim1', u'content']) def test_generate_topics_from_5_claims(self): self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim2 content') self._add_claim(u'claim3', u'claim3 content move') self._add_claim(u'claim4', u'claim4 dif data') self._add_claim(u'claim5', u'claim5 some boring text') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self.assertTopicInserted(u'claim1', [u'claim1', u'content']) self.assertTopicInserted(u'claim2', [u'claim2', u'content']) self.assertTopicInserted(u'claim3', [u'claim3', u'content', u'move']) self.assertTopicInserted(u'claim4', [u'claim4', u'dif', u'data']) self.assertTopicInserted(u'claim5', [u'claim5', u'some', u'boring', u'text']) def test_generate_post_topic_mapping_no_claim(self): self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() self.assertEqual(0, len(mappings)) def test_generate_post_topic_mapping_1_claim(self): self._add_claim(u'claim1', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] self.assertEqual(3, len(mappings)) self.assertSetEqual( {('post1', topic_id, 1.0), ('post2', topic_id, 1.0), ('post3', topic_id, 1.0)}, set(mappings)) def test_generate_post_topic_mapping_2_claim(self): self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._add_claim_tweet_connection(u'claim2', u'post4') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim2'] self.assertEqual(5, len(mappings)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0), ('post5', topic_id2, 1.0)}, set(mappings)) def test__generate_author_topic_mapping_2_claim(self): self._add_author(u'test author2') self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._add_claim_tweet_connection(u'claim2', u'post4') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization._domain = u"Microblog" self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() self._preprocess_visualization.generate_author_topic_mapping() mapping = self._db.get_author_topic_mapping() self.assertEqual(2, len(mapping)) self.assertSetEqual( {(u'test author', 0.6, 0.4), (u'test author2', 0, 0)}, set(mapping)) def test_visualization(self): self._add_author(u'test author2', u"bad_actor") self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim2 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author2", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author2", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post4') self._add_claim_tweet_connection(u'claim2', u'post3') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization._domain = u"Microblog" self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {(u'test author', 0.666666666667, 0.333333333333), (u'test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) # def test_double_execution_visualization(self): # self._add_author(u'test author2', u"bad_actor") # self._add_claim(u'claim1', u'claim1 content') # self._add_claim(u'claim2', u'claim2 content') # self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') # self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') # self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') # self._add_post(u"test author2", u'post4', u'post4 bla bla', u'Microblog') # self._add_post(u"test author2", u'post5', u'post5 noting new', u'Microblog') # self._add_claim_tweet_connection(u'claim1', u'post1') # self._add_claim_tweet_connection(u'claim1', u'post2') # self._add_claim_tweet_connection(u'claim1', u'post4') # self._add_claim_tweet_connection(u'claim2', u'post3') # self._add_claim_tweet_connection(u'claim2', u'post5') # self._db.session.commit() # self._preprocess_visualization._domain = u"Microblog" # self._preprocess_visualization.execute() # self._preprocess_visualization.execute() # # author_topic_mapping = self._db.get_author_topic_mapping() # post_topic_mappings = self._db.get_post_topic_mapping() # post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] # topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim1'] # topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim2'] # self.assertEqual(2, len(author_topic_mapping)) # self.assertSetEqual({(u'test author2', 0.5, 0.5), (u'test author', 0.666666666667, 0.333333333333)}, # set(author_topic_mapping)) # self.assertSetEqual( # {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), # ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def assertTopicInserted(self, claim_id, expected_terms): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary( )[claim_id] claim = self._claim_dictionary[claim_id] expected = set(clean_tweet(claim.description).split(' ')) self.assertIn(topic_id, topic_dict) self.assertSetEqual(expected, topic_dict[topic_id]) self.assertSetEqual(set(expected_terms), topic_dict[topic_id]) def _add_author(self, author_guid, type=u"good_actor"): author = Author() author.author_guid = author_guid author.author_full_name = author_guid author.author_screen_name = author_guid author.name = author_guid author.domain = u'Microblog' author.author_type = type self._db.add_author(author) self._authors.append(author) def _add_post(self, author_guid, title, content, domain=u'Microblog'): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.is_detailed = True post.is_LB = False self._db.addPost(post) self._posts.append(post) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str=u"2017-06-14 05:00:00", keywords=u"", post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.keywords = keywords claim.url = u"claim url" self._db.addPost(claim) self._claim_dictionary[claim.claim_id] = claim
class Word_Embeddings_Differential_Feature_Generator_Unittests( unittest.TestCase): def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() self._model = Word_Embedding_Differential_Feature_Generator(self._db) self._posts = [] self._author = None self._set_author(u'test_user') def tearDown(self): self._db.session.close() self._db.deleteDB() self._db.session.close() def test_simple_case(self): self._add_post(u'of to a for', u'of is') self._add_target_article(u'0', u'of was ', u'am that was') self._setup_test() is_vec1 = self._get_word_dimension(u'is', 0) was_vec_d1 = self._get_word_dimension(u'was', 0) expected_val = was_vec_d1 - is_vec1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d0" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) expected_val = was_vec_d1 - is_vec1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_np.mean_target_articles_title_to_posts_content_d0" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) is_vec = self._words[u'is'] was_vec = self._words['was'] expected_val = commons.euclidean_distance(is_vec, was_vec) db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_distance_function_euclidean_distance_target_articles_title_np.mean_TO_posts_content_np.mean" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_few_words(self): self._add_post(u'of to a for', u'of is on') self._add_target_article(u'0', u'of was that', u'am that was') self._setup_test() dimension = 0 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot2 - tot1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) dimension = 140 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot2 - tot1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_opposite(self): self._add_post(u'am that was', u'of was that') self._add_target_article(u'0', u'of is on', u'of to a for') self._setup_test() dimension = 0 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot1 - tot2 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_empty_word(self): self._add_post(u'of to a for', u'') self._add_target_article(u'0', u'of was that', u'am that was') self._setup_test() self.assertTrue(True) def _add_post(self, title, content): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = u'test' post.post_id = len(self._posts) post.guid = post.post_id self._db.addPost(post) self._posts.append(post) def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author def _setup_test(self): self._db.session.commit() self._word_embedding_model_creator = GloveWordEmbeddingModelCreator( self._db) self._word_embedding_model_creator.execute(None) params = {'authors': [self._author], 'posts': self._posts} self._model = Word_Embedding_Differential_Feature_Generator( self._db, **params) self._model.execute() self._words = self._db.get_word_embedding_dictionary() def _get_word_dimension(self, word, dimension): word_vec = self._words[word] return word_vec[dimension] def _add_target_article(self, post_id, title, description): target_article = Target_Article() target_article.author_guid = u'test_user' target_article.post_id = post_id target_article.title = title target_article.description = description target_article.keywords = 'temp, lala, fafa' self._db.add_target_articles([target_article]) def _add_target_article_item(self, post_id, type, content, author_guid): article_item = Target_Article_Item() article_item.post_id = post_id article_item.type = type article_item.item_number = 3 article_item.content = content article_item.author_guid = author_guid self._db.addPosts([article_item])
class TestGensimWordEmbeddingsModelTrainer(TestCase): def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() # self._Word_Embedding_Model_Creator.execute(None) self._is_load_wikipedia_300d_glove_model = True self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt" self._table_name = "wikipedia_model_300d" self._word_vector_dict_full_path = "data/output/word_embedding/" self._word_vector_dict = {} self._author = None self._set_author(u'test_user') self._counter = 0 self._posts = [] def tearDown(self): self._db.session.close() def test_add_additional_fields_to_existing_table(self): self._add_post(u'was', u'is') self._add_post(u'is', u'was') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._word_embedding_model_creator._aggregation_functions_names = [ 'sum' ] self._word_embedding_model_creator.execute(None) file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == 'test_user') & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] sum_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'sum'] mean_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'np.mean'] try: if len(sum_value_df.values.tolist()) > 0 and len( mean_value_df.values.tolist()) > 0: self.assertTrue(True) else: self.fail() except: self.fail() def test_case_post_represent_by_posts(self): self._add_post(u'post1', u'the claim', u'Claim') self._add_post(u'post2', u'dog cat pig man') # 2 self._add_post(u'post3', u'TV is the best guys') # 1 self._add_claim_tweet_connection(u'post1', u'post2') self._add_claim_tweet_connection(u'post1', u'post3') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator._targeted_fields_for_embedding = [{ 'source': { 'table_name': 'posts', 'id': 'post_id' }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id' }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [] } }] self._word_embedding_model_creator.execute(None) model_name_path = self._word_embedding_model_creator._prepare_model_name_path( ) model = Word2Vec.load(model_name_path) word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict( model) self._words = word_vector_dict self._words_vectors = self._get_posts_val() expected_val = self._calc_results() self._generic_test(expected_val, u'post1') def _setup_test(self): self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._words = self._db.get_word_embedding_dictionary() self._words_vectors = self._get_posts_val() def _generic_test(self, expected_value, source_id=u""): if source_id == u"": source_id = self._author.author_guid file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == source_id) & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] self.assert_word_embedding(word_embedding_results, expected_value, u'min') self.assert_word_embedding(word_embedding_results, expected_value, u'max') self.assert_word_embedding(word_embedding_results, expected_value, u'np.mean') def assert_word_embedding(self, db_results, expected_value, type): result_value = db_results.loc[db_results[u'word_embedding_type'] == type, '0':].values.tolist()[0] self.assertEquals(list(expected_value[type]), result_value) def _generic_non_equal_test(self, expected_value): db_results = self._db.get_author_word_embedding( self._author.author_guid, u'posts', u'content') self.assertNotEqual(expected_value[u'min'], db_results[u'min']) self.assertNotEqual(expected_value[u'max'], db_results[u'max']) self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean']) def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author def _add_post(self, title, content, _domain=u'Microblog'): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = _domain post.post_id = title post.guid = title self._db.addPost(post) self._posts.append(post) def _get_posts_val( self): # return the vectors for all the words in the added posts vals = {} for post in self._posts: for word in post.content.split(): if word in self._words.keys(): vals[word] = self._words[word] return vals.values() def _calc_mean(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('np.mean'), ziped_vec) return tuple(result) def _calc_min(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('min'), ziped_vec) return tuple(result) def _calc_max(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('max'), ziped_vec) return tuple(result) def _calc_results(self): vectors = self._words_vectors results = {} results[u'min'] = self._calc_min(vectors) results[u'max'] = self._calc_max(vectors) results[u'np.mean'] = self._calc_mean(vectors) return results def _add_target_article(self, post_id, title, description, author_guid): target_article = Target_Article() target_article.author_guid = author_guid target_article.post_id = post_id target_article.title = title target_article.description = description target_article.keywords = 'temp, lala, fafa' self._db.add_target_articles([target_article]) def _add_target_article_item(self, post_id, type, content, author_guid): article_item = Target_Article_Item() article_item.post_id = post_id article_item.type = type article_item.item_number = 3 article_item.content = content article_item.author_guid = author_guid self._db.addPosts([article_item]) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass
class Data_Handler_Unittests(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._data_handler = Data_Handler(self._db, 'author_type') self._authors_to_author_features_dict = {} self._fill_empty= True self._remove_features = [] self._select_features = [] self._label_text_to_value = {'good':0,'bad':1} def tearDown(self): self._db.session.close() self._db.deleteDB() self._db.session.close() def test_basic_case(self): self._create_author_with_features('1','good',(10,11,12,13,14,15,16)) self._create_author_with_features('2','bad', (20,21,22,23,24,25,26)) self._create_author_with_features('3','good', (30,31,32,33,34,35,36)) authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value) self.assertEqual(1,1) def test_remove_by_prefix(self): self._create_author('123','bad') self._create_author_feature_with_name('123',3,'feature_test') self._create_author_feature_with_name('123', 5, 'feature_test2') self._create_author_feature_with_name('123', 5, 'bla_bla') self._create_author_feature_with_name('123', 5, 'dada') self._create_author_feature_with_name('123', 6, 'bla_bli') self._data_handler._remove_features_by_prefix = ['feature_test','bla'] authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value) feature_num = len(authors_features_dataframe.columns) self.assertEqual(feature_num, 1) def test_remove_by_prefix_2(self): self._create_author('123','bad') self._create_author_feature_with_name('123',3,'feature_test') self._create_author_feature_with_name('123', 5, 'feature_test2') self._create_author_feature_with_name('123', 5, 'bla_bla') self._create_author_feature_with_name('123', 5, 'dada') self._create_author_feature_with_name('123', 6, 'bla_bli') self._data_handler._remove_features_by_prefix = ['feature_test'] authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification( self._remove_features, self._select_features, self._label_text_to_value) feature_num = len(authors_features_dataframe.columns) self.assertEqual(3, feature_num) def test_select_by_prefix(self): self._create_author('123','bad') self._create_author_feature_with_name('123',3,'feature_test') self._create_author_feature_with_name('123', 5, 'feature_test2') self._create_author_feature_with_name('123', 5, 'bla_bla') self._create_author_feature_with_name('123', 5, 'dada') self._create_author_feature_with_name('123', 6, 'bla_bli') self._data_handler._select_features_by_prefix = ['feature_test'] authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification( self._remove_features, self._select_features, self._label_text_to_value) feature_num = len(authors_features_dataframe.columns) self.assertEqual(2, feature_num) def test_select_by_prefix2(self): self._create_author('123','bad') self._create_author_feature_with_name('123',3,'feature_test') self._create_author_feature_with_name('123', 5, 'feature_test2') self._create_author_feature_with_name('123', 5, 'bla_bla') self._create_author_feature_with_name('123', 5, 'bloom_bla') self._create_author_feature_with_name('123', 5, 'blada') self._create_author_feature_with_name('123', 6, 'bla_bli') self._data_handler._select_features_by_prefix = ['bla'] authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification( self._remove_features, self._select_features, self._label_text_to_value) feature_num = len(authors_features_dataframe.columns) self.assertEqual(3, feature_num) def test_fill_and_drop_nan(self): self._create_author_with_features('1','good',(10,None,12,None)) self._create_author_with_features('2', 'bad', (20, 24, 22,None)) self._create_author_with_features('3', 'bad', (30, 34, 32,None)) self._data_handler._fill_empty = 'zero' authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value) null_val = authors_features_dataframe.iloc[0][u'1'] self.assertEqual(null_val,0) did_remove_empty_column = u'3' not in authors_features_dataframe.columns self.assertTrue(did_remove_empty_column) self._data_handler._fill_empty= 'mean' authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value) null_val = authors_features_dataframe.iloc[0][u'1'] self.assertEqual(null_val,(24+34)/2) def test_get_split(self): self._auto_create_authors(4,7) authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification( self._remove_features, self._select_features, self._label_text_to_value) test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe, authors_labels, 0, 2) self.assertEqual(test_set.iloc[0][u'0'],11) self.assertEqual(test_set.iloc[1][u'0'],21) test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset( authors_features_dataframe, authors_labels, 1, 2) self.assertEqual(test_set.iloc[0][u'0'],31) self.assertEqual(test_set.iloc[1][u'0'],41) def test_train_and_test_differ(self): author_number = 7 self._auto_create_authors(author_number,9) authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification( self._remove_features, self._select_features, self._label_text_to_value) test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe, authors_labels, 0, 7) for num in range(author_number): author_guid =(num+1)*10+1 is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid) if is_in_both: logging.info("in both " + str(author_guid)) self.assertFalse(is_in_both) test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe, authors_labels, 6, 7) for num in range(author_number): author_guid =(num+1)*10+1 is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid) if is_in_both: logging.info("in both "+str(author_guid)) self.assertFalse(is_in_both) def _auto_create_authors(self, author_num, num_of_features): for num in range(author_num): author_name = num+1 feature = [] for feature_name in range(num_of_features): feature.append(str(author_name*10+feature_name+1)) author_type = str(author_name*1000+author_name) self._create_author_with_features(str(author_name),author_type, feature) def _compare_authors_features_to_author(self): pass def _create_author_with_features(self, author_guid, author_type, feature_values): self._create_author(author_guid,author_type) for feature_value in feature_values: self._create_author_feature(author_guid,feature_value) self._db.session.commit() def _create_author(self, guid, author_type): author = Author() author.name = unicode(guid) author.domain = u'Microblog' author.author_guid = unicode(guid) author.author_screen_name = u'TestUser1' author.author_type = author_type author.domain = u'Restaurant' author.author_osn_id = 1 self._authors_to_author_features_dict[author.author_guid]=[] self._db.add_author(author) def _create_author_feature(self, author_guid, value): feature_name = str(len(self._authors_to_author_features_dict[author_guid])) self._create_author_feature_with_name(author_guid, value, feature_name) def _create_author_feature_with_name(self, author_guid, value, feature_name): author_feature = AuthorFeatures() author_feature.author_guid = author_guid author_feature.window_start = date('2010-01-01 00:00:00') author_feature.window_end = date('2020-01-01 23:59:59') author_feature.attribute_name = feature_name author_feature.attribute_value=value self._authors_to_author_features_dict[author_guid].append(author_feature) self._db.update_author_features((author_feature)) self._db.session.commit() def _is_val_in_datatframe(self, df, value): for row in range(df.shape[0]): # df is the DataFrame for col in range(df.shape[1]): if df.iloc[row][col] == value: return True return False def _get_random_guid(self): return unicode(uuid.uuid4())
class TestTimelineOverlapVisualizationGenerator(TestCase): def setUp(self): self.config = getConfig() self._db = DB() self._db.setUp() self.timeline_overlap = TimelineOverlapVisualizationGenerator() author1 = Author() author1.name = 'acquired_user' author1.domain = 'Microblog' author1.author_guid = 'acquired_user' author1.author_screen_name = 'acquired_user' author1.author_full_name = 'acquired_user' author1.author_osn_id = 1 author1.created_at = datetime.datetime.now() author1.missing_data_complementor_insertion_date = datetime.datetime.now( ) author1.xml_importer_insertion_date = datetime.datetime.now() author1.author_type = 'bad_actor' author1.author_sub_type = 'acquired' self._db.add_author(author1) for i in range(1, 11): post1 = Post() post1.post_id = 'bad_post' + str(i) post1.author = 'acquired_user' post1.guid = 'bad_post' + str(i) post1.date = datetime.datetime.now() post1.domain = 'Microblog' post1.author_guid = 'acquired_user' post1.content = 'InternetTV love it' + str(i) post1.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post1) author = Author() author.name = 'TestUser1' author.domain = 'Microblog' author.author_guid = 'TestUser1' author.author_screen_name = 'TestUser1' author.author_full_name = 'TestUser1' author.author_osn_id = 2 author.created_at = datetime.datetime.now() author.missing_data_complementor_insertion_date = datetime.datetime.now( ) author.xml_importer_insertion_date = datetime.datetime.now() self._db.add_author(author) for i in range(1, 11): post = Post() post.post_id = 'TestPost' + str(i) post.author = 'TestUser1' post.guid = 'TestPost' + str(i) post.date = datetime.datetime.now() post.domain = 'Microblog' post.author_guid = 'TestUser1' post.content = 'InternetTV love it' + str(i) post.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post) self._db.commit() def test_generate_timeline_overlap_csv(self): self.timeline_overlap.setUp() self.timeline_overlap.generate_timeline_overlap_csv() author = self._db.get_author_by_author_guid('acquired_user') self.assertEqual(author.author_type, 'bad_actor') self.assertEqual(author.author_sub_type, 'acquired') pass def tearDown(self): self._db.session.close_all() self._db.session.close() self._db.deleteDB() self._db.session.close()
class TF_IDF_Feature_Generator_Unittests(unittest.TestCase): def setUp(self): self._db = DB() self._db.setUp() self._domain = u'test' self._posts = [] self._authors = [] self._texts = [] def tearDown(self): self._db.session.close() self._db.deleteDB() self._db.session.close() def execute_module(self): authors = self._authors posts = self._db.get_posts_by_domain('test') parameters = {"authors": authors, "posts": posts, "graphs": []} self._module = TF_IDF_Feature_Generator(self._db, **parameters) self._module._stemming = False self._module.execute(window_start=None) def test_tf_idf(self): self._add_author(u'1') text1 = 'this is a a sample' text2 = 'this is another another example example example' self._add_post('ta da', text1, '1') self._add_author(u'2') self._add_post('ta dddda', text2, '2') self.execute_module() self._module.clear_memo_dicts() module_result = self._module.tfidf('example', text2, [text1, text2], {}) self.assertAlmostEqual(module_result, 0.129, places=3) self._module.clear_memo_dicts() module_result = self._module.tfidf('example', text1, [text1, text2], {}) self.assertAlmostEqual(module_result, 0.0, places=2) self._module.clear_memo_dicts() self.assertAlmostEqual(self._module.tf('this', text1), 0.2) self._module.clear_memo_dicts() self.assertAlmostEqual(self._module.tf('this', text2), 0.1428, places=3) def test_tf_idf_complicated(self): self._add_author(u'1') text1 = 'this is a a sample' text2 = 'this is another another example example example' text3 = 'hello world' text4 = 'hello big world' text5 = 'hello Israel' text6 = 'i live in israel' self._add_post(text1, text1, u'1') self._add_post(text3, text3, u'1') self._add_post(text4, text4, u'1') self._add_author(u'2') self._add_post(text2, text2, u'2') self._add_post(text5, text5, u'2') self._add_post(text6, text6, u'2') self.execute_module() self._module.clear_memo_dicts() module_result = self._module.tfidf('example', text2, self._texts, {}) self.assertAlmostEqual(module_result, (3.0 / 7) * abs(math.log((1.0 / 6), 10)), places=4) self._module.clear_memo_dicts() module_result = self._module.tfidf('example', text1, self._texts, {}) self.assertAlmostEqual(module_result, 0.0, places=2) self._module.clear_memo_dicts() def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.name = u'test' author.domain = u'test' self._db.add_author(author) self._db.session.commit() self._authors.append(author) def _add_post(self, title, content, author_guid): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = u'test' post.post_id = len(self._posts) post.guid = post.post_id post.date = date('2020-01-01 23:59:59') self._db.addPost(post) self._db.session.commit() self._posts.append(post) self._texts.append(content)
class TestOldTweetsCrawler(TestCase): # I checked the test at 21/08/2018 there is a chance that the return tweet count will change (I hope not) def setUp(self): self._db = DB() self._db.setUp() self.tweets_crawler = OldTweetsCrawler(self._db) self.tweets_crawler._domain = u'Claim' self._add_author(u"author_guid") self._claims = {} def tearDown(self): self._db.session.close() def test_retrieve_tweets_by_content_between_dates_after(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() date_interval_dict = defaultdict(set) claim_date = self._claims[u"post0"].verdict_date until_date = str_to_date(u"2017-08-03 00:00:00") self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True tweets = self.tweets_crawler._retrieve_tweets_between_dates( self._claims[u"post0"], u"The Rock Running for President", date_to_str(claim_date, "%Y-%m-%d"), date_to_str(until_date, "%Y-%m-%d")) tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue( all([claim_date <= date < until_date for date in tweets_date])) self.assertGreaterEqual(100, len(tweets)) def test_retrieve_tweets_by_content_between_dates_before(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() date_interval_dict = defaultdict(set) claim_date = self._claims[u"post0"].verdict_date since_date = str_to_date(u"2016-08-03 00:00:00") self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True tweets = self.tweets_crawler._retrieve_tweets_between_dates( self._claims[u"post0"], u"The Rock Running for President", date_to_str(since_date, "%Y-%m-%d"), date_to_str(claim_date, "%Y-%m-%d")) tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue( all([since_date <= date < claim_date for date in tweets_date])) self.assertGreaterEqual(100, len(tweets)) def test_retrieve_tweets_by_content_between_dates_1_month_interval(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() since_date = str_to_date(u"2017-01-03 00:00:00") until_date = str_to_date(u"2017-03-03 00:00:00") self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True self.tweets_crawler._max_num_tweets = 133 self.tweets_crawler._month_interval = 1 tweets = self.tweets_crawler._retrieve_old_tweets( self._claims[u"post0"], u"The Rock Running for President") tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue( all([since_date <= date < until_date for date in tweets_date])) self.assertGreaterEqual(133, len(tweets)) def test_retrieve_tweets_by_content_between_dates_no_limit_after(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() since_date = str_to_date(u"2017-01-03 00:00:00") until_date = str_to_date(u"2017-03-03 00:00:00") self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = False self.tweets_crawler._max_num_tweets = 250 self.tweets_crawler._month_interval = 1 tweets = self.tweets_crawler._retrieve_old_tweets( self._claims[u"post0"], u"The Rock Running for President") tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue(all([since_date <= date for date in tweets_date])) self.assertGreaterEqual(250, len(tweets)) def test_retrieve_tweets_by_content_between_dates_no_limit_before(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() since_date = str_to_date(u"2017-01-03 00:00:00") until_date = str_to_date(u"2017-03-03 00:00:00") self.tweets_crawler._limit_start_date = False self.tweets_crawler._limit_end_date = True self.tweets_crawler._max_num_tweets = 250 self.tweets_crawler._month_interval = 1 tweets = self.tweets_crawler._retrieve_old_tweets( self._claims[u"post0"], u"The Rock Running for President") tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue(all([date < until_date for date in tweets_date])) self.assertGreaterEqual(250, len(tweets)) def test_execute_retrieve_tweets_by_full_content_1_month_interval(self): self._add_claim( u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00", u"The Rock Running for President, Dwayne Running for President") self._db.commit() self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True self.tweets_crawler._max_num_tweets = 133 self.tweets_crawler._month_interval = 1 self.tweets_crawler._actions = ['get_old_tweets_by_claims_content'] self.tweets_crawler.execute() tweets_before = self.tweets_crawler._claim_id_tweets_id_before_dict[ u"post0"] tweets_after = self.tweets_crawler._claim_id_tweets_id_after_dict[ u"post0"] self.assertEqual(0, len(tweets_before & tweets_after)) tweets_retrieved = len(tweets_before) + len(tweets_after) self.assertGreaterEqual(133, tweets_retrieved) self.assertEqual(tweets_retrieved, len(self._db.get_posts())) self.assertEqual(tweets_retrieved, len(self._db.get_claim_tweet_connections())) self.assertLess(0, tweets_retrieved) def test_execute_retrieve_tweets_by_key_words_1_month_interval(self): self._add_claim( u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00", u"The Rock Running for President,Dwayne Running for President") self._db.commit() self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True self.tweets_crawler._max_num_tweets = 141 self.tweets_crawler._month_interval = 1 self.tweets_crawler._actions = ['get_old_tweets_by_claims_keywords'] self.tweets_crawler.execute() tweets_before = self.tweets_crawler._claim_id_tweets_id_before_dict[ u"post0"] tweets_after = self.tweets_crawler._claim_id_tweets_id_after_dict[ u"post0"] self.assertEqual(0, len(tweets_before & tweets_after)) tweets_retrieved = len(tweets_before) + len(tweets_after) self.assertGreaterEqual(141 * 3, tweets_retrieved) self.assertEqual(tweets_retrieved, len(self._db.get_posts())) self.assertEqual(tweets_retrieved, len(self._db.get_claim_tweet_connections())) self.assertLess(0, tweets_retrieved) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.author_screen_name = author_guid author.name = u'test' author.domain = u'tests' author.statuses_count = 0 author.created_at = u"2017-06-14 05:00:00" self._db.add_author(author) self._author = author def _add_post(self, post_id, content, tags, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = post_id post.domain = domain post.post_id = post_id post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.tags = tags self._db.addPost(post) self._author.statuses_count += 1 def _add_claim(self, claim_id, content, date_str, keywords=u"", post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.keywords = keywords claim.url = u"claim url" self._db.addPost(claim) self._claims[claim.claim_id] = claim
class TestFakeNewsFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() def test_get_word_count_1_claim_1_comments_no_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {}) self.assert_word_dictionary_fraction('post0', {}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('0', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.0', author_feature.attribute_value) def test_get_word_count_1_claim_1_comments_1_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '1'}) self.assert_word_dictionary_fraction('post0', {'liar': '1.0'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('1', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('1.0', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_1_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00") self._add_post("post2", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post3", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post4", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '1'}) self.assert_word_dictionary_fraction('post0', {'liar': '0.25'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('1', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.25', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_8_words(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 liar bad word liar", "2017-06-12 05:00:00") self._add_post("post2", "no bad words liar at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no liar bad words at all liar", "2017-06-12 05:00:00") self._add_post("post4", " liar no liar bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '8'}) self.assert_word_dictionary_fraction('post0', {'liar': '2.0'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('8', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('2.0', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_8_different_words(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count( 'post0', { 'liar': '3', 'joke': '2', 'didnt actually': '1', 'untrue': '1', 'laugh': '1' }) self.assert_word_dictionary_fraction( 'post0', { 'liar': '0.75', 'joke': '0.5', 'didnt actually': '0.25', 'untrue': '0.25', 'laugh': '0.25' }) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('3', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.75', author_feature.attribute_value) def test_get_claim_type_4_claim(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_claim('post1', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_claim('post2', 'the claim', "2017-06-10 05:00:00", 'pants-fire') self._add_claim('post3', 'the claim', "2017-06-10 05:00:00", 'mostly-false') self._add_claim('post4', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'mostly-true') self._add_claim('post6', 'the claim', "2017-06-10 05:00:00", 'half_true') self._add_claim('post7', 'the claim', "2017-06-10 05:00:00", 'unproven') self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator._domain = 'Claim' self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) author_feature = self._db.get_author_feature( 'post1', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post2', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post3', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post4', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('True', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post5', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('True', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post6', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) author_feature = self._db.get_author_feature( 'post7', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) def assert_word_dictionary_count(self, author_guid, values): self.assert_dictionary_words(author_guid, 'FakeNewsFeatureGenerator_{0}_count', '0', values) def assert_word_dictionary_fraction(self, author_guid, values): self.assert_dictionary_words(author_guid, 'FakeNewsFeatureGenerator_{0}_fraction', '0.0', values) def assert_dictionary_words(self, author_guid, count_template, default_value, values): fake_news_dictionary_words = self.fake_news_feature_generator._fake_news_dictionary for word in fake_news_dictionary_words: word = word.strip().replace(' ', '-') author_feature = self._db.get_author_feature( author_guid, count_template.format(word)) if word in values: self.assertEqual(values[word], author_feature.attribute_value) else: self.assertEqual(default_value, author_feature.attribute_value) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = 'test author' author.author_screen_name = author_guid author.name = 'test' author.domain = 'tests' author.statuses_count = 0 author.created_at = "2017-06-14 05:00:00" self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain='Microblog', post_type=None): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.post_type = post_type self._db.addPost(post) self._posts.append(post) def _get_params(self): posts = {self._author.author_guid: self._posts} params = params = {'authors': [self._author], 'posts': posts} return params def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str, post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.url = "claim url" self._db.addPost(claim)
class TestSyntaxFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None self.syntax_feature_generator = SyntaxFeatureGenerator(self._db, **{}) def tearDown(self): self._db.session.close() pass def test_average_hashtags(self): self._add_author(u"author_guid") self._add_post(u"post1", u"#content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post3", u"content 1 #tag #yes", "2017-06-12 05:00:00") self._add_post(u"post4", u"content #1 #test #dont #fail #please", "2017-06-12 05:00:00") self.syntax_feature_generator.execute() author_feature = self._db.get_author_feature( u"author_guid", u"SyntaxFeatureGenerator_average_hashtags") self.assertAlmostEqual(float(author_feature.attribute_value), 8.0 / 4, places=4) def test_average_links(self): self._add_author(u"author_guid") self._add_post(u"post1", u"#content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"https://www.google.co.il 1", "2017-06-12 05:00:00") self._add_post(u"post3", u"content 1 #tag http://www.google.co.il", "2017-06-12 05:00:00") self._add_post( u"post4", u"http://www.bank.co.il #1 #test #dont http://www.ynet.co.il https://www.msn.co.il", "2017-06-12 05:00:00") self.syntax_feature_generator.execute() author_feature = self._db.get_author_feature( u"author_guid", u"SyntaxFeatureGenerator_average_links") self.assertAlmostEqual(float(author_feature.attribute_value), 5.0 / 4, places=4) def test_average_user_mentions(self): self._add_author(u"author_guid") self._add_post(u"post1", u"@content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post3", u"content 1 @tag #@es", "2017-06-12 05:00:00") self._add_post(u"post4", u"content #1 @test @dont @fail #please", "2017-06-12 05:00:00") self.syntax_feature_generator.execute() author_feature = self._db.get_author_feature( u"author_guid", u"SyntaxFeatureGenerator_average_user_mentions") self.assertAlmostEqual(float(author_feature.attribute_value), 6.0 / 4, places=4) def test_average_post_lenth(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post3", u"content 1 @tag #@es", "2017-06-12 05:00:00") self._add_post(u"post4", u"content #1 @test @dont @fail #please", "2017-06-12 05:00:00") self.syntax_feature_generator.execute() author_feature = self._db.get_author_feature( u"author_guid", u"SyntaxFeatureGenerator_average_post_lenth") self.assertAlmostEqual(float(author_feature.attribute_value), 14.0 / 4, places=4) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.name = u'test' author.domain = u'tests' author.statuses_count = 0 self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date self._db.addPost(post) self._posts.append(post) self._author.statuses_count += 1
class TestFakeNewsClassifier(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() def test_classify_by_dictionary_1_FN_1_FP(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.0) self.assertAlmostEqual(float(output_data['AUC']), 0.0) def test_classify_by_dictionary_1_FN_1_FP_and_ignore_1(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'unknown') self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post12", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post13", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post14", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.0) self.assertAlmostEqual(float(output_data['AUC']), 0.0) def test_classify_by_dictionary_0_FN_0_FP(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 0) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 0) self.assertAlmostEqual(float(output_data['accuracy']), 1.0) self.assertAlmostEqual(float(output_data['AUC']), 1.0) def test_classify_by_dictionary_1_FN_0_FP_3_claims(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_author('author_guid') self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post11", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post12", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post13", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post14", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 0) self.assertAlmostEqual(float(output_data['accuracy']), 0.666666, places=4) self.assertAlmostEqual(float(output_data['AUC']), 0.75) def test_classify_by_dictionary_0_FN_1_FP_3_claims(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post12", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post13", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post14", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 0) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.666666, places=4) self.assertAlmostEqual(float(output_data['AUC']), 0.75) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = 'test author' author.author_screen_name = author_guid author.name = 'test' author.domain = 'tests' author.statuses_count = 0 author.created_at = "2017-06-14 05:00:00" self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain='Microblog', post_type=None): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.post_type = post_type self._db.addPost(post) self._posts.append(post) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str, post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.url = "claim url" self._db.addPost(claim)
class TestEntityToTopicConverter(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._post_dictionary = {} self._authors = [] self._add_author('test author') self._preprocess_visualization = EntityToTopicConverter(self._db) def tearDown(self): self._db.session.close_all() self._db.deleteDB() self._db.session.close() def test_generate_topics_no_topics(self): arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) topics = self._db.get_topics() self.assertEqual(topics, []) def test_generate_topics_from_1_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') def test_generate_topics_from_1_claim_and_remove_stop_words(self): self._add_post("test author", 'claim1', 'claim1 go to the house', 'Claim') arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } self._db.session.commit() self._preprocess_visualization._remove_stop_words = True source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') def test_generate_topics_from_5_claims(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'claim3', 'claim3 content move', 'Claim') self._add_post("test author", 'claim4', 'claim4 dif data', 'Claim') self._add_post("test author", 'claim5', 'claim5 some boring text', 'Claim') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') self.assertTopicInserted('claim2') self.assertTopicInserted('claim3') self.assertTopicInserted('claim4') self.assertTopicInserted('claim5') def test_generate_post_topic_mapping_no_claim(self): arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) mappings = self._db.get_post_topic_mapping() self.assertEqual(0, len(mappings)) def test_generate_post_topic_mapping_1_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] self.assertEqual(3, len(mappings)) self.assertSetEqual( {('post1', topic_id, 1.0), ('post2', topic_id, 1.0), ('post3', topic_id, 1.0)}, set(mappings)) def test_generate_post_topic_mapping_2_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'claim2', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._add_claim_tweet_connection('claim2', 'post4') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(5, len(mappings)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0), ('post5', topic_id2, 1.0)}, set(mappings)) def test__generate_author_topic_mapping_2_claim(self): self._add_author('test author2') self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._add_claim_tweet_connection('claim2', 'post4') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } self._preprocess_visualization._domain = "Microblog" source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_author_topic_mapping() self._preprocess_visualization.save_topic_entities() mapping = self._db.get_author_topic_mapping() self.assertEqual(2, len(mapping)) self.assertSetEqual({('test author', 0.6, 0.4), ('test author2', 0, 0)}, set(mapping)) def test_visualization(self): self._add_author('test author2', "bad_actor") self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author2", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author2", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post4') self._add_claim_tweet_connection('claim2', 'post3') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() self._preprocess_visualization._domain = "Microblog" self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {('test author', 0.666666666667, 0.333333333333), ('test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def test_double_execution_visualization(self): self._add_author('test author2', "bad_actor") self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author2", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author2", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post4') self._add_claim_tweet_connection('claim2', 'post3') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() self._preprocess_visualization._domain = "Microblog" self._preprocess_visualization.execute() self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {('test author', 0.666666666667, 0.333333333333), ('test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def assertTopicInserted(self, claim_id): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_source_id_topic_dictionary( )[claim_id] self.assertIn(topic_id, topic_dict) expected = set( clean_tweet(self._post_dictionary[claim_id].content).split(' ')) if self._preprocess_visualization._remove_stop_words: expected = set( clean_content_by_nltk_stopwords( self._post_dictionary[claim_id].content).split(' ')) self.assertSetEqual(expected, topic_dict[topic_id]) def _add_author(self, author_guid, type="good_actor"): author = Author() author.author_guid = author_guid author.author_full_name = author_guid author.author_screen_name = author_guid author.name = author_guid author.domain = 'Microblog' author.author_type = type self._db.add_author(author) self._authors.append(author) def _add_post(self, author_guid, title, content, domain='Microblog'): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.is_detailed = True post.is_LB = False self._db.addPost(post) self._posts.append(post) self._post_dictionary[post.post_id] = post def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass