Пример #1
0
    def test_dump_to_csv_dumps_with_top_level(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')

        field_list = [
            'id_str', 'coordinates.coordinates.0', 'coordinates.coordinates.1',
            'user.id_str', 'user.lang', 'lang', 'text', 'user.screen_name',
            'user.location', 'user.description', 'created_at',
            'user.friends_count', 'user.followers_count', 'retweet_count',
            'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url',
            'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url',
            'entities.urls.4.expanded_url', 'entities.hashtags.0.text',
            'entities.hashtags.1.text'
        ]

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.csv'
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        collection.dump_to_csv(output_path, field_list, top_level=True)
        self.assertTrue(os.path.getsize(output_path) > 0)

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')
Пример #2
0
    def test_dump_to_sqlite_db_dumps_the_right_stuff_with_top_level(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.db'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) + '/data/output.db')

        field_list = [
            'id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source',
            'user.id', 'timestamp.$date', 'text', 'coordinates.coordinates.0',
            'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang',
            'user.screen_name', 'user.location', 'user.description',
            'created_at', 'user.friends_count', 'user.followers_count',
            'retweet_count', 'entities.urls.0.expanded_url',
            'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url',
            'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url',
            'entities.hashtags.0.text', 'entities.hashtags.1.text'
        ]

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.db'
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        collection.dump_to_sqlite_db(output_path, field_list, top_level=True)

        con = sqlite3.connect(output_path)
        cur = con.cursor()
        row = [
            elem for row in cur.execute("SELECT * FROM data LIMIT 1;")
            for elem in row
        ]
        con.close()
        self.assertTrue(len(row) > 0)
        self.assertEqual(
            set(row),
            set([
                '661275583813431296',
                '{"indices": [74, 83], "text": "jadehelm"}',
                '{"indices": [84, 98], "text": "newworldorder"}',
                '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>',
                '379851447', '1446495359000',
                'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat',
                'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL',
                'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL',
                'NULL', 'NULL', 'NULL'
            ]))

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.db'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) + '/data/output.db')
Пример #3
0
 def test_strip_tweets_keeps_fields(self):
     tweet_parser = TweetParser()
     collection = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid'])
     self.maxDiff = None
     it = collection.strip_tweets(['source', 'text', 'id_str']).get_iterator()
     def tweets_have_right_keys(iterator, fields):
         for tweet in iterator:
             keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
             for elem in fields:
                 if elem not in keys:
                     return False
         return True     
     self.assertTrue(tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
Пример #4
0
    def test_json_collection_custom_filter_filters(self):
        collectionone = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        full_collection_len = len(list(collectionone.get_iterator()))

        def is_tweet_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return True
            else:
                return False

        num_retweets = len(
            list(
                collectionone.set_custom_filter(
                    is_tweet_a_retweet).get_iterator()))

        collectiontwo = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])

        def is_not_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return False
            else:
                return True

        num_non_retweets = len(
            list(
                collectiontwo.set_custom_filter(
                    is_not_a_retweet).get_iterator()))

        #the numbes of retweets and non retweets should add up to the whole collection
        self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Пример #5
0
    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        self.maxDiff = None
        it = collection.strip_tweets(['source', 'text',
                                      'id_str']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
Пример #6
0
    def test_json_collection_custom_filter_filters(self):
        collectionone = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid'])
        full_collection_len = len(list(collectionone.get_iterator()))
        def is_tweet_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return True
            else:
                return False
        num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

        collectiontwo = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid'])
        def is_not_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return False
            else:
                return True
        num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator()))

        #the numbes of retweets and non retweets should add up to the whole collection
        self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Пример #7
0
 def test_iterator_returns_tweets(self):
     collection = CsvCollection(
         os.path.dirname(os.path.realpath(__file__)) + '/' +
         config['csv']['valid'])
     self.assertTrue(len(list(collection.get_iterator())) > 0)
Пример #8
0
    def test_dump_to_csv_dumps_right_stuff_with_top_level(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')

        field_list = [
            'id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source',
            'user.id', 'timestamp.$date', 'text', 'coordinates.coordinates.0',
            'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang',
            'user.screen_name', 'user.location', 'user.description',
            'created_at', 'user.friends_count', 'user.followers_count',
            'retweet_count', 'entities.urls.0.expanded_url',
            'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url',
            'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url',
            'entities.hashtags.0.text', 'entities.hashtags.1.text'
        ]

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.csv'
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        collection.dump_to_csv(output_path, field_list, top_level=True)
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv', 'r') as filehandle:
            count = 0
            for line in csv.reader(filehandle):
                if count != 0:
                    val_count = 0
                    for csv_row_value in line:
                        everything_in_order = True
                        if val_count == 0:
                            self.assertEqual(csv_row_value,
                                             '661275583813431296')
                        elif val_count == 1:
                            if csv_row_value != 'jadehelm':
                                everything_in_order = False
                        elif val_count == 2:
                            if csv_row_value != 'newworldorder':
                                everything_in_order = False
                        elif val_count == 3:
                            self.assertEqual(
                                csv_row_value,
                                '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>'
                            )
                        elif val_count == 4:
                            self.assertEqual(csv_row_value, '379851447')
                        elif val_count == 5:
                            self.assertEqual(csv_row_value, '1446495359000')
                        elif val_count == 6:
                            self.assertEqual(
                                csv_row_value,
                                'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat'
                            )
                        if everything_in_order:
                            self.assertTrue(True)
                        val_count += 1
                else:
                    count += 1

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')
Пример #9
0
 def test_iterator_returns_tweets(self):
     collection = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid'])
     self.assertTrue(len(list(collection.get_iterator())) > 0)