Пример #1
0
def make_sqlite_db_json(output, input_file, fields):
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)

    column_str = ','.join([column for column in fields]).replace('.','__')
    question_marks = ','.join(['?' for column in fields])
    con = sqlite3.connect(output)
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

    json_col = JsonCollection(input_file)
    insert_list = []
    tp = TweetParser()

    for count,tweet in enumerate(json_col.get_iterator()):
        ret = tp.parse_columns_from_tweet(tweet, fields)
        row = [replace_none(col_val[1]) for col_val in ret]
        insert_list.append(tuple(row))

        if (count % 10000) == 0:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
            insert_list = []

    if count < 10000:
        cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
        con.commit()

    con.close()
    logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
Пример #2
0
def write_files(collection):
    hashtags = ["#ivoted", "#myvote2016", "#myvote"]
    statements = ["i voted", "i will vote", "my vote", "vote for"]

    with open("/scratch/olympus/projects/hashtag_filtering/hashtags_{}.json".format(collection[0].split('/')[4]), 'w') as hashtag_file:
        with open("/scratch/olympus/projects/hashtag_filtering/statements_{}.json".format(collection[0].split('/')[4]), 'w') as statement_file:
            # with open --> this is how to read in / initialize files in python
            # 'w' : write, 'r' : read
            for each_file in collection:
                hashtags_counter = 0
                statements_counter = 0
                collection = JsonCollection(each_file, throw_error=False, verbose=1)
                for tweet in collection.get_iterator():
                    if tweet and tweet["text"]:
                        if any(hashtag in tweet["text"] for hashtag in hashtags):
                            hashtag_file.write("{}\n".format(json.dumps(tweet, default=date_handler)))
                            hashtags_counter += 1
                        if any(statement in tweet["text"] for statement in statements):
                            statement_file.write("{}\n".format(json.dumps(tweet, default=date_handler)))
                            statements_counter += 1
                    else:
                        logging.info("Something was wrong with a tweet")
                logging.info("Extracted {} tweets to the statement file".format(statements_counter))
                logging.info("Extracted {} tweets to the hashtags file".format(hashtags_counter))
                # same as tweet_counter = tweet_counter + 1
        statement_file.close()
    hashtag_file.close()
Пример #3
0
	def test_json_collection_custom_filter_filters(self):
		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		full_collection_len = len(list(collectionone.get_iterator()))
		def is_tweet_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return True
			else:
				return False
		num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		def is_not_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return False
			else:
				return True
		num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator()))

		#the numbes of retweets and non retweets should add up to the whole collection
		self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Пример #4
0
	def test_json_collection_custom_filter_filters(self):
		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		full_collection_len = len(list(collectionone.get_iterator()))
		def is_tweet_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return True
			else:
				return False
		num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		def is_not_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return False
			else:
				return True
		num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator()))

		#the numbes of retweets and non retweets should add up to the whole collection
		self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Пример #5
0
	def test_iterator_returns_tweets(self):
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		self.assertTrue(len(list(collection.get_iterator())) > 0)
Пример #6
0
	def test_iterator_returns_tweets(self):
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		self.assertTrue(len(list(collection.get_iterator())) > 0)