Exemplo n.º 1
0
def make_sqlite_db_json(output, input_file, fields):
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)

    column_str = ','.join([column for column in fields]).replace('.','__')
    question_marks = ','.join(['?' for column in fields])
    con = sqlite3.connect(output)
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

    json_col = JsonCollection(input_file)
    insert_list = []
    tp = TweetParser()

    for count,tweet in enumerate(json_col.get_iterator()):
        ret = tp.parse_columns_from_tweet(tweet, fields)
        row = [replace_none(col_val[1]) for col_val in ret]
        insert_list.append(tuple(row))

        if (count % 10000) == 0:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
            insert_list = []

    if count < 10000:
        cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
        con.commit()

    con.close()
    logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
Exemplo n.º 2
0
    def dump_to_sqlite_db(self, output_db, input_fields, top_level=False):
        def replace_none(s):
            if s is None:
                return 'NULL'
            return s

        tweet_parser = TweetParser()
        column_str = ','.join([column for column in input_fields]).replace('.','__')
        question_marks = ','.join(['?' for column in input_fields])

        con = sqlite3.connect(output_db)
        cur = con.cursor()
        cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

        insert_list = []
        # batch insert if more than 10k tweets
        for count, tweet in enumerate(self.get_iterator()):
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields)
            row = [replace_none(col_val[1]) for col_val in ret]
            insert_list.append(tuple(row))
            if (count % 10000) == 0:
                cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
                con.commit()
                insert_list = []
        if count < 10000:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
        con.close()
Exemplo n.º 3
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         csv_handle = bz2.open(self.filepath,
                               self.mode,
                               encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         csv_handle = gzip.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     else:
         csv_handle = open(self.filepath, self.mode, encoding=self.encoding)
     for count, tweet in enumerate(csv.DictReader(csv_handle)):
         if self.limit < count + 1 and self.limit != 0:
             csv_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield dict(tweet)
     csv_handle.close()
Exemplo n.º 4
0
	def dump_to_csv(self, output_csv, input_fields):
		count = 0
		tweet_parser = TweetParser()
		filehandle = open(output_csv, 'wb')
		writer = unicodecsv.writer(filehandle)

		expanded_fields = []
		expanded_fields_list_keys = []

		for field_path in input_fields:
			fields = field_path.split('.')
			if fields[-1].isdigit():
				expanded_fields_list_keys.append((fields[0:len(fields)-1], fields[len(fields)-1]))
				if fields[0:len(fields)-1] not in expanded_fields:
					expanded_fields.append(fields[0:len(fields)-1])
			else:
				expanded_fields.append(fields)

		for tweet in self.get_iterator():
			#use json.loads and not json_util
			#to get a regular dict
			tweet = json.loads(json_util.dumps(tweet))
			row_to_write = []
			flat_tweet_list = []

			# flatten each tweet, and put the resulting tuples
			# in a list
			for flat_entry in tweet_parser.flatten_dict(tweet):
				flat_tweet_list.append(flat_entry)

			# write a header if its the first
			# tweet
			if count == 0:
				writer.writerow(input_fields)
				count += 1

			# if each flattened key path 
			# is a path the user wants add
			# it to be a row to write
			for expanded_field in expanded_fields:
				for tweet_tuple in flat_tweet_list:
					if tweet_tuple[0] == expanded_field:
						if isinstance(tweet_tuple[1], list):
							# for each possible array index
							for list_key in expanded_fields_list_keys:
								if list_key[0] == tweet_tuple[0] and int(list_key[1]) < len(tweet_tuple[1]):
									row_to_write.append(json_util.dumps(tweet_tuple[1][int(list_key[1])]))
								else:
									row_to_write.append('None')
						else:
							if isinstance(tweet_tuple[1], str):
								row_to_write.append(tweet_tuple[1].encode('utf-8').decode('utf-8'))
							else:
								row_to_write.append(tweet_tuple[1])

			#convert each thing to unicode
			writer.writerow(row_to_write)
		filehandle.close()
Exemplo n.º 5
0
	def get_iterator(self):
		tweet_parser = TweetParser()
		mongo_cursor = self.mongo_collection.find( \
			filter=self.filter, \
			no_cursor_timeout=False, \
			limit=self.limit \
		)
		for tweet in mongo_cursor:
			if tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		mongo_cursor.close()
Exemplo n.º 6
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     bson_handle = open(self.filepath, 'rb')
     for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
         if self.limit < count + 1 and self.limit != 0:
             bson_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     bson_handle.close()
Exemplo n.º 7
0
	def get_iterator(self):
		tweet_parser = TweetParser()
		bson_handle = open(self.filepath, 'rb')
		for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
			if self.limit < count+1 and self.limit != 0:
				bson_handle.close()
				return
			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		bson_handle.close()
Exemplo n.º 8
0
	def get_iterator(self):
		tweet_parser = TweetParser()
		json_handle = open(self.filepath, 'r')
		for count, tweet in enumerate(json_handle):
			tweet = json_util.loads(tweet)
			if self.limit != 0 and self.limit <= count:
				return
			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		json_handle.close()
Exemplo n.º 9
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     mongo_cursor = self.mongo_collection.find( \
      filter=self.filter, \
      no_cursor_timeout=False, \
      limit=self.limit \
     )
     for tweet in mongo_cursor:
         if tweet_parser.tweet_passes_custom_filter_list(
                 self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     mongo_cursor.close()
Exemplo n.º 10
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     csv_handle = open(self.filepath, "rb")
     for count, tweet in enumerate(unicodecsv.DictReader(csv_handle)):
         if self.limit < count + 1 and self.limit != 0:
             csv_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) and tweet_parser.tweet_passes_custom_filter_list(
             self.custom_filters, tweet
         ):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     csv_handle.close()
Exemplo n.º 11
0
	def test_strip_tweets_keeps_fields(self):
		tweet_parser = TweetParser()
		collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
		self.maxDiff = None
		it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
		def tweets_have_right_keys(iterator, fields):
			for tweet in iterator:
				keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
				for elem in fields:
					if elem not in keys:
						return False
			return True		
		self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
Exemplo n.º 12
0
    def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None):
        if compression == 'bz2':
            mode = binary_mode(mode)
            filehandle = bz2.open(output_csv, mode)
        elif compression == 'gzip':
            mode = binary_mode(mode)
            filehandle = gzip.open(output_csv, mode)
        else:
            filehandle = open(output_csv, mode)
            
        writer = csv.writer(filehandle)
        if write_header:
            writer.writerow(input_fields)
        tweet_parser = TweetParser()

        for tweet in self.get_iterator():
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields)
            ret_values = [col_val[1] for col_val in ret]
            writer.writerow(ret_values)
        filehandle.close()
Exemplo n.º 13
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         json_handle = bz2.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         json_handle = gzip.open(self.filepath,
                                 self.mode,
                                 encoding=self.encoding)
     else:
         json_handle = open(self.filepath,
                            self.mode,
                            encoding=self.encoding)
     bad_lines = 0
     for count, tweet in enumerate(json_handle):
         if not self.throw_error:
             try:
                 tweet = json_util.loads(tweet)
             except:
                 bad_lines += 1
         else:
             tweet = json_util.loads(tweet)
         if self.limit != 0 and self.limit <= count:
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     if self.verbose:
         print("{} rows are ok.".format(count - bad_lines))
         print("{} rows are corrupt.".format(bad_lines))
     json_handle.close()
Exemplo n.º 14
0
    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        self.maxDiff = None
        it = collection.strip_tweets(['source', 'text',
                                      'id_str']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
Exemplo n.º 15
0
    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = MongoCollection(config['mongo']['host'],
                                     config['mongo']['port'],
                                     config['mongo']['user'],
                                     config['mongo']['password'],
                                     config['mongo']['database'],
                                     config['mongo']['collection'])
        self.maxDiff = None
        it = collection.set_limit(10).strip_tweets(
            ['id', 'entities.user_mentions',
             'user.profile_image_url_https']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'],
                                        ['user', 'profile_image_url_https']]))