def dump_to_sqlite_db(self, output_db, input_fields, top_level=False): def replace_none(s): if s is None: return 'NULL' return s tweet_parser = TweetParser() column_str = ','.join([column for column in input_fields]).replace('.','__') question_marks = ','.join(['?' for column in input_fields]) con = sqlite3.connect(output_db) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) insert_list = [] # batch insert if more than 10k tweets for count, tweet in enumerate(self.get_iterator()): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close()
def make_sqlite_db_json(output, input_file, fields): logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) column_str = ','.join([column for column in fields]).replace('.','__') question_marks = ','.join(['?' for column in fields]) con = sqlite3.connect(output) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) json_col = JsonCollection(input_file) insert_list = [] tp = TweetParser() for count,tweet in enumerate(json_col.get_iterator()): ret = tp.parse_columns_from_tweet(tweet, fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close() logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None): if compression == 'bz2': mode = binary_mode(mode) filehandle = bz2.open(output_csv, mode) elif compression == 'gzip': mode = binary_mode(mode) filehandle = gzip.open(output_csv, mode) else: filehandle = open(output_csv, mode) writer = csv.writer(filehandle) if write_header: writer.writerow(input_fields) tweet_parser = TweetParser() for tweet in self.get_iterator(): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields) ret_values = [col_val[1] for col_val in ret] writer.writerow(ret_values) filehandle.close()