def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" mongo = open_db_conn(config) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch g_logger.info("Downloading data for %s from %s to %s starts" % (kind, start_dt_arg, end_dt_arg)) is_ndb = bool(config["kinds"][kind][3]) entity_list = fetch_entities.download_entities( kind, is_ndb, start_dt_arg, end_dt_arg, fetch_interval, config["max_logs"], config["max_tries"], "backup_timestamp", # TODO(jace): make configurable verbose=False, ) g_logger.info( "Data downloaded for %s from %s to %s.# rows: %d finishes" % (kind, start_dt_arg, end_dt_arg, len(entity_list)) ) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file # TODO(yunfang): revisit if we should save the pickled pb archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "pickle") with open(archived_file, "wb") as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) # jsonize the entities json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "json") json_key = config["kinds"][kind][4] f = open(json_filename, "wb") for pb in entity_list: doc = load_pbufs_to_hive.pb_to_dict(pb) json_str = json.dumps(doc) print >> f, "%s\t%s" % (doc[json_key], json_str) f.close() ret = subprocess.call(["gzip", "-f", json_filename]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename)) else: g_logger.error("Cannot gzip %s" % (json_filename)) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # we used to load into mongoDB, we don't anymore. but still set the flag. kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)
def jsonify_downloaded_file(kind, config): file_beginning = filename(kind, config) dat_filename = "%s.dat" % file_beginning json_filename = "%s.json" % file_beginning sqlite_conn = sqlite3.connect(dat_filename, isolation_level=None) sqlstring = 'SELECT id, value FROM result' cursor = sqlite_conn.cursor() cursor.execute(sqlstring) f = open(json_filename, 'wb') for unused_entity_id, pb in cursor: doc = load_pbufs_to_hive.pb_to_dict(pb) json_str = json.dumps(doc) print >>f, "%s\t%s" % (doc['key'], json_str) f.close() sqlite_conn.close() echo_system("gzip -f %s" % json_filename)
def jsonify_downloaded_file(kind, config): file_beginning = filename(kind, config) dat_filename = "%s.dat" % file_beginning json_filename = "%s.json" % file_beginning sqlite_conn = sqlite3.connect(dat_filename, isolation_level=None) sqlstring = 'SELECT id, value FROM result' cursor = sqlite_conn.cursor() cursor.execute(sqlstring) f = open(json_filename, 'wb') for unused_entity_id, pb in cursor: doc = load_pbufs_to_hive.pb_to_dict(pb, parent=True) json_str = json.dumps(doc) print >> f, "%s\t%s" % (doc['key'], json_str) f.close() sqlite_conn.close() echo_system("gzip -f %s" % json_filename)
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" if config['dbhost']: mongo = open_db_conn(config) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch g_logger.info("Downloading data for %s from %s to %s starts" % ( kind, start_dt_arg, end_dt_arg)) is_ndb = bool(config['kinds'][kind][3]) entity_list = fetch_entities.download_entities( kind, is_ndb, start_dt_arg, end_dt_arg, fetch_interval, config['max_logs'], config['max_tries'], "backup_timestamp", # TODO(jace): make configurable verbose=False) g_logger.info( "Data downloaded for %s from %s to %s.# rows: %d finishes" % ( kind, start_dt_arg, end_dt_arg, len(entity_list))) if config['dbhost']: kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file # TODO(yunfang): revisit if we should save the pickled pb archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, 'pickle') with open(archived_file, 'wb') as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) #jsonize the entities json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, 'json') json_key = config['kinds'][kind][4] f = open(json_filename, 'wb') for pb in entity_list: doc = load_pbufs_to_hive.pb_to_dict(pb) # TODO(mattfaus): Make configurable, like for download_entities() above log_timestamp_outside_window( kind, doc.get('backup_timestamp'), start_dt_arg, end_dt_arg) json_str = json.dumps(doc) print >>f, "%s\t%s" % (doc[json_key], json_str) f.close() ret = subprocess.call(["gzip", "-f", json_filename]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename)) else: g_logger.error("Cannot gzip %s" % (json_filename)) if config['dbhost']: kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # Well, we didn't actually load the data with this script, but mark # it as such anyway. kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)