def main(): options = get_cmd_line_args() config = util.load_unstripped_json(options.config) g_logger.info("Fetching failed jobs from progress db") start_dt = date_util.from_date_iso(options.start_date) mongo = gae_download.open_db_conn(config) coordinator_cfg = config["coordinator_cfg"] # Don't touch tasks that was recently started # TODO(yunfang): parameterize this two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2) results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg) if not results: g_logger.info("Empty result set. Nothing to reprocess.") exit(0) for rec in results: if rec["history"]["1"] < start_dt: continue if rec["history"]["1"] >= two_hours_ago: # Started less than 2 hours ago continue # Reprocess fetch_interval = config['kinds'][rec['kind']][1] gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"], rec["end_dt"], fetch_interval, config) g_logger.info("Done reprocessing!!")
def main(): options = get_cmd_line_args() config = util.load_unstripped_json(options.config) #hard code some args config['max_threads'] = 2 config['coordinator_cfg']['control_db'] = "ka_backpopulate_cntrl" config["sub_process_time_out"] = 86400 * 3 with open(options.file_list) as f: file_list = f.readlines() processes = [] for gzfile in file_list: while True: if len(active_children()) < config['max_threads']: g_logger.info("Starting loading %s ...", gzfile) p = Process(target=gz_pickle_to_mongo, args=(config, gzfile.strip())) processes.append((p, gzfile.strip(), time.time())) p.start() time.sleep(5) break else: monitor(config, processes) time.sleep(10) while len(active_children()) > 0: monitor(config, processes) time.sleep(10)
def main(): options = get_cmd_line_args() config = util.load_unstripped_json(options.config) #hard code some args config['max_threads'] = 2 config['coordinator_cfg']['control_db'] = "ka_backpopulate_cntrl" config["sub_process_time_out"] = 86400*3 with open(options.file_list) as f: file_list = f.readlines() processes = [] for gzfile in file_list: while True: if len(active_children()) < config['max_threads']: g_logger.info("Starting loading %s ...", gzfile) p = Process(target = gz_pickle_to_mongo, args = (config, gzfile.strip())) processes.append((p, gzfile.strip(), time.time())) p.start() time.sleep(5) break else: monitor(config, processes) time.sleep(10) while len(active_children()) > 0: monitor(config, processes) time.sleep(10)
def main(): options = get_cmd_line_args() # NOTE: the Mongo connection specified on command line will override # what may be specified in the config file for gae_download mongo_conn = pymongo.Connection(options.server, options.port) config = util.load_unstripped_json(options.config) insert_dumpfile_into_mongo(options.file, config, mongo_conn, options.offset, options.limit)
def get_db(db_name, config_location): """Return a pymongo Database reference as configured in 'config'.""" config = util.load_unstripped_json(config_location) db_config = config['databases']['mongo'][db_name] server_name = db_config['server'] mongo_db_name = db_config['database'] return get_connection(server_name, config_location)[mongo_db_name]
def get_connection(mongo_server_name, config_location): """Return a pymongo.Connection to the named server. NOTE: the mongo_server_name is not the hostname of the machine or the name of EC2 instance, it is the name given to the mongo server in the main analytics config file, the location of which is the second argument. """ config = util.load_unstripped_json(config_location) server_config = config['servers']['mongo'][mongo_server_name] host = server_config['host'] port = server_config['port'] return pymongo.Connection(host, port)
def main(): options = get_cmd_line_args() config = load_unstripped_json(options.config) for key in DEFAULT_DOWNLOAD_SETTINGS.keys(): if key not in config: config[key] = DEFAULT_DOWNLOAD_SETTINGS[key] if options.start_date and options.end_date: start_dt = date_util.from_date_iso(options.start_date) end_dt = date_util.from_date_iso(options.end_date) else: ts = time.time() end_ts = ts - (ts % int(options.proc_interval)) start_ts = end_ts - int(options.proc_interval) start_dt = dt.datetime.fromtimestamp(start_ts) end_dt = dt.datetime.fromtimestamp(end_ts) if options.archive_dir: # Override the archive directory, if specified. config['archive_dir'] = options.archive_dir start_data_process(config, start_dt, end_dt)