Exemplo n.º 1
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
            rec["end_dt"], fetch_interval, config)
    g_logger.info("Done reprocessing!!")
Exemplo n.º 2
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    #hard code some args
    config['max_threads'] = 2
    config['coordinator_cfg']['control_db'] = "ka_backpopulate_cntrl"
    config["sub_process_time_out"] = 86400 * 3
    with open(options.file_list) as f:
        file_list = f.readlines()
    processes = []
    for gzfile in file_list:
        while True:
            if len(active_children()) < config['max_threads']:
                g_logger.info("Starting loading %s ...", gzfile)
                p = Process(target=gz_pickle_to_mongo,
                            args=(config, gzfile.strip()))
                processes.append((p, gzfile.strip(), time.time()))
                p.start()
                time.sleep(5)
                break
            else:
                monitor(config, processes)
                time.sleep(10)
    while len(active_children()) > 0:
        monitor(config, processes)
        time.sleep(10)
Exemplo n.º 3
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    #hard code some args
    config['max_threads'] = 2
    config['coordinator_cfg']['control_db'] = "ka_backpopulate_cntrl"
    config["sub_process_time_out"] = 86400*3
    with open(options.file_list) as f:
        file_list = f.readlines()
    processes = []
    for gzfile in file_list:
        while True:
            if len(active_children()) < config['max_threads']:         
               g_logger.info("Starting loading %s ...", gzfile)
               p = Process(target = gz_pickle_to_mongo,
                           args = (config, gzfile.strip()))
               processes.append((p, gzfile.strip(), time.time()))
               p.start()
               time.sleep(5)
               break
            else: 
               monitor(config, processes)
               time.sleep(10)
    while len(active_children()) > 0:       
        monitor(config, processes)
        time.sleep(10)
Exemplo n.º 4
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
                                            rec["end_dt"], fetch_interval,
                                            config)
    g_logger.info("Done reprocessing!!")
Exemplo n.º 5
0
def main():
    options = get_cmd_line_args()
    # NOTE: the Mongo connection specified on command line will override
    # what may be specified in the config file for gae_download
    mongo_conn = pymongo.Connection(options.server, options.port)
    config = util.load_unstripped_json(options.config)
    insert_dumpfile_into_mongo(options.file, config, mongo_conn, 
                               options.offset, options.limit)
Exemplo n.º 6
0
def main():
    options = get_cmd_line_args()
    # NOTE: the Mongo connection specified on command line will override
    # what may be specified in the config file for gae_download
    mongo_conn = pymongo.Connection(options.server, options.port)
    config = util.load_unstripped_json(options.config)
    insert_dumpfile_into_mongo(options.file, config, mongo_conn,
                               options.offset, options.limit)
Exemplo n.º 7
0
def get_db(db_name, config_location):
    """Return a pymongo Database reference as configured in 'config'."""
    config = util.load_unstripped_json(config_location)
    db_config = config['databases']['mongo'][db_name]

    server_name = db_config['server']
    mongo_db_name = db_config['database']
    
    return get_connection(server_name, config_location)[mongo_db_name]
Exemplo n.º 8
0
def get_db(db_name, config_location):
    """Return a pymongo Database reference as configured in 'config'."""
    config = util.load_unstripped_json(config_location)
    db_config = config['databases']['mongo'][db_name]

    server_name = db_config['server']
    mongo_db_name = db_config['database']

    return get_connection(server_name, config_location)[mongo_db_name]
Exemplo n.º 9
0
def get_connection(mongo_server_name, config_location):
    """Return a pymongo.Connection to the named server.
    
    NOTE: the mongo_server_name is not the hostname of the machine or 
    the name of EC2 instance, it is the name given to the mongo server in 
    the main analytics config file, the location of which is 
    the second argument.
    """
    config = util.load_unstripped_json(config_location)
    server_config = config['servers']['mongo'][mongo_server_name]

    host = server_config['host']
    port = server_config['port']

    return pymongo.Connection(host, port)
Exemplo n.º 10
0
def get_connection(mongo_server_name, config_location):
    """Return a pymongo.Connection to the named server.
    
    NOTE: the mongo_server_name is not the hostname of the machine or 
    the name of EC2 instance, it is the name given to the mongo server in 
    the main analytics config file, the location of which is 
    the second argument.
    """
    config = util.load_unstripped_json(config_location)
    server_config = config['servers']['mongo'][mongo_server_name]

    host = server_config['host']
    port = server_config['port']

    return pymongo.Connection(host, port)
Exemplo n.º 11
0
def main():
    options = get_cmd_line_args()
    config = load_unstripped_json(options.config)
    for key in DEFAULT_DOWNLOAD_SETTINGS.keys():
        if key not in config:
            config[key] = DEFAULT_DOWNLOAD_SETTINGS[key]
    if options.start_date and options.end_date:
        start_dt = date_util.from_date_iso(options.start_date)
        end_dt = date_util.from_date_iso(options.end_date)
    else:
        ts = time.time()
        end_ts = ts - (ts % int(options.proc_interval))
        start_ts = end_ts - int(options.proc_interval)
        start_dt = dt.datetime.fromtimestamp(start_ts)
        end_dt = dt.datetime.fromtimestamp(end_ts)
    if options.archive_dir:
        # Override the archive directory, if specified.
        config['archive_dir'] = options.archive_dir
    start_data_process(config, start_dt, end_dt)
Exemplo n.º 12
0
def main():
    options = get_cmd_line_args()
    config = load_unstripped_json(options.config)
    for key in DEFAULT_DOWNLOAD_SETTINGS.keys():
        if key not in config:
            config[key] = DEFAULT_DOWNLOAD_SETTINGS[key]
    if options.start_date and options.end_date:
        start_dt = date_util.from_date_iso(options.start_date)
        end_dt = date_util.from_date_iso(options.end_date)
    else:
        ts = time.time()
        end_ts = ts - (ts % int(options.proc_interval))
        start_ts = end_ts - int(options.proc_interval)
        start_dt = dt.datetime.fromtimestamp(start_ts)
        end_dt = dt.datetime.fromtimestamp(end_ts)
    if options.archive_dir:
        # Override the archive directory, if specified.
        config['archive_dir'] = options.archive_dir
    start_data_process(config, start_dt, end_dt)