def test_auto_sample(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/criteo_top30_1m.csv.gz' encoding = investigate_encoding_and_dialect(data, None, ui) assert auto_sampler(data, encoding, ui) == 14980 ui.close()
def test_auto_small_dataset(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/regression_jp.csv.gz' encoding = investigate_encoding_and_dialect(data, None, ui) assert auto_sampler(data, encoding, ui) == 500
def run_batch_predictions(base_url, base_headers, user, pwd, api_token, create_api_token, pid, lid, n_retry, concurrent, resume, n_samples, out_file, keep_cols, delimiter, dataset, pred_name, timeout, ui, fast_mode, auto_sample, dry_run, encoding, skip_dialect, skip_row_id=False, output_delimiter=None, max_batch_size=None): if max_batch_size is None: max_batch_size = MAX_BATCH_SIZE multiprocessing.freeze_support() t1 = time() queue_size = concurrent * 2 # provide version info and system info in user-agent base_headers['User-Agent'] = 'datarobot_batch_scoring/{}|' \ 'Python/{}|{}|system/{}|concurrency/{}' \ ''.format(__version__, sys.version.split(' ')[0], requests.utils.default_user_agent(), platform.system(), concurrent) with ExitStack() as stack: if os.name is 'nt': # Windows requires an additional manager process. The locks # and queues it creates are proxies for objects that exist within # the manager itself. It does not perform as well so we only # use it when necessary. conc_manager = stack.enter_context(multiprocessing.Manager()) else: # You're on a nix of some sort and don't need a manager process. conc_manager = multiprocessing queue = conc_manager.Queue(queue_size) deque = conc_manager.Queue(queue_size) lock = conc_manager.Lock() rlock = conc_manager.RLock() if not api_token: if not pwd: pwd = ui.getpass() try: api_token = acquire_api_token(base_url, base_headers, user, pwd, create_api_token, ui) except Exception as e: ui.fatal(str(e)) base_headers['content-type'] = 'text/csv; charset=utf8' endpoint = base_url + '/'.join((pid, lid, 'predict')) encoding = investigate_encoding_and_dialect( dataset=dataset, sep=delimiter, ui=ui, fast=fast_mode, encoding=encoding, skip_dialect=skip_dialect, output_delimiter=output_delimiter) if auto_sample: # override n_sample n_samples = auto_sampler(dataset, encoding, ui) ui.info('auto_sample: will use batches of {} rows' ''.format(n_samples)) # Make a sync request to check authentication and fail early first_row = peek_row(dataset, delimiter, ui, fast_mode, encoding) ui.debug('First row for auth request: {}'.format(first_row)) if fast_mode: chunk_formatter = fast_to_csv_chunk else: chunk_formatter = slow_to_csv_chunk first_row_data = chunk_formatter(first_row.data, first_row.fieldnames) first_row = first_row._replace(data=first_row_data) if not dry_run: authorize(user, api_token, n_retry, endpoint, base_headers, first_row, ui) ctx = stack.enter_context( RunContext.create(resume, n_samples, out_file, pid, lid, keep_cols, n_retry, delimiter, dataset, pred_name, ui, fast_mode, encoding, skip_row_id, output_delimiter, lock)) network = stack.enter_context(Network(concurrent, timeout, ui)) n_batches_checkpointed_init = len(ctx.db['checkpoints']) ui.debug('number of batches checkpointed initially: {}' .format(n_batches_checkpointed_init)) # make the queue twice as big as the MGBQ = MultiprocessingGeneratorBackedQueue(ui, queue, deque, rlock) batch_generator_args = ctx.batch_generator_args() shovel = Shovel(queue, batch_generator_args, ui) ui.info('Shovel go...') t2 = time() shovel.go() ui.info('shoveling complete | total time elapsed {}s' .format(time() - t2)) work_unit_gen = WorkUnitGenerator(MGBQ, endpoint, headers=base_headers, user=user, api_token=api_token, ctx=ctx, pred_name=pred_name, fast_mode=fast_mode, ui=ui, max_batch_size=max_batch_size) t0 = time() i = 0 if dry_run: for _ in work_unit_gen: pass ui.info('dry-run complete | time elapsed {}s'.format(time() - t0)) ui.info('dry-run complete | total time elapsed {}s'.format( time() - t1)) else: for r in network.perform_requests(work_unit_gen): i += 1 ui.info('{} responses sent | time elapsed {}s' .format(i, time() - t0)) ui.debug('list of checkpointed batches: {}' .format(sorted(ctx.db['checkpoints']))) n_batches_checkpointed = (len(ctx.db['checkpoints']) - n_batches_checkpointed_init) ui.debug('number of batches checkpointed: {}' .format(n_batches_checkpointed)) n_batches_not_checkpointed = (work_unit_gen.queue.n_consumed - n_batches_checkpointed) batches_missing = n_batches_not_checkpointed > 0 if batches_missing: ui.fatal(('scoring incomplete, {} batches were dropped | ' 'time elapsed {}s') .format(n_batches_not_checkpointed, time() - t0)) else: ui.info('scoring complete | time elapsed {}s' .format(time() - t0)) ui.info('scoring complete | total time elapsed {}s' .format(time() - t1)) total_done = 0 for _, batch_len in ctx.db["checkpoints"]: total_done += batch_len total_lost = 0 for bucket in ("warnings", "errors"): ui.info('==== Scoring {} ===='.format(bucket)) if ctx.db[bucket]: msg_data = ctx.db[bucket] msg_keys = sorted(msg_data.keys()) for batch_id in msg_keys: first = True for msg in msg_data[batch_id]: if first: first = False ui.info("{}: {}".format(batch_id, msg)) else: ui.info(" {}".format(msg)) if bucket == "errors": total_lost += batch_id[1] ui.info('==== Total stats ===='.format(bucket)) ui.info("done: {} lost: {}".format(total_done, total_lost))