def _fetch_products_with_thread_pool_executor(self, links: Iterable[str], *, max_workers: Optional[int] = None ) -> Iterator[Product]: """ Fetch iterator of the products. Returned iterator of the results of ``_get_product`` method. Note: iterator might contain error that has occurred during thread pool execution. :param links: links that refer on the product web pages :type links: Iterable[str] :param max_workers: max workers of the pool :type max_workers: Optional[int] :return: iterator of products :rtype: Iterator[Product] """ thread_pool_params = { 'max_workers': max_workers, 'thread_name_prefix': f'{self.__class__.__name__}.dump_products' } with thread.ThreadPoolExecutor(**thread_pool_params) as executor: logger.info(f'Quantity of used workers in pool: {executor._max_workers}.') products_iterator: Iterator[Product] = executor.map( self._get_product, links ) return products_iterator
def run(self, arguments_set): total_executors = min(self.max_thread_workers, len(arguments_set)) executor = thread.ThreadPoolExecutor(max_workers=total_executors) future_items = [executor.submit(self.function, argument) for argument in arguments_set] wait(future_items) for result in future_items: result.result()
def __init__( self, *args, max_workers: int = None, enable_default_help: bool = True, ignore_event_decorator_call: bool = False, ignore_overwrite_on_message: bool = False, shut_up: bool = False, **kwargs, ): self._thread_pool = thread.ThreadPoolExecutor( max_workers=max_workers, thread_name_prefix=f"libneko.clients.{type(self).__name__} worker", ) # Update: we do not need this; Python automatically does this anyway. # Ensure thread pool shuts down. # @atexit.register # def kill_pool(): # try: # self._thread_pool.shutdown(wait=False) # except Exception: # pass self._has_logged_out_triggered = False self._ignore_event_decorator_call = ignore_event_decorator_call or shut_up self._ignore_overwrite_on_message = ignore_overwrite_on_message or shut_up super().__init__(*args, **kwargs) if not enable_default_help: self.remove_command("help")
def init(): global _downloaders_thread_pool if _downloaders_thread_pool: _downloaders_thread_pool.shutdown() _downloaders_thread_pool = thread.ThreadPoolExecutor( int(const.config.max_downloads))
def start(self): '''开启服务,线程控制''' try: print( f"\033[33m{'-'*10} {f'开 始 监 听({self.ip}:{self.port})':^32} {'-'*10}\033[0m\r\n" ) t = thread.ThreadPoolExecutor(self.thread) while True: try: # 接收套接字 conn, addr = self.s.accept() # 每个会话一个实例 s = session() # 传入设置 s.server = self.server s.dirlist = self.dirlist s.image = self.image s.header = self.header s.footer = self.footer s._index = self._index s.__method__ = self.__method__ t.submit(s.http, conn, addr) except BlockingIOError as e: # 无连接pass继续查询 pass t.shutdown(wait=True) except KeyboardInterrupt as e: logging.exception(e) print("\033[31m手动停止\033[0m") self.s.shutdown(2) self.s.close() exit()
def __enter__(self): threads = 4 * os.cpu_count() - 1 or 4 processes = len(os.sched_getaffinity(0)) or 4 self.logger.info( "Acquiring up to %s thread workers and up to %s process workers for asyncio executors", threads, processes) self.thread_pool = thread.ThreadPoolExecutor(max_workers=threads) self.process_pool = process.ProcessPoolExecutor(max_workers=processes) return self
def initCharacterPosition(): characterPositionPool = thread.ThreadPoolExecutor( max_workers=GameConfig.threading_pool_max) characterList = CacheContorl.npcTemData for i in range(0, len(characterList)): characterIdS = str(i + 1) characterData = characterList[i] characterPositionPool.submit(initCharacterPositionNow, characterData, characterIdS) characterPositionPool.shutdown()
def initCharacterList(): initCharacterThreadPool = thread.ThreadPoolExecutor( max_workers=GameConfig.threading_pool_max) initCharacterTem() characterList = CacheContorl.npcTemData i = 1 for character in characterList: initCharacterThreadPool.submit(initCharacter, i, character) i += 1 initCharacterThreadPool.shutdown() initCharacterPosition()
def maybe_start_xprof(seconds): if jax.host_id() == 0 and FLAGS.xprof: xprof = xprof_session.XprofSession() xprof.start_session('REDACTED', True, 2) def sleep_and_end_xprof(): time.sleep(seconds) logging.info( 'Xprof URL: %s', xprof.end_session_and_get_url( tag='flax resnet, {} devices, batch {} per device'. format(jax.device_count(), device_batch_size))) thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof)
def main(): pool = thread.ThreadPoolExecutor(20) threading_list = [] for i in range(1, 267): url = 'https://piao.qunar.com/ticket/list.htm?keyword=%E5%8C%97%E4%BA%AC®ion=&from=mps_search_suggest&page={}'.format( i) t = pool.submit(task, url) threading_list.append(t) for future in as_completed(threading_list): ret = future.result() print(ret) with open('qunaer.json', mode='w', encoding='utf-8') as fp: json.dump(data, fp, ensure_ascii=False)
def maybe_start_xprof(seconds): if jax.host_id() == 0 and FLAGS.xprof: xprof = xprof_session.XprofSession() xprof.start_session('REDACTED', True, 2) def sleep_and_end_xprof(): time.sleep(seconds) logging.info( 'Xprof URL: %s', xprof.end_session_and_get_url( tag= 'flax transformer, {} devices, {}-way, batch {} per replica' .format(jax.device_count(), num_partitions, device_train_input_shape[0]))) thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof)
def all_dates(): # Stop polluting the working directory by creating an download folder if not os.path.exists(GDELT_OUTPUT_DIRECTORY): # I have already downloaded a majority of this data and compressed it, so attempt to # restore most of what is needed from the bucket (this will save about an hour) CACHED_ARCHIVE = 'gdelt.tar.gz' BUCKET_URL = f'https://storage.googleapis.com/data301-bucket-9n5z0ph0/{CACHED_ARCHIVE}' # Place to store all the data, needs about 30 GB disk space os.mkdir(GDELT_OUTPUT_DIRECTORY) try: # Download the file, then just call tar to do the extraction (sorry Windows) print('Downloading cached archive from', BUCKET_URL) request.urlretrieve(BUCKET_URL, CACHED_ARCHIVE) print('Extracting compressed archive...') subprocess.run(['tar', 'zxf', CACHED_ARCHIVE, '-C', GDELT_OUTPUT_DIRECTORY]) print('Extraction complete!') os.remove(CACHED_ARCHIVE) except: # Bucket won't exist forever :( print('Failed downloading URL', BUCKET_URL) # Initialise gdelt so its API can be queried gd = gdelt.gdelt(version=2) downloaded_dates = [] # Parallelize the download, as lots of event data is needed with thread.ThreadPoolExecutor(max_workers=GDELT_DOWNLOAD_WORKERS) as executor: # Pull every date out of the slice... slices = [slice for slice in analysis_dates()] dates = (date for dates in slices for date in dates) # ...log how many are to be downloaded download_cnt = ANALYSIS_SLICE_PERIOD_DAYS * ANALYSIS_SLICE_MULTIPLIER * ANALYSIS_BLOCK_COUNT print('Downloading', download_cnt, 'event files...') # ...and then download the corresponding GDELT event data for that day for date in dates: executor.submit(download_date, date, gd) downloaded_dates.append(date_formatted(date)) print('Download complete') # All dates flattened, along with every (formatted) date in each slice of dates return downloaded_dates, [[date_formatted(date) for date in dates] for dates in slices]
def start(self): '''开启服务,线程控制''' try: print( f"\033[33m{'-'*10} {f'开 始 监 听({self.ip}:{self.port})':^32} {'-'*10}\033[0m\r\n" ) t = thread.ThreadPoolExecutor(self.thread) while True: try: # 接收套接字 conn, addr = self.s.accept() t.submit(self.http, conn, addr) except BlockingIOError as e: # 无连接pass继续查询 pass t.shutdown(wait=True) except KeyboardInterrupt as e: logging.exception(e) print("\033[31m手动停止\033[0m") self.s.shutdown(2) self.s.close()
def __init__(self, runner: 'mtap.processing.ProcessingComponent', host: str, port: int = 0, *, register: bool = False, workers: Optional[int] = None, write_address: bool = False, config: 'Optional[mtap.Config]' = None): self.host = host self._port = port self.processor_id = runner.processor_id self.write_address = write_address if config is None: config = _config.Config() self._health_servicer = health.HealthServicer() self._health_servicer.set('', 'SERVING') self._servicer = _ProcessorServicer( config=config, address=host, runner=runner, health_servicer=self._health_servicer, register=register) workers = workers or 10 thread_pool = thread.ThreadPoolExecutor(max_workers=workers) self._server = grpc.server( thread_pool, options=[('grpc.max_send_message_length', config.get('grpc.max_send_message_length')), ('grpc.max_receive_message_length', config.get('grpc.max_receive_message_length'))]) health_pb2_grpc.add_HealthServicer_to_server(self._health_servicer, self._server) processing_pb2_grpc.add_ProcessorServicer_to_server( self._servicer, self._server) self._port = self._server.add_insecure_port("{}:{}".format( self.host, self.port)) self._stopped_event = threading.Event() self._address_file = None
def process(self): # t1 = Thread(target=self.consistency) # t2 = Thread(target=self.form) # t3 = Thread(target=self.recent_form) # t4 = Thread(target=self.total_consistency) # t5 = Thread(target=self.opposition) # t6 = Thread(target=self.venue) # # t1.start() # t2.start() # t3.start() # t4.start() # t5.start() # t6.start() # # t1.join() # t2.join() # t3.join() # t4.join() # t5.join() # t6.join() with thread.ThreadPoolExecutor() as executor: f1 = executor.submit(self.consistency) f2 = executor.submit(self.form) f3 = executor.submit(self.recent_form) f4 = executor.submit(self.total_consistency) f5 = executor.submit(self.opposition) f6 = executor.submit(self.venue) con = f1.result() form = f2.result() rf = f3.result() tc = f4.result() opp = f5.result() ven = f6.result() return con, form, rf, tc, opp, ven
def run_trigger(input_file, output_file, plugin, expect_timeout=False): input_message = json.load(open(input_file)) expected_output = json.load(open(output_file)) trigger_name = input_message["body"]["trigger"] capture = CaptureDispatcher() plugin.triggers[trigger_name].dispatcher = capture executor = thread.ThreadPoolExecutor() executor.submit(plugin.handle_step, input_message) future = executor.submit(capture.wait_for_caught_message) out = futures.wait([future], timeout=10) done = out.done # Non-graceful shutdown executor._threads.clear() futures.thread._threads_queues.clear() if len(done) <= 0: if expect_timeout: return raise Exception("Timeout") output = capture.caught_message if "body" in output and "log" in output["body"]: output["body"]["log"] = "" if "body" in expected_output and "log" in expected_output["body"]: expected_output["body"]["log"] = "" if output != expected_output: raise Exception( "Actual output differs from expected output.{} != {}".format( output, expected_output))
def restore_checkpoint(ckpt_dir, target, step=None, prefix='checkpoint_', parallel=True): """Restore last/best checkpoint from checkpoints in path. Sorts the checkpoint files naturally, returning the highest-valued file, e.g.: ckpt_1, ckpt_2, ckpt_3 --> ckpt_3 ckpt_0.01, ckpt_0.1, ckpt_0.001 --> ckpt_0.1 ckpt_-1.0, ckpt_1.0, ckpt_1e5 --> ckpt_1e5 Args: ckpt_dir: str: checkpoint file or directory of checkpoints to restore from. target: matching object to rebuild via deserialized state-dict. If None, the deserialized state-dict is returned as-is. step: int: step number to load or None to load latest. If specified, ckpt_dir must be a directory. prefix: str: name prefix of checkpoint files. parallel: bool: whether to load seekable checkpoints in parallel, for speed. Returns: Restored `target` updated from checkpoint file, or if no step specified and no checkpoint files present, returns the passed-in `target` unchanged. If a file path is specified and is not found, the passed-in `target` will be returned. This is to match the behavior of the case where a directory path is specified but the directory has not yet been created. """ if step: ckpt_path = _checkpoint_path(ckpt_dir, step, prefix) if not gfile.exists(ckpt_path): raise ValueError(f'Matching checkpoint not found: {ckpt_path}') else: if gfile.isdir(ckpt_dir): ckpt_path = latest_checkpoint(ckpt_dir, prefix) if not ckpt_path: logging.info(f'Found no checkpoint files in {ckpt_dir}') return target else: ckpt_path = ckpt_dir if not gfile.exists(ckpt_path): logging.info(f'Found no checkpoint file at {ckpt_path}') return target logging.info('Restoring checkpoint from %s', ckpt_path) with gfile.GFile(ckpt_path, 'rb') as fp: if parallel and fp.seekable(): buf_size = 128 << 20 # 128M buffer. num_bufs = fp.size() / buf_size logging.debug('num_bufs: %d', num_bufs) checkpoint_contents = bytearray(fp.size()) def read_chunk(i): # NOTE: We have to re-open the file to read each chunk, otherwise the # parallelism has no effect. But we could reuse the file pointers # within each thread. with gfile.GFile(ckpt_path, 'rb') as f: f.seek(i * buf_size) buf = f.read(buf_size) if buf: checkpoint_contents[i * buf_size:i * buf_size + len(buf)] = buf return len(buf) / buf_size pool_size = 32 pool = thread.ThreadPoolExecutor(pool_size) results = pool.map(read_chunk, range(int(num_bufs) + 1)) results = list(results) pool.shutdown(wait=False) logging.debug('results: %s', results) else: checkpoint_contents = fp.read() if target is None: return serialization.msgpack_restore(checkpoint_contents) else: return serialization.from_bytes(target, checkpoint_contents)
def run_pretrain(optimizer): """Run bert pretraining. Args: optimizer: BERT model with pretraining layer Returns: optimizer: trained model """ result_stats = {} def get_input_context(): class InputContext(): def __init__(self): self.input_pipeline_id = jax.host_id() self.num_input_pipelines = jax.host_count() return InputContext() summary_thread = thread.ThreadPoolExecutor(1, 'summary') host_id = jax.host_id() # Get input dataset input_files = [] for input_pattern in FLAGS.input_files.split(','): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info('*** Input Files ***') for input_file in input_files: logging.info(' %s', input_file) eval_input_files = [] for input_pattern in FLAGS.eval_input_files.split(','): eval_input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info('*** Eval Input Files ***') for input_file in eval_input_files: logging.info(' %s', input_file) train_input_fn = input_pipeline.input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, num_cpu_threads=8) host_train_batch_size = FLAGS.train_batch_size // jax.host_count() host_eval_batch_size = FLAGS.eval_batch_size // jax.host_count() params = {'batch_size': host_train_batch_size} input_context = get_input_context() train_dataset = train_input_fn(params, input_context) train_iterator = iter(train_dataset) eval_input_fn = input_pipeline.input_fn_builder( input_files=eval_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, num_cpu_threads=8, global_input_size=FLAGS.eval_sample_size) eval_params = {'batch_size': host_eval_batch_size} eval_dataset = eval_input_fn(eval_params, input_context) eval_iterator = iter(eval_dataset) # train step total_training_steps = FLAGS.total_training_steps learning_rate_fn = create_learning_rate_scheduler( base_learning_rate=FLAGS.learning_rate, warmup_steps=FLAGS.warmup_steps, total_training_steps=FLAGS.total_training_steps, poly_power=FLAGS.poly_power, start_warmup_step=FLAGS.start_warmup_step) # Device training loop cond. def device_train_loop_cond(args): _, _, _, _, _, _, step, epoch, num_steps_per_epoch = args return step // num_steps_per_epoch == epoch # Device training loop body. def device_train_loop_body(args): """Device training loop body.""" (optimizer, total_loss, lm_loss, sentence_loss, new_dropout_rng, token, step, epoch, num_steps_per_epoch) = args device_batch_size = FLAGS.train_batch_size // jax.device_count() input_shape = [device_batch_size, FLAGS.max_seq_length] input_shape_pred = [device_batch_size, FLAGS.max_predictions_per_seq] (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels), token = lax.infeed( token, shape=(jax.ShapedArray(input_shape, jnp.int32), jax.ShapedArray(input_shape, jnp.int32), jax.ShapedArray(input_shape, jnp.int32), jax.ShapedArray(input_shape_pred, jnp.int32), jax.ShapedArray(input_shape_pred, jnp.int32), jax.ShapedArray(input_shape_pred, jnp.float32), jax.ShapedArray([device_batch_size, 1], jnp.int32))) inputs = [input_ids, input_mask, segment_ids, masked_lm_positions] labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels] optimizer, total_loss, lm_loss, sentence_loss, new_dropout_rng = train_step( optimizer, inputs, labels, learning_rate_fn, dropout_rng=new_dropout_rng) step += 1 return (optimizer, total_loss, lm_loss, sentence_loss, new_dropout_rng, token, step, epoch, num_steps_per_epoch) # Device training loop. def device_train_loop(optimizer, dropout_rng, total_loss, lm_loss, sentence_loss, step, epoch, num_steps_per_epoch): """Device training loop.""" token = lax.create_token(step) (optimizer, total_loss, lm_loss, sentence_loss, dropout_rng, _, step, epoch, num_steps_per_epoch) = lax.while_loop( device_train_loop_cond, device_train_loop_body, (optimizer, total_loss, lm_loss, sentence_loss, dropout_rng, token, step, epoch, num_steps_per_epoch)) return optimizer, total_loss, lm_loss, sentence_loss, dropout_rng, step if FLAGS.infeed: pmap_fn = jax.pmap if FLAGS.enable_buffer_donation: pmap_fn = functools.partial(pmap_fn, donate_argnums=(0, 1)) if FLAGS.enable_wus: pmap_fn = functools.partial( pmap_fn, in_axes=(None, 0, None, None, None, None, None, None)) p_train_epoch = pmap_fn(device_train_loop, axis_name='batch') else: # without infeed. p_train_step = jax.pmap( functools.partial(train_step, learning_rate_fn=learning_rate_fn), axis_name='batch') if FLAGS.infeed: # Infeed is currently synchronous, so do it in a background thread too infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed') pmap_fn = jax.pmap # Weight update sharding is not implemented yet for host train loop. # Enable wus on eval only if device loop is used. if FLAGS.enable_wus and FLAGS.infeed: pmap_fn = functools.partial(pmap_fn, in_axes=(None, 0, 0)) p_eval_step = pmap_fn(eval_step, axis_name='batch') rng = random.PRNGKey(0) device_count = jax.local_device_count() dropout_rngs = random.split(rng, device_count) num_steps_per_epoch = np.int32(FLAGS.num_steps_per_epoch) if FLAGS.precompile: if FLAGS.infeed: if FLAGS.enable_wus: total_loss = np.float32(0.0) lm_loss = np.float32(0.0) sentence_loss = np.float32(0.0) host_step = 0 host_epoch = 1 optimizer = unbroadcast(optimizer) # the device training loop condition will immediately be false optimizer, total_loss, lm_loss, sentence_loss, _, _ = p_train_epoch( optimizer, dropout_rngs, total_loss, lm_loss, sentence_loss, host_step, host_epoch, num_steps_per_epoch) else: total_loss = jax_utils.replicate(np.float32(0.0)) lm_loss = jax_utils.replicate(np.float32(0.0)) sentence_loss = jax_utils.replicate(np.float32(0.0)) device_step = jax_utils.replicate(0) device_epoch = jax_utils.replicate(1) # the device training loop condition will immediately be false optimizer, total_loss, lm_loss, sentence_loss, _, _ = p_train_epoch( optimizer, dropout_rngs, total_loss, lm_loss, sentence_loss, device_step, device_epoch, jax_utils.replicate(num_steps_per_epoch)) else: train_input_shape = (host_train_batch_size, FLAGS.max_seq_length) train_input_shape_pred = (host_train_batch_size, FLAGS.max_predictions_per_seq) word_id_data = jax.random.randint(rng, train_input_shape, 0, 10) mask_data = jax.random.randint(rng, train_input_shape, 0, 1) type_id_data = jax.random.randint(rng, train_input_shape, 0, 3) lm_mask = jax.random.randint(rng, train_input_shape_pred, 0, 5) masked_lm_ids = jax.random.randint(rng, train_input_shape_pred, 0, 2) masked_lm_weights = jax.random.randint(rng, train_input_shape_pred, 1, 1).astype(np.float32) next_sentence_labels = jax.random.randint(rng, (host_train_batch_size, 1), 0, 1) labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels] train_inputs = [word_id_data, mask_data, type_id_data, lm_mask] train_inputs = common_utils.shard(train_inputs) labels = common_utils.shard(labels) p_train_step(optimizer, train_inputs, labels, dropout_rng=dropout_rngs) eval_input_shape = (host_eval_batch_size, FLAGS.max_seq_length) eval_input_shape_pred = (host_eval_batch_size, FLAGS.max_predictions_per_seq) word_id_data = jax.random.randint(rng, eval_input_shape, 0, 10) mask_data = jax.random.randint(rng, eval_input_shape, 0, 1) type_id_data = jax.random.randint(rng, eval_input_shape, 0, 3) lm_mask = jax.random.randint(rng, eval_input_shape_pred, 0, 5) masked_lm_ids = jax.random.randint(rng, eval_input_shape_pred, 0, 2) masked_lm_weights = jax.random.randint( rng, eval_input_shape_pred, 1, 1).astype(np.float32) next_sentence_labels = jax.random.randint(rng, (host_eval_batch_size, 1), 0, 1) eval_inputs = { 'input_ids': word_id_data, 'input_mask': mask_data, 'segment_ids': type_id_data, 'masked_lm_positions': lm_mask, 'masked_lm_ids': masked_lm_ids, 'masked_lm_weights': masked_lm_weights, 'next_sentence_labels': next_sentence_labels } eval_inputs = common_utils.shard(eval_inputs) metrics = empty_metrics() optimizer_target = optimizer.target # Weight update sharding is not implemented yet for host train loop. # Enable wus on eval only if device loop is used. if FLAGS.enable_wus and FLAGS.infeed: optimizer_target = unbroadcast(optimizer_target) metrics = p_eval_step(optimizer_target, eval_inputs, metrics) metrics = allreduce_metrics(metrics) metrics = empty_metrics() time.sleep(FLAGS.init_sleep) allreduce_metrics(metrics)['masked_lm_weighted_correct'].block_until_ready() mlp_log.mlperf_print('init_stop', None) mlp_log.mlperf_print('run_start', None) # To make the logging consistent with other mlperf models, # in all the mlp_log, epochs are steps, and examples are sequences. mlp_log.mlperf_print('train_samples', FLAGS.total_training_steps * FLAGS.train_batch_size) mlp_log.mlperf_print('eval_samples', FLAGS.eval_sample_size) xprof = None run_start = time.time() global RUN_STOP global TOTAL_STEPS RUN_STOP = False TOTAL_STEPS = False if host_id == 0: if FLAGS.end_to_end_profile: xprof = xprof_session.XprofSession() xprof.start_session(device_name='REDACTED', enable_python_tracer=True, host_trace_level=2) elif FLAGS.profile: profile_with_xprof_on_background(start_after_sec=FLAGS.profile_latency, profile_time_sec=FLAGS.profile_duration) if FLAGS.infeed: h_total_loss = np.float32(0.0) h_lm_loss = np.float32(0.0) h_sentence_loss = np.float32(0.0) d_total_loss = jax_utils.replicate(np.float32(0.0)) d_lm_loss = jax_utils.replicate(np.float32(0.0)) d_sentence_loss = jax_utils.replicate(np.float32(0.0)) host_step, device_step = 0, jax_utils.replicate(0) device_epoch = jax_utils.replicate(0) num_train_epochs = FLAGS.total_training_steps // FLAGS.num_steps_per_epoch steps_per_epoch = num_steps_per_epoch if num_train_epochs >= 6: # Merge the first 6 epochs, as we do not have to do eval. steps_per_epoch = np.int32(num_steps_per_epoch * 6) for host_epoch in range(num_train_epochs): block_step = host_step # While BERT pretraining does not have epochs, # to make the logging consistent with other mlperf models, # in all the mlp_log, epochs are steps, and examples are sequences. mlp_log.mlperf_print( 'block_start', None, metadata={ 'first_epoch_num': block_step, 'epoch_count': FLAGS.num_steps_per_epoch }) if not (num_train_epochs >= 6 and host_epoch in (1, 2, 3, 4, 5)) and FLAGS.infeed: if FLAGS.enable_wus: optimizer = unbroadcast(optimizer) (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs, device_step) = p_train_epoch(optimizer, dropout_rngs, h_total_loss, h_lm_loss, h_sentence_loss, host_step, host_epoch, steps_per_epoch) else: device_epoch = jax_utils.replicate(host_epoch) device_steps_per_epoch = jax_utils.replicate(steps_per_epoch) (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs, device_step) = p_train_epoch(optimizer, dropout_rngs, d_total_loss, d_lm_loss, d_sentence_loss, device_step, device_epoch, device_steps_per_epoch) # After first epoch, reduce the steps per epoch back to normal number. steps_per_epoch = num_steps_per_epoch # Training for one epoch. while int(host_step // FLAGS.num_steps_per_epoch) == host_epoch: input_data = next(train_iterator) input_data = jax.tree_map(lambda x: x.numpy(), input_data) input_data = jax.tree_map(common_utils.shard, input_data) input_ids = input_data['input_ids'] input_mask = input_data['input_mask'] segment_ids = input_data['segment_ids'] masked_lm_positions = input_data['masked_lm_positions'] masked_lm_ids = input_data['masked_lm_ids'] masked_lm_weights = input_data['masked_lm_weights'] next_sentence_labels = input_data['next_sentence_labels'] # Infeed data to infeed queue. if FLAGS.infeed: for i, device in enumerate(jax.local_devices()): infeed_pool.submit( partial(device.transfer_to_infeed, (input_ids[i], input_mask[i], segment_ids[i], masked_lm_positions[i], masked_lm_ids[i], masked_lm_weights[i], next_sentence_labels[i]))) else: inputs = [input_ids, input_mask, segment_ids, masked_lm_positions] labels = [masked_lm_ids, masked_lm_weights, next_sentence_labels] (optimizer, total_loss, lm_loss, sentence_loss, dropout_rngs ) = p_train_step(optimizer, inputs, labels, dropout_rng=dropout_rngs) host_step += 1 mlp_log.mlperf_print('block_stop', None, metadata={ 'first_epoch_num': block_step, 'epoch_count': FLAGS.num_steps_per_epoch }) # No need to do eval in the first 5 epochs as it has to traverse min 3M # samples. if host_epoch < 5: continue if host_step % FLAGS.num_steps_per_epoch == 0: mlp_log.mlperf_print( 'eval_start', None, metadata={'epoch_num': host_step}) optimizer_target = optimizer.target if FLAGS.enable_wus and FLAGS.infeed: optimizer_target = unbroadcast(optimizer_target) metrics = empty_metrics() for _ in range(FLAGS.max_eval_steps): inputs = jax.tree_map(lambda x: x.numpy(), next(eval_iterator)) inputs = jax.tree_map(common_utils.shard, inputs) # Weight update sharding is not implemented yet for host train loop. # Enable wus on eval only if device loop is used. metrics = p_eval_step(optimizer_target, inputs, metrics) metrics = allreduce_metrics(metrics) train_metrics = {'total_loss': total_loss, 'lm_loss': lm_loss, 'sentence_loss': sentence_loss} # masked_lm_accuracy = get_masked_lm_accuracy(metrics) summary_thread.submit(partial( _write_metrics, metrics, train_metrics, host_step, total_training_steps, host_id)) if host_step % FLAGS.num_steps_per_epoch == 0 and FLAGS.save_checkpoint: if host_id == 0: checkpoints.save_checkpoint( FLAGS.model_dir, optimizer, host_step, prefix='checkpoint', keep=1) allreduce_metrics(metrics)['masked_lm_weighted_correct'].block_until_ready() summary_thread.shutdown() if not RUN_STOP: mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'}) mlp_log.mlperf_print('run_final', None) if host_id == 0: if FLAGS.end_to_end_profile: xprof_url = xprof.end_session_and_get_url(tag='') logging.info('Xprof profile is at %s', xprof_url) if RUN_STOP: result_stats['total_time'] = RUN_STOP - run_start result_stats['total_steps'] = TOTAL_STEPS return optimizer, result_stats
def test_thread_local(self): with thread.ThreadPoolExecutor(max_workers=2) as t: for _ in range(100): t.submit(increase)
def test_mutil_thread_connect(self): with thread.ThreadPoolExecutor(max_workers=100) as t: for _ in range(1000): t.submit(request)
def main(argv): global BLEU_THRESHOLD_REACHED if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') init_mllogger() mllogger.event('cache_clear') mllogger.start('init_start') mllogger.event('submission_org', 'Google') mllogger.event('submission_platform', 'TPUv3-{}'.format(jax.device_count())) mllogger.event('submission_division', 'closed') mllogger.event('submission_status', 'research') mllogger.event('submission_benchmark', 'transformer') mllogger.event('train_samples', input_pipeline.N_TRAIN) mllogger.event('eval_samples', input_pipeline.N_EVAL) tf.enable_v2_behavior() # Use hardware RNG for bernoulli randoms in dropout mask creation. if FLAGS.hardware_rng: models.set_hardware_bernoulli() num_partitions = FLAGS.num_partitions batch_size = FLAGS.batch_size if batch_size is None: batch_size = min(16 * jax.device_count() // num_partitions, 2048) mllogger.event('global_batch_size', batch_size) num_eval_steps = FLAGS.num_eval_steps max_target_length = FLAGS.max_target_length max_eval_target_length = FLAGS.max_eval_target_length max_length = max(max_target_length, max_eval_target_length) mllogger.event('max_sequence_length', max_length, metadata={'method': 'discard'}) if FLAGS.random_seed is not None: seed = FLAGS.random_seed else: seed = np.uint32(time.time() if jax.host_id() == 0 else 0) seed = per_host_sum_pmap(seed) mllogger.event('seed', int(seed)) steps_per_epoch = int(math.ceil(input_pipeline.N_TRAIN / batch_size)) logging.info('steps per epoch: %d', steps_per_epoch) num_replicas = jax.local_device_count() // num_partitions device_train_input_shape = (batch_size // (num_replicas * jax.host_count()), max_target_length) # This is per-host; in principle 64/replica or more should fit eval_batch_size = min( 32 * num_replicas, int( math.ceil(input_pipeline.N_EVAL / (num_replicas * jax.host_count()))) * num_replicas) logging.info('eval batch size: %d', eval_batch_size) pred_batches = int( math.ceil(input_pipeline.N_EVAL / (jax.host_count() * eval_batch_size))) logging.info('pred batches: %d', pred_batches) broadcast = functools.partial(_broadcast, num_replicas=num_replicas, num_partitions=num_partitions) if jax.host_id() == 0: train_summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'train')) eval_summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'eval')) else: train_summary_writer = None eval_summary_writer = None # Write summaries in background thread to avoid blocking on device sync summary_thread = thread.ThreadPoolExecutor(1, 'summary') if FLAGS.infeed: # Infeed is currently synchronous, so do it in a background thread too infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed') # MLPerf 2020 WMT en-de dataset uses a custom T2T dataset: # Shared 32K subword tokenization # 256-length packed training examples from WMT17 # 97-length unpacked evaluation examples from WMT14 train_keys = [ 'inputs', 'targets', 'inputs_position', 'targets_position', 'inputs_segmentation', 'targets_segmentation' ] encoder = mlperf_encoder.SubwordTextEncoder(filename=FLAGS.vocab_path) input_encoder = encoder target_encoder = encoder vocab_size = input_encoder.vocab_size output_vocab_size = target_encoder.vocab_size input_shape = (batch_size, max_target_length) target_shape = (batch_size, max_target_length) transformer_kwargs = flax.core.FrozenDict({ 'vocab_size': vocab_size, 'output_vocab_size': output_vocab_size, 'emb_dim': 1024, 'num_heads': 16, 'num_layers': 6, 'qkv_dim': 1024, 'mlp_dim': 4096, 'max_len': max_length, 'share_embeddings': FLAGS.share_embeddings, 'logits_via_embedding': FLAGS.logits_via_embedding, 'num_partitions': num_partitions, }) rng = random.PRNGKey(seed) rng, init_rng = random.split(rng) model, cache_def = create_model(init_rng, tuple(input_shape), tuple(target_shape), transformer_kwargs) mllogger.event('opt_name', 'adam') if batch_size < 1024: learning_rate = 4.0 # 0.0625 warmup_steps = 1000 beta1 = 0.9 beta2 = 0.98 if batch_size < 2048: learning_rate = 2.0 warmup_steps = 500 # ?? beta1 = 0.9 # ?? beta2 = 0.98 # ?? else: learning_rate = 3.3092157691415953 warmup_steps = 664 beta1 = 0.9086575725261137 beta2 = 0.9198719118104947 epsilon = 1e-9 if FLAGS.learning_rate is not None: learning_rate = FLAGS.learning_rate mllogger.event('opt_adam_beta_1', beta1) mllogger.event('opt_adam_beta_2', beta2) mllogger.event('opt_adam_epsilon', epsilon) optimizer_def = optim.Adam(learning_rate, beta1=beta1, beta2=beta2, eps=epsilon, weight_decay=FLAGS.weight_decay) optimizer = optimizer_def.create(model) del model # don't keep a copy of the initial model # Build parameter partition annotations for preserving partitions from train # to eval. partition_rules = [ (('encoder', 'posembed_input'), partitions.empty_dict), (('decoder', 'posembed_targets'), partitions.empty_dict), (('embedding', ), partitions.spec(num_partitions, 1)), ((r'LayerNorm_\d+', '(bias|scale)'), None), ((r'encoder(decoder)?_norm', '(bias|scale)'), None), ((r'MultiHeadDotProductAttention_\d+', '(query|key|value)', 'kernel'), partitions.spec(1, num_partitions, 1)), ((r'MultiHeadDotProductAttention_\d+', 'out', 'kernel'), partitions.spec(num_partitions, 1, 1)), ((r'MlpBlock_\d+', r'Dense_\d+', 'bias'), None), ((r'MlpBlock_\d+', 'Dense_0', 'kernel'), partitions.spec(1, num_partitions)), ((r'MlpBlock_\d+', 'Dense_1', 'kernel'), partitions.spec(num_partitions, 1)), (('state', 'step'), None), ] optimizer_partitions = optimizer.restore_state( partitions.set_partitions(partition_rules, optimizer.state_dict())) optimizer = broadcast(optimizer) empty_metrics = broadcast({'loss': 0.0, 'accuracy': 0, 'denominator': 0}) learning_rate_fn = create_learning_rate_scheduler( base_learning_rate=learning_rate, warmup_steps=warmup_steps, hidden_size=transformer_kwargs['qkv_dim']) p_train_step = jax.pmap(functools.partial( train_step, learning_rate_fn=learning_rate_fn), axis_name='batch', in_axes=(None, 0, 0, 0)) if num_partitions > 1: sharded_predict_step = sharded_jit( predict_step, in_parts=(None, optimizer_partitions.target, None), out_parts=None) else: sharded_predict_step = predict_step if FLAGS.extra_eval_metrics: p_eval_step = jax.pmap(eval_step, axis_name='batch', in_axes=(None, 0)) p_pred_step = jax.pmap(sharded_predict_step, axis_name='batch', in_axes=(0, None, None)) p_allreduce_metrics = jax.pmap(functools.partial(lax.psum, axis_name='batch'), axis_name='batch') def device_train_loop_cond(args): _, _, _, _, step, epoch = args return step // steps_per_epoch == epoch def device_train_loop_body(args): optimizer, dropout_rngs, metrics, token, step, epoch = args input_data, token = lax.infeed(token, shape=tuple([ jax.ShapedArray( device_train_input_shape, jnp.int32) for _ in train_keys ])) batch = {k: v for k, v in zip(train_keys, input_data)} optimizer, metrics, dropout_rngs = train_step(optimizer, batch, metrics, learning_rate_fn, dropout_rng=dropout_rngs) step += 1 return optimizer, dropout_rngs, metrics, token, step, epoch def device_train_loop(optimizer, dropout_rngs, metrics, step, epoch): token = lax.create_token(step) optimizer, dropout_rngs, metrics, _, step, _ = lax.while_loop( device_train_loop_cond, device_train_loop_body, (optimizer, dropout_rngs, metrics, token, step, epoch)) return optimizer, dropout_rngs, metrics, step if num_partitions > 1: device_train_loop = sharded_jit(device_train_loop, in_parts=(optimizer_partitions, None, None, None, None), out_parts=(optimizer_partitions, None, None, None)) p_train_epoch = jax.pmap(device_train_loop, axis_name='batch', in_axes=(None, 0, 0, None, None)) p_allreduce_metrics_train = functools.partial(lax.psum, axis_name='batch') if num_partitions > 1: p_allreduce_metrics_train = sharded_jit(p_allreduce_metrics_train, in_parts=None, out_parts=None, num_partitions=num_partitions) p_allreduce_metrics_train = jax.pmap(p_allreduce_metrics_train, axis_name='batch') # Precompile all needed computations with fake data so as not to include # compilation time in MLPerf metrics. if FLAGS.precompile: logging.info('precompiling step/epoch functions') if FLAGS.infeed: # the device training loop condition will immediately be false, but # the optimizer tree will be resharded here optimizer, *_ = p_train_epoch(unbroadcast(optimizer), random.split(rng, num_replicas), empty_metrics, jnp.array(0, dtype=jnp.int32), 1) else: metrics = empty_metrics train_input_shape = (num_replicas, batch_size // num_replicas, input_pipeline.MAX_TRAIN_LEN) fake_batch = { k: jnp.ones(train_input_shape, jnp.int32) for k in train_keys } p_train_step(unbroadcast(optimizer), fake_batch, metrics, dropout_rng=random.split(rng, num_replicas)) eval_input_shape = (num_replicas, eval_batch_size // num_replicas, input_pipeline.MAX_EVAL_LEN) fake_eval_batch = { 'inputs': jnp.ones(eval_input_shape, jnp.int32), 'targets': jnp.ones(eval_input_shape, jnp.int32), } if FLAGS.extra_eval_metrics: p_eval_step(unbroadcast(optimizer.target), fake_eval_batch) fake_cache = cache_def.initialize_cache( (eval_input_shape[1], FLAGS.max_predict_length)) p_pred_step(fake_eval_batch['inputs'], unbroadcast(optimizer.target), fake_cache) time.sleep(20) sync_devices() fake_bleu_1 = np.zeros((4, ), dtype=np.int32) fake_bleu_2 = np.zeros((), dtype=np.int32) per_host_sum_pmap((fake_bleu_1, fake_bleu_1, fake_bleu_2, fake_bleu_2)) sync_devices() p_allreduce_metrics_train(empty_metrics) sync_devices() logging.info('finished precompiling step/epoch functions') # We init the first set of dropout PRNG keys, but update it afterwards inside # the main pmap'd training update for performance. dropout_rngs = random.split(rng, num_replicas) # Record time-0 metrics for proper tensorboard plot x-axis scaling. if jax.host_id() == 0: if FLAGS.compute_train_metrics: train_summary_writer.scalar('loss', 9.999, 0) train_summary_writer.scalar('accuracy', 0.0, 0) train_summary_writer.flush() eval_summary_writer.scalar('bleu', 0.0, 0) eval_summary_writer.flush() train_ds = input_pipeline.get_wmt_dataset(batch_size=batch_size // jax.host_count(), train=True) eval_ds = input_pipeline.get_wmt_dataset(batch_size=eval_batch_size, train=False) train_iter = iter(train_ds) eval_iter = iter(eval_ds) local_devices = jax.local_devices() host_step, device_step = 0, broadcast(0) gc.disable() mllogger.end('init_stop') if jax.host_id() == 0: mllogger.start('run_start') for epoch in range(FLAGS.num_epochs): if jax.host_id() == 0 and not BLEU_THRESHOLD_REACHED: mllogger.start('block_start', metadata={ 'first_epoch_num': epoch + 1, 'epoch_count': 1 }) metrics = empty_metrics if FLAGS.infeed: optimizer, dropout_rngs, metrics, device_step = p_train_epoch( unbroadcast(optimizer), dropout_rngs, metrics, unbroadcast(device_step), epoch) while int(host_step // steps_per_epoch) == epoch: # pylint: disable=protected-access batch = jax.tree_map(lambda x: x._numpy(), next(train_iter)) # Shard data to devices and do a training step. batch = jax.tree_map( lambda x: x.reshape((num_replicas, -1) + x.shape[1:]), batch) if FLAGS.infeed: for i, device in enumerate(local_devices): replica_id = i // num_partitions input_tuple = tuple( [batch[k][replica_id] for k in train_keys]) assert input_tuple[0].shape == device_train_input_shape, ( 'infeed shape error %s != %s' % (input_tuple[0].shape, device_train_input_shape)) assert input_tuple[0].dtype == jnp.int32, ( 'infeed dtype error %s != %s' % (input_tuple[0].dtype, jnp.int32)) infeed_pool.submit( functools.partial(device.transfer_to_infeed, input_tuple)) else: optimizer, metrics, dropout_rngs = p_train_step( unbroadcast(optimizer), batch, metrics, dropout_rng=dropout_rngs) host_step += 1 if FLAGS.compute_train_metrics: metrics = p_allreduce_metrics_train(metrics) # Schedule training metric handling. summary_thread.submit( functools.partial(write_train_summary, metrics, train_summary_writer, host_step)) # Optional, extra evaluation metrics. if FLAGS.extra_eval_metrics: eval_metrics = [] eval_iter = iter(eval_ds) for _, eval_batch in zip(range(num_eval_steps), eval_iter): eval_batch = common_utils.shard(eval_batch) metrics = p_eval_step(unbroadcast(optimizer.target), eval_batch) eval_metrics.append(metrics) eval_metrics = p_allreduce_metrics(eval_metrics) # Schedule metric summarization/logging. summary_thread.submit( functools.partial(write_eval_summary, eval_metrics, eval_summary_writer, host_step)) # Translation and BLEU Score. all_predicted, all_targets, all_bs = [], [], [] for i in range(pred_batches): # pylint: disable=protected-access pred_batch = jax.tree_map(lambda x: x._numpy(), next(eval_iter)) # Handle final odd-sized batch by padding instead of dropping it. cur_pred_batch_size = pred_batch['inputs'].shape[0] if cur_pred_batch_size != eval_batch_size: pred_batch = jax.tree_map( lambda x: pad_examples(x, eval_batch_size), pred_batch) pred_batch = jax.tree_map( lambda x: x.reshape((num_replicas, -1) + x.shape[1:]), pred_batch) per_device_batchsize = pred_batch['inputs'].shape[1] cache = cache_def.initialize_cache( (per_device_batchsize, FLAGS.max_predict_length)) all_predicted.append( p_pred_step(pred_batch['inputs'], unbroadcast(optimizer.target), cache)) all_targets.append(pred_batch['targets']) all_bs.append(cur_pred_batch_size) # Schedule BLEU calculation and summarization/logging. # We use the ICI as part of BLEU score computation, so we call this from the # main thread so the BLEU pmap runs before the next train epoch pmap write_predict_summary(all_predicted, all_targets, all_bs, target_encoder, eval_summary_writer, epoch, host_step, summary_thread) # Wait until computations are done before exiting sync_devices() if jax.host_id() == 0: summary_thread.shutdown() if not BLEU_THRESHOLD_REACHED: mllogger.end('run_stop', metadata={'status': 'aborted'})
def profile_with_xprof_on_background(start_after_sec=30, profile_time_sec=1, device='REDACTED'): profiler_thread = thread.ThreadPoolExecutor(jax.local_device_count(), 'xprof') profiler_thread.submit(partial(xprof_profile, start_after_sec, profile_time_sec, device))
if __name__ == "__main__": # run_experiment(0, 0, "placeholder") hotncold = """bp.registerBThread("HotBt", function() { bp.sync({request:bp.Event("hotEvent")}); bp.sync({request:bp.Event("hotEvent")}); bp.sync({request:bp.Event("hotEvent")}); }); bp.registerBThread("ColdBt", function() { bp.sync({request:bp.Event("coldEvent")}); bp.sync({request:bp.Event("coldEvent")}); bp.sync({request:bp.Event("coldEvent")}); }); bp.registerBThread("AlternatorBt", function() { for(var i = 0; i < 3; i++) { bp.sync({waitFor:bp.Event("coldEvent"), block:bp.Event("hotEvent")}); bp.sync({waitFor:bp.Event("hotEvent"), block:bp.Event("coldEvent")}); } bp.sync({request:bp.Event("allDone")}); });""" print("start") with thread.ThreadPoolExecutor(max_workers=4) as e: e.submit(thread_check, hotncold) e.submit(thread_check, hotncold) e.submit(thread_check, hotncold) e.submit(thread_check, hotncold) print("finish")
def main(argv): del argv # BEGIN GOOGLE-INTERNAL xm.setup_work_unit() # END GOOGLE-INTERNAL tf.enable_v2_behavior() init_mllogger() mllogger.event('cache_clear') mllogger.start('init_start') mllogger.event('submission_org', 'Google') mllogger.event('submission_platform', 'TPUv3-{}'.format(jax.device_count())) mllogger.event('submission_division', 'closed') mllogger.event('submission_status', 'research') mllogger.event('submission_benchmark', 'resnet') mllogger.event('train_samples', input_pipeline.TRAIN_IMAGES) mllogger.event('eval_samples', input_pipeline.EVAL_IMAGES) if jax.host_id() == 0: summary_writer = tensorboard.SummaryWriter(FLAGS.output_dir) # Write summaries in background thread to avoid blocking on device sync summary_thread = thread.ThreadPoolExecutor(1, 'summary') # Infeed is currently synchronous, so do it in a background thread too infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed') if FLAGS.seed is not None: seed = FLAGS.seed else: seed = np.uint32(time.time() if jax.host_id() == 0 else 0) seed = per_host_sum_pmap(seed) mllogger.event('seed', int(seed)) key = random.PRNGKey(seed) batch_size = FLAGS.batch_size if batch_size == -1: if jax.device_count() > 4096: batch_size = 65536 else: batch_size = min(128 * jax.device_count(), 32768) mllogger.event('global_batch_size', batch_size) eval_batch_size = min(input_pipeline.EVAL_IMAGES, 256 * jax.device_count()) device_batch_size = batch_size // jax.device_count() device_eval_batch_size = int( math.ceil(eval_batch_size / jax.device_count())) model_dtype = jnp.bfloat16 if FLAGS.bfloat16 else jnp.float32 input_dtype = tf.bfloat16 if FLAGS.bfloat16 else tf.float32 num_epochs = FLAGS.num_epochs if num_epochs is None: if batch_size < 32768: num_epochs = 56 elif batch_size < 65536: num_epochs = 64 else: num_epochs = 92 steps_per_epoch = input_pipeline.TRAIN_IMAGES / batch_size # match TF submission behavior (round steps per loop up) steps_per_loop = int(math.ceil(steps_per_epoch * FLAGS.epochs_per_loop)) # also apply rounding loop up to next step to "epochs" in LR schedule steps_per_epoch *= steps_per_loop / (steps_per_epoch * FLAGS.epochs_per_loop) steps_per_eval = int( math.ceil(input_pipeline.EVAL_IMAGES / eval_batch_size)) base_learning_rate = FLAGS.learning_rate * batch_size / 256. beta = FLAGS.momentum if beta is None: if batch_size < 32768: beta = 0.9 elif batch_size < 65536: beta = 0.929 else: beta = 0.9537213777059405 weight_decay = FLAGS.weight_decay if weight_decay is None: weight_decay = 2e-4 if batch_size < 32768 else 1e-4 space_to_depth = FLAGS.space_to_depth if space_to_depth is None: space_to_depth = device_batch_size <= 8 image_format = FLAGS.image_format if image_format is None: if space_to_depth and device_batch_size <= 8: image_format = 'HWNC' else: image_format = 'HWCN' image_size = input_pipeline.IMAGE_SIZE if space_to_depth: train_input_shape = (device_batch_size, image_size // 2, image_size // 2, 12) eval_input_shape = (device_eval_batch_size, image_size // 2, image_size // 2, 12) else: train_input_shape = (device_batch_size, image_size, image_size, 3) eval_input_shape = (device_eval_batch_size, image_size, image_size, 3) if image_format == 'HWCN': train_input_shape = tuple(train_input_shape[i] for i in [1, 2, 3, 0]) eval_input_shape = tuple(eval_input_shape[i] for i in [1, 2, 3, 0]) elif image_format == 'HWNC': train_input_shape = tuple(train_input_shape[i] for i in [1, 2, 0, 3]) eval_input_shape = tuple(eval_input_shape[i] for i in [1, 2, 0, 3]) model, state = create_model(key, device_batch_size, image_size, model_dtype, space_to_depth) if FLAGS.lars: mllogger.event('opt_name', 'lars') mllogger.event('lars_opt_weight_decay', weight_decay) mllogger.event('lars_opt_momentum', beta) mllogger.event('lars_epsilon', 0) weight_opt_def = optim.LARS(base_learning_rate, beta, weight_decay=weight_decay) other_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=0, nesterov=False) learning_rate_fn = polynomial_learning_rate_fn(batch_size, steps_per_epoch, num_epochs) else: mllogger.event('opt_name', 'sgd') mllogger.event('sgd_opt_momentum', beta) weight_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=weight_decay, nesterov=True) other_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=0, nesterov=True) learning_rate_fn = piecewise_learning_rate_fn(base_learning_rate, steps_per_epoch, num_epochs) def filter_weights(key, _): return 'bias' not in key and 'scale' not in key def filter_other(key, _): return 'bias' in key or 'scale' in key weight_traversal = optim.ModelParamTraversal(filter_weights) other_traversal = optim.ModelParamTraversal(filter_other) optimizer_def = optim.MultiOptimizer((weight_traversal, weight_opt_def), (other_traversal, other_opt_def)) optimizer = optimizer_def.create(model) del model # do not keep a copy of the initial model optimizer = broadcast(optimizer) state = broadcast(state) empty_metrics = broadcast({'samples': 0, 'loss': 0., 'accuracy': 0}) p_allreduce_metrics = jax.pmap(allreduce_metrics, axis_name='batch') p_sync_batchnorm_stats = jax.pmap(sync_batchnorm_stats, axis_name='batch') def host_loop_train_step(optimizer, state, metrics): token = lax.create_token(optimizer.state[0].step) batch, token = lax.infeed(token, shape=(jax.ShapedArray( train_input_shape, model_dtype), jax.ShapedArray((device_batch_size, ), jnp.int32))) optimizer, state, metrics = train_step(optimizer, state, batch, metrics, learning_rate_fn, image_format, space_to_depth) return optimizer, state, metrics p_host_loop_train_step = jax.pmap(host_loop_train_step, axis_name='batch', in_axes=(None, 0, 0)) def host_loop_eval_step(model, state, metrics): token = lax.create_token(metrics['samples']) batch, token = lax.infeed( token, shape=(jax.ShapedArray(eval_input_shape, model_dtype), jax.ShapedArray((device_eval_batch_size, ), jnp.int32))) metrics = eval_step(model, state, batch, metrics, image_format, space_to_depth) return metrics p_host_loop_eval_step = jax.pmap(host_loop_eval_step, axis_name='batch', in_axes=(None, None, 0)) def device_train_loop_cond(args): _, _, _, _, step, loop = args return step // steps_per_loop == loop def device_train_loop_body(args): optimizer, state, metrics, token, step, loop = args batch, token = lax.infeed(token, shape=(jax.ShapedArray( train_input_shape, model_dtype), jax.ShapedArray((device_batch_size, ), jnp.int32))) optimizer, state, metrics = train_step(optimizer, state, batch, metrics, learning_rate_fn, image_format, space_to_depth) step += 1 return optimizer, state, metrics, token, step, loop def device_train_loop(optimizer, state, metrics, step, loop): token = lax.create_token(step) optimizer, state, metrics, _, step, _ = lax.while_loop( device_train_loop_cond, device_train_loop_body, (optimizer, state, metrics, token, step, loop)) state = sync_batchnorm_stats(state) metrics = allreduce_metrics(metrics) return optimizer, state, metrics, step p_train_loop = jax.pmap(device_train_loop, axis_name='batch', in_axes=(None, None, 0, None, None)) # BEGIN GOOGLE-INTERNAL def maybe_start_xprof(seconds): if jax.host_id() == 0 and FLAGS.xprof: xprof = xprof_session.XprofSession() xprof.start_session('REDACTED', True, 2) def sleep_and_end_xprof(): time.sleep(seconds) logging.info( 'Xprof URL: %s', xprof.end_session_and_get_url( tag='flax resnet, {} devices, batch {} per device'. format(jax.device_count(), device_batch_size))) thread.ThreadPoolExecutor(1, 'xprof').submit(sleep_and_end_xprof) # END GOOGLE-INTERNAL if FLAGS.precompile: logging.info('precompiling step/loop functions') if FLAGS.device_loop: # the device training loop condition will immediately be false p_train_loop(unbroadcast(optimizer), unbroadcast(state), empty_metrics, jnp.array(0, dtype=jnp.int32), 1) else: for device in jax.local_devices(): images = np.zeros(train_input_shape, model_dtype) labels = np.zeros((device_batch_size, ), np.int32) infeed_pool.submit( partial(device.transfer_to_infeed, (images, labels))) p_host_loop_train_step(unbroadcast(optimizer), state, empty_metrics) p_sync_batchnorm_stats(state) for device in jax.local_devices(): images = np.zeros(eval_input_shape, model_dtype) labels = np.zeros((device_eval_batch_size, ), np.int32) infeed_pool.submit( partial(device.transfer_to_infeed, (images, labels))) p_host_loop_eval_step(unbroadcast(optimizer.target), unbroadcast(state), empty_metrics) p_allreduce_metrics(empty_metrics)['accuracy'].block_until_ready() logging.info('finished precompiling') # BEGIN GOOGLE-INTERNAL maybe_start_xprof(20) # END GOOGLE-INTERNAL if not FLAGS.fake_data: logging.info('constructing datasets') # pylint: disable=g-complex-comprehension train_ds, eval_ds = [ input_pipeline.load_split( device_batch_size if train else device_eval_batch_size, dtype=input_dtype, train=train, image_format=image_format, space_to_depth=space_to_depth, cache_uncompressed=jax.device_count() > 64) for train in (True, False) ] logging.info('constructing dataset iterators') train_iter = iter(train_ds) eval_iter = iter(eval_ds) local_devices = jax.local_devices() host_step, device_step = 0, broadcast(0) mllogger.end('init_stop') mllogger.start('run_start') mllogger.start('block_start', metadata={ 'first_epoch_num': 1, 'epoch_count': FLAGS.epochs_per_loop }) for loop in range(int(math.ceil(num_epochs / FLAGS.epochs_per_loop)) + 2): # BEGIN GOOGLE-INTERNAL if loop == 10: maybe_start_xprof(1) # END GOOGLE-INTERNAL metrics = empty_metrics if FLAGS.device_loop: optimizer, state, metrics, device_step = p_train_loop( unbroadcast(optimizer), unbroadcast(state), metrics, unbroadcast(device_step), loop) while int(host_step // steps_per_loop) == loop: if not FLAGS.device_loop: optimizer, state, metrics = p_host_loop_train_step( unbroadcast(optimizer), state, metrics) # pylint: disable=protected-access while infeed_pool._work_queue.qsize() > 100: time.sleep(0.01) for device in local_devices: if FLAGS.fake_data: images = np.zeros(train_input_shape, model_dtype) labels = np.zeros((device_batch_size, ), np.int32) else: # pylint: disable=protected-access images, labels = jax.tree_map(lambda x: x._numpy(), next(train_iter)) assert images.shape == train_input_shape and labels.dtype == jnp.int32 infeed_pool.submit( partial(device.transfer_to_infeed, (images, labels))) host_step += 1 epoch = (loop + 1) * FLAGS.epochs_per_loop if FLAGS.train_metrics: if not FLAGS.device_loop: metrics = p_allreduce_metrics(metrics) if jax.host_id() == 0: summary_thread.submit( partial(write_summary, summary_writer, metrics, 'train', epoch)) if not FLAGS.device_loop: state = p_sync_batchnorm_stats(state) metrics = empty_metrics for _ in range(steps_per_eval): metrics = p_host_loop_eval_step(unbroadcast(optimizer.target), unbroadcast(state), metrics) for device in local_devices: if FLAGS.fake_data: images = np.zeros(eval_input_shape, model_dtype) labels = np.zeros((device_eval_batch_size, ), np.int32) else: # pylint: disable=protected-access images, labels = jax.tree_map(lambda x: x._numpy(), next(eval_iter)) assert images.shape == eval_input_shape and labels.dtype == jnp.int32, \ 'images.shape={}'.format(images.shape) infeed_pool.submit( partial(device.transfer_to_infeed, (images, labels))) metrics = p_allreduce_metrics(metrics) if jax.host_id() == 0: summary_thread.submit( partial(write_summary, summary_writer, metrics, 'eval', epoch)) # Wait until computations are done before exiting p_allreduce_metrics(metrics)['accuracy'].block_until_ready() if jax.host_id() == 0: summary_thread.shutdown() if not DONE: mllogger.end('run_stop', metadata={'status': 'aborted'})
def executor(self) -> thread.ThreadPoolExecutor: if self._executor is None: self._executor = thread.ThreadPoolExecutor() return self._executor
def main(argv): global CFG CFG = FLAGS.config if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Guarantee that the JAX bfloat16 extension is used rather than TF bfloat16. _ = np.array(jnp.array([1.0], dtype=jnp.bfloat16)) # Use hardware RNG for bernoulli randoms in dropout mask creation. if CFG.hardware_rng: models.set_hardware_bernoulli() if 'module_import' in CFG and CFG.module_import: for module in CFG.module_import: importlib.import_module(module) if 'additional_task_cache_dirs' in CFG and CFG.additional_task_cache_dirs: t5.data.add_global_cache_dirs(CFG.additional_task_cache_dirs) num_partitions = CFG.num_partitions topology = train_lib.compute_multihost_topology(num_partitions) batch_size = CFG.batch_size eval_batch_size = CFG.eval_batch_size per_replica_set_eval_batch_size = eval_batch_size // topology.num_replica_sets if batch_size % topology.num_replicas: raise ValueError( 'Batch size must be divisible by the number of replicas.') steps_per_epoch = CFG.steps_per_epoch logging.info('steps per epoch: %d', steps_per_epoch) broadcast = functools.partial( train_lib.broadcast, num_replicas=topology.per_replica_set_num_replicas, num_partitions=topology.per_host_num_partitions, devices=topology.this_host_device_assignment) if jax.host_id() == 0: tf.io.gfile.makedirs(FLAGS.model_dir) tf.io.gfile.copy(FLAGS['config'].config_filename, os.path.join(FLAGS.model_dir, 'config.py'), overwrite=True) train_summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'train')) eval_summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'eval')) else: train_summary_writer = None eval_summary_writer = None # Write summaries in background thread to avoid blocking on device sync if CFG.infeed: # Infeed is currently synchronous, so do it in a background thread too infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed') (train_ds, eval_ds), eval_cache = input_pipeline.get_datasets_and_cache( CFG, topology.num_replica_sets, topology.replica_set_id, topology.per_replica_set_host_id) vocab = input_pipeline.get_vocabulary(CFG.mixture_or_task_name) encoder = vocab.tf_tokenizer eos_id = vocab.tokenizer.eos_id() def decode_tokens(toks, eos_id=eos_id, max_id=32000): """Decode tokens back to unicode.""" del eos_id # TODO(levskaya): T5 doesn't seem to emit EOS tokens? double check this # is the best decoding function or just switch to using tf_decode. # valid_toks = toks[:np.argmax(toks == eos_id) + 1].astype(np.int32) valid_toks = toks.astype(np.int32) valid_toks[valid_toks >= max_id] = 3 return encoder.detokenize(valid_toks).numpy().decode('utf-8') logging.info('Initializing model, optimizer, and step functions.') train_config, eval_config, predict_config = get_configs(CFG) rng = random.PRNGKey(CFG.random_seed) rng, init_rng = random.split(rng) # This is used for infeed conversion from feature dict <--> tuple train_keys = [ 'inputs', 'targets', 'inputs_position', 'targets_position', 'inputs_segmentation', 'targets_segmentation' ] device_train_input_shape = tuple([ (batch_size // topology.num_replicas, CFG.max_input_length if 'inputs' in k else CFG.max_target_length) for k in train_keys ]) learning_rate_fn = train_lib.create_learning_rate_scheduler( factors=CFG.schedule, base_learning_rate=CFG.learning_rate, warmup_steps=CFG.warmup_steps) # First, we only abstractly initialize the optimizer and model parameters, # since the parameters may not even fit in device memory! # TODO(jekbradbury): make optimizer_defs compare by value so it can be created # in get_initial_params without causing pytree incompatibility optimizer_def = optim.Adafactor(CFG.learning_rate, decay_rate=0.8, step_offset=CFG.step_offset) initialize_params_fn = functools.partial(get_initial_params, config=CFG, transformer_config=eval_config, optimizer_def=optimizer_def) optimizer = jax.eval_shape(initialize_params_fn, init_rng) # tuple-like pytree leaves for global_arg_shapes optimizer_shapes = jax.tree_map(lambda x: partitions.Spec(*x.shape), optimizer) # Build parameter partition annotations for preserving partitions from train # to eval. if num_partitions > 1: optimizer_partitions = optimizer.restore_state( partitions.set_partitions(num_partitions, optimizer.state_dict())) per_host_optimizer_partitions = optimizer.restore_state( partitions.set_partitions(topology.per_host_num_partitions, optimizer.state_dict())) # Restore unreplicated optimizer + model state from last checkpoint. # TODO(jekbradbury,levskaya): implement sharded native checkpoint/restore existing_checkpoint_found = False if CFG.restore_checkpoints: existing_checkpoint_found = train_lib.checkpoint_exists( FLAGS.model_dir) optimizer = checkpoints.restore_checkpoint(FLAGS.model_dir, optimizer) # Import a pretrained-T5 checkpoint only if we didn't import a local # "native" checkpoint (e.g. due to resuming a pre-empted finetuning run.) # TODO(jekbradbury,levskaya): implement sharded T5 checkpoint/restore if CFG.restore_t5_checkpoint and not existing_checkpoint_found: optimizer = checkpoint_importer.restore_from_t5_checkpoint( optimizer, CFG.restore_t5_checkpoint) if CFG.restore_t5_checkpoint or existing_checkpoint_found: if num_partitions > 1: # Until checkpoint/restore is sharded, the restored checkpoint is global # and we need to slice each sharded parameter into the chunk containing # only the partitions that are present on this host. def per_host_chunk(x, spec): if spec is None or spec is x: # unsharded or not a parameter return x if spec[0] == 1: dim_size = x.shape[1] elif spec[1] == 1: dim_size = x.shape[0] else: raise NotImplementedError() chunk_size = (dim_size * topology.per_host_num_partitions // num_partitions) lower = topology.per_replica_set_host_id * chunk_size upper = (topology.per_replica_set_host_id + 1) * chunk_size if spec[0] == 1: return x[:, lower:upper] else: return x[lower:upper] optimizer = jax.tree_multimap(per_host_chunk, optimizer, optimizer_partitions) else: # If pretraining and no checkpoint imported, we jit the (sharded-) init # function to minimize fragmentation. We use the same pmap(sharded_jit) # setup as the training step/loop to initialize everything "in-place" and # avoid communication or OOM. if num_partitions > 1: initialize_params_fn = sharded_jit( initialize_params_fn, in_parts=None, local_in_parts=None, out_parts=optimizer_partitions, local_out_parts=per_host_optimizer_partitions, # devices=one_replica_device_assignment, ) initialize_params_fn = jax.pmap(initialize_params_fn, 'batch', in_axes=0, axis_size=topology.num_replicas, devices=topology.device_assignment) init_rng = broadcast(init_rng) optimizer = initialize_params_fn(init_rng) # We maintain the optimizer in unbroadcasted form (i.e. with no leading # replica axis). This is equivalent to the as-yet-nonexistent pmap kwarg # out_axes=None. optimizer = train_lib.unbroadcast(optimizer) else: optimizer = jax.jit(initialize_params_fn)(init_rng) # --------------------------------------------------------------------------- # Compile multidevice versions of train/eval/predict step and cache init fn. # --------------------------------------------------------------------------- # We can use either a single train-step for a host training loop: # train_step(optimizer, batch, prev_metrics, dropout_rng, **kwargs) # --> new_optimizer, metrics, new_dropout_rng def p_train_step(optimizer, batch, prev_metrics, dropout_rng): return train_lib.train_step(optimizer, batch, prev_metrics, dropout_rng, config=train_config, learning_rate_fn=learning_rate_fn, num_microbatches=CFG.microbatches, label_smoothing=CFG.label_smoothing, z_loss=CFG.z_loss, use_bfloat16=CFG.use_bfloat16) if num_partitions > 1: p_train_step = sharded_jit( p_train_step, in_parts=(optimizer_partitions, None, None, None), local_in_parts=(per_host_optimizer_partitions, None, None, None), out_parts=(optimizer_partitions, None, None), local_out_parts=(per_host_optimizer_partitions, None, None)) # TODO(levskaya): the in_axes spec below might be wrong, double-check. p_train_step = jax.pmap(p_train_step, axis_name='batch', in_axes=(None, 0, 0, 0), donate_argnums=(0, ), global_arg_shapes=(optimizer_shapes, None, None, None), axis_size=topology.num_replicas, devices=topology.device_assignment) # pytype: disable=wrong-arg-types # OR, we use an on-device loop that feeds the training step via infeed queue. def device_train_loop_cond(args): """Stopping criterion for on-device loop.""" _, _, _, _, step, epoch = args return step // steps_per_epoch == epoch def device_train_loop_body(args): """On-device loop body.""" optimizer, dropout_rngs, metrics, token, step, epoch = args # Ordering input data from infeed requires threading a symbolic token # through the computation. input_data, token = lax.infeed(token, shape=tuple([ jax.ShapedArray(s, jnp.int32) for s in device_train_input_shape ])) # Rebuild input dict from infeed data tuple. batch = {k: v for k, v in zip(train_keys, input_data)} # Run the train_step function and return the loop state. optimizer, metrics, dropout_rngs = train_lib.train_step( optimizer, batch, metrics, dropout_rngs, train_config, learning_rate_fn, num_microbatches=CFG.microbatches, label_smoothing=CFG.label_smoothing, z_loss=CFG.z_loss) step += 1 return optimizer, dropout_rngs, metrics, token, step, epoch def device_train_loop(optimizer, dropout_rngs, metrics, step, epoch): # Create symbolic token for threading infeed data. token = lax.create_token(step) # Run on-device loop. optimizer, dropout_rngs, metrics, _, step, _ = lax.while_loop( device_train_loop_cond, device_train_loop_body, (optimizer, dropout_rngs, metrics, token, step, epoch)) return optimizer, dropout_rngs, metrics, step if num_partitions > 1: device_train_loop = sharded_jit( device_train_loop, in_parts=(optimizer_partitions, None, None, None, None), local_in_parts=(per_host_optimizer_partitions, None, None, None, None), out_parts=(optimizer_partitions, None, None, None), local_out_parts=(per_host_optimizer_partitions, None, None, None)) p_train_epoch = jax.pmap(device_train_loop, axis_name='batch', in_axes=(None, 0, 0, None, None), donate_argnums=(0, ), global_arg_shapes=(optimizer_shapes, None, None, None, None), axis_size=topology.num_replicas, devices=topology.device_assignment) # pytype: disable=wrong-arg-types # Reduction psum for metric data. def p_allreduce_metrics(x): return lax.psum(x, axis_name='batch') if num_partitions > 1: p_allreduce_metrics = sharded_jit( p_allreduce_metrics, in_parts=None, local_in_parts=None, out_parts=None, local_out_parts=None, num_partitions=num_partitions, local_num_partitions=topology.per_host_num_partitions) p_allreduce_metrics = jax.pmap(p_allreduce_metrics, axis_name='batch', global_arg_shapes=None, axis_size=topology.num_replicas, devices=topology.device_assignment) # Training evaluation computation. # eval_step(params, batch, config, label_smoothing=0.0) --> metrics def p_eval_step(params, batch): return train_lib.eval_step(params, batch, config=eval_config, label_smoothing=CFG.label_smoothing) if num_partitions > 1: p_eval_step = sharded_jit( p_eval_step, in_parts=(optimizer_partitions.target, None), local_in_parts=(per_host_optimizer_partitions.target, None), out_parts=None, local_out_parts=None) p_eval_step = jax.pmap(p_eval_step, axis_name='batch', in_axes=(None, 0), global_arg_shapes=(optimizer_shapes.target, None), axis_size=topology.num_replicas, devices=topology.device_assignment) # pytype: disable=wrong-arg-types # Fast autoregressive decoding loop. # For inference and model evaluation. # predict_step(inputs, params, # eos_id, max_decode_len, config, beam_size=4) --> beam_seqs def p_pred_step(inputs, params): return train_lib.predict_step(inputs, params, eos_id, CFG.max_eval_target_length, predict_config, CFG.beam_size) if num_partitions > 1: p_pred_step = sharded_jit( p_pred_step, in_parts=(None, optimizer_partitions.target), local_in_parts=(None, per_host_optimizer_partitions.target), out_parts=None, local_out_parts=None) p_pred_step = jax.pmap(p_pred_step, axis_name='batch', in_axes=(0, None), global_arg_shapes=(None, optimizer_shapes.target), axis_size=topology.num_replicas, devices=topology.device_assignment) # pytype: disable=wrong-arg-types # --------------------------------------------------------------------------- # Main Train Loop # --------------------------------------------------------------------------- # We init the first set of dropout PRNG keys, but update it afterwards inside # the main pmap'd training update for performance. # There should be a unique dropout key for each replica represented on this # host, but the key should be the same for the same replica on other hosts. # Again, this is what the replica set abstraction is for. dropout_rngs = random.split(random.fold_in(rng, topology.replica_set_id), topology.per_replica_set_num_replicas) # restore step from last checkpoint host_step = int(optimizer.state.step) empty_metrics = broadcast({ 'loss': 0.0, 'accuracy': 0.0, 'learning_rate': 0.0, 'denominator': 0.0 }) if CFG.infeed: # TODO(jekbradbury): support something like this for the Python-loop case logging.info( 'Precompiling training loop and moving optimizer to device.') optimizer, _, metrics, _ = p_train_epoch(optimizer, dropout_rngs, empty_metrics, jnp.array(0, dtype=jnp.int32), 1) optimizer = train_lib.unbroadcast(optimizer) metrics['loss'].block_until_ready() logging.info('Starting training loop.') local_devices = jax.local_devices() device_step = broadcast(host_step) first_epoch = host_step // steps_per_epoch # Main Loop over "epochs". train_iter = train_ds.as_numpy_iterator() for epoch in range(first_epoch, first_epoch + CFG.num_epochs): metrics = empty_metrics # NOTE: 'optimizer' is unbroadcast by construction at initialization or # when loading a checkpoint. It is maintained in 'unbroadcast' state to # enable the XLA cross-replica sharding optimization. The broadcasting is # handled automatically by the pmap'd functions that use it. # Gather all task evaluation metrics. logging.info('Evaluating tasks.') if epoch == first_epoch + 1: train_lib.sync_devices() for task in eval_cache.tasks: logging.info('Evaluating task %s', task.name) all_predicted, all_bs = [], [] for pred_batch in eval_cache.preprocessed_examples[task.name]: # Handle final odd-sized batch by padding instead of dropping it. input_batch, unpadded_batch_size = train_lib.pad_batch_to_size( pred_batch['inputs'], per_replica_set_eval_batch_size) all_bs.append(unpadded_batch_size) # Split batch dimensions for pmap. input_batch = jax.tree_map( lambda x: x.reshape((topology.per_replica_set_num_replicas, -1) + x.shape[1:]), input_batch) # Run fast inference on batch. all_predicted.append(p_pred_step(input_batch, optimizer.target)) # Pad out the number of batches so each host has the same number. max_host_batch_number = np.max( eval_cache.preprocessed_batch_sizes[task.name]) batch_shortfall = max_host_batch_number - len(all_predicted) if batch_shortfall > 0: # TODO(levskaya): Fix for case of entirely empty all_predicted. # To make sure the cross-host barriers work, we run the program the same # number of times on all hosts. The results of this call is ignored, and # the predictions are populated with zeros instead. p_pred_step(input_batch, optimizer.target) # Dummy call. all_predicted.extend([jnp.zeros_like(all_predicted[0])] * batch_shortfall) all_bs.extend([0] * batch_shortfall) all_predicted = jnp.concatenate(all_predicted) all_bs = jnp.array(all_bs) # Collect all batches from across hosts and reverse sharding. all_predicted = train_lib.host_allgather( all_predicted, topology.num_replica_sets, topology.replica_set_id, topology.per_replica_set_host_id == 0) seqlength = all_predicted.shape[-1] total_examples = np.sum( train_lib.host_allgather( all_bs, topology.num_replica_sets, topology.replica_set_id, topology.per_replica_set_host_id == 0)) del all_bs assert total_examples == len(eval_cache.examples[task.name]), ( 'Total number of batches incorrect for task %s.' % task.name) # De-shard the collected predicted tokens and remove padding. all_predicted = np.transpose(all_predicted, (1, 2, 0, 3)).reshape( -1, seqlength)[:total_examples] # We now run the post-processing and metric-fns on a single host. if jax.host_id() == 0: assert eval_summary_writer raw_predictions = [] for tokens in all_predicted: raw_predictions.append(decode_tokens(tokens)) # post-process predictions for metric fns predictions = [ task.postprocess_fn(p, example=ex) for p, ex in zip( raw_predictions, eval_cache.examples[task.name]) ] for metric_fn in task.metric_fns: scores = metric_fn(eval_cache.targets[task.name], predictions) for metric_name, metric_value in scores.items(): tag = f'eval/{task.name}/{metric_name}' eval_summary_writer.scalar(tag, metric_value, host_step) logging.info('EVAL %s at step %d: %.3f', tag, host_step, metric_value) eval_summary_writer.flush() # Save text samples for tensorboard. exemplars = '' for n in np.random.choice(np.arange(len(predictions)), 8): tgt_txt = tf.compat.as_text( eval_cache.examples[task.name][n]['targets_plaintext']) pred_txt = raw_predictions[n] exemplars += (f'{eval_cache.inputs[task.name][n]}\n\n' f'target: {tgt_txt}\n\n' f'prediction: {pred_txt}\n\n') eval_summary_writer.text(f'{task.name} samples', exemplars, host_step) eval_summary_writer.flush() # Take an Xprof trace after the first loop has compiled everything. if epoch == first_epoch + 1: train_lib.sync_devices() # For on-device loop, we launch the computation before feeding data. logging.info('BEGIN Train loop.') if CFG.infeed: optimizer, dropout_rngs, metrics, device_step = p_train_epoch( optimizer, dropout_rngs, metrics, train_lib.unbroadcast(device_step), epoch) optimizer = train_lib.unbroadcast(optimizer) # Epoch loop. while int(host_step // steps_per_epoch) == epoch: batch = next(train_iter) batch = jax.tree_map( lambda x: x.reshape( (topology.per_replica_set_num_replicas, -1) + x.shape[1:]), batch) # Feed the on-device training loop. if CFG.infeed: for i, device in enumerate(local_devices): # When using infeed to provide data to the computation, we're on our # own for feeding the right values to the right devices. Each device # should get the minibatch corresponding to its replica, a slice of # the larger batch corresponding to the host's replica set. if device.platform == 'tpu': device_coords = (*device.coords, device.id % 2) else: device_coords = (device.host_id, i) per_replica_set_device_coords = tuple( dc % prsm for dc, prsm in zip( device_coords, topology.per_replica_set_mesh)) per_replica_set_replica_coords = tuple( prsdc // prm for prsdc, prm in zip(per_replica_set_device_coords, topology.per_replica_mesh)) per_replica_set_replica_id = 0 for prsm, prm, prsrc in zip( topology.per_replica_set_mesh, topology.per_replica_mesh, per_replica_set_replica_coords): per_replica_set_replica_id = ( per_replica_set_replica_id * prsm // prm + prsrc) input_tuple = tuple([ batch[k][per_replica_set_replica_id] for k in train_keys ]) # Safety check: infeed does not check shape or types but requires # them to agree with on-device spec, otherwise the queue and program # stalls. tuple_shapes = jax.tree_map(jnp.shape, input_tuple) tuple_dtypes = jax.tree_map(lambda x: x.dtype, input_tuple) assert tuple_shapes == device_train_input_shape, ( 'infeed shape error %s != %s' % (tuple_shapes, device_train_input_shape)) assert tuple(set(tuple_dtypes)) == (jnp.int32,), \ ('infeed dtype error %s not all of type %s' % ( tuple_dtypes, jnp.int32)) infeed_pool.submit( functools.partial(device.transfer_to_infeed, input_tuple)) # Host training loop. else: optimizer, metrics, dropout_rngs = p_train_step( optimizer, batch, metrics, dropout_rngs) optimizer = train_lib.unbroadcast(optimizer) host_step += 1 logging.info('END Train loop.') # Maybe save a checkpoint on one host. if (CFG.save_checkpoints and epoch % CFG.checkpoint_freq == CFG.checkpoint_freq - 1 and jax.host_id() == 0): checkpoints.save_checkpoint(FLAGS.model_dir, optimizer, host_step) # Gather training metrics. metrics = p_allreduce_metrics(metrics) metrics = jax.tree_map(lambda x: jax.device_get(x[0]), metrics) denominator = metrics.pop('denominator') summary = jax.tree_map(lambda x: x / denominator, metrics) # pylint: disable=cell-var-from-loop logging.info('train in step: %s, %s', host_step, summary) if jax.host_id() == 0: assert train_summary_writer for key, val in summary.items(): train_summary_writer.scalar(key, val, host_step) train_summary_writer.flush() # Gather training evaluation metrics. logging.info('Gathering training evaluation metrics.') eval_metrics = [] eval_iter = eval_ds.as_numpy_iterator() for _, eval_batch in zip(range(CFG.num_eval_steps), eval_iter): eval_batch = jax.tree_map( lambda x: x.reshape( (topology.per_replica_set_num_replicas, -1) + x.shape[1:]), eval_batch) metrics = p_eval_step(optimizer.target, eval_batch) eval_metrics.append(metrics) # average metrics across devices eval_metrics = p_allreduce_metrics(eval_metrics) eval_metrics = common_utils.get_metrics(eval_metrics) # average metrics across steps eval_metrics = jax.tree_map(np.sum, eval_metrics) eval_denominator = eval_metrics.pop('denominator') eval_summary = jax.tree_map( lambda x: x / eval_denominator, # pylint: disable=cell-var-from-loop eval_metrics) logging.info('eval in step: %s, %s', host_step, eval_summary) if jax.host_id() == 0: assert eval_summary_writer for key, val in eval_summary.items(): eval_summary_writer.scalar(key, val, host_step) eval_summary_writer.flush() # Wait until computations are done before exiting logging.info('Finished.') train_lib.sync_devices() # Shut down the infeed threadpool. if CFG.infeed: infeed_pool.shutdown()
def main(argv): del argv # BEGIN GOOGLE-INTERNAL xm.setup_work_unit() # END GOOGLE-INTERNAL tf.enable_v2_behavior() if jax.host_id() == 0: summary_writer = tensorboard.SummaryWriter(FLAGS.output_dir) # Write summaries in background thread to avoid blocking on device sync summary_thread = thread.ThreadPoolExecutor(1, 'summary') if FLAGS.infeed: # Infeed is currently synchronous, so do it in a background thread too infeed_pool = thread.ThreadPoolExecutor(jax.local_device_count(), 'infeed') rng = random.PRNGKey(0) image_size = 224 batch_size = FLAGS.batch_size if batch_size is None: batch_size = min(128 * jax.device_count(), 32768) eval_batch_size = 128 * jax.device_count() local_batch_size = batch_size // jax.host_count() local_eval_batch_size = eval_batch_size // jax.host_count() device_batch_size = batch_size // jax.device_count() device_eval_batch_size = eval_batch_size // jax.device_count() device_last_eval_batch_size = (input_pipeline.EVAL_IMAGES % eval_batch_size) // jax.device_count() model_dtype = jnp.bfloat16 if FLAGS.bfloat16 else jnp.float32 input_dtype = tf.bfloat16 if FLAGS.bfloat16 else tf.float32 if FLAGS.transpose_images: train_input_shape = (224, 224, 3, device_batch_size) eval_input_shapes = [(224, 224, 3, bs) for bs in (device_eval_batch_size, device_last_eval_batch_size)] else: train_input_shape = (device_batch_size, 224, 224, 3) eval_input_shapes = [(bs, 224, 224, 3) for bs in (device_eval_batch_size, device_last_eval_batch_size)] num_epochs = FLAGS.num_epochs steps_per_epoch = input_pipeline.TRAIN_IMAGES / batch_size logging.info('steps_per_epoch: %f', steps_per_epoch) steps_per_eval = int(np.ceil(input_pipeline.EVAL_IMAGES / eval_batch_size)) logging.info('steps_per_eval: %d', steps_per_eval) base_learning_rate = FLAGS.learning_rate * batch_size / 256. beta = FLAGS.momentum weight_decay = FLAGS.weight_decay logging.info('creating and initializing model and optimizer') model, state = create_model(rng, device_batch_size, image_size, model_dtype) state = jax_utils.replicate(state) if FLAGS.lars: weight_opt_def = optim.LARS(base_learning_rate, beta, weight_decay=weight_decay) other_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=0, nesterov=False) learning_rate_fn = polynomial_learning_rate_fn(batch_size, steps_per_epoch, num_epochs) else: weight_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=weight_decay, nesterov=True) other_opt_def = optim.Momentum(base_learning_rate, beta, weight_decay=0, nesterov=True) learning_rate_fn = piecewise_learning_rate_fn(base_learning_rate, steps_per_epoch, num_epochs) def filter_weights(key, _): return 'bias' not in key and 'scale' not in key def filter_other(key, _): return 'bias' in key or 'scale' in key weight_traversal = optim.ModelParamTraversal(filter_weights) other_traversal = optim.ModelParamTraversal(filter_other) optimizer_def = optim.MultiOptimizer((weight_traversal, weight_opt_def), (other_traversal, other_opt_def)) optimizer = optimizer_def.create(model) optimizer = optimizer.replicate() del model # do not keep a copy of the initial model p_train_step = jax.pmap(partial(train_step, learning_rate_fn=learning_rate_fn), axis_name='batch') p_eval_step = jax.pmap(eval_step, axis_name='batch') def device_train_loop_cond(args): _, _, _, _, step, epoch = args return step // steps_per_epoch == epoch def device_train_loop_body(args): optimizer, state, metrics, token, step, epoch = args (images, labels), token = lax.infeed( token, shape=(jax.ShapedArray(train_input_shape, model_dtype), jax.ShapedArray((device_batch_size, ), jnp.int32))) batch = {'image': images, 'label': labels} optimizer, state, metrics = train_step(optimizer, state, batch, metrics, learning_rate_fn) step += 1 return optimizer, state, metrics, token, step, epoch def device_train_loop(optimizer, state, metrics, step, epoch): token = lax.create_token(step) optimizer, state, metrics, _, step, _ = lax.while_loop( device_train_loop_cond, device_train_loop_body, (optimizer, state, metrics, token, step, epoch)) return optimizer, state, metrics, step p_train_epoch = jax.pmap(device_train_loop, axis_name='batch') if FLAGS.precompile: logging.info('precompiling step/epoch functions') if FLAGS.infeed: # the device training loop condition will immediately be false p_train_epoch(optimizer, state, empty_metrics(), jax_utils.replicate(0), jax_utils.replicate(1)) else: batch = { 'image': jnp.zeros((jax.local_device_count(), ) + train_input_shape, model_dtype), 'label': jnp.zeros((jax.local_device_count(), ) + (device_batch_size, ), jnp.int32) } p_train_step(optimizer, state, batch, empty_metrics()) for dbs, eis in zip( [device_eval_batch_size, device_last_eval_batch_size], eval_input_shapes): batch = { 'image': jnp.zeros((jax.local_device_count(), ) + eis, model_dtype), 'label': jnp.zeros((jax.local_device_count(), ) + (dbs, ), jnp.int32) } p_eval_step(optimizer.target, state, batch, empty_metrics()) allreduce_metrics(empty_metrics()) pmean = functools.partial(jax.lax.pmean, axis_name='batch') jax.pmap(pmean, axis_name='batch')(state) logging.info('constructing datasets') # pylint: disable=g-complex-comprehension train_ds, eval_ds = [ input_pipeline.load_split( local_batch_size if train else local_eval_batch_size, image_size=image_size, dtype=input_dtype, train=train, transpose_images=FLAGS.transpose_images) for train in (True, False) ] # pylint: enable=g-complex-comprehension logging.info('constructing dataset iterators') train_iter = iter(train_ds) eval_iter = iter(eval_ds) logging.info('beginning training') host_step, device_step = 0, jax_utils.replicate(0) for epoch in range(num_epochs): device_epoch = jax_utils.replicate(epoch) metrics = empty_metrics() if FLAGS.infeed: optimizer, state, metrics, device_step = p_train_epoch( optimizer, state, metrics, device_step, device_epoch) while int(host_step // steps_per_epoch) == epoch: batch = jax.tree_map(lambda x: x._numpy(), next(train_iter)) # pylint: disable=protected-access if FLAGS.infeed: for i, device in enumerate(jax.local_devices()): images, labels = batch['image'][i], batch['label'][i] assert images.shape == train_input_shape and labels.dtype == jnp.int32 infeed_pool.submit( partial(device.transfer_to_infeed, (images, labels))) else: optimizer, state, metrics = p_train_step( optimizer, state, batch, metrics) host_step += 1 if FLAGS.train_metrics: metrics = allreduce_metrics(metrics) if jax.host_id() == 0: summary_thread.submit( partial(write_summary, summary_writer, metrics, 'train', epoch + 1)) if not FLAGS.distributed_batchnorm: # otherwise it's already synced pmean = functools.partial(jax.lax.pmean, axis_name='batch') state = jax.pmap(pmean, axis_name='batch')(state) metrics = empty_metrics() for _ in range(steps_per_eval): batch = jax.tree_map(lambda x: x._numpy(), next(eval_iter)) # pylint: disable=protected-access metrics = p_eval_step(optimizer.target, state, batch, metrics) metrics = allreduce_metrics(metrics) if jax.host_id() == 0: summary_thread.submit( partial(write_summary, summary_writer, metrics, 'eval', epoch + 1)) # TODO(deveci): do something like this from the summary thread: # if summary['accuracy'] > TARGET_ACCURACY: # break if jax.host_id() == 0: summary_thread.shutdown() # Wait until computations are done before exiting jax.random.normal(jax.random.PRNGKey(0), ()).block_until_ready()
# -*- coding: utf-8 -*- import functools from concurrent.futures import thread executor_pool = thread.ThreadPoolExecutor() def run_on_executor(fn): """ Decorator to run a synchronous method asynchronously """ @functools.wraps(fn) def wrapper(*args, **kwargs): future = executor_pool.submit(fn, *args, **kwargs) return future return wrapper
def predict(argv): import redisclient from load_config import load_config args = ctparser.parse_args(argv) conf = load_config('../conf/servers.yaml' if len(args.config) == 0 else os.path.join(cur_dir, args.config)) rc = redisclient.predictClient(conf) def predAndDraw(target, args): try: result = rc.sendRequest(args.model, target[0], priority=args.priority, thresh=args.printthresh) assert result.status == 0, 'Invalid response!' if target[1] is not None: print_boxes( target[0], os.path.join(target[1], os.path.basename(target[0])), result.dets, args.printthresh) except Exception: return (None, target) return (result, target) csvfile = open(os.path.join(cur_dir, args.csv), 'w') if len(args.csv) > 0 else None xsfile = open(os.path.join(cur_dir, args.xsize), 'w') if len(args.xsize) > 0 else None outpath = os.path.join(cur_dir, args.output) if len(args.output) > 0 else None if outpath is not None and not os.path.exists(outpath): os.makedirs(outpath) imagelist = [] videolist = [] args.input = os.path.join(cur_dir, args.input) if args.action == 'image': imagelist.append((args.input, outpath)) elif args.action == 'images': for name in os.listdir(args.input): rl = mimetypes.guess_type(name) if rl[0] is not None and rl[0].startswith('image'): imagelist.append((os.path.join(args.input, name), outpath)) elif args.action == 'video': videolist.append( videoprocess( os.path.join(cur_dir, args.input), os.path.join(cur_dir, args.output, os.path.basename(args.input)), args.fps)) else: raise NotImplementedError for video in videolist: imagelist.extend(video.getFrames()) imagelist.sort() with thread.ThreadPoolExecutor(8) as pool: for i in range(args.retry + 1): if (len(imagelist)) == 0: print('all files has been processed') break wklist = [ pool.submit(predAndDraw, target, args) for target in imagelist ] imagelist = [] for wk in wklist: result = wk.result() if result[0] is None: imagelist.append(result[1]) else: print('{} processed'.format(result[1][0])) if csvfile: csvfile.write( message2csv( result[0].dets, os.path.splitext(os.path.basename( result[1][0]))[0])) if xsfile: xsfile.write('{}:{},{},{}\n'.format( os.path.splitext(os.path.basename( result[1][0]))[0], result[0].xsize, result[0].width, result[0].height)) if csvfile: csvfile.close() if xsfile: xsfile.close() for target in imagelist: print('failed to process {}'.format(target[0])) for video in videolist: video.getVideo()
def run_thread_by_pool(foo,argslist): executor = thread.ThreadPoolExecutor(max_workers=100) for arg in argslist[0:1]: executor.submit(foo,arg)