def test_exact_posterior_recovery_no_transition_noise(self): with self.test_session() as session: stub_model, data, true_params = self._get_single_model() input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader(data)) features, _ = input_fn() model_outputs = stub_model.get_batch_loss( features=features, mode=None, state=math_utils.replicate_state( start_state=stub_model.get_start_state(), batch_size=array_ops.shape( features[feature_keys.TrainEvalFeatures.TIMES])[0])) variables.global_variables_initializer().run() coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) posterior_mean, posterior_var, posterior_times = session.run( # Feed the true model parameters so that this test doesn't depend on # the generated parameters being close to the variable initializations # (an alternative would be training steps to fit the noise values, # which would be slow). model_outputs.end_state, feed_dict=true_params) coordinator.request_stop() coordinator.join() self.assertAllClose(numpy.zeros([1, 4, 4]), posterior_var, atol=1e-2) self.assertAllClose(numpy.dot( numpy.linalg.matrix_power( stub_model.transition, data[feature_keys.TrainEvalFeatures.TIMES].shape[1]), true_params[stub_model.prior_state_mean]), posterior_mean[0], rtol=1e-1) self.assertAllClose( math_utils.batch_end_time( features[feature_keys.TrainEvalFeatures.TIMES]).eval(), posterior_times)
def testDistributedFunctionPendingNodesServerReplaced(self): with ops.device(self.device_local): x1 = array_ops.ones([2, 2]) @def_function.function def worker_fn(i): with ops.device(self.device_t1): mul = math_ops.matmul(i, i) with ops.device(self.device_t2): add = mul + i return add - i worker_fn.get_concrete_function(x1) num_calls = 10 self._coord = coordinator.Coordinator() def thread_fn(device, results): with self._coord.stop_on_exception(): for i in range(num_calls): with ops.device(device): y = worker_fn(x1) results[i] = y.numpy() def update_server_def_fn(): with self._coord.stop_on_exception(): for i in range(num_calls): context.update_server_def( server_def=(self.server_def_s1_s2_s3 if i % 2 == 0 else self.server_def_s1_s2)) results = [None] * num_calls threads = [] threads.append(threading.Thread(target=thread_fn, args=(self.device_t1, results))) threads.append(threading.Thread(target=update_server_def_fn)) for t in threads: t.start() self._coord.join(threads) for result in results: np.testing.assert_array_equal([[2, 2], [2, 2]], result)
def testMultiThreadPendingNodesLockFree(self): """Update cluster when other remote function calls are being launched.""" with ops.device(self.device_t1): x1 = array_ops.ones([2, 2]) num_calls = 10 self._coord = coordinator.Coordinator() @def_function.function def worker_fn(i): return math_ops.matmul(i, i) # Forces function tracing and registration worker_fn.get_concrete_function(x1) def thread_fn(device, results): for i in range(num_calls): with self._coord.stop_on_exception(): with ops.device(device): results[i] = worker_fn(x1).numpy() def update_server_def_fn(): for _ in range(30): with self._coord.stop_on_exception(): context.update_server_def(self.server_def_s1_s2) t1_results = [None] * num_calls t2_results = [None] * num_calls threads = [] threads.append( threading.Thread(target=thread_fn, args=(self.device_t1, t1_results))) threads.append( threading.Thread(target=thread_fn, args=(self.device_t2, t2_results))) threads.append(threading.Thread(target=update_server_def_fn)) for t in threads: t.start() self._coord.join(threads) for result in t1_results + t2_results: np.testing.assert_array_equal([[2, 2], [2, 2]], result)
def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy, cluster_spec, session_config, rpc_layer): """Runs a standalone client for between-graph replication.""" coord = coordinator.Coordinator() eval_thread = None if _TaskType.EVALUATOR in cluster_spec.jobs: eval_thread = threading.Thread(target=_run_single_worker, args=(eval_fn, eval_strategy, cluster_spec, _TaskType.EVALUATOR, 0, session_config), kwargs={ "rpc_layer": rpc_layer, "coord": coord, }) eval_thread.start() threads = [] worker_barrier = _Barrier(_get_num_workers(cluster_spec)) for task_type in [_TaskType.CHIEF, _TaskType.WORKER]: for task_id in range(len(cluster_spec.as_dict().get(task_type, []))): t = threading.Thread(target=_run_single_worker, args=(worker_fn, strategy, cluster_spec, task_type, task_id, session_config), kwargs={ "rpc_layer": rpc_layer, "worker_barrier": worker_barrier, "coord": coord, }) t.start() threads.append(t) if eval_thread: # TODO(yuefengz): is it necessary to join eval thread? threads_to_join = threads + [eval_thread] else: threads_to_join = threads coord.join(threads_to_join) # TODO(yuefengz): we probably want to return results from all workers? return None
def testPandasFeedingMultiThread(self): if not HAS_PANDAS: return with ops.Graph().as_default(): array1 = np.arange(128, 256) array2 = 2 * array1 df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(128)) q = ff._enqueue_data(df, capacity=128, num_threads=8, shuffle=True) batch_size = 5 dq_op = q.dequeue_many(batch_size) with session.Session() as sess: coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord) for _ in range(100): dq = sess.run(dq_op) indices = dq[0] expected_rows = df.iloc[indices] for col_num, col in enumerate(df.columns): np.testing.assert_array_equal(expected_rows[col].values, dq[col_num + 1]) coord.request_stop() coord.join(threads)
def testNoShuffle(self): id_source = rs.ReaderSource(reader_cls=io_ops.IdentityReader, work_units=self.work_units, batch_size=1, shuffle=False, num_threads=1) index_column, value_column = id_source() index_tensor = index_column.build() value_tensor = value_column.build() self.assertEqual([1], index_tensor.get_shape().as_list()) self.assertEqual([1], value_tensor.get_shape().as_list()) with self.test_session() as sess: variables.global_variables_initializer().run() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord) for i in range(50): index, value = sess.run([index_tensor, value_tensor]) self.assertEqual(i, int(index[0])) self.assertEqual(i, int(value[0])) coord.request_stop() coord.join(threads)
def testNotifyBeforeWait(self): closure_queue = coordinator_lib._CoordinatedClosureQueue() def func(): logging.info('func running') coord = coordinator.Coordinator(clean_stop_exception_types=[]) def process_queue(): with coord.stop_on_exception(): closure_queue.get() closure_queue.mark_finished() closure_queue.put( coordinator_lib.Closure(func, closure_queue._cancellation_mgr)) t = threading.Thread(target=process_queue) t.start() coord.join([t]) # This test asserts that waiting at the time the function has been processed # doesn't time out. closure_queue.wait()
def testNumpyInputFnWithBatchSizeNotDividedByDataSize(self): batch_size = 2 a = np.arange(5) * 1.0 b = np.arange(32, 37) x = {'a': a, 'b': b} y = np.arange(-32, -27) with self.test_session() as session: input_fn = numpy_io.numpy_input_fn(x, y, batch_size=batch_size, shuffle=False, num_epochs=1) features, target = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], [0, 1]) self.assertAllEqual(res[0]['b'], [32, 33]) self.assertAllEqual(res[1], [-32, -31]) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], [2, 3]) self.assertAllEqual(res[0]['b'], [34, 35]) self.assertAllEqual(res[1], [-30, -29]) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], [4]) self.assertAllEqual(res[0]['b'], [36]) self.assertAllEqual(res[1], [-28]) with self.assertRaises(errors.OutOfRangeError): session.run([features, target]) coord.request_stop() coord.join(threads)
def testMultiThreadedEstimateDataDistribution(self): num_classes = 10 # Set up graph. random_seed.set_random_seed(1234) label = math_ops.cast( math_ops.round(random_ops.random_uniform([1]) * num_classes), dtypes_lib.int32) prob_estimate = sampling_ops._estimate_data_distribution( # pylint: disable=protected-access label, num_classes) # Check that prob_estimate is well-behaved in a multithreaded context. _, _, [prob_estimate] = sampling_ops._verify_input( # pylint: disable=protected-access [], label, [prob_estimate]) # Use queues to run multiple threads over the graph, each of which # fetches `prob_estimate`. queue = data_flow_ops.FIFOQueue(capacity=25, dtypes=[prob_estimate.dtype], shapes=[prob_estimate.get_shape()]) enqueue_op = queue.enqueue([prob_estimate]) queue_runner_impl.add_queue_runner( queue_runner_impl.QueueRunner(queue, [enqueue_op] * 25)) out_tensor = queue.dequeue() # Run the multi-threaded session. with self.cached_session() as sess: # Need to initialize variables that keep running total of classes seen. variables.global_variables_initializer().run() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(coord=coord) for _ in range(25): sess.run([out_tensor]) coord.request_stop() coord.join(threads)
def test_keyed_read_text_lines(self): gfile.Glob = self._orig_glob filename = self._create_temp_file("ABC\nDEF\nGHK\n") batch_size = 1 queue_capacity = 5 name = "my_batch" with ops.Graph().as_default() as g, self.test_session(graph=g) as session: keys, inputs = graph_io.read_keyed_batch_examples( filename, batch_size, reader=io_ops.TextLineReader, randomize_input=False, num_epochs=1, queue_capacity=queue_capacity, name=name) self.assertAllEqual((None,), keys.get_shape().as_list()) self.assertAllEqual((None,), inputs.get_shape().as_list()) session.run(variables.local_variables_initializer()) coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) self.assertAllEqual( session.run([keys, inputs]), [[filename.encode("utf-8") + b":1"], [b"ABC"]]) self.assertAllEqual( session.run([keys, inputs]), [[filename.encode("utf-8") + b":2"], [b"DEF"]]) self.assertAllEqual( session.run([keys, inputs]), [[filename.encode("utf-8") + b":3"], [b"GHK"]]) with self.assertRaises(errors.OutOfRangeError): session.run(inputs) coord.request_stop() coord.join(threads)
def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(self): if not HAS_PANDAS: return with self.cached_session() as session: index = np.arange(100, 105) a = np.arange(5) b = np.arange(32, 37) x = pd.DataFrame({'a': a, 'b': b}, index=index) y = pd.Series(np.arange(-32, -27), index=index) input_fn = pandas_io.pandas_input_fn( x, y, batch_size=2, shuffle=False, num_epochs=1) results = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) features, target = session.run(results) self.assertAllEqual(features['a'], [0, 1]) self.assertAllEqual(features['b'], [32, 33]) self.assertAllEqual(target, [-32, -31]) features, target = session.run(results) self.assertAllEqual(features['a'], [2, 3]) self.assertAllEqual(features['b'], [34, 35]) self.assertAllEqual(target, [-30, -29]) features, target = session.run(results) self.assertAllEqual(features['a'], [4]) self.assertAllEqual(features['b'], [36]) self.assertAllEqual(target, [-28]) with self.assertRaises(errors.OutOfRangeError): session.run(results) coord.request_stop() coord.join(threads)
def test_chained_exact_posterior_recovery_no_transition_noise(self): with self.test_session() as session: stub_model, data, true_params = self._get_single_model() chunk_size = 10 input_fn = test_utils.AllWindowInputFn( input_pipeline.NumpyReader(data), window_size=chunk_size) features, _ = input_fn() state_manager = state_management.ChainingStateManager( state_saving_interval=1) state_manager.initialize_graph(stub_model) model_outputs = state_manager.define_loss( model=stub_model, features=features, mode=estimator_lib.ModeKeys.TRAIN) variables.global_variables_initializer().run() coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) for _ in range( data[feature_keys.TrainEvalFeatures.TIMES].shape[1] // chunk_size): model_outputs.loss.eval() posterior_mean, posterior_var, posterior_times = session.run( model_outputs.end_state, feed_dict=true_params) coordinator.request_stop() coordinator.join() self.assertAllClose(numpy.zeros([1, 4, 4]), posterior_var, atol=1e-2) self.assertAllClose(numpy.dot( numpy.linalg.matrix_power( stub_model.transition, data[feature_keys.TrainEvalFeatures.TIMES].shape[1]), true_params[stub_model.prior_state_mean]), posterior_mean[0], rtol=1e-1) self.assertAllClose( data[feature_keys.TrainEvalFeatures.TIMES][:, -1], posterior_times)
def testGeneratorInputFnLabelDict(self): def generator(): for index in range(2): yield { 'a': np.ones(1) * index, 'b': np.ones(1) * index + 32, 'label': np.ones(1) * index - 32, 'label2': np.ones(1) * index - 64, } with self.test_session() as session: input_fn = generator_io.generator_input_fn( generator, target_key=['label', 'label2'], batch_size=2, shuffle=False, num_epochs=1) features, target = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1)) self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1)) self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(-1, 1)) self.assertAllEqual(res[1]['label2'], np.asarray([-64, -63]).reshape(-1, 1)) session.run([features]) with self.assertRaises(errors.OutOfRangeError): session.run([features, target]) coord.request_stop() coord.join(threads)
def _test(self): if np_dtype == bool: arr = np.array([True] * int(NUMPY_ARRAY_SIZE / 2) + [False] * int(NUMPY_ARRAY_SIZE / 2)) np.random.shuffle(arr) else: arr = np.arange(NUMPY_ARRAY_SIZE, dtype=np_dtype) frame = df.TensorFlowDataFrame.from_numpy( arr, batch_size=NUMPY_ARRAY_SIZE, shuffle=False) self.assertTrue(hasattr(frame["value"], name)) frame["actual"] = getattr(frame["value"], name)() frame_built = frame.build() expected_tensor = op(frame_built["value"]) actual_tensor = frame_built["actual"] session = session_lib.Session() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(sess=session, coord=coord) actual, expected = session.run([actual_tensor, expected_tensor]) coord.request_stop() coord.join(threads) np.testing.assert_almost_equal(expected, actual)
def _test_missing_values(self, cut_start, cut_end, offset): stub_model = StubTimeSeriesModel() data = self._make_test_data( length=100, cut_start=cut_start, cut_end=cut_end, offset=offset) input_fn = test_utils.AllWindowInputFn( input_pipeline.NumpyReader(data), window_size=10) chainer = state_management.ChainingStateManager( state_saving_interval=1) features, _ = input_fn() stub_model.initialize_graph() chainer.initialize_graph(model=stub_model) model_outputs = chainer.define_loss( model=stub_model, features=features, mode=estimator_lib.ModeKeys.TRAIN) with self.test_session() as session: variables.global_variables_initializer().run() coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) for _ in range(10): model_outputs.loss.eval() returned_loss = model_outputs.loss.eval() coordinator.request_stop() coordinator.join() return returned_loss
def testGeneratorSingleInputFn(self): def generator(): for index in range(2): yield {'a': np.ones(1) * index} with self.test_session() as session: input_fn = generator_io.generator_input_fn( generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1) features = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) res = session.run([features]) self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1)) session.run([features]) with self.assertRaises(errors.OutOfRangeError): session.run([features]) coord.request_stop() coord.join(threads)
def testGeneratorInputFnWithDifferentDimensionsOfFeatures(self): def generator(): for index in range(100): yield {'a': np.ones((10, 10)) * index, 'b': np.ones((5, 5)) * index + 32, 'label': np.ones((3, 3)) * index - 32} with self.test_session() as session: input_fn = generator_io.generator_input_fn( generator, target_key="label", batch_size=2, shuffle=False, num_epochs=1) features, target = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], np.vstack((np.zeros((10, 10)), np.ones((10, 10)))).reshape(2, 10, 10)) self.assertAllEqual(res[0]['b'], np.vstack((np.zeros((5, 5)), np.ones((5, 5)))).reshape(2, 5, 5) + 32) self.assertAllEqual(res[1], np.vstack((np.zeros((3, 3)), np.ones((3, 3)))).reshape(2, 3, 3) - 32) coord.request_stop() coord.join(threads)
def testNumpyInputFnWithZeroEpochs(self): a = np.arange(4) * 1.0 b = np.arange(32, 36) x = {'a': a, 'b': b} y = np.arange(-32, -28) with self.test_session() as session: input_fn = numpy_io.numpy_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=0) features, target = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) with self.assertRaises(errors.OutOfRangeError): session.run([features, target]) coord.request_stop() coord.join(threads)
def testRespectCoordShouldStop(self): with self.cached_session() as sess: # CountUpTo will raise OUT_OF_RANGE when it reaches the count. zero64 = constant_op.constant(0, dtype=dtypes.int64) var = variables.VariableV1(zero64) count_up_to = var.count_up_to(3) queue = data_flow_ops.FIFOQueue(10, dtypes.float32) variables.global_variables_initializer().run() qr = queue_runner_impl.QueueRunner(queue, [count_up_to]) # As the coordinator to stop. The queue runner should # finish immediately. coord = coordinator.Coordinator() coord.request_stop() threads = qr.create_threads(sess, coord) self.assertEqual(sorted(t.name for t in threads), ["QueueRunnerThread-fifo_queue-CountUpTo:0", "QueueRunnerThread-fifo_queue-close_on_stop"]) for t in threads: t.start() coord.join() self.assertEqual(0, len(qr.exceptions_raised)) # The variable should be 0. self.assertEqual(0, var.eval())
def _test_pass_to_next(self, read_offset, step, correct_offset): stub_model = StubTimeSeriesModel(correct_offset=correct_offset) data = self._make_test_data(length=100 + read_offset, cut_start=None, cut_end=None, offset=100., step=step) init_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader( {k: v[:-read_offset] for k, v in data.items()})) result_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader( {k: v[read_offset:] for k, v in data.items()})) chainer = state_management.ChainingStateManager( state_saving_interval=1) stub_model.initialize_graph() chainer.initialize_graph(model=stub_model) init_model_outputs = chainer.define_loss( model=stub_model, features=init_input_fn()[0], mode=estimator_lib.ModeKeys.TRAIN) result_model_outputs = chainer.define_loss( model=stub_model, features=result_input_fn()[0], mode=estimator_lib.ModeKeys.TRAIN) with self.cached_session() as session: variables.global_variables_initializer().run() coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) init_model_outputs.loss.eval() returned_loss = result_model_outputs.loss.eval() coordinator.request_stop() coordinator.join() return returned_loss
def _random_window_input_fn_test_template(self, time_series_reader, window_size, batch_size, num_features, discard_out_of_order=False): input_fn = input_pipeline.RandomWindowInputFn( time_series_reader=time_series_reader, window_size=window_size, batch_size=batch_size) result, _ = input_fn() init_op = variables.local_variables_initializer() with self.cached_session() as session: coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) session.run(init_op) features = session.run(result) coordinator.request_stop() coordinator.join() self.assertAllEqual([batch_size, window_size], features[TrainEvalFeatures.TIMES].shape) for window_position in range(window_size - 1): for batch_position in range(batch_size): # Checks that all times are contiguous self.assertEqual( features[TrainEvalFeatures.TIMES][batch_position, window_position + 1], features[TrainEvalFeatures.TIMES][batch_position, window_position] + 1) self.assertAllEqual([batch_size, window_size, num_features], features[TrainEvalFeatures.VALUES].shape) self.assertEqual("int64", features[TrainEvalFeatures.TIMES].dtype) for feature_number in range(num_features): self.assertAllEqual( features[TrainEvalFeatures.TIMES] * 2. + feature_number, features[TrainEvalFeatures.VALUES][:, :, feature_number]) return features
def setUp(self, num_workers, num_ps): super(BaseFaultToleranceTest, self).setUp() self._cluster = multi_worker_test_base.create_multi_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") self._cluster_def = self._cluster.cluster_resolver.cluster_spec( ).as_dict() self._cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver(server_lib.ClusterSpec( self._cluster_def), rpc_layer="grpc") # The strategy's constructor would connect to the cluster. self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver) self.cluster_coord = cluster_coordinator.ClusterCoordinator( self.strategy) self.thread_coord = thread_coordinator.Coordinator( clean_stop_exception_types=[]) self.num_workers = num_workers self.num_ps = num_ps
def testConditionallyEnqueueAndBatch(self): random_seed.set_random_seed(1234) tensor = control_flow_ops.cond( math_ops.greater(.5, random_ops.random_uniform([])), lambda: constant_op.constant(1.0), lambda: constant_op.constant(2.0)) keep_input = math_ops.equal(tensor, 2.0) batch_size = 4 # Set up the test graph. [batch] = sampling_ops._conditional_batch([tensor], keep_input, batch_size) # pylint: disable=protected-access # Check conditional operation. with self.test_session(): coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(coord=coord) batch_np = batch.eval() coord.request_stop() coord.join(threads) # Check that all elements in batch come from tensors with acceptance prob # 1, so that none come from acceptance prob 0. self.assertListEqual(list(batch_np), [2.0] * batch_size)
def setUp(self): ops.reset_default_graph() self.scalar_int_feed = array_ops.placeholder(dtypes_lib.int32, ()) self.unk_int64_feed = array_ops.placeholder(dtypes_lib.int64, (None, )) self.vec3_str_feed = array_ops.placeholder(dtypes_lib.string, (3, )) self.sparse_c = sparse_tensor.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1]) self._coord = coordinator.Coordinator() # Make capacity very large so we can feed all the inputs in the # main thread without blocking input_queue = data_flow_ops.PaddingFIFOQueue( 5000, dtypes=[dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.string], shapes=[(), (None, ), (3, )]) self._input_enqueue_op = input_queue.enqueue( (self.scalar_int_feed, self.unk_int64_feed, self.vec3_str_feed)) self.scalar_int, self.unk_int64, self.vec3_str = input_queue.dequeue() self._threads = None self._close_op = input_queue.close() self._sess = None
def testNumpyInputFnWithVeryLargeBatchSizeAndMultipleEpochs(self): a = np.arange(2) * 1.0 b = np.arange(32, 34) x = {'a': a, 'b': b} y = np.arange(-32, -30) with self.test_session() as session: input_fn = numpy_io.numpy_input_fn( x, y, batch_size=128, shuffle=False, num_epochs=2) features, target = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) res = session.run([features, target]) self.assertAllEqual(res[0]['a'], [0, 1, 0, 1]) self.assertAllEqual(res[0]['b'], [32, 33, 32, 33]) self.assertAllEqual(res[1], [-32, -31, -32, -31]) with self.assertRaises(errors.OutOfRangeError): session.run([features, target]) coord.request_stop() coord.join(threads)
def testRejectionBatchingBehavior(self): batch_size = 20 input_batch_size = 11 val_input_batch = [array_ops.zeros([input_batch_size, 2, 3, 4])] lbl_input_batch = control_flow_ops.cond( math_ops.greater(.5, random_ops.random_uniform([])), lambda: array_ops.ones([input_batch_size], dtype=dtypes.int32) * 1, lambda: array_ops.ones([input_batch_size], dtype=dtypes.int32) * 3) probs = np.array([0, .2, 0, .8, 0]) data_batch, labels = sampling_ops.stratified_sample( val_input_batch, lbl_input_batch, probs, batch_size, init_probs=[0, .3, 0, .7, 0], enqueue_many=True) with self.cached_session() as sess: coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(coord=coord) sess.run([data_batch, labels]) coord.request_stop() coord.join(threads)
def testNormalBehavior(self): tensor_list = [ control_flow_ops.cond( math_ops.greater(.5, random_ops.random_uniform([])), lambda: constant_op.constant(1.0), lambda: constant_op.constant(2.0)) ] accept_prob_fn = lambda x: x[0] - 1.0 batch_size = 10 # Set up graph. sample = sampling_ops.rejection_sample(tensor_list, accept_prob_fn, batch_size) with self.cached_session() as sess: coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(coord=coord) for _ in range(5): sample_np = sess.run(sample)[0] self.assertListEqual([2.0] * batch_size, list(sample_np)) coord.request_stop() coord.join(threads)
def __init__(self, graph=None, ready_op=USE_DEFAULT, is_chief=True, init_op=USE_DEFAULT, init_feed_dict=None, local_init_op=USE_DEFAULT, logdir=None, summary_op=USE_DEFAULT, saver=USE_DEFAULT, global_step=USE_DEFAULT, save_summaries_secs=120, save_model_secs=600, recovery_wait_secs=30, stop_grace_secs=120, checkpoint_basename="model.ckpt", session_manager=None, summary_writer=USE_DEFAULT, init_fn=None): """Create a `Supervisor`. Args: graph: A `Graph`. The graph that the model will use. Defaults to the default `Graph`. The supervisor may add operations to the graph before creating a session, but the graph should not be modified by the caller after passing it to the supervisor. ready_op: `Operation` to check if the model is initialized. This operation is run by supervisors in `prepare_or_wait_for_session()` to check if the model is ready to use. The model is considered ready if that operation succeeds. Defaults to the operation returned from `tf.assert_variables_initialized()` If `None`, the model is not checked for readiness. is_chief: If True, create a chief supervisor in charge of initializing and restoring the model. If False, create a supervisor that relies on a chief supervisor for inits and restore. init_op: `Operation`. Used by chief supervisors to initialize the model when it can not be recovered. Defaults to an `Operation` that initializes all variables. If `None`, no initialization is done automatically unless you pass a value for `init_fn`, see below. init_feed_dict: A dictionary that maps `Tensor` objects to feed values. This feed dictionary will be used when `init_op` is evaluated. local_init_op: `Operation`. Used by all supervisors to run initializations that should run for every new supervisor instance. By default these are table initializers and initializers for local variables. If `None`, no further per supervisor-instance initialization is done automatically. logdir: A string. Optional path to a directory where to checkpoint the model and log events for the visualizer. Used by chief supervisors. The directory will be created if it does not exist. summary_op: An `Operation` that returns a Summary for the event logs. Used by chief supervisors if a `logdir` was specified. Defaults to the operation returned from merge_all_summaries(). If `None`, summaries are not computed automatically. saver: A Saver object. Used by chief supervisors if a `logdir` was specified. Defaults to the saved returned by Saver(). If `None`, the model is not saved automatically. global_step: An integer Tensor of size 1 that counts steps. The value from 'global_step' is used in summaries and checkpoint filenames. Default to the op named 'global_step' in the graph if it exists, is of rank 1, size 1, and of type tf.int32 ot tf.int64. If `None` the global step is not recorded in summaries and checkpoint files. Used by chief supervisors if a `logdir` was specified. save_summaries_secs: Number of seconds between the computation of summaries for the event log. Defaults to 120 seconds. Pass 0 to disable summaries. save_model_secs: Number of seconds between the creation of model checkpoints. Defaults to 600 seconds. Pass 0 to disable checkpoints. recovery_wait_secs: Number of seconds between checks that the model is ready. Used by supervisors when waiting for a chief supervisor to initialize or restore the model. Defaults to 30 seconds. stop_grace_secs: Grace period, in seconds, given to running threads to stop when `stop()` is called. Defaults to 120 seconds. checkpoint_basename: The basename for checkpoint saving. session_manager: `SessionManager`, which manages Session creation and recovery. If it is `None`, a default `SessionManager` will be created with the set of arguments passed in for backwards compatibility. summary_writer: `SummaryWriter` to use or `USE_DEFAULT`. Can be `None` to indicate that no summaries should be written. init_fn: Optional callable used to initialize the model. Called after the optional `init_op` is called. The callable must accept one argument, the session being initialized. Returns: A `Supervisor`. """ # Set default values of arguments. if graph is None: graph = ops.get_default_graph() with graph.as_default(): self._init_ready_op(ready_op=ready_op) self._init_init_op(init_op=init_op, init_feed_dict=init_feed_dict) self._init_local_init_op(local_init_op=local_init_op) self._init_saver(saver=saver) self._init_summary_op(summary_op=summary_op) self._init_global_step(global_step=global_step) self._graph = graph self._is_chief = is_chief self._coord = coordinator.Coordinator() self._started_threads = [] self._recovery_wait_secs = recovery_wait_secs self._stop_grace_secs = stop_grace_secs self._init_fn = init_fn # Set all attributes related to checkpointing and writing events to None. # Afterwards, set them appropriately for chief supervisors, as these are # the only supervisors that can write checkpoints and events. self._logdir = None self._save_summaries_secs = None self._save_model_secs = None self._save_path = None self._summary_writer = None if self._is_chief: self._logdir = logdir self._save_summaries_secs = save_summaries_secs self._save_model_secs = save_model_secs if self._logdir: self._save_path = os.path.join(self._logdir, checkpoint_basename) if summary_writer is Supervisor.USE_DEFAULT: if self._logdir: self._summary_writer = summary_io.SummaryWriter( self._logdir) else: self._summary_writer = summary_writer self._init_session_manager(session_manager=session_manager) self._verify_setup() # The graph is not allowed to change anymore. graph.finalize()
def _input_statistics_test_template(self, stat_object, num_features, dtype, give_full_data, warmup_iterations=0, rtol=1e-6, data_length=500, chunk_size=4): graph = ops.Graph() with graph.as_default(): numpy_dtype = dtype.as_numpy_dtype values = ( (numpy.arange(data_length, dtype=numpy_dtype)[..., None] + numpy.arange(num_features, dtype=numpy_dtype)[None, ...])[None]) times = 2 * (numpy.arange(data_length)[None]) - 3 if give_full_data: stat_object.set_data((times, values)) features = { TrainEvalFeatures.TIMES: times, TrainEvalFeatures.VALUES: values } input_fn = input_pipeline.RandomWindowInputFn( batch_size=16, window_size=chunk_size, time_series_reader=input_pipeline.NumpyReader(features)) statistics = stat_object.initialize_graph(features=input_fn()[0]) with self.session(graph=graph) as session: variables.global_variables_initializer().run() coordinator = coordinator_lib.Coordinator() queue_runner_impl.start_queue_runners(session, coord=coordinator) for _ in range(warmup_iterations): # A control dependency should ensure that, for queue-based statistics, # a use of any statistic is preceded by an update of all adaptive # statistics. statistics.total_observation_count.eval() self.assertAllClose( range(num_features) + numpy.mean(numpy.arange(chunk_size))[None], statistics.series_start_moments.mean.eval(), rtol=rtol) self.assertAllClose( numpy.tile( numpy.var(numpy.arange(chunk_size))[None], [num_features]), statistics.series_start_moments.variance.eval(), rtol=rtol) self.assertAllClose( numpy.mean(values[0], axis=0), statistics.overall_feature_moments.mean.eval(), rtol=rtol) self.assertAllClose( numpy.var(values[0], axis=0), statistics.overall_feature_moments.variance.eval(), rtol=rtol) self.assertAllClose(-3, statistics.start_time.eval(), rtol=rtol) self.assertAllClose(data_length, statistics.total_observation_count.eval(), rtol=rtol) coordinator.request_stop() coordinator.join()
def _call_for_each_replica(distribution, fn, args, kwargs): """Run `fn` in separate threads, once per replica/worker device. Args: distribution: the DistributionStrategy object. fn: function to run (will be run once per device, each in its own thread). args: positional arguments for `fn` kwargs: keyword arguments for `fn`. Returns: Merged return value of `fn` across all replicas. Raises: RuntimeError: If fn() calls get_replica_context().merge_call() a different number of times from the available devices. """ # TODO(josh11b): Add this option once we add synchronization to variable # creation. Until then, this is pretty unsafe to use. run_concurrently = False if not context.executing_eagerly(): # Needed for per-thread device, etc. contexts in graph mode. ops.get_default_graph().switch_to_thread_local() coord = coordinator.Coordinator( clean_stop_exception_types=(_RequestedStop, )) shared_variable_store = {} # TODO(isaprykin): Create these threads once instead of during every run() # call. threads = [] for index, d in enumerate(distribution.worker_devices): variable_creator_fn = shared_variable_creator.make_fn( shared_variable_store, index) t = MirroredStrategy._MirroredReplicaThread( # pylint: disable=protected-access distribution, coord, d, variable_creator_fn, fn, *values.select_device(d, args), **values.select_device(d, kwargs)) threads.append(t) for t in threads: t.start() # When `fn` starts `should_run` event is set on _MirroredReplicaThread # (`MRT`) threads. The execution waits until # `MRT.has_paused` is set, which indicates that either `fn` is # complete or a `get_replica_context().merge_call()` is called. If `fn` is # complete, then `MRT.done` is set to True. Otherwise, arguments # of `get_replica_context().merge_call` from all paused threads are grouped # and the `merge_fn` is performed. Results of the # `get_replica_context().merge_call` are then set to `MRT.merge_result`. # Each such `get_replica_context().merge_call` call returns the # `MRT.merge_result` for that thread when `MRT.should_run` event # is reset again. Execution of `fn` resumes. try: with coord.stop_on_exception(): all_done = False while not all_done and not coord.should_stop(): done = [] if run_concurrently: for t in threads: t.should_run.set() for t in threads: t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) else: for t in threads: t.should_run.set() t.has_paused.wait() t.has_paused.clear() if coord.should_stop(): return None done.append(t.done) if coord.should_stop(): return None all_done = all(done) if not all_done: if any(done): raise RuntimeError( "Some replicas made a different number of " "replica_context().merge_call() calls.") # get_replica_context().merge_call() case merge_args = values.regroup( {t.device: t.merge_args for t in threads}) merge_kwargs = values.regroup( {t.device: t.merge_kwargs for t in threads}) # We capture the name_scope of the MRT when we call merge_fn # to ensure that if we have opened a name scope in the MRT, # it will be respected when executing the merge function. We only # capture the name_scope from the first MRT and assume it is # the same for all other MRTs. mtt_captured_name_scope = threads[0].captured_name_scope with ops.name_scope(mtt_captured_name_scope): merge_result = threads[0].merge_fn( distribution, *merge_args, **merge_kwargs) for t in threads: t.merge_result = values.select_device( t.device, merge_result) finally: for t in threads: t.should_run.set() coord.join(threads) return values.regroup({t.device: t.main_result for t in threads})