def testCachingReusables(self): # Test that we can define reusable variables before the driver is connected. def foo_initializer(): return 1 def bar_initializer(): return [] def bar_reinitializer(bar): return [] ray.reusables.foo = ray.Reusable(foo_initializer) ray.reusables.bar = ray.Reusable(bar_initializer, bar_reinitializer) @ray.remote def use_foo(): return ray.reusables.foo @ray.remote def use_bar(): ray.reusables.bar.append(1) return ray.reusables.bar ray.init(start_ray_local=True, num_workers=2) self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_bar.remote()), [1]) self.assertEqual(ray.get(use_bar.remote()), [1]) ray.worker.cleanup()
def testFailImportingReusableVariable(self): ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE) # This will throw an exception when the reusable variable is imported on the # workers. def initializer(): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("The initializer failed.") return 0 ray.reusables.foo = ray.Reusable(initializer) for _ in range(100): # Retry if we need to wait longer. if len(ray.task_info()["failed_reusable_variable_imports"]) >= 1: break time.sleep(0.1) # Check that the error message is in the task info. self.assertTrue("The initializer failed." in ray.task_info()["failed_reusable_variable_imports"][0]["error_message"]) ray.worker.cleanup()
def testFailImportingReusableVariable(self): ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE) # This will throw an exception when the reusable variable is imported on the # workers. def initializer(): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("The initializer failed.") return 0 ray.reusables.foo = ray.Reusable(initializer) wait_for_errors("ReusableVariableImportError", 1) # Check that the error message is in the task info. self.assertTrue("The initializer failed." in ray.error_info() ["ReusableVariableImportError"][0]["message"]) ray.worker.cleanup()
def testFailReinitializingVariable(self): ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE) def initializer(): return 0 def reinitializer(foo): raise Exception("The reinitializer failed.") ray.reusables.foo = ray.Reusable(initializer, reinitializer) @ray.remote def use_foo(): ray.reusables.foo use_foo.remote() for _ in range(100): # Retry if we need to wait longer. if len(ray.task_info()["failed_reinitialize_reusable_variables"]) >= 1: break time.sleep(0.1) # Check that the error message is in the task info. self.assertTrue("The reinitializer failed." in ray.task_info()["failed_reinitialize_reusable_variables"][0]["error_message"]) ray.worker.cleanup()
def rnn_ray(argv): #num_workers = 1 scale = 10 num_steps = 10 try: opts, args = getopt.getopt(argv, "hw:s:n:", ["workers=","scale=","num_steps="]) except getopt.GetoptError: print 'rnn_ray_loop -w <num_workers> -s <scale> -n <num_steps>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'rnn_ray_loop -w <num_workers>' sys.exit() elif opt in ("-w", "--workers"): num_of_workers = int(arg) elif opt in ("-s", "--scale"): scale = int(arg) elif opt in ("-n", "--num_steps"): print "num steps is {}".format(arg) num_steps = int(arg) ray.init(start_ray_local=True, num_workers=num_of_workers) start_time = time.time() scale = scale*5 batch_size = scale - 1 xdim = scale * 10 h1dim = (scale + 1) * 10 h2dim = (scale + 2) * 10 h3dim = (scale + 3) * 10 h4dim = (scale + 4) * 10 h5dim = (scale + 5) * 10 ydim = (2 * scale + 6) * 10 ray.reusables.net_vars = ray.Reusable(lambda : rnn.net_initialization(scale,num_steps,batch_size,xdim,h1dim,h2dim,h3dim,h4dim,h5dim,ydim), rnn.net_reinitialization) res = ray_rnn_int.remote(num_of_workers, scale, num_steps, batch_size, xdim, h1dim, h2dim, h3dim, h4dim, h5dim, ydim) ray.get(res)
def testReusableVariablesInPythonMode(self): reload(test_functions) ray.init(start_ray_local=True, driver_mode=ray.PYTHON_MODE) def l_init(): return [] def l_reinit(l): return [] ray.reusables.l = ray.Reusable(l_init, l_reinit) @ray.remote def use_l(): l = ray.reusables.l l.append(1) return l # Get the local copy of the reusable variable. This should be stateful. l = ray.reusables.l assert_equal(l, []) # Make sure the remote function does what we expect. assert_equal(ray.get(use_l.remote()), [1]) assert_equal(ray.get(use_l.remote()), [1]) # Make sure the local copy of the reusable variable has not been mutated. assert_equal(l, []) l = ray.reusables.l assert_equal(l, []) # Make sure that running a remote function does not reset the state of the # local copy of the reusable variable. l.append(2) assert_equal(ray.get(use_l.remote()), [1]) assert_equal(l, [2]) ray.worker.cleanup()
def testUsingReusablesOnDriver(self): ray.init(start_ray_local=True, num_workers=1) # Test that we can add a variable to the key-value store. def foo_initializer(): return [] def foo_reinitializer(foo): return [] ray.reusables.foo = ray.Reusable(foo_initializer, foo_reinitializer) @ray.remote def use_foo(): foo = ray.reusables.foo foo.append(1) return foo # Check that running a remote function does not reset the reusable variable # on the driver. foo = ray.reusables.foo self.assertEqual(foo, []) foo.append(2) self.assertEqual(foo, [2]) foo.append(3) self.assertEqual(foo, [2, 3]) self.assertEqual(ray.get(use_foo.remote()), [1]) self.assertEqual(ray.get(use_foo.remote()), [1]) self.assertEqual(ray.get(use_foo.remote()), [1]) # Check that the copy of foo on the driver has not changed. self.assertEqual(foo, [2, 3]) foo = ray.reusables.foo self.assertEqual(foo, [2, 3]) ray.worker.cleanup()
def testFailReinitializingVariable(self): ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE) def initializer(): return 0 def reinitializer(foo): raise Exception("The reinitializer failed.") ray.reusables.foo = ray.Reusable(initializer, reinitializer) @ray.remote def use_foo(): ray.reusables.foo use_foo.remote() wait_for_errors("ReusableVariableReinitializeError", 1) # Check that the error message is in the task info. self.assertTrue("The reinitializer failed." in ray.error_info() ["ReusableVariableReinitializeError"][0]["message"]) ray.worker.cleanup()
def testReusables(self): ray.init(start_ray_local=True, num_workers=1) # Test that we can add a variable to the key-value store. def foo_initializer(): return 1 def foo_reinitializer(foo): return foo ray.reusables.foo = ray.Reusable(foo_initializer, foo_reinitializer) self.assertEqual(ray.reusables.foo, 1) @ray.remote def use_foo(): return ray.reusables.foo self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_foo.remote()), 1) # Test that we can add a variable to the key-value store, mutate it, and reset it. def bar_initializer(): return [1, 2, 3] ray.reusables.bar = ray.Reusable(bar_initializer) @ray.remote def use_bar(): ray.reusables.bar.append(4) return ray.reusables.bar self.assertEqual(ray.get(use_bar.remote()), [1, 2, 3, 4]) self.assertEqual(ray.get(use_bar.remote()), [1, 2, 3, 4]) self.assertEqual(ray.get(use_bar.remote()), [1, 2, 3, 4]) # Test that we can use the reinitializer. def baz_initializer(): return np.zeros([4]) def baz_reinitializer(baz): for i in range(len(baz)): baz[i] = 0 return baz ray.reusables.baz = ray.Reusable(baz_initializer, baz_reinitializer) @ray.remote def use_baz(i): baz = ray.reusables.baz baz[i] = 1 return baz assert_equal(ray.get(use_baz.remote(0)), np.array([1, 0, 0, 0])) assert_equal(ray.get(use_baz.remote(1)), np.array([0, 1, 0, 0])) assert_equal(ray.get(use_baz.remote(2)), np.array([0, 0, 1, 0])) assert_equal(ray.get(use_baz.remote(3)), np.array([0, 0, 0, 1])) # Make sure the reinitializer is actually getting called. Note that this is # not the correct usage of a reinitializer because it does not reset qux to # its original state. This is just for testing. def qux_initializer(): return 0 def qux_reinitializer(x): return x + 1 ray.reusables.qux = ray.Reusable(qux_initializer, qux_reinitializer) @ray.remote def use_qux(): return ray.reusables.qux self.assertEqual(ray.get(use_qux.remote()), 0) self.assertEqual(ray.get(use_qux.remote()), 1) self.assertEqual(ray.get(use_qux.remote()), 2) ray.worker.cleanup()
# Arguments to specify where the imagenet data is stored. parser = argparse.ArgumentParser(description="Run the AlexNet example.") parser.add_argument("--s3-bucket", required=True, type=str, help="Name of the bucket that contains the image data.") parser.add_argument("--key-prefix", default="ILSVRC2012_img_train/n015", type=str, help="Prefix for files to fetch.") parser.add_argument("--label-file", default="train.txt", type=str, help="File containing labels.") if __name__ == "__main__": args = parser.parse_args() ray.init(start_ray_local=True, num_workers=10) # Note we do not do sess.run(tf.initialize_all_variables()) because that would # result in a different initialization on each worker. Instead, we initialize # the weights on the driver and load the weights on the workers every time we # compute a gradient. ray.reusables.net_vars = ray.Reusable(alexnet.net_initialization, alexnet.net_reinitialization) # Prepare keys for downloading the data. s3_resource = boto3.resource("s3") imagenet_bucket = s3_resource.Bucket(args.s3_bucket) objects = imagenet_bucket.objects.filter(Prefix=args.key_prefix) image_tar_files = [str(obj.key) for obj in objects.all()] print "Images will be downloaded from {} files.".format(len(image_tar_files)) # Downloading the label file, and create a dictionary mapping the filenames of # the images to their labels. s3_client = boto3.client("s3") label_file = s3_client.get_object(Bucket=args.s3_bucket, Key=args.label_file) filename_label_str = label_file["Body"].read().strip().split("\n") filename_label_pairs = [line.split(" ") for line in filename_label_str] filename_label_dict = dict([(os.path.basename(name), label) for name, label in filename_label_pairs])
# Function for initializing the gym environment. def env_initializer(): return gym.make("Pong-v0") # Function for reinitializing the gym environment in order to guarantee that # the state of the game is reset after each remote task. def env_reinitializer(env): env.reset() return env # Create a reusable variable for the gym environment. ray.reusables.env = ray.Reusable(env_initializer, env_reinitializer) def sigmoid(x): return 1.0 / (1.0 + np.exp(-x) ) # sigmoid "squashing" function to interval [0,1] def preprocess(I): """preprocess 210x160x3 uint8 frame into 6400 (80x80) 1D float vector""" I = I[35:195] # crop I = I[::2, ::2, 0] # downsample by factor of 2 I[I == 144] = 0 # erase background (background type 1) I[I == 109] = 0 # erase background (background type 2) I[I != 0] = 1 # everything else (paddles, ball) just set to 1 return I.astype(np.float).ravel()
return sess, cross_entropy, cross_entropy_grads, x, y_, get_weights, set_weights # By default, when a reusable variable is used by a remote function, the # initialization code will be rerun at the end of the remote task to ensure # that the state of the variable is not changed by the remote task. However, # the initialization code may be expensive. This case is one example, because # a TensorFlow network is constructed. In this case, we pass in a special # reinitialization function which gets run instead of the original # initialization code. As users, if we pass in custom reinitialization code, # we must ensure that no state is leaked between tasks. def net_reinitialization(net_vars): return net_vars # Create a reusable variable for the network. ray.reusables.net_vars = ray.Reusable(net_initialization, net_reinitialization) # Load the weights into the network. def load_weights(theta): sess, _, _, _, _, get_weights, set_weights = ray.reusables.net_vars set_weights( [theta[:w_size].reshape(w_shape), theta[w_size:].reshape(b_shape)]) # Compute the loss on a batch of data. @ray.remote def loss(theta, xs, ys): sess, cross_entropy, _, x, y_, _, _ = ray.reusables.net_vars load_weights(theta) return float(sess.run(cross_entropy, feed_dict={x: xs, y_: ys})) # Compute the gradient of the loss on a batch of data.
def rnn_ray(argv): #num_workers = 1 scale = 10 num_steps = 10 try: opts, args = getopt.getopt(argv, "hw:s:n:", ["workers=", "scale=", "num_steps="]) except getopt.GetoptError: print 'rnn_ray_loop -w <num_workers> -s <scale> -n <num_steps>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'rnn_ray_loop -w <num_workers>' sys.exit() elif opt in ("-w", "--workers"): num_of_workers = int(arg) elif opt in ("-s", "--scale"): scale = int(arg) elif opt in ("-n", "--num_steps"): print "num steps is {}".format(arg) num_steps = int(arg) ray.init(start_ray_local=True, num_workers=num_of_workers) for k in range(1): scale = scale * 5 batch_size = scale - 1 xdim = scale * 10 h1dim = (scale + 1) * 10 h2dim = (scale + 2) * 10 h3dim = (scale + 3) * 10 h4dim = (scale + 4) * 10 h5dim = (scale + 5) * 10 ydim = (2 * scale + 6) * 10 ray.reusables.net_vars = ray.Reusable( lambda: rnn.net_initialization(scale, num_steps, batch_size, xdim, h1dim, h2dim, h3dim, h4dim, h5dim, ydim), rnn.net_reinitialization) h1 = ra.zeros.remote([batch_size, h1dim]) h2 = ra.zeros.remote([batch_size, h2dim]) h3 = ra.zeros.remote([batch_size, h3dim]) h4 = ra.zeros.remote([batch_size, h4dim]) h5 = ra.zeros.remote([batch_size, h5dim]) inputs = [ ra.random.normal.remote([batch_size, xdim]) for _ in range(num_steps) ] # Run distributed RNN elapsed_time_1_layers = [] elapsed_time_2_layers = [] elapsed_time_3_layers = [] elapsed_time_4_layers = [] elapsed_time_5_layers = [] elapsed_time_6_layers = [] for _ in range(10): start_time = time.time() for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) ray.get(h1) end_time = time.time() elapsed_time_1_layers.append(end_time - start_time) #print "Distributed RNN, 1 layer, elapsed_time = {} seconds.".format(end_time - start_time) start_time = time.time() for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) h2 = rnn.second_layer.remote(h1, h2) ray.get(h2) end_time = time.time() elapsed_time_2_layers.append(end_time - start_time) #print "Distributed RNN, 2 layer, elapsed_time = {} seconds.".format(end_time - start_time) start_time = time.time() for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) h2 = rnn.second_layer.remote(h1, h2) h3 = rnn.third_layer.remote(h2, h3) ray.get(h3) end_time = time.time() elapsed_time_3_layers.append(end_time - start_time) #print "Distributed RNN, 3 layer, elapsed_time = {} seconds.".format(end_time - start_time) start_time = time.time() for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) h2 = rnn.second_layer.remote(h1, h2) h3 = rnn.third_layer.remote(h2, h3) h4 = rnn.fourth_layer.remote(h3, h4) ray.get(h4) end_time = time.time() elapsed_time_4_layers.append(end_time - start_time) #print "Distributed RNN, 4 layer, elapsed_time = {} seconds.".format(end_time - start_time) start_time = time.time() for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) h2 = rnn.second_layer.remote(h1, h2) h3 = rnn.third_layer.remote(h2, h3) h4 = rnn.fourth_layer.remote(h3, h4) h5 = rnn.fifth_layer.remote(h4, h5) ray.get(h5) end_time = time.time() elapsed_time_5_layers.append(end_time - start_time) #print "Distributed RNN, 5 layer, elapsed_time = {} seconds.".format(end_time - start_time) start_time = time.time() outputs = [] for t in range(num_steps): h1 = rnn.first_layer.remote(inputs[t], h1) h2 = rnn.second_layer.remote(h1, h2) h3 = rnn.third_layer.remote(h2, h3) h4 = rnn.fourth_layer.remote(h3, h4) h5 = rnn.fifth_layer.remote(h4, h5) outputs.append(rnn.sixth_layer.remote(h5)) for t in range(num_steps): ray.get(outputs[t]) end_time = time.time() elapsed_time_6_layers.append(end_time - start_time) #print "Distributed RNN, 6 layer, elapsed_time = {} seconds.".format(end_time - start_time) elapsed_time_1_layers = np.sort(elapsed_time_1_layers) elapsed_time_2_layers = np.sort(elapsed_time_2_layers) elapsed_time_3_layers = np.sort(elapsed_time_3_layers) elapsed_time_4_layers = np.sort(elapsed_time_4_layers) elapsed_time_5_layers = np.sort(elapsed_time_5_layers) elapsed_time_6_layers = np.sort(elapsed_time_6_layers) elapsed_time_1_layers_average = sum(elapsed_time_1_layers) / 10 elapsed_time_2_layers_average = sum(elapsed_time_2_layers) / 10 elapsed_time_3_layers_average = sum(elapsed_time_3_layers) / 10 elapsed_time_4_layers_average = sum(elapsed_time_4_layers) / 10 elapsed_time_5_layers_average = sum(elapsed_time_5_layers) / 10 elapsed_time_6_layers_average = sum(elapsed_time_6_layers) / 10 print "" print "Number of workers = {}.".format(num_of_workers) print "Scale = {}.".format(scale) print "Load measure (scale/num_workers) = {}.".format(scale / num_of_workers) print "Time required for 1 layer RNN:" print " Average: {}".format(elapsed_time_1_layers_average) print " 90th precentile: {}".format(elapsed_time_1_layers[8]) print " Worst: {}".format(elapsed_time_1_layers[9]) print "Time required for 2 layer RNN:" print " Average: {}".format(elapsed_time_2_layers_average) print " 90th precentile: {}".format(elapsed_time_2_layers[8]) print " Worst: {}".format(elapsed_time_2_layers[9]) print "Time required for 3 layer RNN:" print " Average: {}".format(elapsed_time_3_layers_average) print " 90th precentile: {}".format(elapsed_time_3_layers[8]) print " Worst: {}".format(elapsed_time_3_layers[9]) print "Time required for 4 layer RNN:" print " Average: {}".format(elapsed_time_4_layers_average) print " 90th precentile: {}".format(elapsed_time_4_layers[8]) print " Worst: {}".format(elapsed_time_4_layers[9]) print "Time required for 5 layer RNN:" print " Average: {}".format(elapsed_time_5_layers_average) print " 90th precentile: {}".format(elapsed_time_5_layers[8]) print " Worst: {}".format(elapsed_time_5_layers[9]) print "Time required for 6 layer RNN:" print " Average: {}".format(elapsed_time_6_layers_average) print " 90th precentile: {}".format(elapsed_time_6_layers[8]) print " Worst: {}".format(elapsed_time_6_layers[9]) print "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}".format( num_of_workers, scale, num_steps, elapsed_time_1_layers_average, elapsed_time_1_layers[8], elapsed_time_1_layers[9], elapsed_time_2_layers_average, elapsed_time_2_layers[8], elapsed_time_2_layers[9], elapsed_time_3_layers_average, elapsed_time_3_layers[8], elapsed_time_3_layers[9], elapsed_time_4_layers_average, elapsed_time_4_layers[8], elapsed_time_4_layers[9], elapsed_time_5_layers_average, elapsed_time_5_layers[8], elapsed_time_5_layers[9], elapsed_time_6_layers_average, elapsed_time_6_layers[8], elapsed_time_6_layers[9])
scale = 50 num_steps = 10 batch_size = scale - 1 xdim = scale * 10 h1dim = (scale + 1) * 10 h2dim = (scale + 2) * 10 h3dim = (scale + 3) * 10 h4dim = (scale + 4) * 10 h5dim = (scale + 5) * 10 ydim = (2 * scale + 6) * 10 ray.init(start_ray_local=True, num_workers=10) ray.reusables.net_vars = ray.Reusable( lambda: rnn.net_initialization(scale, num_steps, batch_size, xdim, h1dim, h2dim, h3dim, h4dim, h5dim, ydim), rnn.net_reinitialization) #ray.reusables.net_vars = ray.Reusable(rnn.net_initialization, rnn.net_reinitialization) h1 = ra.zeros.remote([batch_size, h1dim]) h2 = ra.zeros.remote([batch_size, h2dim]) h3 = ra.zeros.remote([batch_size, h3dim]) h4 = ra.zeros.remote([batch_size, h4dim]) h5 = ra.zeros.remote([batch_size, h5dim]) inputs = [ ra.random.normal.remote([batch_size, xdim]) for _ in range(num_steps) ] # Run distributed RNN start_time = time.time()