def get_null_reward(self, strategy, index_id_dict, trace="", record_name=None, direct=False): name_list = [nodedef.name for nodedef in self.null_gdef.node] if not direct: strategy = { index_id_dict[index]: strategy_int for index, strategy_int in enumerate(strategy) } strategy = { name: strategy.get(name, list(strategy.values())[0]) for name in name_list } bandwidth = config_dict.get("bandwidth", None) if bandwidth == None: intra = "5000" inter = "1250" else: intra = bandwidth[0] inter = bandwidth[1] time_mem_tuple = tge.TGE( copy.deepcopy(self.null_gdef), self.devices, sink).fill_batchsize(self.global_batch_size).set_nccl_model( self.nccl_model).use_collective().custom( strategy).set_bandwidth(intra, inter).evaluate( self.name_cost_dict, trace) time = time_mem_tuple[0] mem_list = time_mem_tuple[1] print(mem_list) time = float(time) / (10**3) if any(np.array(mem_list) > np.array(device_mems)): time = time * 10 print("oom") self.strategy_reward_dict[str(strategy)] = time if record_name: record_graph_def = tge.TGE( copy.deepcopy(self.null_gdef), self.devices, sink).custom(strategy).replace_placeholder( self.global_batch_size).use_collective().compile( ).get_result() with open(self.folder + "/" + record_name, "w") as f: f.write(pbtf.MessageToString(record_graph_def)) return np.float32(time)
def get_reward(self, strategy, index_id_dict): if self.strategy_reward_dict.get(str(strategy), None): time = self.strategy_reward_dict.get(str(strategy)) else: bandwidth = config_dict.get("bandwidth", None) if bandwidth == None: intra = "5000" inter = "1250" else: intra = bandwidth[0] inter = bandwidth[1] time_mem_tuple = tge.TGE(copy.deepcopy( self.gdef), self.devices, sink).custom({ index_id_dict[index]: strategy_int for index, strategy_int in enumerate(strategy) }).set_bandwidth(intra, inter).evaluate( self.name_cost_dict, self.folder + "/modified_strategy.json") time = time_mem_tuple[0] mem_list = time_mem_tuple[1] time = float(time) / (10**3) if any(np.array(mem_list) > np.array(device_mems)): time = time * 10 #reward = np.sum(strategy*strategy) self.strategy_reward_dict[str(strategy)] = time return np.float32(time)
def __init__(self, gdef_path, null_gdef_path, devices, folder): self.gdef = graph_pb2.GraphDef() with open(gdef_path, "r") as f: txt = f.read() pbtf.Parse(txt, self.gdef) self.null_gdef = graph_pb2.GraphDef() with open(null_gdef_path, "r") as f: txt = f.read() pbtf.Parse(txt, self.null_gdef) self.folder = folder self.strategy_reward_dict = dict() self.name_cost_dict = self.get_name_cost_dict() self.devices = devices self._tge = tge.TGE(self.gdef, devices) if "graph7" in null_gdef_path: self.global_batch_size = batch_sizes[1] elif "graph8" in null_gdef_path: self.global_batch_size = batch_sizes[2] else: self.global_batch_size = batch_sizes[0] with open("nccl_model.pkl", "rb") as f: self.nccl_model = pkl.load(f)
def __init__(self, gdef_path, devices, folder): self.gdef = graph_pb2.GraphDef() with open(gdef_path, "r") as f: txt = f.read() pbtf.Parse(txt, self.gdef) self.folder = folder self.strategy_reward_dict = dict() self.name_cost_dict = self.get_name_cost_dict() self.devices = devices self._tge = tge.TGE(self.gdef, devices)
def change_model(self, index, config): with open(self.model_name + "/" + str(index) + "/init_graph.pbtxt", "w") as f: f.write(str(tf.get_default_graph().as_graph_def(add_shapes=True))) strategy = {} assignment = {} for i in range(int(len(config) // 2)): indexs = config[i * 2] _strategy = config[i * 2 + 1] for j in range(indexs[0], indexs[1] + 1, 1): assignment[self.scopes[j]] = _strategy ''' for i in range(len(self.scopes)): if i <8: assignment[self.scopes[i]] = [0,0] elif i<14: assignment[self.scopes[i]] = [1,1] else: assignment[self.scopes[i]] = [2,3] ''' op_scope_dict = self.compute_operation_scope_dict() for op in op_scope_dict: place = [0] * len(self.devices) decision = assignment[op_scope_dict[op]] #for i in range(decision[0],decision[1]+1,1): for i in decision: place[i] = 1 strategy[op] = [1] + place for op in self.graph.get_operations(): if op.name not in strategy.keys(): print(op.name) strategy[op.name] = [1] + place import pickle as pkl with open(self.model_name + "/" + str(index) + "/strategy.pkl", "wb") as f: pkl.dump(strategy, f) import tge # options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]] # strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node } g = ( tge.TGE(self.gdef, self.devices).custom(strategy) #.replace_placeholder(self.batch_size) .use_collective().compile().get_result()) with open(self.model_name + "/" + str(index) + "/modified.pbtxt", "w") as fo: fo.write(pbtf.MessageToString(g))
def directly_get_reward(self, strategy_dict): bandwidth = config_dict.get("bandwidth", None) if bandwidth == None: intra = "5000" inter = "1250" else: intra = bandwidth[0] inter = bandwidth[1] time_mem_tuple = tge.TGE(copy.deepcopy(self.gdef), self.devices, sink).custom(strategy_dict).set_bandwidth( intra, inter).evaluate( self.name_cost_dict, self.folder + "/best_stratey.json") time = time_mem_tuple[0] mem_list = time_mem_tuple[1] time = float(time) / (10**6) if any(np.array(mem_list) > np.array(device_mems)): time = time * 10000 return np.float32(time)
def get_reward(self, strategy, index_id_dict, trace=""): bandwidth = config_dict.get("bandwidth", None) if bandwidth == None: intra = "5000" inter = "1250" else: intra = bandwidth[0] inter = bandwidth[1] time_mem_tuple = tge.TGE( copy.deepcopy(self.gdef), self.devices, sink).set_nccl_model(self.nccl_model).use_collective().custom({ index_id_dict[index]: strategy_int for index, strategy_int in enumerate(strategy) }).set_bandwidth(intra, inter).evaluate(self.name_cost_dict, trace) time = time_mem_tuple[0] mem_list = time_mem_tuple[1] time = float(time) / (10**3) if any(np.array(mem_list) > np.array(device_mems)): time = time * 10 print("oom") self.strategy_reward_dict[str(strategy)] = time return np.float32(time)
BATCHSIZE = 110 opt = model_fn(None) init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) devices = ("/GPU:0", ) import tge # options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]] # strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node } strategy = {node.name: [0, 2] for node in gdef.node} g = (tge.TGE(gdef, devices).custom(strategy).compile().get_result()) with open("modified.pb", "w") as fo: fo.write(pbtf.MessageToString(g)) tf.reset_default_graph() tf.import_graph_def(g) graph = tf.get_default_graph() x = graph.get_tensor_by_name("import/Placeholder/replica_0:0") y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0") opt = graph.get_operation_by_name("import/GradientDescent/replica_0") init = graph.get_operation_by_name("import/init/replica_0") data = { x: np.random.uniform(size=(BATCHSIZE, 224, 224, 3)),
optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(tf.reduce_sum(loss)) return optimizer import numpy as np import tensorflow.compat.v1 as tf import google.protobuf.text_format as pbtf tf.disable_eager_execution() opt = model_fn() init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) devices = ( "/job:tge/replica:0/task:0/device:GPU:0", "/job:tge/replica:0/task:0/device:GPU:1", "/job:tge/replica:0/task:1/device:GPU:0", "/job:tge/replica:0/task:1/device:GPU:1" ) import tge noop = ('Placeholder', 'Const', 'Identity', 'NoOp', 'ReadVariableOp', 'VarHandleOp', 'Shape') g = (tge.TGE(gdef, devices) .custom({ node.name: (0, 1, 1, 1, 1) for node in gdef.node }) .set_bandwidth(10000, 100) .evaluate({ node.name: [0 if node.op in noop else 1000] * len(devices) for node in gdef.node }) ) print(g)
protocol="grpc", config=config) # devices = ("GPU:0", "GPU:1") opt = model_fn() init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) import tge strategy = {node.name: [1, 2, 2] for node in gdef.node} g = ( tge.TGE(gdef, devices).custom(strategy) # .use_nccl() .replace_placeholder(48).use_collective() # .verbose() .compile().get_result()) tf.reset_default_graph() tf.import_graph_def(g) graph = tf.get_default_graph() # x = graph.get_tensor_by_name("import/Placeholder/replica_0:0") # y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0") opt = graph.get_operation_by_name("import/GradientDescent/replica_0") init = graph.get_operation_by_name("import/init/replica_0") # data = { x: np.random.uniform(size=(24, 224, 224, 3)), y: np.random.uniform(size=(24, 10)) } sess = tf.Session(server.target, config=config)
opt = model_fn() init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) with open("model.pb", "w") as fo: fo.write(pbtf.MessageToString(gdef)) import tge strategy = { node.name: [1, 1, 1, 1, 1] for node in gdef.node } g = (tge.TGE(gdef, devices) .custom(strategy) # .replace_placeholder(BATCHSIZE) .use_collective() # .verbose() .compile() .get_result() ) with open("modified.pb", "w") as fo: fo.write(pbtf.MessageToString(g)) tf.reset_default_graph() resolver = TFConfigClusterResolver() cluster = resolver.cluster_spec() dist = tf.distribute.experimental.MultiWorkerMirroredStrategy( tf.distribute.experimental.CollectiveCommunication.NCCL) config = dist.update_config_proto(tf.ConfigProto()) config.ClearField("device_filters")
# "/job:tge/replica:0/task:1/device:GPU:0" # ) # server = tf.distribute.Server(tf.train.ClusterSpec({ # "tge": ["127.0.0.1:3901", "127.0.0.1:3902"] # }), job_name='tge', task_index=0, protocol="grpc") devices = ("GPU:0", "GPU:1") import tge options = [[1, 1], [2, 2]] strategy = { node.name: [1] + options[np.random.randint(0, len(options))] for node in gdef.node } g = (tge.TGE(gdef, devices) .custom(strategy) # .replace_placeholder(64) .use_nccl() # .verbose() .compile() .get_result() ) tf.reset_default_graph() tf.import_graph_def(g) graph = tf.get_default_graph() x = graph.get_tensor_by_name("import/Placeholder/replica_0:0") y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0") opt = graph.get_operation_by_name("import/GradientDescent/replica_0") init = graph.get_operation_by_name("import/init/replica_0") data = { x: np.random.uniform(size=(24, 224, 224, 3)), y: np.random.uniform(size=(24, 10)) }
from utils import write_tensorboard, setup_workers opt = model_fn() init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def( ) # add_shapes=True? then we must keep tracking shapes ourselves bytes = gdef.SerializeToString() devices = ("/job:tge/replica:0/task:0/device:GPU:0", "/job:tge/replica:0/task:0/device:GPU:1", "/job:tge/replica:0/task:1/device:GPU:0", "/job:tge/replica:0/task:1/device:GPU:1") import tge g = (tge.TGE().set_graph_def(gdef).set_devices(devices).data_parallel( 'ps0').compile().get_graph_def()) tf.reset_default_graph() tf.import_graph_def(g) graph = tf.get_default_graph() x = graph.get_tensor_by_name("import/Placeholder:0") y = graph.get_tensor_by_name("import/Placeholder_1:0") opt = graph.get_operation_by_name("import/GradientDescent") init = graph.get_operation_by_name("import/init") # currently a hack. Later we will add an API for user to get tensor references back acc = 10 * (graph.get_tensor_by_name("import/Mean/replica_0:0") + graph.get_tensor_by_name("import/Mean/replica_1:0")) / 2 write_tensorboard(opt.graph)
import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import numpy as np import tensorflow as tf opt = model_fn() init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) gdef = tf.graph_util.extract_sub_graph(gdef, [opt.node_def.name]) devices = ("/job:tge/replica:0/task:0/device:GPU:0", "/job:tge/replica:0/task:0/device:GPU:1", "/job:tge/replica:0/task:1/device:GPU:0", "/job:tge/replica:0/task:1/device:GPU:1") import tge # with open("../../xiaodong/tge/GAT/data/graph/docs.txt", "r") as f: # records = (x.strip().split(" ") for x in f.readlines()) # prof = {items[0]: [int(float(x)) for x in items[1:]] for items in records} g = (tge.TGE(gdef, devices).custom({ node.name: [0, 2, 1, 0, 1] for node in gdef.node }).evaluate({node.name: [200] * len(devices) for node in gdef.node}, "trace.json")) print(g)
workers = ["10.28.1.26:3901", "10.28.1.25:3901"] server = setup_workers(workers, "grpc+verbs") import tge from profiler import profiler_factory tic1 = time.perf_counter() g = ( tge.TGE(gdef, devices) # .data_parallel('ring') .custom({ node.name: [np.random.randint(0, 2)] + [np.random.randint(0, 2) for _ in devices] for node in gdef.node }) # .destructify_names() # .compile() # .get_result() .set_bandwidth(100000, 1000).evaluate({ node.name: [np.random.randint(0, 1000)] * len(devices) for node in gdef.node })) print(g) toc1 = time.perf_counter() raise SystemExit tf.reset_default_graph() tf.import_graph_def(g) graph = tf.get_default_graph() write_tensorboard(graph)
init = tf.global_variables_initializer() gdef = tf.get_default_graph().as_graph_def(add_shapes=True) with open("model.pb", "w") as fo: fo.write(pbtf.MessageToString(gdef)) import tge # options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]] # strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node } strategy = {node.name: [0, 1, 1, 1, 1, 1, 1] for node in gdef.node} g = ( tge.TGE(gdef, devices).custom(strategy).replace_placeholder( BATCHSIZE).use_collective() # .verbose() .compile().get_result()) with open("modified.pb", "w") as fo: fo.write(pbtf.MessageToString(g)) tf.reset_default_graph() resolver = TFConfigClusterResolver() cluster = resolver.cluster_spec() dist = tf.distribute.experimental.MultiWorkerMirroredStrategy( tf.distribute.experimental.CollectiveCommunication.NCCL) config = dist.update_config_proto(tf.ConfigProto()) config.ClearField("device_filters") tf.import_graph_def(g) graph = tf.get_default_graph()
def build_model(self): tf.reset_default_graph() self.losses = [] self.vars = [] self.avg_gradient = [] self.apply_grad = [] self.instances = [] self.gradients = [] class setter(): def __init__(self, assignment, devices): self.assignment = assignment self.last_device = devices[0] def choose(self, op): scope = tf.get_variable_scope().name for key in self.assignment: if key in scope: self.last_device = self.assignment[key] return self.assignment[key] #print(self.assignment) print(scope, op.name, self.last_device) return self.last_device def device_setter(assignment, devices): _setter = setter(assignment, devices) return _setter.choose losses = [] outputs = [] tf.get_variable_scope()._reuse = tf.AUTO_REUSE for i in range(1): loss, output, scopes = self.model_fn(None, self.model_name) losses.append(loss) outputs.append(output[-1]) self.scopes = scopes new_loss = tf.add_n(losses) new_loss = tf.reduce_mean(new_loss, name="final_loss") #self.train_op = tf.train.AdamOptimizer(learning_rate=0.2, beta1=0.9, beta2=0.98, epsilon=1e-9).minimize(new_loss) self.train_op = tf.train.GradientDescentOptimizer( learning_rate=0.01).minimize(new_loss, colocate_gradients_with_ops=True) init = tf.global_variables_initializer() g = tf.get_default_graph().as_graph_def(add_shapes=True) import tge strategy = {node.name: [1, 1, 1, 1, 1] for node in g.node} g = ( tge.TGE(g, devices).custom(strategy) # .replace_placeholder(BATCHSIZE) .use_collective() # .verbose() .compile().get_result()) with open("vgg_tge_modified.pbtxt", "w") as fo: fo.write(pbtf.MessageToString(g)) tf.reset_default_graph() gdef = graph_pb2.GraphDef() with open("vgg_tge_modified.pbtxt", "r") as f: txt = f.read() pbtf.Parse(txt, gdef) tf.import_graph_def(gdef) graph = tf.get_default_graph() dataset = dataset_factory.get_dataset("imagenet", "train", "/data/slim_imagenet") preprocessing_name = "vgg_19" image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=4, common_queue_capacity=20 * batch_size, common_queue_min=10 * batch_size) [image, label] = provider.get(['image', 'label']) train_image_size = 224 image = image_preprocessing_fn(image, train_image_size, train_image_size) print("image shape:", image.shape) print("label shape:", label.shape) images, labels = tf.train.batch([image, label], batch_size=batch_size, num_threads=4, capacity=5 * batch_size) labels = slim.one_hot_encoding(labels, dataset.num_classes) batch_queue = slim.prefetch_queue.prefetch_queue([images, labels], capacity=2 * micro_batch_num) x_tensor = graph.get_tensor_by_name("import/Placeholder/replica_0:0") y_tensor = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0") x, y = batch_queue.dequeue() replace_input(graph, x, x_tensor.name) replace_input(graph, y, y_tensor.name) opt = graph.get_operation_by_name("import/GradientDescent/replica_0") loss = tf.reduce_mean(tf.add_n(get_tensors(graph, "final_loss"))) init = graph.get_operation_by_name("import/init/replica_0") config = tf.ConfigProto() config.allow_soft_placement = True sess = tf.Session(config=config) sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for i in range(10000000): _, cal_loss = sess.run([opt, loss]) if i % 10 == 0: print("Step:{},Loss:{}".format(i, cal_loss))
assert j == 1 while True: # test current d = {} for i, node in enumerate(gdef.node): if p[i][0] == 0: d[node.name] = p[i][1] elif p[i][0] == 1: d[node.name] = [dec[p[i][1]], 1, 1] else: d[node.name] = [0, *options[dec[p[i][1]]]] t = (tge.TGE(deepcopy(gdef), devices).custom(d).set_bandwidth(2000, 10000).evaluate(prof))[0] if t < best: with open("best_{}.txt".format(sys.argv[1]), "w") as f: print(t, file=f) for i, x in enumerate(dec): if i < 2: print(x, file=f) else: print(options[x], file=f) best = t print("new best: {}".format(t)) # next decision for i in range(ngiven, len(dec)):