Пример #1
0
    def get_null_reward(self,
                        strategy,
                        index_id_dict,
                        trace="",
                        record_name=None,
                        direct=False):
        name_list = [nodedef.name for nodedef in self.null_gdef.node]
        if not direct:
            strategy = {
                index_id_dict[index]: strategy_int
                for index, strategy_int in enumerate(strategy)
            }
            strategy = {
                name: strategy.get(name,
                                   list(strategy.values())[0])
                for name in name_list
            }
        bandwidth = config_dict.get("bandwidth", None)
        if bandwidth == None:
            intra = "5000"
            inter = "1250"
        else:
            intra = bandwidth[0]
            inter = bandwidth[1]
        time_mem_tuple = tge.TGE(
            copy.deepcopy(self.null_gdef), self.devices,
            sink).fill_batchsize(self.global_batch_size).set_nccl_model(
                self.nccl_model).use_collective().custom(
                    strategy).set_bandwidth(intra, inter).evaluate(
                        self.name_cost_dict, trace)
        time = time_mem_tuple[0]
        mem_list = time_mem_tuple[1]
        print(mem_list)
        time = float(time) / (10**3)

        if any(np.array(mem_list) > np.array(device_mems)):
            time = time * 10
            print("oom")
        self.strategy_reward_dict[str(strategy)] = time

        if record_name:
            record_graph_def = tge.TGE(
                copy.deepcopy(self.null_gdef), self.devices,
                sink).custom(strategy).replace_placeholder(
                    self.global_batch_size).use_collective().compile(
                    ).get_result()
            with open(self.folder + "/" + record_name, "w") as f:
                f.write(pbtf.MessageToString(record_graph_def))
        return np.float32(time)
Пример #2
0
 def get_reward(self, strategy, index_id_dict):
     if self.strategy_reward_dict.get(str(strategy), None):
         time = self.strategy_reward_dict.get(str(strategy))
     else:
         bandwidth = config_dict.get("bandwidth", None)
         if bandwidth == None:
             intra = "5000"
             inter = "1250"
         else:
             intra = bandwidth[0]
             inter = bandwidth[1]
         time_mem_tuple = tge.TGE(copy.deepcopy(
             self.gdef), self.devices, sink).custom({
                 index_id_dict[index]: strategy_int
                 for index, strategy_int in enumerate(strategy)
             }).set_bandwidth(intra, inter).evaluate(
                 self.name_cost_dict,
                 self.folder + "/modified_strategy.json")
         time = time_mem_tuple[0]
         mem_list = time_mem_tuple[1]
         time = float(time) / (10**3)
         if any(np.array(mem_list) > np.array(device_mems)):
             time = time * 10
         #reward = np.sum(strategy*strategy)
         self.strategy_reward_dict[str(strategy)] = time
     return np.float32(time)
Пример #3
0
    def __init__(self, gdef_path, null_gdef_path, devices, folder):

        self.gdef = graph_pb2.GraphDef()
        with open(gdef_path, "r") as f:
            txt = f.read()
        pbtf.Parse(txt, self.gdef)

        self.null_gdef = graph_pb2.GraphDef()
        with open(null_gdef_path, "r") as f:
            txt = f.read()
        pbtf.Parse(txt, self.null_gdef)

        self.folder = folder
        self.strategy_reward_dict = dict()
        self.name_cost_dict = self.get_name_cost_dict()
        self.devices = devices
        self._tge = tge.TGE(self.gdef, devices)
        if "graph7" in null_gdef_path:
            self.global_batch_size = batch_sizes[1]
        elif "graph8" in null_gdef_path:
            self.global_batch_size = batch_sizes[2]
        else:
            self.global_batch_size = batch_sizes[0]
        with open("nccl_model.pkl", "rb") as f:
            self.nccl_model = pkl.load(f)
Пример #4
0
    def __init__(self, gdef_path, devices, folder):

        self.gdef = graph_pb2.GraphDef()
        with open(gdef_path, "r") as f:
            txt = f.read()
        pbtf.Parse(txt, self.gdef)
        self.folder = folder
        self.strategy_reward_dict = dict()
        self.name_cost_dict = self.get_name_cost_dict()
        self.devices = devices
        self._tge = tge.TGE(self.gdef, devices)
Пример #5
0
    def change_model(self, index, config):

        with open(self.model_name + "/" + str(index) + "/init_graph.pbtxt",
                  "w") as f:
            f.write(str(tf.get_default_graph().as_graph_def(add_shapes=True)))

        strategy = {}
        assignment = {}
        for i in range(int(len(config) // 2)):
            indexs = config[i * 2]
            _strategy = config[i * 2 + 1]
            for j in range(indexs[0], indexs[1] + 1, 1):
                assignment[self.scopes[j]] = _strategy
        '''
        for i in range(len(self.scopes)):
            if i <8:
                assignment[self.scopes[i]] = [0,0]
            elif i<14:
                assignment[self.scopes[i]] = [1,1]
            else:
                assignment[self.scopes[i]] = [2,3]

        '''
        op_scope_dict = self.compute_operation_scope_dict()
        for op in op_scope_dict:
            place = [0] * len(self.devices)
            decision = assignment[op_scope_dict[op]]
            #for i in range(decision[0],decision[1]+1,1):
            for i in decision:
                place[i] = 1
            strategy[op] = [1] + place
        for op in self.graph.get_operations():
            if op.name not in strategy.keys():
                print(op.name)
                strategy[op.name] = [1] + place
        import pickle as pkl
        with open(self.model_name + "/" + str(index) + "/strategy.pkl",
                  "wb") as f:
            pkl.dump(strategy, f)
        import tge

        # options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]]
        # strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node }

        g = (
            tge.TGE(self.gdef, self.devices).custom(strategy)
            #.replace_placeholder(self.batch_size)
            .use_collective().compile().get_result())
        with open(self.model_name + "/" + str(index) + "/modified.pbtxt",
                  "w") as fo:
            fo.write(pbtf.MessageToString(g))
Пример #6
0
 def directly_get_reward(self, strategy_dict):
     bandwidth = config_dict.get("bandwidth", None)
     if bandwidth == None:
         intra = "5000"
         inter = "1250"
     else:
         intra = bandwidth[0]
         inter = bandwidth[1]
     time_mem_tuple = tge.TGE(copy.deepcopy(self.gdef), self.devices,
                              sink).custom(strategy_dict).set_bandwidth(
                                  intra, inter).evaluate(
                                      self.name_cost_dict,
                                      self.folder + "/best_stratey.json")
     time = time_mem_tuple[0]
     mem_list = time_mem_tuple[1]
     time = float(time) / (10**6)
     if any(np.array(mem_list) > np.array(device_mems)):
         time = time * 10000
     return np.float32(time)
Пример #7
0
 def get_reward(self, strategy, index_id_dict, trace=""):
     bandwidth = config_dict.get("bandwidth", None)
     if bandwidth == None:
         intra = "5000"
         inter = "1250"
     else:
         intra = bandwidth[0]
         inter = bandwidth[1]
     time_mem_tuple = tge.TGE(
         copy.deepcopy(self.gdef), self.devices,
         sink).set_nccl_model(self.nccl_model).use_collective().custom({
             index_id_dict[index]: strategy_int
             for index, strategy_int in enumerate(strategy)
         }).set_bandwidth(intra, inter).evaluate(self.name_cost_dict, trace)
     time = time_mem_tuple[0]
     mem_list = time_mem_tuple[1]
     time = float(time) / (10**3)
     if any(np.array(mem_list) > np.array(device_mems)):
         time = time * 10
         print("oom")
     self.strategy_reward_dict[str(strategy)] = time
     return np.float32(time)
BATCHSIZE = 110

opt = model_fn(None)
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)

devices = ("/GPU:0", )

import tge

# options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]]
# strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node }

strategy = {node.name: [0, 2] for node in gdef.node}

g = (tge.TGE(gdef, devices).custom(strategy).compile().get_result())

with open("modified.pb", "w") as fo:
    fo.write(pbtf.MessageToString(g))

tf.reset_default_graph()
tf.import_graph_def(g)
graph = tf.get_default_graph()

x = graph.get_tensor_by_name("import/Placeholder/replica_0:0")
y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0")
opt = graph.get_operation_by_name("import/GradientDescent/replica_0")
init = graph.get_operation_by_name("import/init/replica_0")

data = {
    x: np.random.uniform(size=(BATCHSIZE, 224, 224, 3)),
Пример #9
0
    optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(tf.reduce_sum(loss))
    return optimizer

import numpy as np
import tensorflow.compat.v1 as tf
import google.protobuf.text_format as pbtf

tf.disable_eager_execution()

opt = model_fn()
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)

devices = (
    "/job:tge/replica:0/task:0/device:GPU:0",
    "/job:tge/replica:0/task:0/device:GPU:1",
    "/job:tge/replica:0/task:1/device:GPU:0",
    "/job:tge/replica:0/task:1/device:GPU:1"
)

import tge

noop = ('Placeholder', 'Const', 'Identity', 'NoOp', 'ReadVariableOp', 'VarHandleOp', 'Shape')

g = (tge.TGE(gdef, devices)
    .custom({ node.name: (0, 1, 1, 1, 1) for node in gdef.node })
    .set_bandwidth(10000, 100)
    .evaluate({ node.name: [0 if node.op in noop else 1000] * len(devices) for node in gdef.node })
)
print(g)
Пример #10
0
                              protocol="grpc",
                              config=config)
# devices = ("GPU:0", "GPU:1")

opt = model_fn()
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)

import tge

strategy = {node.name: [1, 2, 2] for node in gdef.node}

g = (
    tge.TGE(gdef, devices).custom(strategy)
    # .use_nccl()
    .replace_placeholder(48).use_collective()
    # .verbose()
    .compile().get_result())

tf.reset_default_graph()
tf.import_graph_def(g)
graph = tf.get_default_graph()

# x = graph.get_tensor_by_name("import/Placeholder/replica_0:0")
# y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0")
opt = graph.get_operation_by_name("import/GradientDescent/replica_0")
init = graph.get_operation_by_name("import/init/replica_0")

# data = { x: np.random.uniform(size=(24, 224, 224, 3)), y: np.random.uniform(size=(24, 10)) }

sess = tf.Session(server.target, config=config)
Пример #11
0
opt = model_fn()
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)

with open("model.pb", "w") as fo:
    fo.write(pbtf.MessageToString(gdef))

import tge

strategy = { node.name: [1, 1, 1, 1, 1] for node in gdef.node }

g = (tge.TGE(gdef, devices)
    .custom(strategy)
    # .replace_placeholder(BATCHSIZE)
    .use_collective()
    # .verbose()
    .compile()
    .get_result()
)

with open("modified.pb", "w") as fo:
    fo.write(pbtf.MessageToString(g))

tf.reset_default_graph()
resolver = TFConfigClusterResolver()
cluster = resolver.cluster_spec()
dist = tf.distribute.experimental.MultiWorkerMirroredStrategy(
        tf.distribute.experimental.CollectiveCommunication.NCCL)
config = dist.update_config_proto(tf.ConfigProto())
config.ClearField("device_filters")
Пример #12
0
#     "/job:tge/replica:0/task:1/device:GPU:0"
# )
# server = tf.distribute.Server(tf.train.ClusterSpec({
#     "tge": ["127.0.0.1:3901", "127.0.0.1:3902"]
# }), job_name='tge', task_index=0, protocol="grpc")
devices = ("GPU:0", "GPU:1")

import tge

options = [[1, 1], [2, 2]]
strategy = { node.name: [1] + options[np.random.randint(0, len(options))] for node in gdef.node }

g = (tge.TGE(gdef, devices)
    .custom(strategy)
    # .replace_placeholder(64)
    .use_nccl()
    # .verbose()
    .compile()
    .get_result()
)

tf.reset_default_graph()
tf.import_graph_def(g)
graph = tf.get_default_graph()

x = graph.get_tensor_by_name("import/Placeholder/replica_0:0")
y = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0")
opt = graph.get_operation_by_name("import/GradientDescent/replica_0")
init = graph.get_operation_by_name("import/init/replica_0")

data = { x: np.random.uniform(size=(24, 224, 224, 3)), y: np.random.uniform(size=(24, 10)) }
Пример #13
0
from utils import write_tensorboard, setup_workers

opt = model_fn()
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(
)  # add_shapes=True? then we must keep tracking shapes ourselves
bytes = gdef.SerializeToString()

devices = ("/job:tge/replica:0/task:0/device:GPU:0",
           "/job:tge/replica:0/task:0/device:GPU:1",
           "/job:tge/replica:0/task:1/device:GPU:0",
           "/job:tge/replica:0/task:1/device:GPU:1")

import tge
g = (tge.TGE().set_graph_def(gdef).set_devices(devices).data_parallel(
    'ps0').compile().get_graph_def())

tf.reset_default_graph()
tf.import_graph_def(g)
graph = tf.get_default_graph()

x = graph.get_tensor_by_name("import/Placeholder:0")
y = graph.get_tensor_by_name("import/Placeholder_1:0")
opt = graph.get_operation_by_name("import/GradientDescent")
init = graph.get_operation_by_name("import/init")
# currently a hack. Later we will add an API for user to get tensor references back
acc = 10 * (graph.get_tensor_by_name("import/Mean/replica_0:0") +
            graph.get_tensor_by_name("import/Mean/replica_1:0")) / 2

write_tensorboard(opt.graph)
Пример #14
0

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import numpy as np
import tensorflow as tf

opt = model_fn()
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)
gdef = tf.graph_util.extract_sub_graph(gdef, [opt.node_def.name])

devices = ("/job:tge/replica:0/task:0/device:GPU:0",
           "/job:tge/replica:0/task:0/device:GPU:1",
           "/job:tge/replica:0/task:1/device:GPU:0",
           "/job:tge/replica:0/task:1/device:GPU:1")

import tge

# with open("../../xiaodong/tge/GAT/data/graph/docs.txt", "r") as f:
#     records = (x.strip().split(" ") for x in f.readlines())
#     prof = {items[0]: [int(float(x)) for x in items[1:]] for items in records}

g = (tge.TGE(gdef, devices).custom({
    node.name: [0, 2, 1, 0, 1]
    for node in gdef.node
}).evaluate({node.name: [200] * len(devices)
             for node in gdef.node}, "trace.json"))
print(g)
Пример #15
0
workers = ["10.28.1.26:3901", "10.28.1.25:3901"]
server = setup_workers(workers, "grpc+verbs")

import tge
from profiler import profiler_factory

tic1 = time.perf_counter()
g = (
    tge.TGE(gdef, devices)
    # .data_parallel('ring')
    .custom({
        node.name:
        [np.random.randint(0, 2)] + [np.random.randint(0, 2) for _ in devices]
        for node in gdef.node
    })
    # .destructify_names()
    # .compile()
    # .get_result()
    .set_bandwidth(100000, 1000).evaluate({
        node.name: [np.random.randint(0, 1000)] * len(devices)
        for node in gdef.node
    }))
print(g)
toc1 = time.perf_counter()

raise SystemExit

tf.reset_default_graph()
tf.import_graph_def(g)
graph = tf.get_default_graph()
write_tensorboard(graph)
Пример #16
0
init = tf.global_variables_initializer()
gdef = tf.get_default_graph().as_graph_def(add_shapes=True)

with open("model.pb", "w") as fo:
    fo.write(pbtf.MessageToString(gdef))

import tge

# options = [[0, 1], [1, 0], [0, 2], [2, 0], [1, 1]]
# strategy = { node.name: [np.random.randint(0, 2)] + options[np.random.randint(0, len(options))] for node in gdef.node }

strategy = {node.name: [0, 1, 1, 1, 1, 1, 1] for node in gdef.node}

g = (
    tge.TGE(gdef, devices).custom(strategy).replace_placeholder(
        BATCHSIZE).use_collective()
    # .verbose()
    .compile().get_result())

with open("modified.pb", "w") as fo:
    fo.write(pbtf.MessageToString(g))

tf.reset_default_graph()
resolver = TFConfigClusterResolver()
cluster = resolver.cluster_spec()
dist = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    tf.distribute.experimental.CollectiveCommunication.NCCL)
config = dist.update_config_proto(tf.ConfigProto())
config.ClearField("device_filters")
tf.import_graph_def(g)
graph = tf.get_default_graph()
Пример #17
0
    def build_model(self):
        tf.reset_default_graph()
        self.losses = []
        self.vars = []
        self.avg_gradient = []
        self.apply_grad = []
        self.instances = []
        self.gradients = []

        class setter():
            def __init__(self, assignment, devices):
                self.assignment = assignment
                self.last_device = devices[0]

            def choose(self, op):
                scope = tf.get_variable_scope().name
                for key in self.assignment:
                    if key in scope:
                        self.last_device = self.assignment[key]
                        return self.assignment[key]
                #print(self.assignment)
                print(scope, op.name, self.last_device)
                return self.last_device

        def device_setter(assignment, devices):
            _setter = setter(assignment, devices)
            return _setter.choose

        losses = []
        outputs = []

        tf.get_variable_scope()._reuse = tf.AUTO_REUSE
        for i in range(1):
            loss, output, scopes = self.model_fn(None, self.model_name)
            losses.append(loss)
            outputs.append(output[-1])
        self.scopes = scopes
        new_loss = tf.add_n(losses)
        new_loss = tf.reduce_mean(new_loss, name="final_loss")
        #self.train_op = tf.train.AdamOptimizer(learning_rate=0.2, beta1=0.9, beta2=0.98, epsilon=1e-9).minimize(new_loss)
        self.train_op = tf.train.GradientDescentOptimizer(
            learning_rate=0.01).minimize(new_loss,
                                         colocate_gradients_with_ops=True)
        init = tf.global_variables_initializer()

        g = tf.get_default_graph().as_graph_def(add_shapes=True)
        import tge
        strategy = {node.name: [1, 1, 1, 1, 1] for node in g.node}

        g = (
            tge.TGE(g, devices).custom(strategy)
            # .replace_placeholder(BATCHSIZE)
            .use_collective()
            # .verbose()
            .compile().get_result())

        with open("vgg_tge_modified.pbtxt", "w") as fo:
            fo.write(pbtf.MessageToString(g))

        tf.reset_default_graph()
        gdef = graph_pb2.GraphDef()
        with open("vgg_tge_modified.pbtxt", "r") as f:
            txt = f.read()
        pbtf.Parse(txt, gdef)

        tf.import_graph_def(gdef)
        graph = tf.get_default_graph()

        dataset = dataset_factory.get_dataset("imagenet", "train",
                                              "/data/slim_imagenet")

        preprocessing_name = "vgg_19"
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=4,
            common_queue_capacity=20 * batch_size,
            common_queue_min=10 * batch_size)
        [image, label] = provider.get(['image', 'label'])

        train_image_size = 224

        image = image_preprocessing_fn(image, train_image_size,
                                       train_image_size)
        print("image shape:", image.shape)
        print("label shape:", label.shape)
        images, labels = tf.train.batch([image, label],
                                        batch_size=batch_size,
                                        num_threads=4,
                                        capacity=5 * batch_size)
        labels = slim.one_hot_encoding(labels, dataset.num_classes)
        batch_queue = slim.prefetch_queue.prefetch_queue([images, labels],
                                                         capacity=2 *
                                                         micro_batch_num)

        x_tensor = graph.get_tensor_by_name("import/Placeholder/replica_0:0")
        y_tensor = graph.get_tensor_by_name("import/Placeholder_1/replica_0:0")
        x, y = batch_queue.dequeue()
        replace_input(graph, x, x_tensor.name)
        replace_input(graph, y, y_tensor.name)

        opt = graph.get_operation_by_name("import/GradientDescent/replica_0")
        loss = tf.reduce_mean(tf.add_n(get_tensors(graph, "final_loss")))
        init = graph.get_operation_by_name("import/init/replica_0")

        config = tf.ConfigProto()
        config.allow_soft_placement = True
        sess = tf.Session(config=config)
        sess.run(init)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        for i in range(10000000):
            _, cal_loss = sess.run([opt, loss])
            if i % 10 == 0:
                print("Step:{},Loss:{}".format(i, cal_loss))
Пример #18
0
assert j == 1

while True:
    # test current
    d = {}
    for i, node in enumerate(gdef.node):
        if p[i][0] == 0:
            d[node.name] = p[i][1]
        elif p[i][0] == 1:
            d[node.name] = [dec[p[i][1]], 1, 1]
        else:
            d[node.name] = [0, *options[dec[p[i][1]]]]

    t = (tge.TGE(deepcopy(gdef),
                 devices).custom(d).set_bandwidth(2000,
                                                  10000).evaluate(prof))[0]

    if t < best:
        with open("best_{}.txt".format(sys.argv[1]), "w") as f:
            print(t, file=f)
            for i, x in enumerate(dec):
                if i < 2:
                    print(x, file=f)
                else:
                    print(options[x], file=f)
        best = t
        print("new best: {}".format(t))

    # next decision
    for i in range(ngiven, len(dec)):