예제 #1
0
def get_accu_and_loss(ps, args):
    net = model.SimpleCNN(args)
    mnist = model.download_mnist_retry(seed=1111)
    start_time = time.time()
    value = []
    master_weights = []
    current_time = time.time() - start_time
    while current_time < args.stop_time:
        weights = ray.get(ps.get_master_weight.remote())
        master_weights.append((current_time, weights))
        if current_time > 5:
            t, w = master_weights.pop(0)
            net.set_flat(w)
            xs, xy = mnist.test.next_batch(2000)
            accu, loss = net.compute_accuracy_and_loss(xs, xy)
            print()
            # print(['*']*10)
            print('master_time', t, 'accu:', accu, 'testing loss:', loss)
            # print(['*']*10)
            print()
            value.append((t, accu, loss))
            np.save(
                args.save_dir +
                'federated_num_worker%d, k_%d, round_%d, net_lrn_%.6f, FL_lrn_%6f'
                % (args.num_workers, args.k, args.round, args.net_lrn,
                   args.lrns[0]), np.array(value))
        time.sleep(1)
        current_time = time.time() - start_time
def get_accu_and_loss(ps, args):
    net = model.SimpleCNN(args)
    mnist = model.download_mnist_retry(seed=1111)

    value = []
    cents = []
    begin = ray.get(ps.get_begin.remote())
    while not begin:
        time.sleep(0.001)
        begin = ray.get(ps.get_begin.remote())

    start_time = time.time()
    while True:
        the_time = time.time() - start_time
        cent = ray.get(ps.get_weights.remote())
        cents.append((the_time, cent))
        print('number of items in the cents', len(cents))
        time.sleep(1)
        if the_time > 5:
            cent_time, cent = cents.pop(0)
            net.set_flat(cent)
            xs, xy = mnist.test.next_batch(10000)
            accu, loss = net.compute_accuracy_and_loss(xs, xy)
            print()
            print('centralized_time', cent_time, 'accu:', accu, 'loss:', loss)
            print()
            value.append((cent_time, accu, loss))
            np.save(
                args.save_dir + 'centralized_num_worker%d, round %d' %
                (args.num_workers, args.round), np.array(value))
예제 #3
0
def worker_task(ps, worker_index, batch_size=50):
    # Download MNIST.
    mnist = model.download_mnist_retry(seed=worker_index)

    # Initialize the model.
    net = model.SimpleCNN()
    keys = net.get_weights()[0]

    while True:
        # Get the current weights from the parameter server.
        weights = ray.get(ps.pull.remote(keys))
        net.set_weights(keys, weights)

        # Compute an update and push it to the parameter server.
        xs, ys = mnist.train.next_batch(batch_size)
        gradients = net.compute_update(xs, ys)
        ps.push.remote(keys, gradients)
def get_accu_and_loss(ps, args):
    net = model.SimpleCNN(args)
    mnist = model.download_mnist_retry(seed=1111)

    # before we start the training, check all the loss value is set which means all workers are ready
    while True:
        losses = ray.get(ps.get_loss.remote())
        if None not in losses:
            print("begin")
            start_time = time.time()
            break
        else:
            time.sleep(0.0001)

    value = []
    cents = []
    current_time = time.time() - start_time
    while current_time < args.stop_time:
        all_weights_ids = ray.get(ps.get_weights_ids.remote())
        all_weights = np.array(
            [ray.get(all_weights_ids[i]) for i in range(args.num_workers)])
        cent = np.mean(all_weights, axis=0)
        cents.append((current_time, cent))
        if current_time > 5:
            cent_time, cent = cents.pop(0)
            net.set_flat(cent)
            xs, xy = mnist.test.next_batch(10000)
            accu, loss = net.compute_accuracy_and_loss(xs, xy)
            print()
            # print(['*']*10)
            print('cent_time', cent_time, 'accu:', accu, 'testing loss:', loss)
            # print(['*']*10)
            print()
            value.append((cent_time, accu, loss))
            np.save(
                args.save_dir +
                'flocking_num_worker%d, k_%d, round_%d, net_lrn_%.6f, node0_lrn_%6f, attraction_%.4f_center_v1'
                % (args.num_workers, args.k, args.round, args.net_lrn,
                   args.lrns[0], args.a), np.array(value))
        time.sleep(1)
        current_time = time.time() - start_time
예제 #5
0
def main(args):
    # Create a parameter server with some random weights.
    net = model.SimpleCNN()
    all_keys, all_values = net.get_weights()
    ps = ParameterServer.remote(all_keys, all_values)

    # Start some training tasks.
    worker_tasks = [worker_task.remote(ps, i) for i in range(args.num_workers)]

    # Download MNIST.
    mnist = model.download_mnist_retry()

    i = 0
    while True:
        # Get and evaluate the current model.
        current_weights = ray.get(ps.pull.remote(all_keys))
        net.set_weights(all_keys, current_weights)
        test_xs, test_ys = mnist.test.next_batch(1000)
        accuracy = net.compute_accuracy(test_xs, test_ys)
        print("Iteration {}: accuracy is {}".format(i, accuracy))
        i += 1
        time.sleep(1)
예제 #6
0
def worker_task(ps, current_worker_index, args):
    # Download MNIST.
    mnist = model.download_mnist_retry(seed=current_worker_index + 1)

    # Initialize the model.
    args.lrn = args.lrns[current_worker_index]
    net = model.SimpleCNN(args)

    if current_worker_index == 1:
        xs, ys = mnist.train.next_batch(args.batch_sizes[current_worker_index])
        acc, loss = net.compute_accuracy_and_loss(xs, ys)
        stored_losses = [loss]

    step = 0
    start_time = time.time()
    pre_time = time.time()
    while step < args.steps and time.time() - start_time < args.stop_time:
        time.sleep(
            max(
                0, args.time_per_batch[current_worker_index] -
                (time.time() - pre_time)))
        pre_time = time.time()
        weights = ray.get(ps.get_master_weight.remote())
        # Get the current weights from the parameter server.
        net.set_flat(weights)

        # Compute an update and push it to the parameter server.
        xs, ys = mnist.train.next_batch(args.batch_sizes[current_worker_index])
        loss_value, new_weights = net.minimize(xs, ys)
        diff = new_weights - weights
        if current_worker_index != 0:
            time.sleep(0.1)
        if step % 50 == 0:
            print("step", step, "current_worker_index", current_worker_index,
                  "elapsed time is",
                  time.time() - start_time, "loss is", loss_value)
        ps.set_master_weight.remote(diff)
        step += 1
 def __init__(self, worker_index, args):
     self.worker_index = worker_index
     self.batch_size = args.batch_size
     self.mnist = model.download_mnist_retry(seed=worker_index)
     self.net = model.SimpleCNN(args)
    ray.init()

    args.save_dir = './centralized_log_%.1f/' % args.sleep_mean
    os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None
    # Create a parameter server.
    net = model.SimpleCNN(args)
    ps = ParameterServer.remote(args)

    # Create workers.
    workers = [
        Worker.remote(worker_index, args)
        for worker_index in range(args.num_workers)
    ]

    # Download MNIST.
    mnist = model.download_mnist_retry()

    i = 0
    current_weights = ps.get_weights.remote()
    get_accu_and_loss.remote(ps, args)
    start_t = time.time()
    while time.time() - start_t < args.stop_time:
        ray.wait([current_weights])
        if i == 1:
            start_t = time.time()
            ps.set_begin.remote()
        sleep_time = np.amax(
            np.random.exponential(args.sleep_mean, args.num_workers))
        time.sleep(sleep_time)
        gradients = [
            worker.compute_gradients.remote(current_weights)
def worker_task(ps, current_worker_index, args):
    mnist = model.download_mnist_retry(seed=current_worker_index + 1)

    # Initialize the model.
    args.lrn = args.lrns[current_worker_index]
    net = model.SimpleCNN(args)
    xs, ys = mnist.train.next_batch(args.batch_size[current_worker_index])
    loss_value, _ = net.minimize(xs, ys)

    all_weights_ids = ray.get(ps.get_weights_ids.remote())
    new_weights = ray.get(all_weights_ids[current_worker_index])
    net.set_flat(new_weights)
    ps.set_loss.remote(current_worker_index, loss_value)

    # before we start the training, check all the loss value is set which means all workers are ready
    while True:
        losses = ray.get(ps.get_loss.remote())
        if None not in losses:
            print("begin")
            start_time = time.time()
            break
        else:
            time.sleep(0.0001)

    flocking_group = ray.get(ps.get_graph.remote())[current_worker_index]
    step = 0

    def get_flocking_potential(weights):
        all_weights_ids = ray.get(ps.get_weights_ids.remote())
        flocking_dis = []
        for fw in flocking_group:
            w = ray.get(all_weights_ids[fw])
            # check whether there is nan in the weights. For debugging purpose
            # if np.isnan(np.min(w)):
            #     print('\n\n\n\n\n\n\n\n\n\nthere is nan in weights')
            #     print(ray.get(all_weights_ids[fw]))
            #     print('fw is', fw)
            #     print(weights)
            #     print('current_worker_index is', current_worker_index)
            #     return
            flocking_dis.append(weights - w)
        return np.sum(np.array(flocking_dis), axis=0) * args.a

    start_time = time.time()
    pre_time = time.time()
    next_weigth_save_time = start_time
    while step < args.steps and time.time() - start_time < args.stop_time:
        time.sleep(
            max(
                0, args.time_per_batch[current_worker_index] -
                (time.time() - pre_time)))
        pre_time = time.time()
        xs, ys = mnist.train.next_batch(args.batch_size[current_worker_index])

        loss_value, new_weights = net.minimize(xs, ys)
        ps.set_loss.remote(current_worker_index, loss_value)
        weights = new_weights
        f_p = get_flocking_potential(weights)
        new_weights = net.get_flat()
        new_weights -= args.lrn * f_p
        net.set_flat(new_weights)
        weights_id = ray.put(new_weights)
        ps.set_weights_ids.remote(current_worker_index, [weights_id])
        step += 1
        # if step % 100 == 0 and current_worker_index == 0:
        if step % 100 == 1:
            print('step', step, 'current_worker_index', current_worker_index,
                  'elapsed_time',
                  time.time() - start_time, 'training loss is', loss_value)
        save = True
        if save:
            os.makedirs(args.save_dir + "saved_weight/", exist_ok=True)
        if time.time() > next_weigth_save_time:
            saved_weight = [time.time() - start_time, new_weights]
            np.save(
                args.save_dir +
                'saved_weight/flocking_num_worker%d, k_%d, round_%d, net_lrn_%.6f, node0_lrn_%6f, attraction_%.4f_worker_%d_time_%.2f'
                % (args.num_workers, args.k, args.round,
                   args.net_lrn, args.lrns[0], args.a, current_worker_index,
                   time.time() - start_time), np.array(saved_weight))
            next_weigth_save_time = time.time() + get_sleep_time(time.time() -
                                                                 start_time)
예제 #10
0
 def __init__(self, worker_index, batch_size=50, curritr=0):
     self.worker_index = worker_index
     self.batch_size = batch_size
     self.mnist = model.download_mnist_retry(seed=worker_index)
     self.net = model.SimpleCNN()
     self.curritr = curritr
예제 #11
0
if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(redis_address=args.redis_address)

    # Create a parameter server.
    net = model.SimpleCNN()
    ps = ParameterServer.remote(1e-4 * args.num_workers)

    # Create workers.
    workers = [Worker.remote(worker_index)
               for worker_index in range(args.num_workers)]

    # Download MNIST.
    mnist = model.download_mnist_retry()

    iteration = 0
    backups = args.backups #no. of stragglers, we will ignore results from them
    print(args.num_workers, backups)
    current_weights = ps.get_weights.remote()

    
    k = args.num_workers-backups
    while iteration<=100:
        # Compute and apply gradients.
        # compute_tasks = [worker.compute_gradients.remote(current_weights) for worker in workers]
        fobj_to_workerID_dict = {} #mapping between remotefns to worker_ids
        compute_tasks = []
        
        for i in range(k):
        ray_ctx = OrcaContext.get_ray_context()
    else:
        print(
            "init_orca_context failed. cluster_mode should be one of 'local', 'yarn' and 'spark-submit' but got "
            + cluster_mode)

    # Create a parameter server with some random weights.
    net = SimpleCNN()
    all_keys, all_values = net.get_weights()
    ps = ParameterServer.remote(all_keys, all_values)

    # Start some training tasks.
    worker_tasks = [worker_task.remote(ps, i) for i in range(args.num_workers)]

    # Download MNIST.
    mnist = download_mnist_retry()
    print("Begin iteration")
    i = 0
    while i < args.iterations:
        # Get and evaluate the current model.
        print("-----Iteration" + str(i) + "------")
        current_weights = ray.get(ps.pull.remote(all_keys))
        net.set_weights(all_keys, current_weights)
        test_xs, test_ys = mnist.test.next_batch(1000)
        accuracy = net.compute_accuracy(test_xs, test_ys)
        print("Iteration {}: accuracy is {}".format(i, accuracy))
        i += 1
        time.sleep(1)
    ray_ctx.stop()
    stop_orca_context()
예제 #13
0
 def __init__(self, worker_index, batch_size=50):
     self.worker_index = worker_index
     self.batch_size = batch_size
     self.mnist = download_mnist_retry(seed=worker_index)
     self.net = SimpleCNN()