Пример #1
0
        else:
            printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st)
        log.write(printstr)
    log.write('all time: %f' % (time.time() - start))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--val", action="store_true", help="whether to perform validation")
    parser.add_argument("--all", action="store_true", help="whether to use all data, default to use 1024000 training data")
    parser.add_argument("--comm", default=None, help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
    parser.add_argument("--bsp", action="store_true", help="whether to use bsp instead of asp")
    parser.add_argument("--cache", default=None, help="cache policy")
    parser.add_argument("--bound", default=100, help="cache bound")
    parser.add_argument("--config", type=str, default="./settings/local_s1_w4.yml", help="configuration for ps")
    args = parser.parse_args()

    if args.comm is None:
        worker(args)
    elif args.comm == 'Hybrid':
        settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
        value = settings['shared']
        os.environ['DMLC_ROLE'] = 'worker'
        for k, v in value.items():
            os.environ[k] = str(v)
        worker(args)
    elif args.comm == 'PS':
        launch(worker, args)
    else:
        raise NotImplementedError
Пример #2
0
        train_acc = np.sum((y_predicted == g_sample.y) * mask)
        stat.update(acc, mask_eval.sum(),
                    np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum())
        stat.update_train(train_acc, mask.sum(),
                          np.sum(loss_val.asnumpy() * mask) / mask.sum())

        # distributed.ps_get_worker_communicator().BarrierWorker()
        nnodes += mask.sum() + mask_eval.sum()
        if nnodes > meta["partition"]["nodes"][rank]:
            nnodes = 0
            epoch += 1
            if rank == 0:
                stat.print(epoch)
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    parser.add_argument("--path", "-p", required=True)
    parser.add_argument("--num_epoch", default=300, type=int)
    parser.add_argument("--hidden_size", default=128, type=int)
    parser.add_argument("--learning_rate", default=1, type=float)
    parser.add_argument("--batch_size", default=128, type=int)
    parser.add_argument("--cache", default="LFUOpt", type=str)
    args = parser.parse_args()
    stat = SharedTrainingStat()
    launch(train_main, args)
Пример #3
0
import ctypes
import argparse
import numpy as np
from tqdm import tqdm


def test(args):
    comm = ad.get_worker_communicate()
    node_id = 0
    limit = 10000
    length = 10000
    width = 128
    comm.InitTensor(ctypes.c_int(node_id), ctypes.c_int(2), ctypes.c_int(length), ctypes.c_int(width), ctypes.c_int(2), ctypes.c_double(0), ctypes.c_double(0.1), ctypes.c_ulonglong(123),\
        ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
    cache = CacheSparseTable(limit, length, width, node_id, "LFUOpt")
    for i in tqdm(range(10000)):
        key = np.random.randint(10000, size=1000).astype(np.uint64)
        value = np.empty((key.size, width), np.float32)
        ts = cache.embedding_lookup(key, value)
        ts.wait()
        grad = np.random.rand(key.size, width).astype(np.float32)
        ts = cache.embedding_update(key, grad)
        ts.wait()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    args = parser.parse_args()
    launch(test, args)