else: printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time: %f' % (time.time() - start)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--val", action="store_true", help="whether to perform validation") parser.add_argument("--all", action="store_true", help="whether to use all data, default to use 1024000 training data") parser.add_argument("--comm", default=None, help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid") parser.add_argument("--bsp", action="store_true", help="whether to use bsp instead of asp") parser.add_argument("--cache", default=None, help="cache policy") parser.add_argument("--bound", default=100, help="cache bound") parser.add_argument("--config", type=str, default="./settings/local_s1_w4.yml", help="configuration for ps") args = parser.parse_args() if args.comm is None: worker(args) elif args.comm == 'Hybrid': settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader) value = settings['shared'] os.environ['DMLC_ROLE'] = 'worker' for k, v in value.items(): os.environ[k] = str(v) worker(args) elif args.comm == 'PS': launch(worker, args) else: raise NotImplementedError
train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("config") parser.add_argument("--path", "-p", required=True) parser.add_argument("--num_epoch", default=300, type=int) parser.add_argument("--hidden_size", default=128, type=int) parser.add_argument("--learning_rate", default=1, type=float) parser.add_argument("--batch_size", default=128, type=int) parser.add_argument("--cache", default="LFUOpt", type=str) args = parser.parse_args() stat = SharedTrainingStat() launch(train_main, args)
import ctypes import argparse import numpy as np from tqdm import tqdm def test(args): comm = ad.get_worker_communicate() node_id = 0 limit = 10000 length = 10000 width = 128 comm.InitTensor(ctypes.c_int(node_id), ctypes.c_int(2), ctypes.c_int(length), ctypes.c_int(width), ctypes.c_int(2), ctypes.c_double(0), ctypes.c_double(0.1), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) cache = CacheSparseTable(limit, length, width, node_id, "LFUOpt") for i in tqdm(range(10000)): key = np.random.randint(10000, size=1000).astype(np.uint64) value = np.empty((key.size, width), np.float32) ts = cache.embedding_lookup(key, value) ts.wait() grad = np.random.rand(key.size, width).astype(np.float32) ts = cache.embedding_update(key, grad) ts.wait() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("config") args = parser.parse_args() launch(test, args)