def main(): print('start job ...') start_time = time.time() # 1. create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-shanghai', 'cn-zhangjiakou', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, instance_type=INSTANCE_TYPE, disable_nas=True, spot=True, install_script='') init_ncluster = time.time() print('init ncluster:', init_ncluster - start_time) # 2. upload GTC code job.run('yum install -y unzip') job.upload('GTC') job.run( 'cd GTC && wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/dataset.zip ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/test.JPG ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/resnet50-19c8e357.pth ' + '&& conda activate torch_1.3_cu10.0_py36') upload_data = time.time() print('upload_data time:', upload_data - init_ncluster) # 3. prepare the dataset job.run('unzip -o dataset.zip') unzip_time = time.time() print('unzip data:', unzip_time - upload_data) # 4. run the training job job.tasks[0].run('conda activate torch_1.3_cu10.0_py36') job.tasks[0].run('./run-perseus.sh 2>&1 | tee logs.log', non_blocking=False) train_time = time.time() print('training time:', train_time - unzip_time) # 5. run the inference job job.tasks[0].run('python inference.py 2>&1 | tee logs.inference.log', non_blocking=False) print('inference time:', time.time() - train_time) eclapse_time = time.time() - start_time print(f'training and inference deploy time is: {eclapse_time} s.') # 6. stop the instance (optional) job.stop()
def main(): start_time = time.time() # 1. Create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, disable_nas=True, spot=True, instance_type=INSTANCE_TYPE) # 2. Upload perseus bert code. job.run('yum install -y unzip') job.upload('perseus-bert') job.run('conda activate tensorflow_1.14_cu10.0_py36') # 3. Download pretrain model and dataset. BERT_CHINESE_BASE_DIR = '/root/chinese_L-12_H-768_A-12' DATA_DIR = '/root/toutiao_data' job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/chinese_L-12_H-768_A-12.zip && unzip chinese_L-12_H-768_A-12.zip' ) job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/toutiao_data.tgz && tar xvf toutiao_data.tgz' ) # 4. Run the training job. job.run('cd perseus-bert') hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks] host_str = ','.join(hosts) mpi_cmd = [ 'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}', f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none', '-x NCCL_DEBUG=INFO', '-x PATH', '-x PYTHONPATH', '-x LD_LIBRARY_PATH', '-x XLA_FLAGS' ] bert_classifier_cmd = [ 'python run_classifier.py', '--task_name=news', '--do_train=true', '--do_eval=true', f'--data_dir={DATA_DIR}', f'--vocab_file={BERT_CHINESE_BASE_DIR}/vocab.txt', f'--bert_config_file={BERT_CHINESE_BASE_DIR}/bert_config.json', f'--init_checkpoint={BERT_CHINESE_BASE_DIR}/bert_model.ckpt', '--max_seq_length=128', '--train_batch_size=48', '--learning_rate=8e-5', '--num_train_epochs=3.0', '--warmup_proportion=0.8', '--output_dir=/root/output_dir', '--use_amp=true', '--use_perseus=true', '--use_xla=true' ] cmd = mpi_cmd + bert_classifier_cmd cmd = " ".join(cmd) job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd') job.tasks[0].run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") eclapse_time = time.time() - start_time print(f'training deploy time is: {eclapse_time} s.') job.stop()
def main(): start_time = time.time() # 1. Create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", #image_name='aiacc-dlimg-centos7:1.3.0.post3', num_tasks=args.machines, instance_type=INSTANCE_TYPE, spot=True, disable_nas=True, ) # 2. Upload perseus faster-rcnn code. job.upload('gluon-cv') job.run('conda activate mxnet_1.5.1.post0_cu10.0_py36') # 3. Download pretrain model and dataset. job.run('mkdir /root/mscoco') job.run( 'cd /root/mscoco && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/annotations/annotations_trainval2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/train2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/test2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/val2017.zip' ) job.run('mkdir -p /root/.mxnet/models') job.run( 'cd /root/.mxnet/models && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/pretrain_model/resnet50_v1b-0ecdba34.params' ) # 4. install requirements. job.run('cd /root/gluon-cv/') job.run('pip install -r requirements.txt') job.run('python mscoco.py') # 5. Run the training job. hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks] host_str = ','.join(hosts) mpi_cmd = [ 'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}', f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none', '-x NCCL_DEBUG=INFO', '-x PATH', '-x LD_LIBRARY_PATH', ] insightface_cmd = './train-perseus.sh' cmd = mpi_cmd cmd = " ".join(cmd) + " " + insightface_cmd job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd') job.tasks[0].run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") eclapse_time = time.time() - start_time print(f'training deploy time is: {eclapse_time} s.')
#!/usr/bin/env python import argparse import ncluster import os import time from ncluster import ncluster_globals ncluster_globals.set_should_disable_nas(True) INSTANCE_TYPE = 'ecs.gn6v-c10g1.20xlarge' # V100 #INSTANCE_TYPE = 'ecs.gn6v-c8g1.16xlarge' #INSTANCE_TYPE = 'ecs.gn5-c8g1.14xlarge' NUM_GPUS = 8 ncluster.set_backend('aliyun') parser = argparse.ArgumentParser() parser.add_argument( '--name', type=str, default='perseus-faster-rcnn', help= "name of the current run, used for machine naming and tensorboard visualization" ) parser.add_argument('--machines', type=int, default=1, help="how many machines to use") args = parser.parse_args()