def run(args, device_id, error_queue): """ run process """ setattr(args, 'gpu_ranks', [int(i) for i in args.gpu_ranks]) try: gpu_rank = distributed.multi_init(device_id, args.world_size, args.gpu_ranks, args.init_method) print('gpu_rank %d' % gpu_rank) if gpu_rank != args.gpu_ranks[device_id]: raise AssertionError("An error occurred in \ Distributed initialization") recover_all = False pt = None if args.recover_from != '': pt = args.recover_from recover_all = True elif args.extract_model != '': pt = args.extract_model if args.mode == 'abs': abs_train(args, device_id, pt, recover_all) else: train(args, device_id) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.gpu_ranks[device_id], traceback.format_exc()))
def run(args, device_id, error_queue): """ run process """ setattr(args, 'gpu_ranks', [int(i) for i in args.gpu_ranks]) try: gpu_rank = distributed.multi_init(device_id, args.world_size, args.gpu_ranks) print('gpu_rank %d' % gpu_rank) if gpu_rank != args.gpu_ranks[device_id]: raise AssertionError("An error occurred in \ Distributed initialization") train_single_ext(args, device_id) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.gpu_ranks[device_id], traceback.format_exc()))