Пример #1
0
def run_distributed():
    size = config.NB_PROCESSES
    processes = []
    for rank in range(size):
        p = Process(target=init_process, args=(rank, size, train_model))
        p.start()
        processes.append(p)

    while all(p.is_alive() for p in processes):
        time.sleep(5)

    for p in processes:
        p.kill()
        p.join()
    logging.info("Main process exit")
Пример #2
0
    def _launch_procs(self, num_procs):
        mp.set_start_method('forkserver', force=True)
        skip_msg = mp.Queue(
        )  # Allows forked processes to share pytest.skip reason
        processes = []
        for local_rank in range(num_procs):
            p = Process(target=self._dist_init,
                        args=(local_rank, num_procs, skip_msg))
            p.start()
            processes.append(p)

        # Now loop and wait for a test to complete. The spin-wait here isn't a big
        # deal because the number of processes will be O(#GPUs) << O(#CPUs).
        any_done = False
        while not any_done:
            for p in processes:
                if not p.is_alive():
                    any_done = True
                    break

        # Wait for all other processes to complete
        for p in processes:
            p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)

        failed = [(rank, p) for rank, p in enumerate(processes)
                  if p.exitcode != 0]
        for rank, p in failed:
            # If it still hasn't terminated, kill it because it hung.
            if p.exitcode is None:
                p.terminate()
                pytest.fail(f'Worker {rank} hung.', pytrace=False)
            if p.exitcode < 0:
                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
                            pytrace=False)
            if p.exitcode > 0:
                pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
                            pytrace=False)

        if not skip_msg.empty():
            # This assumed all skip messages are the same, it may be useful to
            # add a check here to assert all exit messages are equal
            pytest.skip(skip_msg.get())
Пример #3
0
        def dist_launcher(num_procs, *func_args, **func_kwargs):
            """Launch processes and gracefully handle failures."""

            # Spawn all workers on subprocesses.
            processes = []
            for local_rank in range(num_procs):
                p = Process(
                    target=dist_init,
                    args=(local_rank, num_procs, *func_args),
                    kwargs=func_kwargs,
                )
                p.start()
                processes.append(p)

            # Now loop and wait for a test to complete. The spin-wait here isn't a big
            # deal because the number of processes will be O(#GPUs) << O(#CPUs).
            any_done = False
            while not any_done:
                for p in processes:
                    if not p.is_alive():
                        any_done = True
                        break

            # Wait for all other processes to complete
            for p in processes:
                p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)

            failed = [(rank, p) for rank, p in enumerate(processes)
                      if p.exitcode != 0]
            for rank, p in failed:
                # If it still hasn't terminated, kill it because it hung.
                if p.exitcode is None:
                    p.terminate()
                    pytest.fail(f"Worker {rank} hung.", pytrace=False)
                if p.exitcode < 0:
                    pytest.fail(
                        f"Worker {rank} killed by signal {-p.exitcode}",
                        pytrace=False)
                if p.exitcode > 0:
                    pytest.fail(f"Worker {rank} exited with code {p.exitcode}",
                                pytrace=False)
Пример #4
0
class DataLoaderMultiFiles(object):
    """DataLoader to iterator over a set of DataSet"""
    def __init__(self, dataset, batch_s):
        self.dataset = dataset
        self.batch_size = batch_s
        self.index_queue = deque(torch.randperm(len(self.dataset)).tolist())
        self.batch_queue = Queue(maxsize=5)

    def __iter__(self):
        print('new iteration of dataloader')
        args = (self.batch_queue, self.index_queue, self.dataset,
                self.batch_size)
        self.batch_process = Process(target=fill_batch, args=args)
        self.batch_process.daemon = True
        self.batch_process.start()
        return self

    def is_alive(self):
        # return sum([e.is_alive() for e in self.buffr_processes])
        return self.batch_process.is_alive()

    def __next__(self):
        # print('batch_queue: {}'.format(self.batch_queue.qsize()))
        timeout = 600 if self.is_alive() else 1
        try:
            batch = self.batch_queue.get(timeout=timeout)
        except Empty:
            print('empty')
            self.kill()
            raise StopIteration
        # print('got batch')
        tmp = LongTensor(batch)
        # print('computing')
        return tmp

    def kill(self):
        print('Killing processes')
        self.batch_process.terminate()

    def __del__(self):
        self.kill()
Пример #5
0
def train(pv_net,dev_train_nums=[0,],dev_bench_num=1):
    import torch.optim as optim
    import gc
    data_rounds=64 #64
    data_timeout=30 #96
    data_timerest=10 #20
    loss2_weight=0.03
    train_mcts_b=0
    train_mcts_k=2
    review_number=3
    age_in_epoch=3
    log("loss2_weight: %.2f, data_rounds: %dx%d, train_mcts_b: %d, train_mcts_k: %.1f, review_number: %d, age_in_epoch: %d"
        %(loss2_weight,len(dev_train_nums),data_rounds,train_mcts_b,train_mcts_k,review_number,age_in_epoch))

    device_main=torch.device("cuda:0")
    pv_net=pv_net.to(device_main)
    optimizer=optim.Adam(pv_net.parameters(),lr=0.0004,betas=(0.9,0.999),eps=1e-07,weight_decay=1e-4,amsgrad=False)
    log("optimizer: %s"%(optimizer.__dict__['defaults'],))

    train_datas=[]
    p_benchmark=None
    data_queue=Queue()
    for epoch in range(4000):
        if epoch%90==0:
            save_name='%s-%s-%s-%d.pkl'%(pv_net.__class__.__name__,pv_net.num_layers(),pv_net.num_paras(),epoch)
            #torch.save(pv_net,save_name)
            torch.save(pv_net.state_dict(),save_name)
            if p_benchmark!=None:
                if p_benchmark.is_alive():
                    log("waiting benchmark threading to join")
                p_benchmark.join()
            p_benchmark=Process(target=benchmark,args=(save_name,epoch,dev_bench_num))
            p_benchmark.start()

        if (epoch<=5) or (epoch<30 and epoch%5==0) or epoch%30==0:
            output_flag=True
            log("gc len at %d: %d"%(epoch,len(gc.get_objects())))
        else:
            output_flag=False

        #start prepare data processes
        for i in dev_train_nums:
            args=(copy.deepcopy(pv_net),i,data_rounds,train_mcts_b,train_mcts_k,data_queue)
            #p=Process(target=prepare_train_data_complete_info,args=args)
            p=Process(target=clean_worker,args=args)
            p.start()
        else:
            time.sleep(data_timerest)

        #collect data
        if epoch>=review_number:
            train_datas=train_datas[len(train_datas)//review_number:]
        for i in range(len(dev_train_nums)*4):
            try:
                if i==0:
                    queue_get=data_queue.get(block=True,timeout=data_timeout*2+data_timerest)
                else:
                    queue_get=data_queue.get(block=True,timeout=data_timerest)
                train_datas+=queue_get
            except:
                log("get data failed AGAIN at epoch %d! Has got %d datas."%(epoch,len(train_datas)),l=2)

        trainloader=torch.utils.data.DataLoader(train_datas,batch_size=128,drop_last=True,shuffle=True)
        for age in range(age_in_epoch):
            running_loss1=[];running_loss2=[]
            for batch in trainloader:
                p,v=pv_net(batch[0].to(device_main))
                log_p=F.log_softmax(p*batch[3].to(device_main),dim=1)
                loss1=F.kl_div(log_p,batch[1].to(device_main),reduction="batchmean")
                loss2=F.mse_loss(v.view(-1),batch[2].to(device_main),reduction='mean').sqrt()
                optimizer.zero_grad()
                loss=loss1+loss2*loss2_weight
                loss.backward()
                optimizer.step()
                running_loss1.append(loss1.item())
                running_loss2.append(loss2.item())
            batchnum=len(running_loss1)
            running_loss1=numpy.mean(running_loss1)
            running_loss2=numpy.mean(running_loss2)

            if output_flag and age==0:
                if epoch==0:
                    test_loss1=running_loss1
                    test_loss2=running_loss2
                elif epoch<review_number:
                    test_loss1=running_loss1*(epoch+1)-last_loss1*epoch
                    test_loss2=running_loss2*(epoch+1)-last_loss2*epoch
                else:
                    test_loss1=running_loss1*3-last_loss1*2
                    test_loss2=running_loss2*3-last_loss2*2
                log("%d: %.3f %.2f %d %d"%(epoch,test_loss1,test_loss2,len(train_datas),batchnum))

            if age==age_in_epoch-1:
                last_loss1=running_loss1
                last_loss2=running_loss2

            if output_flag:
                log("        epoch %d age %d: %.3f %.2f"%(epoch,age,running_loss1,running_loss2))

    log(p_benchmark)
    log("waiting benchmark threading to join: %s"%(p_benchmark.is_alive()))
    p_benchmark.join()
    log("benchmark threading should have joined: %s"%(p_benchmark.is_alive()))
Пример #6
0
def mmseg_parallel_predict_main(para_file, trained_model):

    print(
        "MMSegmetation prediction using the trained model (run parallel if use multiple GPUs)"
    )
    machine_name = os.uname()[1]
    start_time = datetime.now()

    if os.path.isfile(para_file) is False:
        raise IOError('File %s not exists in current folder: %s' %
                      (para_file, os.getcwd()))

    expr_name = parameters.get_string_parameters(para_file, 'expr_name')
    # network_ini = parameters.get_string_parameters(para_file, 'network_setting_ini')
    # mmseg_repo_dir = parameters.get_directory(network_ini, 'mmseg_repo_dir')
    # mmseg_code_dir = osp.join(mmseg_repo_dir,'mmseg')

    # if os.path.isdir(mmseg_code_dir) is False:
    #     raise ValueError('%s does not exist' % mmseg_code_dir)

    # # set PYTHONPATH to use my modified version of mmseg
    # if os.getenv('PYTHONPATH'):
    #     os.environ['PYTHONPATH'] = os.getenv('PYTHONPATH') + ':' + mmseg_code_dir
    # else:
    #     os.environ['PYTHONPATH'] = mmseg_code_dir
    # print('\nPYTHONPATH is: ',os.getenv('PYTHONPATH'))

    if trained_model is None:
        trained_model = os.path.join(expr_name, 'latest.pth')

    outdir = parameters.get_directory(para_file, 'inf_output_dir')
    # remove previous results (let user remove this folder manually or in exe.sh folder)
    io_function.mkdir(outdir)

    # get name of inference areas
    multi_inf_regions = parameters.get_string_list_parameters(
        para_file, 'inference_regions')
    b_use_multiGPUs = parameters.get_bool_parameters(para_file,
                                                     'b_use_multiGPUs')

    # loop each inference regions
    sub_tasks = []
    for area_idx, area_ini in enumerate(multi_inf_regions):

        area_name = parameters.get_string_parameters(area_ini, 'area_name')
        area_remark = parameters.get_string_parameters(area_ini, 'area_remark')
        area_time = parameters.get_string_parameters(area_ini, 'area_time')

        inf_image_dir = parameters.get_directory(area_ini, 'inf_image_dir')

        # it is ok consider a file name as pattern and pass it the following functions to get file list
        inf_image_or_pattern = parameters.get_string_parameters(
            area_ini, 'inf_image_or_pattern')

        inf_img_list = io_function.get_file_list_by_pattern(
            inf_image_dir, inf_image_or_pattern)
        img_count = len(inf_img_list)
        if img_count < 1:
            raise ValueError(
                'No image for inference, please check inf_image_dir and inf_image_or_pattern in %s'
                % area_ini)

        area_save_dir = os.path.join(
            outdir, area_name + '_' + area_remark + '_' + area_time)
        io_function.mkdir(area_save_dir)

        # parallel inference images for this area
        CUDA_VISIBLE_DEVICES = []
        if 'CUDA_VISIBLE_DEVICES' in os.environ.keys():
            CUDA_VISIBLE_DEVICES = [
                int(item.strip())
                for item in os.environ['CUDA_VISIBLE_DEVICES'].split(',')
            ]
        idx = 0
        while idx < img_count:

            if b_use_multiGPUs:
                # get available GPUs  # https://github.com/anderskm/gputil
                # memory: orders the available GPU device ids by ascending memory usage
                deviceIDs = GPUtil.getAvailable(order='memory',
                                                limit=100,
                                                maxLoad=0.5,
                                                maxMemory=0.5,
                                                includeNan=False,
                                                excludeID=[],
                                                excludeUUID=[])
                # only use the one in CUDA_VISIBLE_DEVICES
                if len(CUDA_VISIBLE_DEVICES) > 0:
                    deviceIDs = [
                        item for item in deviceIDs
                        if item in CUDA_VISIBLE_DEVICES
                    ]
                    basic.outputlogMessage('on ' + machine_name +
                                           ', available GPUs:' +
                                           str(deviceIDs) +
                                           ', among visible ones:' +
                                           str(CUDA_VISIBLE_DEVICES))
                else:
                    basic.outputlogMessage('on ' + machine_name +
                                           ', available GPUs:' +
                                           str(deviceIDs))

                if len(deviceIDs) < 1:
                    time.sleep(
                        60
                    )  # wait 60 seconds (mmseg need longer time to load models) , then check the available GPUs again
                    continue
                # set only the first available visible
                gpuid = deviceIDs[0]
                basic.outputlogMessage(
                    '%d: predict image %s on GPU %d of %s' %
                    (idx, inf_img_list[idx], gpuid, machine_name))
            else:
                gpuid = None
                basic.outputlogMessage('%d: predict image %s on %s' %
                                       (idx, inf_img_list[idx], machine_name))

            # run inference
            img_save_dir = os.path.join(area_save_dir, 'I%d' % idx)
            inf_list_file = os.path.join(area_save_dir, '%d.txt' % idx)

            done_indicator = '%s_done' % inf_list_file
            if os.path.isfile(done_indicator):
                basic.outputlogMessage('warning, %s exist, skip prediction' %
                                       done_indicator)
                idx += 1
                continue

            # if it already exist, then skip
            if os.path.isdir(img_save_dir) and is_file_exist_in_folder(
                    img_save_dir):
                basic.outputlogMessage(
                    'folder of %dth image (%s) already exist, '
                    'it has been predicted or is being predicted' %
                    (idx, inf_img_list[idx]))
                idx += 1
                continue

            with open(inf_list_file, 'w') as inf_obj:
                inf_obj.writelines(inf_img_list[idx] + '\n')

            sub_process = Process(target=predict_one_image_mmseg,
                                  args=(para_file, inf_img_list[idx],
                                        img_save_dir, inf_list_file, gpuid,
                                        trained_model))
            sub_process.start()
            sub_tasks.append(sub_process)

            if b_use_multiGPUs is False:
                # wait until previous one finished
                while sub_process.is_alive():
                    time.sleep(1)

            idx += 1

            # wait until predicted image patches exist or exceed 20 minutes
            time0 = time.time()
            elapsed_time = time.time() - time0
            while elapsed_time < 20 * 60:
                elapsed_time = time.time() - time0
                file_exist = os.path.isdir(
                    img_save_dir) and is_file_exist_in_folder(img_save_dir)
                if file_exist is True or sub_process.is_alive() is False:
                    break
                else:
                    time.sleep(1)

            if sub_process.exitcode is not None and sub_process.exitcode != 0:
                sys.exit(1)

            basic.close_remove_completed_process(sub_tasks)
            # if 'chpc' in machine_name:
            #     time.sleep(60)  # wait 60 second on ITSC services
            # else:
            #     time.sleep(10)

    # check all the tasks already finished
    wait_all_finish = 0
    while basic.b_all_process_finish(sub_tasks) is False:
        if wait_all_finish % 100 == 0:
            basic.outputlogMessage('wait all tasks to finish')
        time.sleep(1)
        wait_all_finish += 1

    basic.close_remove_completed_process(sub_tasks)
    end_time = datetime.now()

    diff_time = end_time - start_time
    out_str = "%s: time cost of total parallel inference on %s: %d seconds" % (
        str(end_time), machine_name, diff_time.seconds)
    basic.outputlogMessage(out_str)
    with open("time_cost.txt", 'a') as t_obj:
        t_obj.writelines(out_str + '\n')