示例#1
0
def init_process_group(strategy=None):
    nranks = ParallelEnv().nranks
    rank = ParallelEnv().local_rank
    is_master = True if rank == 0 else False
    pg_group = dist.init_parallel_env()

    return pg_group.process_group
示例#2
0
def get_path_from_url(url, md5sum=None, check_exist=True):
    """ Download from given url to root_dir.
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    Args:
        url (str): download url
        md5sum (str): md5 sum of download package

    Returns:
        str: a local path to save downloaded models & weights & datasets.
    """

    from paddle.fluid.dygraph.parallel import ParallelEnv

    assert is_url(url), "downloading from {} not a url".format(url)
    root_dir = PPGAN_HOME
    # parse path after download to decompress under root_dir
    fullpath = _map_path(url, root_dir)

    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
        logger.info("Found {}".format(fullpath))
    else:
        if ParallelEnv().local_rank == 0:
            fullpath = _download(url, root_dir, md5sum)
        else:
            while not os.path.exists(fullpath):
                time.sleep(1)

    if ParallelEnv().local_rank == 0:
        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
            fullpath = _decompress(fullpath)

    return fullpath
示例#3
0
文件: model.py 项目: huangjun12/hapi
    def __init__(self, model):
        super(StaticGraphAdapter, self).__init__()
        self.model = model
        # with `_build_once` gone, parameters are now created in `__init__`
        # so we need to keep track of the parameters already created
        self._startup_prog = fluid.default_startup_program()
        self._orig_prog = fluid.default_main_program()

        self._label_vars = {}  # label variables
        self._input_vars = {}  # label variables
        self._endpoints = {}
        self._loss_endpoint = None
        self._executor = None
        self._progs = {}
        self._compiled_progs = {}

        self._merge_count = {
            'eval_total': 0,
            'test_total': 0,
            'eval_batch': 0,
            'test_batch': 0
        }

        self._nranks = ParallelEnv().nranks
        self._local_rank = ParallelEnv().local_rank
示例#4
0
def get_path(url, root_dir, md5sum=None, check_exist=True):
    """ Download from given url to root_dir.
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    url (str): download url
    root_dir (str): root dir for downloading, it should be
                    WEIGHTS_HOME or DATASET_HOME
    md5sum (str): md5 sum of download package
    """
    assert is_url(url), "downloading from {} not a url".format(url)
    # parse path after download to decompress under root_dir
    fullpath = map_path(url, root_dir)

    exist_flag = False
    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
        exist_flag = True
        if ParallelEnv().local_rank == 0:
            logger.info("Found {}".format(fullpath))
    else:
        if ParallelEnv().local_rank == 0:
            fullpath = _download(url, root_dir, md5sum)
        else:
            while not os.path.exists(fullpath):
                time.sleep(1)
    return fullpath, exist_flag
示例#5
0
def init_process_group(strategy=None):
    nranks = ParallelEnv().nranks
    rank = ParallelEnv().local_rank
    is_master = True if rank == 0 else False
    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
    pg_group = core.ProcessGroupHCCL(store, rank, nranks)

    return pg_group
示例#6
0
    def prepare_leveldb(self,
                        input_file,
                        leveldb_file,
                        label_list,
                        max_seq_length,
                        tokenizer,
                        line_processor=None,
                        delimiter="\t",
                        quotechar=None):
        def default_line_processor(line_id, line):
            assert len(line) == 2
            text_a = line[0]
            label = line[1]

            return BertInputExample(str(line_id),
                                    text_a=text_a,
                                    text_b=None,
                                    label=label)

        if line_processor is None:
            line_processor = default_line_processor

        if ParallelEnv().nranks > 1:
            leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)

        if not os.path.exists(leveldb_file):
            print("putting data %s into leveldb %s" %
                  (input_file, leveldb_file))
            _example_num = 0
            _db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
            with io.open(input_file, "r", encoding="utf8") as f:
                reader = csv.reader(f,
                                    delimiter=delimiter,
                                    quotechar=quotechar)
                line_id = 0
                for (_line_id, line) in enumerate(reader):
                    if line_processor(str(_line_id), line) is None:
                        continue

                    line_str = delimiter.join(line)
                    _db.Put(
                        str(line_id).encode("utf8"), line_str.encode("utf8"))
                    line_id += 1
                    _example_num += 1
            _db.Put("_example_num_".encode("utf8"),
                    str(_example_num).encode("utf8"))
        else:
            _db = leveldb.LevelDB(leveldb_file, create_if_missing=False)

        self.label_list = label_list
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.delimiter = delimiter
        self._db = _db
        self._line_processor = line_processor
示例#7
0
def main():
    device = set_device(FLAGS.device)
    fluid.enable_dygraph(device) if FLAGS.dynamic else None

    model_list = [x for x in models.__dict__["__all__"]]
    assert FLAGS.arch in model_list, "Expected FLAGS.arch in {}, but received {}".format(
        model_list, FLAGS.arch)
    model = models.__dict__[FLAGS.arch](
        pretrained=FLAGS.eval_only and not FLAGS.resume)

    if FLAGS.resume is not None:
        model.load(FLAGS.resume)

    inputs = [Input([None, 3, 224, 224], 'float32', name='image')]
    labels = [Input([None, 1], 'int64', name='label')]

    train_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'train'),
                                    mode='train',
                                    image_size=FLAGS.image_size,
                                    resize_short_size=FLAGS.resize_short_size)

    val_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'val'),
                                  mode='val',
                                  image_size=FLAGS.image_size,
                                  resize_short_size=FLAGS.resize_short_size)

    optim = make_optimizer(np.ceil(
        len(train_dataset) * 1. / FLAGS.batch_size / ParallelEnv().nranks),
                           parameter_list=model.parameters())

    model.prepare(optim, CrossEntropy(), Accuracy(topk=(1, 5)), inputs, labels,
                  FLAGS.device)

    if FLAGS.eval_only:
        model.evaluate(val_dataset,
                       batch_size=FLAGS.batch_size,
                       num_workers=FLAGS.num_workers)
        return

    output_dir = os.path.join(
        FLAGS.output_dir, FLAGS.arch,
        time.strftime('%Y-%m-%d-%H-%M', time.localtime()))
    if ParallelEnv().local_rank == 0 and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model.fit(train_dataset,
              val_dataset,
              batch_size=FLAGS.batch_size,
              epochs=FLAGS.epoch,
              save_dir=output_dir,
              num_workers=FLAGS.num_workers)
示例#8
0
def get_path_from_url(url,
                      root_dir,
                      md5sum=None,
                      check_exist=True,
                      decompress=True,
                      method='get'):
    """ Download from given url to root_dir.
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    Args:
        url (str): download url
        root_dir (str): root dir for downloading, it should be
                        WEIGHTS_HOME or DATASET_HOME
        md5sum (str): md5 sum of download package
        decompress (bool): decompress zip or tar file. Default is `True`
        method (str): which download method to use. Support `wget` and `get`. Default is `get`.

    Returns:
        str: a local path to save downloaded models & weights & datasets.
    """

    from paddle.fluid.dygraph.parallel import ParallelEnv

    assert is_url(url), "downloading from {} not a url".format(url)
    # parse path after download to decompress under root_dir
    fullpath = _map_path(url, root_dir)
    # Mainly used to solve the problem of downloading data from different
    # machines in the case of multiple machines. Different ips will download
    # data, and the same ip will only download data once.
    unique_endpoints = _get_unique_endpoints(
        ParallelEnv().trainer_endpoints[:])
    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
        logger.info("Found {}".format(fullpath))
    else:
        if ParallelEnv().current_endpoint in unique_endpoints:
            fullpath = _download(url, root_dir, md5sum, method=method)
        else:
            while not os.path.exists(fullpath):
                time.sleep(1)

    if ParallelEnv().current_endpoint in unique_endpoints:
        if decompress and (tarfile.is_tarfile(fullpath)
                           or zipfile.is_zipfile(fullpath)):
            fullpath = _decompress(fullpath)

    return fullpath
示例#9
0
文件: val.py 项目: pennypm/PaddleSeg
def main(args):
    env_info = get_environ_info()
    places = fluid.CUDAPlace(ParallelEnv().dev_id) \
        if env_info['place'] == 'cuda' and fluid.is_compiled_with_cuda() \
        else fluid.CPUPlace()

    if args.dataset not in DATASETS:
        raise Exception(
            '`--dataset` is invalid. it should be one of {}'.format(
                str(list(DATASETS.keys()))))
    dataset = DATASETS[args.dataset]

    with fluid.dygraph.guard(places):
        eval_transforms = T.Compose([T.Resize(args.input_size), T.Normalize()])
        eval_dataset = dataset(dataset_root=args.dataset_root,
                               transforms=eval_transforms,
                               mode='val')

        if args.model_name not in MODELS:
            raise Exception(
                '`--model_name` is invalid. it should be one of {}'.format(
                    str(list(MODELS.keys()))))
        model = MODELS[args.model_name](num_classes=eval_dataset.num_classes)

        evaluate(model,
                 eval_dataset,
                 model_dir=args.model_dir,
                 num_classes=eval_dataset.num_classes)
示例#10
0
 def on_epoch_begin(self, epoch=None, logs=None):
     self.steps = self.params['steps']
     self.epoch = epoch
     self.train_step = 0
     if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
         print('Epoch %d/%d' % (epoch + 1, self.epochs))
     self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
示例#11
0
def main(args):
    env_info = get_environ_info()
    places = fluid.CUDAPlace(ParallelEnv().dev_id) \
        if env_info['Paddle compiled with cuda'] and env_info['GPUs used'] \
        else fluid.CPUPlace()

    if args.dataset not in DATASETS:
        raise Exception(
            '`--dataset` is invalid. it should be one of {}'.format(
                str(list(DATASETS.keys()))))
    dataset = DATASETS[args.dataset]

    with fluid.dygraph.guard(places):
        test_transforms = T.Compose([T.Resize(args.input_size), T.Normalize()])
        test_dataset = dataset(dataset_root=args.dataset_root,
                               transforms=test_transforms,
                               mode='test')

        model = manager.MODELS[args.model_name](
            num_classes=test_dataset.num_classes)

        infer(model,
              model_dir=args.model_dir,
              test_dataset=test_dataset,
              save_dir=args.save_dir)
示例#12
0
def _md5check(fullname, md5sum=None):
    if md5sum is None:
        return True
    if ParallelEnv().local_rank == 0:
        logger.info("File {} md5 checking...".format(fullname))
    md5 = hashlib.md5()
    with open(fullname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5.update(chunk)
    calc_md5sum = md5.hexdigest()

    if calc_md5sum != md5sum:
        if ParallelEnv().local_rank == 0:
            logger.info("File {} md5 check failed, {}(calc) != "
                        "{}(base)".format(fullname, calc_md5sum, md5sum))
        return False
    return True
示例#13
0
    def on_train_batch_end(self, step, logs=None):
        logs = logs or {}
        self.train_step += 1

        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
        ).local_rank == 0:
            if self.steps is None or self.train_step < self.steps:
                self._updates(logs, 'train')
示例#14
0
文件: main.py 项目: heavengate/hapi
def main():
    device = paddle.set_device(FLAGS.device)
    paddle.disable_static(device) if FLAGS.dynamic else None

    train_transform = Compose([
        GroupScale(),
        GroupMultiScaleCrop(),
        GroupRandomCrop(),
        GroupRandomFlip(),
        NormalizeImage()
    ])
    train_dataset = KineticsDataset(
        file_list=os.path.join(FLAGS.data, 'train_10.list'),
        pickle_dir=os.path.join(FLAGS.data, 'train_10'),
        label_list=os.path.join(FLAGS.data, 'label_list'),
        transform=train_transform)
    val_transform = Compose(
        [GroupScale(), GroupCenterCrop(),
         NormalizeImage()])
    val_dataset = KineticsDataset(
        file_list=os.path.join(FLAGS.data, 'val_10.list'),
        pickle_dir=os.path.join(FLAGS.data, 'val_10'),
        label_list=os.path.join(FLAGS.data, 'label_list'),
        mode='val',
        transform=val_transform)

    pretrained = FLAGS.eval_only and FLAGS.weights is None
    model = tsm_resnet50(num_classes=train_dataset.num_classes,
                         pretrained=pretrained)

    step_per_epoch = int(len(train_dataset) / FLAGS.batch_size \
                         / ParallelEnv().nranks)
    optim = make_optimizer(step_per_epoch, model.parameters())

    model.prepare(optimizer=optim,
                  loss=paddle.nn.CrossEntropyLoss(),
                  metrics=paddle.metric.Accuracy(topk=(1, 5)))

    if FLAGS.eval_only:
        if FLAGS.weights is not None:
            model.load(FLAGS.weights, reset_optimizer=True)

        model.evaluate(val_dataset,
                       batch_size=FLAGS.batch_size,
                       num_workers=FLAGS.num_workers)
        return

    if FLAGS.resume is not None:
        model.load(FLAGS.resume)

    model.fit(train_data=train_dataset,
              eval_data=val_dataset,
              epochs=FLAGS.epoch,
              batch_size=FLAGS.batch_size,
              save_dir=FLAGS.save_dir or 'tsm_checkpoint',
              num_workers=FLAGS.num_workers,
              drop_last=True,
              shuffle=True)
示例#15
0
 def on_eval_begin(self, logs=None):
     self.eval_steps = logs.get('steps', None)
     self.eval_metrics = logs.get('metrics_name', [])
     self.eval_step = 0
     self.evaled_samples = 0
     self.eval_progbar = ProgressBar(num=self.eval_steps,
                                     verbose=self.verbose)
     if ParallelEnv().local_rank == 0:
         print('Eval begin...')
示例#16
0
文件: model.py 项目: huangjun12/hapi
def set_device(device):
    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
    "Expected device in ['cpu', 'gpu'], but got {}".format(device)

    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
                else fluid.CPUPlace()

    return place
示例#17
0
def log(level=2, message=""):
    if ParallelEnv().local_rank == 0:
        current_time = time.time()
        time_array = time.localtime(current_time)
        current_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        if log_level >= level:
            print(
                "{} [{}]\t{}".format(current_time, levels[level],
                                     message).encode("utf-8").decode("latin1"))
            sys.stdout.flush()
示例#18
0
 def __init__(self,
              dataset,
              batch_size,
              shuffle=False,
              drop_last=True,
              seed=None):
     self._dataset = dataset
     self._batch_size = batch_size
     self._shuffle = shuffle
     self._drop_last = drop_last
     self._random = np.random
     self._random.seed(seed)
     self._nranks = ParallelEnv().nranks
     self._local_rank = ParallelEnv().local_rank
     self._device_id = ParallelEnv().dev_id
     self._num_samples = int(
         math.ceil(len(self._dataset) * 1.0 / self._nranks))
     self._total_size = self._num_samples * self._nranks
     self._epoch = 0
示例#19
0
    def on_eval_batch_end(self, step, logs=None):
        logs = logs or {}
        self.eval_step += 1
        samples = logs.get('batch_size', 1)
        self.evaled_samples += samples

        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
        ).local_rank == 0:
            if self.eval_steps is None or self.eval_step < self.eval_steps:
                self._updates(logs, 'eval')
示例#20
0
    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
        self.dataset = dataset

        assert isinstance(batch_size, int) and batch_size > 0, \
                "batch_size should be a positive integer"
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
                "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
                "drop_last should be a boolean number"

        self.drop_last = drop_last
        self.nranks = ParallelEnv().nranks
        self.local_rank = ParallelEnv().local_rank
        self.epoch = 0
        self.num_samples = int(math.ceil(
            len(self.dataset) * 1.0 / self.nranks))
        self.total_size = self.num_samples * self.nranks
示例#21
0
    def __init__(self,
                 dataset,
                 batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False,
                 consumed_samples=0):
        self.dataset = dataset

        assert isinstance(batch_size, int) and batch_size > 0, \
                "batch_size should be a positive integer"
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
                "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
                "drop_last should be a boolean number"

        from paddle.fluid.dygraph.parallel import ParallelEnv

        if num_replicas is not None:
            assert isinstance(num_replicas, int) and num_replicas > 0, \
                    "num_replicas should be a positive integer"
            self.nranks = num_replicas
        else:
            self.nranks = ParallelEnv().nranks

        if rank is not None:
            assert isinstance(rank, int) and rank >= 0, \
                    "rank should be a non-negative integer"
            self.local_rank = rank
        else:
            self.local_rank = ParallelEnv().local_rank

        self.drop_last = drop_last
        self.epoch = 0

        self.consumed_samples = consumed_samples
        self.num_samples = int(math.ceil(
            len(self.dataset) * 1.0 / self.nranks))
        self.total_size = self.num_samples * self.nranks
示例#22
0
    def on_eval_batch_end(self, step, logs=None):
        logs = logs or {}
        self.eval_step = step
        samples = logs.get('batch_size', 1)
        self.evaled_samples += samples

        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
        ).local_rank == 0:
            # if steps is not None, last step will update in on_epoch_end
            if self.eval_steps and self.eval_step < self.eval_steps:
                self._updates(logs, 'eval')
示例#23
0
    def on_train_batch_end(self, step, logs=None):
        logs = logs or {}
        self.train_step += 1

        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
        ).local_rank == 0:
            # if steps is not None, last step will update in on_epoch_end
            if self.steps and self.train_step < self.steps:
                self._updates(logs, 'train')
            else:
                self._updates(logs, 'train')
def test_dygraph_gloo_init():
    """test gloo init and broadcast"""
    paddle.distributed.init_parallel_env()
    if ParallelEnv().local_rank == 0:
        np_data = np.array([4, 5])
    else:
        np_data = np.array([1, 2])
    data = paddle.to_tensor(np_data)
    paddle.distributed.broadcast(data, 1)
    res = data.numpy()
    assert res == [1, 2]
示例#25
0
文件: reader.py 项目: zhengya01/hapi
 def __init__(self,
              dataset,
              batch_size,
              pool_size=10000,
              sort_type=SortType.NONE,
              min_length=0,
              max_length=100,
              shuffle=False,
              shuffle_batch=False,
              use_token_batch=False,
              clip_last_batch=False,
              distribute_mode=True,
              seed=0):
     for arg, value in locals().items():
         if arg != "self":
             setattr(self, "_" + arg, value)
     self._random = np.random
     self._random.seed(seed)
     # for multi-devices
     self._distribute_mode = distribute_mode
     self._nranks = ParallelEnv().nranks
     self._local_rank = ParallelEnv().local_rank
     self._device_id = ParallelEnv().dev_id
示例#26
0
def _download(url, path, md5sum=None):
    """
    Download from url, save to path.

    url (str): download url
    path (str): download to given path
    """
    if not osp.exists(path):
        os.makedirs(path)

    fname = osp.split(url)[-1]
    fullname = osp.join(path, fname)
    retry_cnt = 0

    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
            retry_cnt += 1
        else:
            raise RuntimeError("Download from {} failed. "
                               "Retry limit reached".format(url))
        if ParallelEnv().local_rank == 0:
            logger.info("Downloading {} from {}".format(fname, url))

        req = requests.get(url, stream=True)
        if req.status_code != 200:
            raise RuntimeError("Downloading from {} failed with code "
                               "{}!".format(url, req.status_code))

        # For protecting download interupted, download to
        # tmp_fullname firstly, move tmp_fullname to fullname
        # after download finished
        tmp_fullname = fullname + "_tmp"
        total_size = req.headers.get('content-length')
        with open(tmp_fullname, 'wb') as f:
            if total_size:
                for chunk in tqdm.tqdm(
                        req.iter_content(chunk_size=1024),
                        total=(int(total_size) + 1023) // 1024,
                        unit='KB'):
                    f.write(chunk)
            else:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
        shutil.move(tmp_fullname, fullname)

    return fullname
示例#27
0
def set_device(device):
    """
    Paddle supports running calculations on various types of devices, including CPU and GPU.
    They are represented by string identifiers. This function can specify the global device
    which the OP will run.

    Parameters:
        device(str): This parameter determines the specific running device.
            It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the
            program is running on the cpu. When ``device`` is ``gpu``, the
            program is running ont the gpu.
    Examples:

     .. code-block:: python
            
        import paddle
        paddle.disable_static()
        paddle.set_device("cpu")
        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
        data = paddle.stack([x1,x2], axis=1)
    """
    lower_device = device.lower()
    if lower_device == 'cpu':
        place = core.CPUPlace()
    elif lower_device == 'gpu':
        if not core.is_compiled_with_cuda():
            raise ValueError(
                "The device should not be 'gpu', " \
                "since PaddlePaddle is not compiled with CUDA")
        place = core.CUDAPlace(ParallelEnv().dev_id)
    else:
        avaliable_device = re.match(r'gpu:\d+', lower_device)
        if not avaliable_device:
            raise ValueError(
                "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'"
            )
        if not core.is_compiled_with_cuda():
            raise ValueError(
                "The device should not be {}, since PaddlePaddle is " \
                "not compiled with CUDA".format(avaliable_device))
        device_info_list = device.split(':', 1)
        device_id = device_info_list[1]
        device_id = int(device_id)
        place = core.CUDAPlace(device_id)
    framework._set_expected_place(place)
    return place
示例#28
0
def setup_logger(output=None, name="hapi", log_level=logging.INFO):
    """
    Initialize logger of hapi and set its verbosity level to "INFO".

    Args:
        output (str): a file name or a directory to save log. If None, will not save log file.
            If ends with ".txt" or ".log", assumed to be a file name.
            Otherwise, logs will be saved to `output/log.txt`.
        name (str): the root module name of this logger. Default: 'hapi'.
        log_level (enum): log level. eg.'INFO', 'DEBUG', 'ERROR'. Default: logging.INFO.
    Returns:
        logging.Logger: a logger
    """
    logger = logging.getLogger(name)
    logger.propagate = False
    logger.setLevel(log_level)

    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    # stdout logging: only local rank==0
    local_rank = ParallelEnv().local_rank
    if local_rank == 0 and len(logger.handlers) == 0:
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(log_level)

        ch.setFormatter(logging.Formatter(format_str))
        logger.addHandler(ch)

    # file logging if output is not None: all workers
    if output is not None:
        if output.endswith(".txt") or output.endswith(".log"):
            filename = output
        else:
            filename = os.path.join(output, "log.txt")

        if local_rank > 0:
            filename = filename + ".rank{}".format(local_rank)

        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))

        fh = logging.StreamHandler(filename)
        fh.setLevel(log_level)
        fh.setFormatter(logging.Formatter(format_str))
        logger.addHandler(fh)

    return logger
示例#29
0
def get_world_size():
    """
    Returns the number of trainers (number of processes participating in current job).

    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
    The default value is 1.

    Returns:
        (int) The number of trainers.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.distributed as dist

            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
            print("The world_size is %d" % dist.get_world_size())
            # The world_size is 4
    """
    return ParallelEnv().world_size
示例#30
0
def get_rank():
    """
    Returns the rank of current trainer.

    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
    The default value is 0.

    Returns:
        (int) The rank of current trainer.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.distributed as dist

            # execute this command in terminal: export PADDLE_TRAINER_ID=0
            print("The rank is %d" % dist.get_rank())
            # The rank is 0
    """
    return ParallelEnv().rank