示例#1
0
def initialize_horovod():
    horovod_installed = importlib.util.find_spec("horovod") is not None
    if torch.cuda.is_available() and horovod_installed:
        import horovod.torch as hvd
        hvd.init()
        torch.cuda.set_device(hvd.local_rank())
        part_index = hvd.rank()
        part_num = hvd.size()
    else:
        part_index = 0
        part_num = 1
    if is_root_node():
        print("Running on {} GPUs".format(part_num))
    return part_index, part_num
示例#2
0
def initialize_trains(arg_parser, project_name, tag):
    tb_logdir = None
    OPTS.trains_task = None
    if is_root_node():
        if OPTS.tensorboard:
            try:
                from trains import Task
                task = Task.init(project_name=project_name,
                                 task_name=tag,
                                 auto_connect_arg_parser=False,
                                 output_uri="{}/data/model_backups".format(
                                     os.getenv("HOME")))
                task.connect(arg_parser)
                task.set_random_seed(OPTS.seed)
                OPTS.trains_task = task
            except SystemError as e:
                print(e)
                pass
            tb_logdir = os.path.join(OPTS.root, "tensorboard")
            if not os.path.exists(tb_logdir):
                os.mkdir(tb_logdir)
    return tb_logdir
示例#3
0
if torch.cuda.is_available() and horovod_installed:
    import horovod.torch as hvd
    hvd.init()
    torch.cuda.set_device(hvd.local_rank())
    part_index = hvd.rank()
    part_num = hvd.size()
    gpu_num = hvd.size()
else:
    part_index = 0
    part_num = 1
    gpu_num = 1

# Tensorboard Logging
tb_logdir = None
OPTS.trains_task = None
if is_root_node():
    print("Running on {} GPUs".format(gpu_num))
    if OPTS.tensorboard:
        try:
            from trains import Task
            task = Task.init(project_name="lanmt2",
                             task_name=OPTS.result_tag,
                             auto_connect_arg_parser=False,
                             output_uri=OPTS.root)
            task.connect(ap)
            task.set_random_seed(OPTS.seed)
            task.set_output_model_id(OPTS.model_tag)
            OPTS.trains_task = task
        except:
            pass
        if envswitch.who() != "shu":
示例#4
0
if torch.cuda.is_available() and horovod_installed:
    import horovod.torch as hvd
    hvd.init()
    torch.cuda.set_device(hvd.local_rank())
    part_index = hvd.rank()
    part_num = hvd.size()
    gpu_num = hvd.size()
else:
    part_index = 0
    part_num = 1
    gpu_num = 1

# Tensorboard Logging
tb_logdir = None
OPTS.trains_task = None
if is_root_node():
    print("Running on {} GPUs".format(gpu_num))
    if OPTS.tensorboard:
        try:
            from trains import Task
            task = Task.init(project_name="EBM_LM",
                             task_name=OPTS.result_tag,
                             auto_connect_arg_parser=False,
                             output_uri="{}/data/model_backups".format(
                                 os.getenv("HOME")))
            task.connect(ap)
            task.set_random_seed(OPTS.seed)
            OPTS.trains_task = task
        except SystemError as e:
            print(e)
            pass
示例#5
0
    def batch_translate(self, input_path, output_path, field=0, remove_subword_tokens=True, max_length=100, resume=False):
        """Translate a file."""
        # Check whether using multiple GPUs
        try:
            import horovod.torch as hvd
        except ImportError:
            pass
        # If using multigpu, then separate the input file
        if self._is_multigpu:
            sync_tensor = torch.tensor(0)
            tmp_output_path = "/tmp/{}.{}".format(os.path.basename(output_path), hvd.local_rank())
        else:
            sync_tensor = None
            tmp_output_path = output_path
        result_map = {}
        if self._is_multigpu and resume and os.path.exists(tmp_output_path):
            for line in open(tmp_output_path):
                pair = line.strip("\n").split("\t")
                if len(pair) != 2:
                    print(line)
                id, line = pair
                result_map[int(id)] = line
            print("loaded {} computed results".format(len(result_map)))
        fout = open(tmp_output_path, "w")
        test_lines = list(open(input_path))
        err = 0
        for i, line in enumerate(test_lines):
            # Gather error counts in multigpu mode
            if self._is_multigpu:
                if i % (10 * hvd.size()) == 0:
                    sync_tensor.fill_(err)
                    hvd.allreduce_(sync_tensor, average=False)
                if i % hvd.size() != hvd.local_rank():
                    continue
            # Translate
            pair = line.strip().split("\t")
            src_sent = pair[field]
            if len(src_sent.split()) > max_length:
                result = "x"
            else:
                if i in result_map:
                    result = result_map[i]
                else:
                    result, _ = self.translate("<s> {} </s>".format(src_sent))

            if result is None:
                result = ""
            if remove_subword_tokens:
                if "▁" in result:
                    result = "".join(result.split()).replace("▁", " ").strip()
                else:
                    result = result.replace("@@ ", "")
            if not result:
                err += 1
            # Write the results and print progress
            if self._is_multigpu:
                fout.write("{}\t{}\n".format(i, result))
            else:
                fout.write("{}\n".format(result))
            fout.flush()
            if self._is_multigpu and hvd.local_rank() == 0:
                sys.stdout.write("translating: {:.0f}%  err: {}    \r".format(float(i + 1) * 100 / len(test_lines),
                                                                              int(sync_tensor)))
            elif not self._is_multigpu:
                sys.stdout.write("translating: {:.0f}%  err: {}    \r".format(float(i + 1) * 100 / len(test_lines), err))
            sys.stdout.flush()
        if is_root_node():
            sys.stdout.write("\n")
        fout.close()
        if self._is_multigpu:
            # Wait for all process to end
            hvd.allreduce_(sync_tensor, average=False)
            # Concatenate all separated translation results
            if hvd.local_rank() == 0:
                results = []
                for i in range(hvd.size()):
                    for line in open("/tmp/{}.{}".format(os.path.basename(output_path), i)):
                        id, result = line.strip("\n").split("\t")
                        results.append((int(id), result))
                results.sort()
                with open(output_path, "w") as fout:
                    for _, result in results:
                        fout.write(result + "\n")