def initialize_horovod(): horovod_installed = importlib.util.find_spec("horovod") is not None if torch.cuda.is_available() and horovod_installed: import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() else: part_index = 0 part_num = 1 if is_root_node(): print("Running on {} GPUs".format(part_num)) return part_index, part_num
def initialize_trains(arg_parser, project_name, tag): tb_logdir = None OPTS.trains_task = None if is_root_node(): if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name=project_name, task_name=tag, auto_connect_arg_parser=False, output_uri="{}/data/model_backups".format( os.getenv("HOME"))) task.connect(arg_parser) task.set_random_seed(OPTS.seed) OPTS.trains_task = task except SystemError as e: print(e) pass tb_logdir = os.path.join(OPTS.root, "tensorboard") if not os.path.exists(tb_logdir): os.mkdir(tb_logdir) return tb_logdir
if torch.cuda.is_available() and horovod_installed: import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1 # Tensorboard Logging tb_logdir = None OPTS.trains_task = None if is_root_node(): print("Running on {} GPUs".format(gpu_num)) if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name="lanmt2", task_name=OPTS.result_tag, auto_connect_arg_parser=False, output_uri=OPTS.root) task.connect(ap) task.set_random_seed(OPTS.seed) task.set_output_model_id(OPTS.model_tag) OPTS.trains_task = task except: pass if envswitch.who() != "shu":
if torch.cuda.is_available() and horovod_installed: import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1 # Tensorboard Logging tb_logdir = None OPTS.trains_task = None if is_root_node(): print("Running on {} GPUs".format(gpu_num)) if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name="EBM_LM", task_name=OPTS.result_tag, auto_connect_arg_parser=False, output_uri="{}/data/model_backups".format( os.getenv("HOME"))) task.connect(ap) task.set_random_seed(OPTS.seed) OPTS.trains_task = task except SystemError as e: print(e) pass
def batch_translate(self, input_path, output_path, field=0, remove_subword_tokens=True, max_length=100, resume=False): """Translate a file.""" # Check whether using multiple GPUs try: import horovod.torch as hvd except ImportError: pass # If using multigpu, then separate the input file if self._is_multigpu: sync_tensor = torch.tensor(0) tmp_output_path = "/tmp/{}.{}".format(os.path.basename(output_path), hvd.local_rank()) else: sync_tensor = None tmp_output_path = output_path result_map = {} if self._is_multigpu and resume and os.path.exists(tmp_output_path): for line in open(tmp_output_path): pair = line.strip("\n").split("\t") if len(pair) != 2: print(line) id, line = pair result_map[int(id)] = line print("loaded {} computed results".format(len(result_map))) fout = open(tmp_output_path, "w") test_lines = list(open(input_path)) err = 0 for i, line in enumerate(test_lines): # Gather error counts in multigpu mode if self._is_multigpu: if i % (10 * hvd.size()) == 0: sync_tensor.fill_(err) hvd.allreduce_(sync_tensor, average=False) if i % hvd.size() != hvd.local_rank(): continue # Translate pair = line.strip().split("\t") src_sent = pair[field] if len(src_sent.split()) > max_length: result = "x" else: if i in result_map: result = result_map[i] else: result, _ = self.translate("<s> {} </s>".format(src_sent)) if result is None: result = "" if remove_subword_tokens: if "▁" in result: result = "".join(result.split()).replace("▁", " ").strip() else: result = result.replace("@@ ", "") if not result: err += 1 # Write the results and print progress if self._is_multigpu: fout.write("{}\t{}\n".format(i, result)) else: fout.write("{}\n".format(result)) fout.flush() if self._is_multigpu and hvd.local_rank() == 0: sys.stdout.write("translating: {:.0f}% err: {} \r".format(float(i + 1) * 100 / len(test_lines), int(sync_tensor))) elif not self._is_multigpu: sys.stdout.write("translating: {:.0f}% err: {} \r".format(float(i + 1) * 100 / len(test_lines), err)) sys.stdout.flush() if is_root_node(): sys.stdout.write("\n") fout.close() if self._is_multigpu: # Wait for all process to end hvd.allreduce_(sync_tensor, average=False) # Concatenate all separated translation results if hvd.local_rank() == 0: results = [] for i in range(hvd.size()): for line in open("/tmp/{}.{}".format(os.path.basename(output_path), i)): id, result = line.strip("\n").split("\t") results.append((int(id), result)) results.sort() with open(output_path, "w") as fout: for _, result in results: fout.write(result + "\n")