예제 #1
0
def multi_train(args):
    init_logger()

    gpu_number = args.world_size
    mp = torch.multiprocessing.get_context('spawn')

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing
    procs = []
    for i in range(gpu_number):
        device_id = i
        procs.append(
            mp.Process(target=run,
                       args=(
                           args,
                           device_id,
                           error_queue,
                       ),
                       daemon=True))
        procs[i].start()
        logger.info("Starting process pid: {:d} ".format(procs[i].pid))
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
예제 #2
0
def build_trainer(args, device_id, model, optim):

    grad_accum_count = args.grad_accum_count
    n_gpu = args.world_size

    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0
    print("gpu_rank %d" % gpu_rank)

    tensorboard_log_dir = args.tensorboard_log_dir
    writer = SummaryWriter(tensorboard_log_dir,
                           comment='bert_coherence_measurement')
    report_manager = ReportMgr(args.report_every,
                               start_time=-1,
                               tensorboard_writer=writer)

    trainer = Trainer(args,
                      model,
                      optim,
                      grad_accum_count,
                      n_gpu,
                      gpu_rank,
                      report_manager=report_manager)

    if model:
        n_params = _tally_parameters(model)
        logger.info("* number of parameters: {:d}".format(n_params))
    return trainer
예제 #3
0
def write_mapping(params):
    init_logger(
        "/sdc/xli/Datasets/cnn_daily/data_nsp/shard/mapping/mapping.log")
    paths, save_file = params
    with open(save_file, 'w') as file:
        for path in paths:
            file.write(path + "\n")
        logger.info("{:d} files has write in mapping file".format(len(paths)))
예제 #4
0
def run(device_id, gpu_ranks, world_size, args):
    gpu_rank = multi_init(device_id, world_size, gpu_ranks)
    logger.info("GPU Rank: gpu_rank {:d}".format(gpu_rank))

    if gpu_rank != gpu_ranks[device_id]:
        raise AssertionError("An Error occured in Distributed intializaiton")
    n_gpu = world_size
    train(args, 10000, n_gpu, device_id, gpu_rank)
예제 #5
0
 def save_json(save_path, file_id, samples):
     init_logger()
     for i, sample in enumerate(samples):
         save_ = os.path.join(save_path,
                              "{:s}_{:d}.json".format(file_id, i))
         with open(save_, 'w') as file:
             json.dump(sample, file)
         logger.info("{:s} saved at {:s}".format(save_, save_path))
예제 #6
0
 def _lazy_load_dataset(pt_file, corpus_type):
     # with open(json_file, 'r') as file:
     #     dataset = json.load(file)
     dataset = torch.load(pt_file, map_location=torch.device('cpu'))
     logger.info(
         "Loading {:s} dataset from {:s}, number of examples: {:d}".format(
             corpus_type, pt_file, len(dataset)))
     return dataset
예제 #7
0
    def shard(self):
        init_logger()

        def check_file_exists(root_path):
            for f in glob.glob(os.path.join(root_path, "*.json")):
                file_path = pathlib.Path(f)
                if file_path.exists():
                    os.unlink(file_path)

        pairs_train_mapping, pairs_test_mapping = self.args.pairs_train_mapping, self.args.pairs_test_mapping
        train_files, test_files = map(
            self.read_mapping, (pairs_train_mapping, pairs_test_mapping))

        divided_corpus = {'train': train_files, 'test': test_files}

        # delete all files under the save_path before write in
        check_file_exists(self.args.save_path)

        pool = Pool(mp.cpu_count())
        for corpus_type in ['train', 'test']:
            files = divided_corpus.get(corpus_type)
            dataset = []
            file_no = 0
            for d in pool.imap_unordered(self.load_pairs, files):
                if d is not None:
                    dataset.append(d)

                    if len(dataset) > self.args.shard_size:
                        pt_file = os.path.join(
                            self.args.save_path,
                            "{:s}/cd_{:s}_{:d}.json".format(
                                corpus_type, corpus_type, file_no))
                        with open(pt_file, 'w') as save:
                            save.write(json.dumps(dataset))

                        logger.info(
                            "cd_{:s}_{:d}.json saved at {:s}/{:s}.".format(
                                corpus_type, file_no, self.args.save_path,
                                corpus_type))
                        file_no += 1
                        dataset = []

                else:
                    continue

            if len(dataset) > 0:
                pt_file = os.path.join(
                    self.args.save_path,
                    "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type,
                                                    file_no))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                file_no += 1
        pool.close()
        pool.join()

        print("Shard task is finished!")
예제 #8
0
def delete_tgt():
    init_logger()
    root_path = "/sdc/xli/Datasets/cnn_daily/tgts"
    for root, dirs, file_list in os.walk(root_path):
        for file in file_list:
            file_path = os.path.join(root, file)
            os.unlink(file_path)
            logger.info("{:s} deleted from {:s}".format(file, root))
    os.removedirs(root_path)
    logger.info("{:s} dir deleted.".format(root_path))
예제 #9
0
 def check_and_delete(self, path):
     init_logger()
     # file_path = pathlib.Path(path)
     # if file_path.exists():
     #     os.unlink(file_path)
     #     logger.info("{:s} deleted".format(path))
     for f in glob.glob(os.path.join(path, "*.json")):
         file_path = pathlib.Path(f)
         if file_path.exists():
             os.unlink(file_path)
             logger.info("{:s} deleted from {:s}".format(f, path))
예제 #10
0
 def save_pair(pairs, coherence, mark, file_id, save_path):
     init_logger()
     if len(pairs) > 0:
         for i, pair in enumerate(pairs):
             pair_dict = {"pair": pair, "coherence": coherence}
             save_file = os.path.join(
                 save_path,
                 "{:s}_{:s}_{:d}.json".format(file_id, mark, i))
             with open(save_file, 'w') as file:
                 json.dump(pair_dict, file)
             logger.info("{:s} saved".format(save_file))
예제 #11
0
 def inner(self, locator, value=""):
     try:
         return method(self, locator, value)
     except Exception as e:
         for popup in self._popup_list:
             ret = self.find_all(popup)
             if ret:
                 logger.info(f' find popup {popup}')
                 ret[0].click()
                 return method(self, locator, value)
             continue
         raise e
예제 #12
0
    def _save(self, step):
        real_model = self.model

        model_state_dict = real_model.state_dict()
        checkpoint = {'model': model_state_dict, 'optim': self.optim}

        checkpoint_path = os.path.join(self.args.model_path,
                                       'model_step_{:d}.pt'.format(step))
        logger.info('Saving checkpoing {:s}'.format(checkpoint_path))

        if not os.path.exists(checkpoint_path):
            torch.save(checkpoint, checkpoint_path)
            return checkpoint, checkpoint_path
예제 #13
0
    def train(self, train_iter_method, train_steps):
        logger.info("Start training...")

        step = self.optim._step + 1
        true_batches = []
        accum, normalization = 0, 0
        train_iter = train_iter_method()

        total_stats = Statistics(
        )  # 初始化,loss=0, n_docs=0, start_time=time.time()
        report_stats = Statistics()
        self._start_report_manager(start_time=total_stats.start_time)

        while step <= train_steps:
            reduce_number = 0
            for i, batch in enumerate(train_iter):
                if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank):
                    true_batches.append(batch)
                    # normalization += batch.batch_size
                    normalization = len(true_batches)
                    accum += 1

                    if accum == self.grad_accum_count:
                        reduce_number += 1
                        # if self.n_gpu > 1:
                        #     normalization = sum(distributed.all_gather_list(normalization))

                        # print("Normalization: {}".format(normalization))
                        self._gradient_accumulation(true_batches,
                                                    normalization, total_stats,
                                                    report_stats)

                        report_stats = self._maybe_report_training(
                            step, train_steps, self.optim.learning_rate,
                            report_stats)

                        true_batches = []
                        accum, normalization = 0, 0

                        if (step % self.save_checkpoint_step == 0
                                and self.gpu_rank == 0):
                            self._save(step)
                        step += 1
                        if step > train_steps:
                            break
            train_iter = train_iter_method()

        return total_stats
예제 #14
0
def run(args, device_id, error_queue):
    setattr(args, 'gpu_ranks', [int(i) for i in args.gpu_ranks])

    try:
        gpu_rank = distributed.multi_init(device_id, args.world_size,
                                          args.gpu_ranks)

        logger.info("GPU Rank: gpu_rank {:d}".format(gpu_rank))
        if gpu_rank != args.gpu_ranks[device_id]:
            raise AssertionError(
                "An Error occured in Distributed intializaiton")
        single_train(args, device_id)
    except KeyboardInterrupt:
        pass
    except Exception:
        import traceback
        error_queue.put((args.gpu_ranks[device_id], traceback.format_exc()))
예제 #15
0
    def output(self, step, num_steps, learning_rate, start):
        """Write out statistics to stdout.

        Args:
           step (int): current step
           n_batch (int): total batches
           start (int): start time of step.
        """
        t = self.elapsed_time()
        step_fmt = "%2d" % step
        if num_steps > 0:
            step_fmt = "%s/%5d" % (step_fmt, num_steps)
        logger.info(
            ("Step %s; xent: %4.4f; " +
             "lr: %7.7f; %3.0f docs/s; %6.0f sec")
            % (step_fmt,
               self.xent(),
               learning_rate,
               self.n_docs / (t + 1e-5),
               time.time() - start))
        sys.stdout.flush()
예제 #16
0
    def _format_to_bert_one_sample(self, params):
        init_logger(
            "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log"
        )
        tokenizer, file, save_file, sample_type = params
        with open(file, 'r') as json_file:
            sample = json.load(json_file)
        pair, coherence = sample['pair'], sample['coherence']

        if isinstance(pair, list) and len(pair) > 0 \
                and isinstance(pair[0][0], type(pair[1][0])):

            os.environ["CUDA_VISIBLE_DEVICES"] = "1"
            encode = tokenizer(pair[0],
                               pair[1],
                               return_tensors='pt',
                               is_pretrained=True)
            if encode['input_ids'].numel() <= self.args.bert_max_position:
                sample_dict = {
                    'input_ids': encode['input_ids'],
                    'token_type_ids': encode['token_type_ids'],
                    'attention_mask': encode['attention_mask']
                }
                sample_tuple = (sample_dict, coherence)
                torch.save(sample_tuple, save_file)
                logger.info("{:s} has converted and saved at {:s}".format(
                    file, save_file))

                file_name = file.split("/")[-1]
                dst_file = os.path.join(
                    "/sdc/xli/Datasets/cnn_daily/data_nsp/pts_and_back/processed",
                    "{:s}/{:s}".format(sample_type, file_name))
                shutil.move(file, dst_file)
                logger.info("{:s} has moved to {:s}".format(
                    file_name, dst_file))

            gc.collect()
예제 #17
0
    def shard(self):
        init_logger("/sdc/xli/Datasets/cnn_daily/data_nsp/logs/shard.log")

        pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping = \
            self.args.pairs_train_mapping, self.args.pairs_test_mapping, self.args.pairs_valid_mapping
        # train_files, test_files, valid_files = map(self.read_mapping, (pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping))
        train_files = self.read_mapping(pairs_train_mapping)
        test_files = self.read_mapping(pairs_test_mapping)
        valid_files = self.read_mapping(pairs_valid_mapping)

        divided_corpus = {
            'train': train_files,
            'test': test_files,
            'valid': valid_files
        }

        pool = Pool(mp.cpu_count())
        for corpus_type in ['train', 'test', 'valid']:
            files = divided_corpus.get(corpus_type)
            dataset = []
            file_no = 0
            for d in pool.imap_unordered(self.load_pairs, files):
                if d is not None:
                    dataset.append(d)

                    if len(dataset) >= self.args.shard_size:
                        pt_file = os.path.join(
                            self.args.save_path,
                            "{:s}/cd_{:s}_{:d}.json".format(
                                corpus_type, corpus_type, file_no))
                        with open(pt_file, 'w') as save:
                            save.write(json.dumps(dataset))

                        logger.info("{:s} has saved at {:s}/{:s}".format(
                            pt_file.split("/")[-1], self.args.save_path,
                            corpus_type))
                        file_no += 1
                        dataset = []

                else:
                    continue

            if len(dataset) > 0:
                pt_file = os.path.join(
                    self.args.save_path,
                    "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type,
                                                    file_no))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                logger.info("{:s} has saved at {:s}/{:s}".format(
                    pt_file.split("/")[-1], self.args.save_path, corpus_type))
                file_no += 1
        pool.close()
        pool.join()

        logger.info("Shard task is finished!")
예제 #18
0
def single_train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        # 使用指定的gpu
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)

    else:
        checkpoint = None

    def train_iter_method():
        return DataLoaderBert(load_dataset(args, 'train', shuffle=True),
                              args.batch_size,
                              shuffle=True,
                              is_test=False)

    model = NextSentencePrediction(args, device, checkpoint)
    optim = build_optim(args, model, checkpoint)

    logger.info(model)

    trainer = build_trainer(args, device_id, model, optim)
    trainer.train(train_iter_method, args.train_steps)
예제 #19
0
    def _format_to_bert(self, params):
        init_logger(
            "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log"
        )
        tokenizer, mapping_file, save_file = params

        logger.info("Processing {:s}".format(mapping_file))
        with open(mapping_file, 'r') as m_file:
            json_paths = (line.strip() for line in m_file.readlines())

        samples = []
        for json_file in json_paths:
            with open(json_file, 'r') as j_file:
                sample = json.load(j_file)
            pair = sample['pair']
            label = sample['coherence']

            try:
                encode = tokenizer(pair[0],
                                   pair[1],
                                   return_tensors='pt',
                                   is_pretokenized=True)

                if encode['input_ids'].numel() <= self.args.bert_max_position:
                    sample_dict = {
                        'input_ids': encode['input_ids'].to('cuda'),
                        'token_type_ids': encode['token_type_ids'].to('cuda'),
                        'attention_mask': encode['attention_mask'].to('cuda')
                    }
                    samples.append((sample_dict, label))
                else:
                    logger.info("Valid sample length: {}".format(
                        encode['input_ids'].numel()))
            except ValueError:
                logger.warning("Value Error! And your data is {}".format(pair))

        torch.save(samples, save_file)
        logger.info("{:s} has converted and saved at {:s}".format(
            mapping_file, save_file))

        del (samples)
        gc.collect()
예제 #20
0
 def log(self, *args, **kwargs):
     logger.info(*args, **kwargs)
예제 #21
0
 def _lazy_load_dataset(json_file, corpus_type):
     with open(json_file, 'r') as file:
         dataset = json.load(file)
     logger.info(
         "Loading {:s} dataset from {:s}, number of examples: {:d}".format(corpus_type, json_file, len(dataset)))
     return dataset
예제 #22
0
 def save(input):
     init_logger()
     with open(self.args.save_file, 'a+') as file:
         json.dump(input, file)
     logger.info("")