예제 #1
0
def get_cwd():
    try:
        return get_original_cwd()
    except AttributeError:
        return os.getcwd()
예제 #2
0
    def __ddp_script_mode_setup(self):
        assert self.trainer.global_rank == 0
        self._check_can_spawn_children()
        self._has_spawned_children = True

        os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1')
        os.environ['MASTER_PORT'] = os.environ.get(
            'MASTER_PORT', str(find_free_network_port()))

        # allow the user to pass the node rank
        node_rank = '0'
        node_rank = os.environ.get('NODE_RANK', node_rank)
        node_rank = os.environ.get('GROUP_RANK', node_rank)
        os.environ['NODE_RANK'] = node_rank
        os.environ['LOCAL_RANK'] = '0'

        # when user is using hydra find the absolute path
        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path

        # pull out the commands used to run the script and resolve the abs file path
        command = sys.argv
        try:
            full_path = path_lib(command[0])
        except Exception as e:
            full_path = abspath(command[0])

        command[0] = full_path
        # use the same python interpreter and actually running
        command = [sys.executable] + command

        # the visible devices tell us how many GPUs we want to use.
        # when the trainer script was called the device has already been scoped by the time
        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
        # but forward the GPUs selected via environment variables
        gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '')
        if len(gpu_ids) == 1:
            gpu_ids = f'{gpu_ids},'

        num_gpus = max(1, len(gpu_ids.split(',')))

        # set the flag for ddp scripts
        os.environ['PL_TRAINER_GPUS'] = gpu_ids

        os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'

        self.trainer.interactive_ddp_procs = []
        for local_rank in range(1, self.trainer.num_processes):
            env_copy = os.environ.copy()
            env_copy['LOCAL_RANK'] = f'{local_rank}'

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.trainer.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)

        self.task_idx = 0
예제 #3
0
파일: main.py 프로젝트: KamWithK/JDRL
import hydra

from utils import wrap_continuous_env, wrap_discrete_env, get_latest
from omegaconf import DictConfig, OmegaConf
from hydra.utils import instantiate, call, get_original_cwd

OmegaConf.register_new_resolver("parse_string", lambda input : input.lower().replace(" ", "_"))
OmegaConf.register_new_resolver("get_wrapper_func", lambda continuous : wrap_continuous_env if continuous else wrap_discrete_env)
OmegaConf.register_new_resolver("original_dir", lambda relative_path : get_original_cwd() + relative_path)
OmegaConf.register_new_resolver("get_latest", get_latest)
OmegaConf.register_new_resolver("wandb_mode", lambda save : "online" if save else "disabled")

@hydra.main(config_path="../config", config_name="config")
def main(config: DictConfig):
    env = call(config.environment)
    callbacks = list(instantiate(config.callbacks)["callbacks"])
    model = call(config.model, env)

    model.learn(total_timesteps=config.run["max_timesteps"], callback=callbacks)
    if config.run["save"]: model.save(config.model["save_path"])

if __name__ == "__main__":
    main()
예제 #4
0
    def _call_children_scripts(self):

        # bookkeeping of spawned processes
        assert self.global_rank == 0
        self._check_can_spawn_children()
        self._has_spawned_children = True

        # DDP Environment variables
        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
        os.environ["MASTER_PORT"] = os.environ.get(
            "MASTER_PORT", str(find_free_network_port()))

        # allow the user to pass the node rank
        node_rank = "0"
        node_rank = os.environ.get("NODE_RANK", node_rank)
        node_rank = os.environ.get("GROUP_RANK", node_rank)
        os.environ["NODE_RANK"] = node_rank
        os.environ["LOCAL_RANK"] = "0"

        # when user is using hydra find the absolute path
        path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path

        # pull out the commands used to run the script and resolve the abs file path
        command = sys.argv
        try:
            full_path = path_lib(command[0])
        except Exception:
            full_path = os.path.abspath(command[0])

        command[0] = full_path
        # use the same python interpreter and actually running
        command = [sys.executable] + command

        # the visible devices tell us how many GPUs we want to use.
        # when the trainer script was called the device has already been scoped by the time
        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
        # but forward the GPUs selected via environment variables
        if self.parallel_devices is None:
            raise MisconfigurationException(
                "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)"
            )

        os.environ["PL_TRAINER_GPUS"] = ",".join(
            [str(device.index) for device in self.parallel_devices])
        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"

        if self.lightning_module.logger is not None:
            os.environ["PL_EXP_VERSION"] = str(
                self.lightning_module.logger.version)

        num_gpus = len(self.parallel_devices)
        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"

        self.interactive_ddp_procs = []

        for local_rank in range(1, self.num_processes):
            env_copy = os.environ.copy()
            env_copy["LOCAL_RANK"] = f"{local_rank}"

            # remove env var if global seed not set
            if os.environ.get(
                    "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                del env_copy["PL_GLOBAL_SEED"]

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if _HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
                    os_cwd = f'"{os.getcwd()}"'
                    command += [
                        f'hydra.run.dir={os_cwd}',
                        f'hydra.job.name=train_ddp_process_{local_rank}'
                    ]
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)
예제 #5
0
파일: predict.py 프로젝트: zxlzr/deepke
def main(cfg):
    cwd = utils.get_original_cwd()
    # cwd = cwd[0:-5]
    cfg.cwd = cwd
    cfg.pos_size = 2 * cfg.pos_limit + 2
    print(cfg.pretty())

    # get predict instance
    instance = _get_predict_instance(cfg)
    data = [instance]

    # preprocess data
    data, rels = _preprocess_data(data, cfg)

    # model
    __Model__ = {
        'cnn': models.PCNN,
        'rnn': models.BiLSTM,
        'transformer': models.Transformer,
        'gcn': models.GCN,
        'capsule': models.Capsule,
        'lm': models.LM,
    }

    # 最好在 cpu 上预测
    cfg.use_gpu = False
    if cfg.use_gpu and torch.cuda.is_available():
        device = torch.device('cuda', cfg.gpu_id)
    else:
        device = torch.device('cpu')
    logger.info(f'device: {device}')

    model = __Model__[cfg.model_name](cfg)
    logger.info(f'model name: {cfg.model_name}')
    logger.info(f'\n {model}')
    model.load(cfg.fp, device=device)
    model.to(device)
    model.eval()

    x = dict()
    x['word'], x['lens'] = torch.tensor([data[0]['token2idx']
                                         ]), torch.tensor([data[0]['seq_len']])

    if cfg.model_name != 'lm':
        x['entity_pos'], x['attribute_value_pos'] = torch.tensor([
            data[0]['entity_pos']
        ]), torch.tensor([data[0]['attribute_value_pos']])
        if cfg.model_name == 'cnn':
            if cfg.use_pcnn:
                x['pcnn_mask'] = torch.tensor([data[0]['entities_pos']])
        if cfg.model_name == 'gcn':
            # 没找到合适的做 parsing tree 的工具,暂时随机初始化
            adj = torch.empty(1, data[0]['seq_len'],
                              data[0]['seq_len']).random_(2)
            x['adj'] = adj

    for key in x.keys():
        x[key] = x[key].to(device)

    with torch.no_grad():
        y_pred = model(x)
        y_pred = torch.softmax(y_pred, dim=-1)[0]
        prob = y_pred.max().item()
        prob_att = list(rels.keys())[y_pred.argmax().item()]
        logger.info(
            f"\"{data[0]['entity']}\" 和 \"{data[0]['attribute_value']}\" 在句中属性为:\"{prob_att}\",置信度为{prob:.2f}。"
        )

    if cfg.predict_plot:
        plt.rcParams["font.family"] = 'Arial Unicode MS'
        x = list(rels.keys())
        height = list(y_pred.cpu().numpy())
        plt.bar(x, height)
        for x, y in zip(x, height):
            plt.text(x, y, '%.2f' % y, ha="center", va="bottom")
        plt.xlabel('关系')
        plt.ylabel('置信度')
        plt.xticks(rotation=315)
        plt.show()
예제 #6
0
def test_get_original_cwd_without_hydra(hydra_restore_singletons: Any) -> None:
    with pytest.raises(ValueError):
        utils.get_original_cwd()
예제 #7
0
    def _call_children_scripts(self):
        assert self.trainer.global_rank == 0
        self._check_can_spawn_children()
        self._has_spawned_children = True

        os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1')
        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))

        # allow the user to pass the node rank
        node_rank = '0'
        node_rank = os.environ.get('NODE_RANK', node_rank)
        node_rank = os.environ.get('GROUP_RANK', node_rank)
        os.environ['NODE_RANK'] = node_rank
        os.environ['LOCAL_RANK'] = '0'

        # when user is using hydra find the absolute path
        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path

        # pull out the commands used to run the script and resolve the abs file path
        command = sys.argv
        try:
            full_path = path_lib(command[0])
        except Exception as e:
            full_path = abspath(command[0])

        command[0] = full_path
        # use the same python interpreter and actually running
        command = [sys.executable] + command

        # the visible devices tell us how many GPUs we want to use.
        # when the trainer script was called the device has already been scoped by the time
        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
        # but forward the GPUs selected via environment variables
        if self.trainer.data_parallel_device_ids is None:
            raise MisconfigurationException('you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)')

        os.environ['PL_TRAINER_GPUS'] = ','.join([str(i) for i in self.trainer.data_parallel_device_ids])
        os.environ['PL_IN_DDP_SUBPROCESS'] = '1'

        if self.trainer.logger is not None:
            os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version)

        num_gpus = len(self.trainer.data_parallel_device_ids)
        os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'

        self.interactive_ddp_procs = []
        for local_rank in range(1, self.trainer.num_processes):
            env_copy = os.environ.copy()
            env_copy['LOCAL_RANK'] = f'{local_rank}'

            # remove env var if global seed not set
            if os.environ.get('PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy:
                del env_copy['PL_GLOBAL_SEED']

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)
예제 #8
0
파일: run.py 프로젝트: zxlzr/deepke
def main(cfg):
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd
    print(cfg)

    data_path = DATA_PATH[cfg.dataset_name]
    for mode, path in data_path.items():
        data_path[mode] = os.path.join(cfg.cwd, path)
    dataset_class, data_process = DATASET_CLASS[
        cfg.dataset_name], DATA_PROCESS[cfg.dataset_name]
    mapping = MAPPING[cfg.dataset_name]

    set_seed(cfg.seed)  # set seed, default is 1
    if cfg.save_path is not None:  # make save_path dir
        cfg.save_path = os.path.join(
            cfg.save_path, cfg.dataset_name + "_" + str(cfg.batch_size) + "_" +
            str(cfg.learning_rate) + cfg.notes)
        if not os.path.exists(cfg.save_path):
            os.makedirs(cfg.save_path, exist_ok=True)

    process = data_process(data_path=data_path,
                           mapping=mapping,
                           bart_name=cfg.bart_name,
                           learn_weights=cfg.learn_weights)
    train_dataset = dataset_class(data_processor=process, mode='train')
    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=train_dataset.collate_fn,
                                  batch_size=cfg.batch_size,
                                  num_workers=4)

    dev_dataset = dataset_class(data_processor=process, mode='dev')
    dev_dataloader = DataLoader(dev_dataset,
                                collate_fn=dev_dataset.collate_fn,
                                batch_size=cfg.batch_size,
                                num_workers=4)

    label_ids = list(process.mapping2id.values())

    prompt_model = PromptBartModel(tokenizer=process.tokenizer,
                                   label_ids=label_ids,
                                   args=cfg)
    model = PromptGeneratorModel(prompt_model=prompt_model,
                                 bos_token_id=0,
                                 eos_token_id=1,
                                 max_length=cfg.tgt_max_len,
                                 max_len_a=cfg.src_seq_ratio,
                                 num_beams=cfg.num_beams,
                                 do_sample=False,
                                 repetition_penalty=1,
                                 length_penalty=cfg.length_penalty,
                                 pad_token_id=1,
                                 restricter=None)
    metrics = Seq2SeqSpanMetric(eos_token_id=1,
                                num_labels=len(label_ids),
                                target_type='word')
    loss = get_loss

    trainer = Trainer(train_data=train_dataloader,
                      dev_data=dev_dataloader,
                      test_data=None,
                      model=model,
                      args=cfg,
                      logger=logger,
                      loss=loss,
                      metrics=metrics,
                      writer=writer)
    trainer.train()

    writer.close()
예제 #9
0
파일: run.py 프로젝트: zxlzr/deepke
def main(cfg):

    # Use gpu or not
    if cfg.use_gpu and torch.cuda.is_available():
        device = torch.device('cuda', cfg.gpu_id)
    else:
        device = torch.device('cpu')

    if cfg.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(cfg.gradient_accumulation_steps))

    cfg.train_batch_size = cfg.train_batch_size // cfg.gradient_accumulation_steps

    random.seed(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)

    if not cfg.do_train and not cfg.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    # Checkpoints
    if os.path.exists(utils.get_original_cwd() + '/' + cfg.output_dir
                      ) and os.listdir(utils.get_original_cwd() + '/' +
                                       cfg.output_dir) and cfg.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                utils.get_original_cwd() + '/' + cfg.output_dir))
    if not os.path.exists(utils.get_original_cwd() + '/' + cfg.output_dir):
        os.makedirs(utils.get_original_cwd() + '/' + cfg.output_dir)

    # Preprocess the input dataset
    processor = NerProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    # Prepare the model
    tokenizer = BertTokenizer.from_pretrained(cfg.bert_model,
                                              do_lower_case=cfg.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if cfg.do_train:
        train_examples = processor.get_train_examples(
            utils.get_original_cwd() + '/' + cfg.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / cfg.train_batch_size /
            cfg.gradient_accumulation_steps) * cfg.num_train_epochs

    config = BertConfig.from_pretrained(cfg.bert_model,
                                        num_labels=num_labels,
                                        finetuning_task=cfg.task_name)
    model = TrainNer.from_pretrained(cfg.bert_model,
                                     from_tf=False,
                                     config=config)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        cfg.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(cfg.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=cfg.learning_rate,
                      eps=cfg.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if cfg.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      cfg.max_seq_length,
                                                      tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=cfg.train_batch_size)

        model.train()

        for _ in trange(int(cfg.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,
                             valid_ids, l_mask)
                if cfg.gradient_accumulation_steps > 1:
                    loss = loss / cfg.gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               cfg.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % cfg.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(utils.get_original_cwd() + '/' +
                                      cfg.output_dir)
        tokenizer.save_pretrained(utils.get_original_cwd() + '/' +
                                  cfg.output_dir)
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": cfg.bert_model,
            "do_lower": cfg.do_lower_case,
            "max_seq_length": cfg.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(
            model_config,
            open(
                os.path.join(utils.get_original_cwd() + '/' + cfg.output_dir,
                             "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(utils.get_original_cwd() + '/' +
                                    cfg.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            utils.get_original_cwd() + '/' + cfg.output_dir,
            do_lower_case=cfg.do_lower_case)

    model.to(device)

    if cfg.do_eval:
        if cfg.eval_on == "dev":
            eval_examples = processor.get_dev_examples(
                utils.get_original_cwd() + '/' + cfg.data_dir)
        elif cfg.eval_on == "test":
            eval_examples = processor.get_test_examples(
                utils.get_original_cwd() + '/' + cfg.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     cfg.max_seq_length,
                                                     tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=cfg.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred, digits=4)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(
            utils.get_original_cwd() + '/' + cfg.output_dir,
            "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
예제 #10
0
def main(cfg: DictConfig):

    cwd = Path(get_original_cwd())

    # overwrite config if continue training from checkpoint
    resume_cfg = None
    if "resume" in cfg:
        cfg_path = cwd / cfg.resume / ".hydra/config.yaml"
        print(f"Continue from: {cfg.resume}")
        # Overwrite everything except device
        # TODO config merger (perhaps continue training with the same optimizer but other lrs?)
        resume_cfg = OmegaConf.load(cfg_path)
        cfg.model = resume_cfg.model
        if cfg.train.num_epochs == 0:
            cfg.data.scale_factor = resume_cfg.data.scale_factor
        OmegaConf.save(cfg, ".hydra/config.yaml")

    print(OmegaConf.to_yaml(cfg))

    device = set_device_id(cfg.device)
    set_seed(cfg.seed, device=device)

    # Augmentations
    if cfg.data.aug == "auto":
        transforms = albu.load(cwd / "autoalbument/autoconfig.json")
    else:
        transforms = D.get_training_augmentations()

    if OmegaConf.is_missing(cfg.model, "convert_bottleneck"):
        cfg.model.convert_bottleneck = (0, 0, 0)

    # Model
    print(f"Setup model {cfg.model.arch} {cfg.model.encoder_name} "
          f"convert_bn={cfg.model.convert_bn} "
          f"convert_bottleneck={cfg.model.convert_bottleneck} ")
    model = get_segmentation_model(
        arch=cfg.model.arch,
        encoder_name=cfg.model.encoder_name,
        encoder_weights=cfg.model.encoder_weights,
        classes=1,
        convert_bn=cfg.model.convert_bn,
        convert_bottleneck=cfg.model.convert_bottleneck,
        # decoder_attention_type="scse",  # TODO to config
    )
    model = model.to(device)
    model.train()
    print(model)

    # Optimization
    # Reduce LR for pretrained encoder
    layerwise_params = {
        "encoder*":
        dict(lr=cfg.optim.lr_encoder, weight_decay=cfg.optim.wd_encoder)
    }
    model_params = cutils.process_model_params(
        model, layerwise_params=layerwise_params)

    # Select optimizer
    optimizer = get_optimizer(
        name=cfg.optim.name,
        model_params=model_params,
        lr=cfg.optim.lr,
        wd=cfg.optim.wd,
        lookahead=cfg.optim.lookahead,
    )

    criterion = {
        "dice": DiceLoss(),
        # "dice": SoftDiceLoss(mode="binary", smooth=1e-7),
        "iou": IoULoss(),
        "bce": nn.BCEWithLogitsLoss(),
        "lovasz": LovaszLossBinary(),
        "focal_tversky": FocalTverskyLoss(eps=1e-7, alpha=0.7, gamma=0.75),
    }

    # Load states if resuming training
    if "resume" in cfg:
        checkpoint_path = (cwd / cfg.resume / cfg.train.logdir /
                           "checkpoints/best_full.pth")
        if checkpoint_path.exists():
            print(f"\nLoading checkpoint {str(checkpoint_path)}")
            checkpoint = cutils.load_checkpoint(checkpoint_path)
            cutils.unpack_checkpoint(
                checkpoint=checkpoint,
                model=model,
                optimizer=optimizer
                if resume_cfg.optim.name == cfg.optim.name else None,
                criterion=criterion,
            )
        else:
            raise ValueError("Nothing to resume, checkpoint missing")

    # We could only want to validate resume, in this case skip training routine
    best_th = 0.5

    stats = None
    if cfg.data.stats:
        print(f"Use statistics from file: {cfg.data.stats}")
        stats = cwd / cfg.data.stats

    if cfg.train.num_epochs is not None:
        callbacks = [
            # Each criterion is calculated separately.
            CriterionCallback(input_key="mask",
                              prefix="loss_dice",
                              criterion_key="dice"),
            CriterionCallback(input_key="mask",
                              prefix="loss_iou",
                              criterion_key="iou"),
            CriterionCallback(input_key="mask",
                              prefix="loss_bce",
                              criterion_key="bce"),
            CriterionCallback(input_key="mask",
                              prefix="loss_lovasz",
                              criterion_key="lovasz"),
            CriterionCallback(
                input_key="mask",
                prefix="loss_focal_tversky",
                criterion_key="focal_tversky",
            ),
            # And only then we aggregate everything into one loss.
            MetricAggregationCallback(
                prefix="loss",
                mode="weighted_sum",  # can be "sum", "weighted_sum" or "mean"
                # because we want weighted sum, we need to add scale for each loss
                metrics={
                    "loss_dice": cfg.loss.dice,
                    "loss_iou": cfg.loss.iou,
                    "loss_bce": cfg.loss.bce,
                    "loss_lovasz": cfg.loss.lovasz,
                    "loss_focal_tversky": cfg.loss.focal_tversky,
                },
            ),
            # metrics
            DiceCallback(input_key="mask"),
            IouCallback(input_key="mask"),
            # gradient accumulation
            OptimizerCallback(accumulation_steps=cfg.optim.accumulate),
            # early stopping
            SchedulerCallback(reduced_metric="loss_dice",
                              mode=cfg.scheduler.mode),
            EarlyStoppingCallback(**cfg.scheduler.early_stopping,
                                  minimize=False),
            # TODO WandbLogger works poorly with multistage right now
            WandbLogger(project=cfg.project, config=dict(cfg)),
            # CheckpointCallback(save_n_best=cfg.checkpoint.save_n_best),
        ]

        # Training
        runner = SupervisedRunner(device=device,
                                  input_key="image",
                                  input_target_key="mask")

        # TODO Scheduler does not work now, every stage restarts from base lr
        scheduler_warm_restart = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[1, 2],
            gamma=10,
        )

        for i, (size, num_epochs) in enumerate(
                zip(cfg.data.sizes, cfg.train.num_epochs)):
            scale = size / 1024
            print(
                f"Training stage {i}, scale {scale}, size {size}, epochs {num_epochs}"
            )

            # Datasets
            (
                train_ds,
                valid_ds,
                train_images,
                val_images,
            ) = D.get_train_valid_datasets_from_path(
                # path=(cwd / cfg.data.path),
                path=(cwd / f"data/hubmap-{size}x{size}/"),
                train_ids=cfg.data.train_ids,
                valid_ids=cfg.data.valid_ids,
                seed=cfg.seed,
                valid_split=cfg.data.valid_split,
                mean=cfg.data.mean,
                std=cfg.data.std,
                transforms=transforms,
                stats=stats,
            )

            train_bs = int(cfg.loader.train_bs / (scale**2))
            valid_bs = int(cfg.loader.valid_bs / (scale**2))
            print(
                f"train: {len(train_ds)}; bs {train_bs}",
                f"valid: {len(valid_ds)}, bs {valid_bs}",
            )

            # Data loaders
            data_loaders = D.get_data_loaders(
                train_ds=train_ds,
                valid_ds=valid_ds,
                train_bs=train_bs,
                valid_bs=valid_bs,
                num_workers=cfg.loader.num_workers,
            )

            # Select scheduler
            scheduler = get_scheduler(
                name=cfg.scheduler.type,
                optimizer=optimizer,
                num_epochs=num_epochs * (len(data_loaders["train"]) if
                                         cfg.scheduler.mode == "batch" else 1),
                eta_min=scheduler_warm_restart.get_last_lr()[0] /
                cfg.scheduler.eta_min_factor,
                plateau=cfg.scheduler.plateau,
            )

            runner.train(
                model=model,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                callbacks=callbacks,
                logdir=cfg.train.logdir,
                loaders=data_loaders,
                num_epochs=num_epochs,
                verbose=True,
                main_metric=cfg.train.main_metric,
                load_best_on_end=True,
                minimize_metric=False,
                check=cfg.check,
                fp16=dict(amp=cfg.amp),
            )

            # Set new initial LR for optimizer after restart
            scheduler_warm_restart.step()
            print(
                f"New LR for warm restart {scheduler_warm_restart.get_last_lr()[0]}"
            )

            # Find optimal threshold for dice score
            model.eval()
            best_th, dices = find_dice_threshold(model, data_loaders["valid"])
            print("Best dice threshold", best_th, np.max(dices[1]))
            np.save(f"dices_{size}.npy", dices)
    else:
        print("Validation only")
        # Datasets
        size = cfg.data.sizes[-1]
        train_ds, valid_ds = D.get_train_valid_datasets_from_path(
            # path=(cwd / cfg.data.path),
            path=(cwd / f"data/hubmap-{size}x{size}/"),
            train_ids=cfg.data.train_ids,
            valid_ids=cfg.data.valid_ids,
            seed=cfg.seed,
            valid_split=cfg.data.valid_split,
            mean=cfg.data.mean,
            std=cfg.data.std,
            transforms=transforms,
            stats=stats,
        )

        train_bs = int(cfg.loader.train_bs / (cfg.data.scale_factor**2))
        valid_bs = int(cfg.loader.valid_bs / (cfg.data.scale_factor**2))
        print(
            f"train: {len(train_ds)}; bs {train_bs}",
            f"valid: {len(valid_ds)}, bs {valid_bs}",
        )

        # Data loaders
        data_loaders = D.get_data_loaders(
            train_ds=train_ds,
            valid_ds=valid_ds,
            train_bs=train_bs,
            valid_bs=valid_bs,
            num_workers=cfg.loader.num_workers,
        )

        # Find optimal threshold for dice score
        model.eval()
        best_th, dices = find_dice_threshold(model, data_loaders["valid"])
        print("Best dice threshold", best_th, np.max(dices[1]))
        np.save(f"dices_val.npy", dices)

    #
    # # Load best checkpoint
    # checkpoint_path = Path(cfg.train.logdir) / "checkpoints/best.pth"
    # if checkpoint_path.exists():
    #     print(f"\nLoading checkpoint {str(checkpoint_path)}")
    #     state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))[
    #         "model_state_dict"
    #     ]
    #     model.load_state_dict(state_dict)
    #     del state_dict
    # model = model.to(device)
    # Load config for updating with threshold and metric
    # (otherwise loading do not work)
    cfg = OmegaConf.load(".hydra/config.yaml")
    cfg.threshold = float(best_th)

    # Evaluate on full-size image if valid_ids is non-empty
    df_train = pd.read_csv(cwd / "data/train.csv")
    df_train = {
        r["id"]: r["encoding"]
        for r in df_train.to_dict(orient="record")
    }
    dices = []
    unique_ids = sorted(
        set(
            str(p).split("/")[-1].split("_")[0]
            for p in (cwd / cfg.data.path / "train").iterdir()))
    size = cfg.data.sizes[-1]
    scale = size / 1024
    for image_id in cfg.data.valid_ids:
        image_name = unique_ids[image_id]
        print(f"\nValidate for {image_name}")

        rle_pred, shape = inference_one(
            image_path=(cwd / f"data/train/{image_name}.tiff"),
            target_path=Path("."),
            cfg=cfg,
            model=model,
            scale_factor=scale,
            tile_size=cfg.data.tile_size,
            tile_step=cfg.data.tile_step,
            threshold=best_th,
            save_raw=False,
            tta_mode=None,
            weight="pyramid",
            device=device,
            filter_crops="tissue",
            stats=stats,
        )

        print("Predict", shape)
        pred = rle_decode(rle_pred["predicted"], shape)
        mask = rle_decode(df_train[image_name], shape)
        assert pred.shape == mask.shape, f"pred {pred.shape}, mask {mask.shape}"
        assert pred.shape == shape, f"pred {pred.shape}, expected {shape}"

        dices.append(
            dice(
                torch.from_numpy(pred).type(torch.uint8),
                torch.from_numpy(mask).type(torch.uint8),
                threshold=None,
                activation="none",
            ))
    print("Full image dice:", np.mean(dices))
    OmegaConf.save(cfg, ".hydra/config.yaml")
    return
예제 #11
0
def test_get_original_cwd():
    orig = "/foo/bar"
    cfg = OmegaConf.create({"hydra": {"runtime": {"cwd": orig}}})
    HydraConfig().set_config(cfg)
    assert utils.get_original_cwd() == orig
예제 #12
0
    def __init__(self, hydra_cfg, logger):
        self.logger = logger
        self.hydra_cfg = hydra_cfg
        self.seed = hydra_cfg['parameters']['seed']
        self.metric = hydra_cfg['parameters']['metric']

        self.device = torch.device(
            'cuda:{}'.format(hydra_cfg['parameters']['gpu_id']
                             ) if torch.cuda.is_available() else 'cpu')

        working_dir = utils.get_original_cwd() + '/'
        training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[
            'dataset']['train_fname']
        is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0

        # load embeddings
        pretrained_path = hydra_cfg['parameters']['pre_trained']
        pretrained_vocab = {}
        if pretrained_path:
            pretrained_path = working_dir + hydra_cfg['parameters'][
                'pre_trained']
            self.logger.info('Loading pre-trained word embeddings {}\n'.format(
                pretrained_path))
            pretrained_w2v = KeyedVectors.load_word2vec_format(
                fname=pretrained_path)
            pretrained_vocab = set(pretrained_w2v.vocab.keys())
            assert hydra_cfg['parameters']['ngram'] == 1

        self.dictionary = SupervisedDictionary(
            replace_OOV_word=is_replaced_OOV,
            min_count=hydra_cfg['parameters']['min_count'],
            replace_word='<OOV>',
            size_word_n_gram=hydra_cfg['parameters']['ngram'],
            word_n_gram_min_count=hydra_cfg['parameters']
            ['word_n_gram_min_count'],
            label_separator=hydra_cfg['parameters']['label_separator'],
            line_break_word='')

        self.logger.info('Use {}\n'.format(self.device))

        self.dictionary.fit(training_path)

        if pretrained_vocab:
            self.dictionary.update_vocab_from_word_set(pretrained_vocab)

        self.train_set, self.val_set = get_datasets(
            cfg=hydra_cfg,
            dictionary=self.dictionary,
            working_dir=working_dir,
            training_path=training_path,
            include_test=False)

        pretrained_word_vectors = None
        dim = self.hydra_cfg['parameters']['dim']

        self.pooling = self.hydra_cfg['parameters']['pooling']

        OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov']
        self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0

        if pretrained_word_vectors:
            pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings(
                pretrained_w2v,
                self.dictionary,
                OOV_initialized_method,
                rnd=np.random.RandomState(self.seed))
            dim = pretrained_word_vectors.shape[1]
        self.pretrained_word_vectors = pretrained_word_vectors
        self.dim = dim

        self.logger.info('#training_data: {}, #val_data: {}\n'.format(
            len(self.train_set), len(self.val_set)))
        self.logger.info(
            'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n'
            .format(self.dictionary.size_word_vocab,
                    self.dictionary.size_ngram_vocab,
                    self.dictionary.size_total_vocab))
예제 #13
0
def main(cfg):
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    stream_handler.terminator = ''
    logger.addHandler(stream_handler)

    check_hydra_conf(cfg)

    metric = cfg['parameters']['metric']
    if metric == 'loss':
        direction = 'minimize'
    else:
        direction = 'maximize'

    sampler = TPESampler(seed=cfg['parameters']['seed'], n_startup_trials=2)
    pruner = optuna.pruners.HyperbandPruner(
        max_resource=cfg['parameters']['epochs'])
    study = optuna.create_study(direction=direction,
                                sampler=sampler,
                                pruner=pruner)
    objective = Objective(cfg, logger)
    study.optimize(objective,
                   n_trials=cfg['optuna']['num_trials'],
                   n_jobs=cfg['optuna']['n_jobs'])

    # logging_file
    trial = study.best_trial

    logger.info('\nVal. loss: {:.4f}, Val acc.: {:.1f}%'.format(
        trial.user_attrs['val_loss'], trial.user_attrs['val_acc'] * 100))

    for key, value in trial.params.items():
        logger.info('    {}: {}'.format(key, value))

    # remove poor models
    target = Path(trial.user_attrs['model_path'])
    for path in target.parent.glob('*.pt'):
        if path != target:
            path.unlink()

    # evaluation
    # load test data loader
    working_dir = utils.get_original_cwd() + '/'
    test_set = SentenceDataset(
        *objective.dictionary.transform(working_dir + cfg['dataset']['path'] +
                                        cfg['dataset']['test_fname']),
        objective.dictionary.size_word_vocab,
        train=False)
    test_data_loader = torch.utils.data.dataloader.DataLoader(test_set,
                                                              batch_size=1,
                                                              shuffle=False,
                                                              num_workers=1)

    device = objective.device
    # init model
    model = SupervisedFastText(V=objective.dictionary.size_total_vocab,
                               num_classes=len(
                                   objective.dictionary.label_vocab),
                               embedding_dim=objective.dim,
                               pretrained_emb=None,
                               freeze=True,
                               pooling=objective.pooling).to(device)

    # load model
    model.load_state_dict(torch.load(target, map_location=device))
    model = model.to(device)

    loss, acc = evaluation(model,
                           device,
                           test_data_loader,
                           divide_by_num_data=True)
    results = trial.user_attrs
    results['test_loss'] = loss
    results['test_acc'] = acc

    output_path_fname = os.getcwd() + '/' + cfg['parameters']['logging_file']
    logger.info('Saving training history and evaluation scores in {}'.format(
        output_path_fname))
    with open(output_path_fname, 'w') as log_file:
        json.dump(results, log_file)
예제 #14
0
def main(cfg):
    logger.info("=" * 20)
    # Current Working Directory
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd
    # Model dictionary
    __MODEL__ = {
        'transformer': models.Transformer,
        'rnn': models.RNN,
        'cnn': models.CNN,
        'bert': models.Bert
    }

    # Device
    if cfg.use_GPU and torch.cuda.is_available():
        device = torch.device('cuda', cfg.gpu_id)
    else:
        device = torch.device('cpu')
    logger.info("Device: {}".format(device))

    # Preprocess raw data
    if cfg.preprocess:
        logger.info("Preprocessing...")
        preprocess(cfg)  # Once is quite enough

    # Make dataset
    logger.info("Making Dataset...")

    train_dataset = makeDataset(cfg, train_flag=True)  # len: 100000
    test_dataset = makeDataset(cfg, train_flag=False)  # len: 25000

    # Model
    logger.info("Model:{}".format(cfg.model_name))

    model = __MODEL__[cfg.model_name](cfg).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=cfg.lr,
                           weight_decay=cfg.weight_decay)
    criterion = nn.CrossEntropyLoss()

    # Train
    if not cfg.test_only:
        logger.info("=" * 5 + " Start %d-fold training! " % cfg.k_fold +
                    "=" * 5)

        k_fold_trainer = KFoldTrainer(cfg, train_dataset, model, device,
                                      optimizer, criterion)

        best_epoch, best_acc_v = -1, -1

        for epoch in range(cfg.EPOCH):
            k_fold_trainer.set_epoch(epoch)
            loss_t, acc_t, loss_v, acc_v = k_fold_trainer.kFoldTrain()
            if (acc_v > best_acc_v):
                best_epoch = epoch
                best_acc_v = acc_v
                torch.save(
                    model,
                    os.path.join(
                        cwd, "{}model_{}.pt".format(cfg.model_dir,
                                                    cfg.model_name)))
                logger.info("Model Updated!")

            logger.info(
                "Current Epoch(%d):\nTrain loss: %.6f Train accuracy: %.2f%% Valid loss: %.6f Valid Accuracy: %.2f%%"
                % (epoch, loss_t, acc_t * 100, loss_v, acc_v * 100))
            logger.info("Best Epoch(%d):\n Accuracy: %.2f%%" %
                        (best_epoch, best_acc_v * 100))

        logger.info("=" * 5 + " Training finished. " + "=" * 5)

    # Test
    logger.info("=" * 5 + " Start testing! " + "=" * 5)
    tester = Tester(cfg, test_dataset, model, device)
    tester.test()
    logger.info("Saving results to {}. ".format(
        os.path.join(cwd, "{}".format(cfg.result_file))))
    logger.info("=" * 5 + " Test finished. " + "=" * 5)
예제 #15
0
def test_get_original_cwd() -> None:
    orig = "/foo/bar"
    cfg = OmegaConf.create({"hydra": {"runtime": {"cwd": orig}}})
    assert isinstance(cfg, DictConfig)
    HydraConfig().set_config(cfg)
    assert utils.get_original_cwd() == orig
예제 #16
0
파일: train_util.py 프로젝트: r9y9/nnsvs
def setup(config, device, collate_fn=collate_fn_default):
    """Setup for training

    Args:
        config (dict): configuration for training
        device (torch.device): device to use for training
        collate_fn (callable, optional): collate function. Defaults to collate_fn_default.

    Returns:
        (tuple): tuple containing model, optimizer, learning rate scheduler,
            data loaders, tensorboard writer, logger, and scalers.
    """
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    logger.info(f"PyTorch version: {torch.__version__}")

    if torch.cuda.is_available():
        from torch.backends import cudnn

        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")
        if torch.backends.cudnn.version() is not None:
            logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    if "use_amp" in config.train and config.train.use_amp:
        logger.info("Use mixed precision training")
        grad_scaler = GradScaler()
    else:
        grad_scaler = None

    # Model
    model = hydra.utils.instantiate(config.model.netG).to(device)
    logger.info("Number of trainable params: {:.3f} million".format(
        num_trainable_params(model) / 1000000.0))
    logger.info(model)

    # Optimizer
    optimizer_class = getattr(optim, config.train.optim.optimizer.name)
    optimizer = optimizer_class(model.parameters(),
                                **config.train.optim.optimizer.params)

    # Scheduler
    lr_scheduler_class = getattr(optim.lr_scheduler,
                                 config.train.optim.lr_scheduler.name)
    lr_scheduler = lr_scheduler_class(optimizer,
                                      **config.train.optim.lr_scheduler.params)

    # DataLoader
    data_loaders = get_data_loaders(config.data, collate_fn, logger)

    set_epochs_based_on_max_steps_(config.train,
                                   len(data_loaders["train_no_dev"]), logger)

    # Resume
    if (config.train.resume.checkpoint is not None
            and len(config.train.resume.checkpoint) > 0):
        logger.info("Load weights from %s", config.train.resume.checkpoint)
        checkpoint = torch.load(
            to_absolute_path(config.train.resume.checkpoint))
        model.load_state_dict(checkpoint["state_dict"])
        if config.train.resume.load_optimizer:
            logger.info("Load optimizer state")
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"])

    if config.data_parallel:
        model = nn.DataParallel(model)

    # Mlflow
    if config.mlflow.enabled:
        mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns")
        mlflow.set_experiment(config.mlflow.experiment)
        # NOTE: disable tensorboard if mlflow is enabled
        writer = None
        logger.info("Using mlflow instead of tensorboard")
    else:
        # Tensorboard
        writer = SummaryWriter(to_absolute_path(config.train.log_dir))

    # Scalers
    if "in_scaler_path" in config.data and config.data.in_scaler_path is not None:
        in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path))
        in_scaler = MinMaxScaler(in_scaler.min_, in_scaler.scale_,
                                 in_scaler.data_min_, in_scaler.data_max_)
    else:
        in_scaler = None
    if "out_scaler_path" in config.data and config.data.out_scaler_path is not None:
        out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path))
        out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_,
                                    out_scaler.scale_)
    else:
        out_scaler = None

    return (
        model,
        optimizer,
        lr_scheduler,
        grad_scaler,
        data_loaders,
        writer,
        logger,
        in_scaler,
        out_scaler,
    )
예제 #17
0
def test_get_original_cwd(hydra_restore_singletons: Any) -> None:
    orig = "/foo/AClass"
    cfg = OmegaConf.create({"hydra": HydraConf(runtime=RuntimeConf(cwd=orig))})
    assert isinstance(cfg, DictConfig)
    HydraConfig.instance().set_config(cfg)
    assert utils.get_original_cwd() == orig
예제 #18
0
파일: train_util.py 프로젝트: r9y9/nnsvs
def setup_cyclegan(config, device, collate_fn=collate_fn_default):
    """Setup for training CycleGAN

    Args:
        config (dict): configuration for training
        device (torch.device): device to use for training
        collate_fn (callable, optional): collate function. Defaults to collate_fn_default.

    Returns:
        (tuple): tuple containing model, optimizer, learning rate scheduler,
            data loaders, tensorboard writer, logger, and scalers.
    """
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    logger.info(f"PyTorch version: {torch.__version__}")

    if torch.cuda.is_available():
        from torch.backends import cudnn

        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")
        if torch.backends.cudnn.version() is not None:
            logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    if "use_amp" in config.train and config.train.use_amp:
        logger.info("Use mixed precision training")
        grad_scaler = GradScaler()
    else:
        grad_scaler = None

    # Model G
    netG_A2B = hydra.utils.instantiate(config.model.netG).to(device)
    netG_B2A = hydra.utils.instantiate(config.model.netG).to(device)
    logger.info(
        "[Generator] Number of trainable params: {:.3f} million".format(
            num_trainable_params(netG_A2B) / 1000000.0))
    logger.info(netG_A2B)
    # Optimizer and LR scheduler for G
    optG, schedulerG = _instantiate_optim_cyclegan(config.train.optim.netG,
                                                   netG_A2B, netG_B2A)

    # Model D
    netD_A = hydra.utils.instantiate(config.model.netD).to(device)
    netD_B = hydra.utils.instantiate(config.model.netD).to(device)
    logger.info(
        "[Discriminator] Number of trainable params: {:.3f} million".format(
            num_trainable_params(netD_A) / 1000000.0))
    logger.info(netD_A)
    # Optimizer and LR scheduler for D
    optD, schedulerD = _instantiate_optim_cyclegan(config.train.optim.netD,
                                                   netD_A, netD_B)

    # DataLoader
    data_loaders = get_data_loaders(config.data, collate_fn, logger)

    set_epochs_based_on_max_steps_(config.train,
                                   len(data_loaders["train_no_dev"]), logger)

    # Resume
    # TODO
    # _resume(logger, config.train.resume.netG, netG, optG, schedulerG)
    # _resume(logger, config.train.resume.netD, netD, optD, schedulerD)

    if config.data_parallel:
        netG_A2B = nn.DataParallel(netG_A2B)
        netG_B2A = nn.DataParallel(netG_B2A)
        netD_A = nn.DataParallel(netD_A)
        netD_B = nn.DataParallel(netD_B)

    # Mlflow
    if config.mlflow.enabled:
        mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns")
        mlflow.set_experiment(config.mlflow.experiment)
        # NOTE: disable tensorboard if mlflow is enabled
        writer = None
        logger.info("Using mlflow instead of tensorboard")
    else:
        # Tensorboard
        writer = SummaryWriter(to_absolute_path(config.train.log_dir))

    # Scalers
    if "in_scaler_path" in config.data and config.data.in_scaler_path is not None:
        in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path))
        if isinstance(in_scaler, SKMinMaxScaler):
            in_scaler = MinMaxScaler(
                in_scaler.min_,
                in_scaler.scale_,
                in_scaler.data_min_,
                in_scaler.data_max_,
            )
    else:
        in_scaler = None
    if "out_scaler_path" in config.data and config.data.out_scaler_path is not None:
        out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path))
        out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_,
                                    out_scaler.scale_)
    else:
        out_scaler = None

    return (
        (netG_A2B, netG_B2A, optG, schedulerG),
        (netD_A, netD_B, optD, schedulerD),
        grad_scaler,
        data_loaders,
        writer,
        logger,
        in_scaler,
        out_scaler,
    )
예제 #19
0
def main(cfg):
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd
    cfg.pos_size = 2 * cfg.pos_limit + 2
    logger.info(f'\n{cfg.pretty()}')

    __Model__ = {
        'cnn': models.PCNN,
        'rnn': models.BiLSTM,
        'transformer': models.Transformer,
        'gcn': models.GCN,
        'capsule': models.Capsule,
        'lm': models.LM,
    }

    # device
    if cfg.use_gpu and torch.cuda.is_available():
        device = torch.device('cuda', cfg.gpu_id)
    else:
        device = torch.device('cpu')
    logger.info(f'device: {device}')

    # 如果不修改预处理的过程,这一步最好注释掉,不用每次运行都预处理数据一次
    if cfg.preprocess:
        preprocess(cfg)

    train_data_path = os.path.join(cfg.cwd, cfg.out_path, 'train.pkl')
    valid_data_path = os.path.join(cfg.cwd, cfg.out_path, 'valid.pkl')
    test_data_path = os.path.join(cfg.cwd, cfg.out_path, 'test.pkl')
    vocab_path = os.path.join(cfg.cwd, cfg.out_path, 'vocab.pkl')

    if cfg.model_name == 'lm':
        vocab_size = None
    else:
        vocab = load_pkl(vocab_path)
        vocab_size = vocab.count
    cfg.vocab_size = vocab_size

    train_dataset = CustomDataset(train_data_path)
    valid_dataset = CustomDataset(valid_data_path)
    test_dataset = CustomDataset(test_data_path)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=cfg.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn(cfg))
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=cfg.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn(cfg))
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=cfg.batch_size,
                                 shuffle=True,
                                 collate_fn=collate_fn(cfg))

    model = __Model__[cfg.model_name](cfg)
    model.to(device)
    logger.info(f'\n {model}')

    optimizer = optim.Adam(model.parameters(),
                           lr=cfg.learning_rate,
                           weight_decay=cfg.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=cfg.lr_factor,
                                                     patience=cfg.lr_patience)
    criterion = nn.CrossEntropyLoss()

    best_f1, best_epoch = -1, 0
    es_loss, es_f1, es_epoch, es_patience, best_es_epoch, best_es_f1, es_path, best_es_path = 1e8, -1, 0, 0, 0, -1, '', ''
    train_losses, valid_losses = [], []

    if cfg.show_plot and cfg.plot_utils == 'tensorboard':
        writer = SummaryWriter('tensorboard')
    else:
        writer = None

    logger.info('=' * 10 + ' Start training ' + '=' * 10)

    for epoch in range(1, cfg.epoch + 1):
        manual_seed(cfg.seed + epoch)
        train_loss = train(epoch, model, train_dataloader, optimizer,
                           criterion, device, writer, cfg)
        valid_f1, valid_loss = validate(epoch, model, valid_dataloader,
                                        criterion, device, cfg)
        scheduler.step(valid_loss)
        model_path = model.save(epoch, cfg)
        # logger.info(model_path)

        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        if best_f1 < valid_f1:
            best_f1 = valid_f1
            best_epoch = epoch
        # 使用 valid loss 做 early stopping 的判断标准
        if es_loss > valid_loss:
            es_loss = valid_loss
            es_f1 = valid_f1
            es_epoch = epoch
            es_patience = 0
            es_path = model_path
        else:
            es_patience += 1
            if es_patience >= cfg.early_stopping_patience:
                best_es_epoch = es_epoch
                best_es_f1 = es_f1
                best_es_path = es_path

    if cfg.show_plot:
        if cfg.plot_utils == 'matplot':
            plt.plot(train_losses, 'x-')
            plt.plot(valid_losses, '+-')
            plt.legend(['train', 'valid'])
            plt.title('train/valid comparison loss')
            plt.show()

        if cfg.plot_utils == 'tensorboard':
            for i in range(len(train_losses)):
                writer.add_scalars('train/valid_comparison_loss', {
                    'train': train_losses[i],
                    'valid': valid_losses[i]
                }, i)
            writer.close()

    logger.info(
        f'best(valid loss quota) early stopping epoch: {best_es_epoch}, '
        f'this epoch macro f1: {best_es_f1:0.4f}')
    logger.info(f'this model save path: {best_es_path}')
    logger.info(
        f'total {cfg.epoch} epochs, best(valid macro f1) epoch: {best_epoch}, '
        f'this epoch macro f1: {best_f1:.4f}')

    validate(-1, model, test_dataloader, criterion, device, cfg)
예제 #20
0
def train(cfg):
    logger = get_logger("./log/")
    # lossutil = LossUtil("./log/")
    logger.info(cfg)
    ROOT = Path(utils.get_original_cwd()).parent
    print(ROOT)
    data_path = ROOT.joinpath("datasets/hsimg/data.tiff")
    himg = imread(str(data_path))
    himg = himg / 2**16
    # 分光画像の範囲を光源の範囲に制限
    himg = himg[:, :, :44]
    himg = np.where(himg < 0, 0, himg)
    nhimg = np.empty((512, 512, himg.shape[2]))
    for i in range(himg.shape[2]):
        logger.info("resize %d channel..." % i)
        ch = himg[:, :, i]
        nhimg[:, :, i] = cv2.resize(255 * ch, (512, 512),
                                    interpolation=cv2.INTER_LANCZOS4)
        logger.info("resize  %d channel... done!" % i)
    himg = nhimg / 255
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if cfg.image.type == "denoise":
        sigma = 0.1
        himg = get_noisy_img(himg, sigma)
        mask = None
    elif "inpaint" in cfg.image.type:
        mask = make_mask_himg(cfg, himg)
        logger.info(mask.dtype)
        mask = torch.from_numpy(mask)
        mask = mask[None, :].permute(0, 3, 1, 2).to(device)
        logger.info(mask.shape)
        logger.info(mask.dtype)

    else:
        raise NotImplementedError('[%s] is not Implemented' % cfg.image.type)

    transform = transforms.Compose([transforms.ToTensor()])
    himg = transform(himg)
    himg = himg[None, :].float().to(device)

    dist_path = ROOT.joinpath("datasets/csvs/D65.csv")
    cmf_path = ROOT.joinpath("datasets/csvs/CIE1964-10deg-XYZ.csv")
    tiff2rgb = Tiff2rgb(dist_path, cmf_path)

    # ターゲット画像を保存
    if cfg.image.type == "denoise":
        img = tiff2rgb.tiff_to_rgb(himg[0].permute(1, 2, 0))
    else:
        tmp = himg * mask
        logger.info(tmp.shape)
        logger.info(tmp.dtype)
        img = tiff2rgb.tiff_to_rgb(tmp[0].permute(1, 2, 0))
    result_dir = Path("./result_imgs/")
    if not result_dir.exists():
        Path(result_dir).mkdir(parents=True)
    img.save(result_dir.joinpath("target.png"))

    logger.info(himg.shape)
    logger.info(himg.dtype)
    np.random.seed(cfg.base_options.seed)
    torch.manual_seed(cfg.base_options.seed)
    torch.cuda.manual_seed_all(cfg.base_options.seed)
    torch.backends.cudnn.benchmark = False

    model = DIP(cfg, himg, mask)
    logger.info(model.generator)

    if cfg.base_options.debugging:
        summary(model.generator, (1, 512, 512))
        # summary(model.generator, (3, 256, 256))
        print('デバッグ中 途中で終了します!!!')
        exit(0)

    input_noise = torch.randn(1,
                              1,
                              himg.shape[2],
                              himg.shape[3],
                              device=device)
    logger.info(input_noise.dtype)
    for epoch in range(cfg.base_options.epochs):
        if epoch % cfg.base_options.print_freq == 0:
            epoch_start_time = time.time()

        if not cfg.base_options.do_fix_noise:
            input_noise = torch.randn(1,
                                      1,
                                      himg.shape[2],
                                      himg.shape[3],
                                      dtype=torch.float32,
                                      device=device)
        model.forward(input_noise)
        # logger.info(model.gimg.shape)
        # logger.info(model.gimg.dtype)
        model.optimize_parameters()
        # model.update_learning_rate()
        losses = model.get_current_losses()

        if epoch % cfg.base_options.print_freq == 0:
            num = cfg.base_options.print_freq
            t1 = (time.time() - epoch_start_time)
            t2 = float(t1) / num
            print("%dエポックの所要時間%.3f 平均時間%.3f" % (num, t1, t2))
            print_losses(epoch, losses)

        if cfg.base_options.save_model_freq != -1 and epoch % cfg.base_options.save_model_freq == 0:
            model.save_networks(epoch)
            model.save_losses()
        if epoch % cfg.base_options.save_img_freq == 0:
            new_img = model.gimg.detach()[0]
            # CHW -> HWC
            new_img = new_img.permute(1, 2, 0)
            # print(new_img.shape, new_img.device, new_img.dtype)
            img = tiff2rgb.tiff_to_rgb(new_img)
            img.save(result_dir.joinpath("%05d.png" % epoch))

    if cfg.base_options.save_model_freq != -1:
        model.save_networks("finish")
    model.save_losses()
    # 実験結果の動画
    freq = cfg.base_options.save_img_freq
    epochs = cfg.base_options.epochs
    result_imgs = [
        "./result_imgs/%05d.png" % (epoch) for epoch in range(epochs)
        if epoch % freq == 0
    ]
    make_video("./", result_imgs, width=himg.shape[3], height=himg.shape[3])
예제 #21
0
def main(cfg):
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd
    cfg.pos_size = 2 * cfg.pos_limit + 2
    print(cfg.pretty())
   
    with open('res_sample0227_v1.csv','w') as csvf:
        writer = csv.writer(csvf)
        writer.writerow(["sentence","relation","head","head_offset","tail","tail_offset","prob","stock_name","stock_code","industry_code"])
    path = '/data/prj2020/EnterpriseSpider/news/enty0126_all.csv'
    # enty_df = pd.read_csv(path).iloc[33376:,:]
    enty_df = pd.read_csv(path)
    all_data = []
    all_rels = []
    all_pred = []
    all_prob = []
    oom_times = 0
    for index, row in enty_df.iterrows():
        # get predict instance
        print(row)
        instance= {
            'sentence':row['sentence'],
            'head':row['head'],
            'tail':row['tail'],
            'head_type':'企业',
            'tail_type':'企业'
                    }
        data = [instance]
        # preprocess data
        data, rels = _preprocess_data(data, cfg)
         # model
        __Model__ = {
            'cnn': models.PCNN,
            'rnn': models.BiLSTM,
            'transformer': models.Transformer,
            'gcn': models.GCN,
            'capsule': models.Capsule,
            'lm': models.LM,
        }
        # 最好在 cpu 上预测
        # cfg.use_gpu = False
        if cfg.use_gpu and torch.cuda.is_available():
            device = torch.device('cuda', cfg.gpu_id)
        else:
            device = torch.device('cpu')
        # logger.info(f'device: {device}')

        model = __Model__[cfg.model_name](cfg)
        # logger.info(f'model name: {cfg.model_name}')
        # logger.info(f'\n {model}')
        model.load(fp, device=device)
        model.to(device)
        model.eval()
        all_data.append(data)
        all_rels.append(rels)

        x = dict()
        x['word'], x['lens'] = torch.tensor([data[0]['token2idx']]), torch.tensor([data[0]['seq_len']])
        if cfg.model_name != 'lm':
            x['head_pos'], x['tail_pos'] = torch.tensor([data[0]['head_pos']]), torch.tensor([data[0]['tail_pos']])
            if cfg.model_name == 'cnn':
                if cfg.use_pcnn:
                    x['pcnn_mask'] = torch.tensor([data[0]['entities_pos']])

        for key in x.keys():
            x[key] = x[key].to(device)
        try:
            with torch.no_grad():
                y_pred = model(x)
                y_pred = torch.softmax(y_pred, dim=-1)[0]
                prob = y_pred.max().item()
                prob_rel = list(rels.keys())[y_pred.argmax().item()]
                all_pred.append(prob_rel)
                all_prob.append(prob)
                logger.info(f"\"{data[0]['head']}\" 和 \"{data[0]['tail']}\" 在句中关系为:\"{prob_rel}\",置信度为{prob:.2f}。")
                with open('res_sample0227_v1.csv','a+') as csvf:
                    writer = csv.writer(csvf)
                    writer.writerow([row['sentence'],prob_rel,row['head'],row['head_offset'],row['tail'],row['tail_offset'],prob,row['stock_name'],row['stock_code'],row['industry_code']])
        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_times += 1
                logger.info("WARNING:ran out of memory,times:{}".format(oom_times))
                if hasattr(torch.cuda,'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception
    all_res = {
        'sentence':enty_df['sentence'],
        'relation':all_pred,
        'head':enty_df['head'],
        'head_pos':enty_df['head_offset'],
        'tail':enty_df['tail'],
        'tail_pos':enty_df['tail_offset'],
        'prob':all_prob,
        "stock_name":enty_df['stock_name'],
        "stock_code":enty_df['stock_code'],
        "industry_code":enty_df['industry_code']
    }   
    all_res_df = pd.DataFrame(all_res)
    all_res_df.to_csv('res_sample0227_v101.csv',index = False,header=1)
def train(dataset_cfg, model_cfg, training_cfg):
    image_root_folder = os.path.join(utils.get_original_cwd(),
                                     dataset_cfg.image_root_folder)

    img_transform = transforms.Compose([
        transforms.Resize((dataset_cfg.img_size, dataset_cfg.img_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    dataset = CustomImageFolder(image_root_folder,
                                transform=img_transform,
                                sample_size=dataset_cfg.sample_size)

    model = models.__dict__[model_cfg.name](
        num_classes=training_cfg.n_clusters, initialize=model_cfg.initialize)

    model, already_trained_epoch = checkpoint_utils.load_latest_checkpoint(
        model, training_cfg.checkpoint, use_gpu)
    if use_gpu:
        model = model.cuda()
    deep_kmeans = DeepKmeans(n_clusters=training_cfg.n_clusters)

    model.train()
    criterion = nn.CrossEntropyLoss()
    if training_cfg.optimizer.name == 'sgd':
        optimizer = torch.optim.SGD(model.module.parameters(),
                                    lr=training_cfg.optimizer.lr,
                                    momentum=training_cfg.optimizer.momentum,
                                    weight_decay=10**training_cfg.optimizer.wd)
    else:
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=training_cfg.optimizer.lr,
            weight_decay=10**training_cfg.optimizer.wd)

    losses = AverageMeter()
    os.makedirs(os.path.dirname(training_cfg.log_file), exist_ok=True)

    for epoch in range(already_trained_epoch, training_cfg.num_epochs):
        features = extract_features(model,
                                    dataset,
                                    batch_size=training_cfg.batch_size)
        features = apply_dimensionality_reduction(
            features, pca_components=training_cfg.pca.component_size)

        pseudo_labels, kmeans_loss, nmi_previous = deep_kmeans.cluster(
            features)
        if not training_cfg.use_original_labels:
            dataset.set_pseudo_labels(pseudo_labels)
        nmi_orj = normalized_mutual_info_score(dataset.ori_labels,
                                               dataset.targets)
        logger.info('NMI against original assignment: {0:.3f}'.format(nmi_orj))
        acc, informational_acc, category_mapping = calculate_accuracy(
            dataset.ori_labels, dataset.targets)
        logger.info('Classification Acc:%s\tInformational Acc:%s\n' %
                    (acc, informational_acc))
        if training_cfg.reinitialize:
            logger.debug('Reinitializing FC')
            model.reinitialize_fc()
        optimizer_tl = torch.optim.SGD(
            model.fc.parameters(),
            lr=training_cfg.optimizer.lr,
            momentum=training_cfg.optimizer.momentum,
            weight_decay=10**training_cfg.optimizer.wd,
        )
        if use_gpu:
            model.cuda()
        model.train()
        sampler = UnifLabelSampler(N=int(len(dataset) * training_cfg.reassign),
                                   images_lists=dataset.targets,
                                   cluster_size=training_cfg.n_clusters)
        dataloader = DataLoader(dataset,
                                batch_size=training_cfg.batch_size,
                                shuffle=False,
                                num_workers=4,
                                drop_last=False,
                                sampler=sampler)
        logger.info('Epoch [{}/{}] started'.format(epoch,
                                                   training_cfg.num_epochs))
        for data in tqdm(dataloader,
                         total=int(
                             len(dataset) * training_cfg.reassign /
                             training_cfg.batch_size)):
            img, y, _ = data
            if use_gpu:
                img = Variable(img).cuda(non_blocking=True)
                y = y.cuda(non_blocking=True)
            # ===================forward=====================
            y_hat = model(img)
            loss = criterion(y_hat, y)
            # record loss
            losses.add({'loss_%s' % epoch: loss.item() / img.size(0)})
            # ===================backward====================
            optimizer.zero_grad()
            optimizer_tl.zero_grad()
            loss.backward()
            optimizer.step()
            optimizer_tl.step()
        # ===================log========================
        correct = 0
        total = 0
        with torch.no_grad():
            dataloader = DataLoader(dataset,
                                    batch_size=training_cfg.batch_size,
                                    shuffle=False,
                                    num_workers=4,
                                    drop_last=True)
            for data in dataloader:
                img, y, _ = data
                if use_gpu:
                    img = Variable(img).cuda(non_blocking=True)
                    y = y.cuda(non_blocking=True)
                y_hat = model(img)
                _, predicted = torch.max(y_hat.data, 1)
                total += y.size(0)
                correct += (predicted == y).sum().item()

        log = 'Epoch [%s/%s],\tLoss:%s,\tKmeans loss:%s\t' \
              'Acc:%s\tInformational acc:%s\tNetwork Acc:%s\tNMI Orj:%s\tNMI Previous:%s\n' % (
                  epoch, training_cfg.num_epochs,
                  losses.get('loss_%s' % epoch),
                  kmeans_loss, acc, informational_acc,
                  (100 * correct / total), nmi_orj, nmi_previous)
        logger.info(log)
        with open(training_cfg.log_file, mode='a') as f:
            f.write(log)
        if epoch % 5 == 0:
            checkpoint_utils.save_checkpoint(model, training_cfg.checkpoint,
                                             epoch)
    if use_gpu:
        torch.cuda.empty_cache()
예제 #23
0
    def _call_children_scripts(self):
        # bookkeeping of spawned processes
        self._check_can_spawn_children()

        # DDP Environment variables
        os.environ["MASTER_ADDR"] = self.cluster_environment.main_address
        os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port)

        # allow the user to pass the node rank
        os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
        os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())

        # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
        # See https://docs.python.org/3/reference/import.html#main-spec
        if __main__.__spec__ is None:  # pragma: no-cover
            # Script called as `python a/b/c.py`
            # when user is using hydra find the absolute path
            path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path

            # pull out the commands used to run the script and resolve the abs file path
            command = sys.argv
            try:
                full_path = path_lib(command[0])
            except Exception:
                full_path = os.path.abspath(command[0])

            command[0] = full_path
            # use the same python interpreter and actually running
            command = [sys.executable] + command
        else:  # Script called as `python -m a.b.c`
            command = [sys.executable, "-m", __main__.__spec__.name
                       ] + sys.argv[1:]

        # the visible devices tell us how many GPUs we want to use.
        # when the trainer script was called the device has already been scoped by the time
        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
        # but forward the GPUs selected via environment variables
        if self.parallel_devices is None:
            raise MisconfigurationException(
                "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)"
            )

        os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"

        self.interactive_ddp_procs = []

        for local_rank in range(1, self.num_processes):
            env_copy = os.environ.copy()
            env_copy["LOCAL_RANK"] = f"{local_rank}"

            # remove env var if global seed not set
            if os.environ.get(
                    "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                del env_copy["PL_GLOBAL_SEED"]

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if _HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
                    os_cwd = f'"{os.getcwd()}"'
                    command += [
                        f"hydra.run.dir={os_cwd}",
                        f"hydra.job.name=train_ddp_process_{local_rank}"
                    ]
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)

        self._rank_0_has_called_call_children_scripts = True
예제 #24
0
def main(cfg: DictConfig) -> None:
    print("Params: \n")
    print(OmegaConf.to_yaml(cfg))
    time.sleep(10)

    best_acc = 0
    start_epoch = 0
    working_dir = os.path.join(get_original_cwd(), cfg.output_dir,
                               cfg.train_id)
    os.makedirs(working_dir, exist_ok=True)
    writer = SummaryWriter(working_dir)

    # Setup data.
    # --------------------
    print('=> Preparing data..')
    trainloader, testloader = utils.get_dataloaders(
        dataset=cfg.dataset.name,
        batch_size=cfg.dataset.batch_size,
        data_root=cfg.dataset.data_root)

    net = setup_network(cfg.dataset.name, cfg.dataset.arch)
    net = tweak_network(net,
                        bit=cfg.quantizer.bit,
                        train_conf=cfg.train_conf,
                        quant_mode=cfg.quant_mode,
                        arch=cfg.dataset.arch,
                        cfg=cfg)
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    print(net)
    print("Number of learnable parameters: ",
          sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6,
          "M")
    time.sleep(5)
    load_checkpoint(net, init_from=cfg.dataset.init_from)
    params = create_train_params(model=net,
                                 main_wd=cfg.quantizer.wd,
                                 delta_wd=0,
                                 skip_keys=['.delta', '.alpha'],
                                 verbose=cfg.verbose)
    criterion = nn.CrossEntropyLoss()

    # Setup optimizer
    # ----------------------------
    if cfg.quantizer.optimizer == 'sgd':
        print("=> Use SGD optimizer")
        optimizer = optim.SGD(params,
                              lr=cfg.quantizer.lr,
                              momentum=0.9,
                              weight_decay=cfg.quantizer.wd)

    elif cfg.quantizer.optimizer == 'adam':
        print("=> Use Adam optimizer")
        optimizer = optim.Adam(params,
                               lr=cfg.quantizer.lr,
                               weight_decay=cfg.quantizer.wd)

    lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=cfg.dataset.epochs)

    if cfg.evaluate:
        print("==> Start evaluating ...")
        test(net, testloader, criterion, -1)
        exit()

    # -----------------------------------------------
    # Reset to 'warmup_lr' if we are using warmup strategy.
    if cfg.quantizer.enable_warmup:
        assert cfg.quantizer.bit == 1
        for param_group in optimizer.param_groups:
            param_group['lr'] = cfg.quantizer.warmup_lr

    # Initialization
    # ------------------------------------------------
    if cfg.quantizer.bit != 32 and "quan" in cfg.train_conf:
        simple_initialization(net,
                              trainloader,
                              num_batches=cfg.dataset.num_calibration_batches,
                              train_conf=cfg.train_conf)

    # Training
    # -----------------------------------------------
    save_checkpoint_epochs = list(range(10))

    for epoch in range(start_epoch, cfg.dataset.epochs):
        train_loss, train_acc1 = train(net,
                                       optimizer,
                                       trainloader,
                                       criterion,
                                       epoch,
                                       cfg=cfg)
        test_loss, test_acc1, curr_acc = test(net, testloader, criterion,
                                              epoch)

        # Save checkpoint.
        if curr_acc > best_acc:
            best_acc = curr_acc
            utils.save_checkpoint(net,
                                  lr_scheduler,
                                  optimizer,
                                  curr_acc,
                                  epoch,
                                  filename=os.path.join(
                                      working_dir, 'ckpt_best.pth'))
            print('Saving..')
            print('Best accuracy: ', best_acc)

        if lr_scheduler is not None:
            lr_scheduler.step()

        write_metrics(writer, epoch, net,  \
                    optimizer, train_loss, train_acc1, test_loss, test_acc1, prefix="Standard_Training")

    print('Best accuracy: ', best_acc)
예제 #25
0
    def spawn_ddp_children(self, model):
        port = os.environ['MASTER_PORT']

        master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR']
        os.environ['MASTER_PORT'] = f'{port}'
        os.environ['MASTER_ADDR'] = f'{master_address}'

        # allow the user to pass the node rank
        node_rank = '0'
        if 'NODE_RANK' in os.environ:
            node_rank = os.environ['NODE_RANK']
        if 'GROUP_RANK' in os.environ:
            node_rank = os.environ['GROUP_RANK']

        os.environ['NODE_RANK'] = node_rank
        os.environ['LOCAL_RANK'] = '0'

        # when user is using hydra find the absolute path
        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path

        # pull out the commands used to run the script and resolve the abs file path
        command = sys.argv
        try:
            full_path = path_lib(command[0])
        except Exception as e:
            full_path = abspath(command[0])

        command[0] = full_path
        # use the same python interpreter and actually running
        command = [sys.executable] + command

        # since this script sets the visible devices we replace the gpus flag with a number
        num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__()

        if '--gpus' in command:
            gpu_flag_idx = command.index('--gpus')
            command[gpu_flag_idx + 1] = f'{num_gpus}'

        os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'

        self.trainer.interactive_ddp_procs = []
        for local_rank in range(1, self.trainer.num_processes):
            env_copy = os.environ.copy()
            env_copy['LOCAL_RANK'] = f'{local_rank}'

            # start process
            # if hydra is available and initialized, make sure to set the cwd correctly
            cwd: Optional[str] = None
            if HYDRA_AVAILABLE:
                if HydraConfig.initialized():
                    cwd = get_original_cwd()
            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
            self.trainer.interactive_ddp_procs.append(proc)

            # starting all processes at once can cause issues
            # with dataloaders delay between 1-10 seconds
            delay = np.random.uniform(1, 5, 1)[0]
            sleep(delay)

        local_rank = 0
        results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True)
        del os.environ['WORLD_SIZE']

        return results
예제 #26
0
def my_app(_: DictConfig) -> None:
    run_dir = str(Path.cwd().relative_to(get_original_cwd()))
    time.sleep(2)
    run_dir_after_sleep = str(Path(HydraConfig.get().run.dir))
    assert run_dir == run_dir_after_sleep
예제 #27
0
def main(cfg):
    cwd = get_original_cwd()
    os.chdir(cwd)
    if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"):
        get_label_word(cfg)
    if not os.path.exists(cfg.data_dir):
        generate_k_shot(cfg.data_dir)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    data = REDataset(cfg)
    data_config = data.get_data_config()

    config = AutoConfig.from_pretrained(cfg.model_name_or_path)
    config.num_labels = data_config["num_labels"]

    model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path,
                                                 config=config)

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count())))

    model.to(device)

    lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer)
    data.setup()

    if cfg.train_from_saved_model != '':
        model.load_state_dict(
            torch.load(cfg.train_from_saved_model)["checkpoint"])
        print("load saved model from {}.".format(cfg.train_from_saved_model))
        lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"]
    #data.tokenizer.save_pretrained('test')

    optimizer = lit_model.configure_optimizers()
    if cfg.train_from_saved_model != '':
        optimizer.load_state_dict(
            torch.load(cfg.train_from_saved_model)["optimizer"])
        print("load saved optimizer from {}.".format(
            cfg.train_from_saved_model))

    num_training_steps = len(data.train_dataloader(
    )) // cfg.gradient_accumulation_steps * cfg.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_training_steps * 0.1,
        num_training_steps=num_training_steps)
    log_step = 100

    logging(cfg.log_dir, '-' * 89, print_=False)
    logging(cfg.log_dir,
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) +
            ' INFO : START TO TRAIN ',
            print_=False)
    logging(cfg.log_dir, '-' * 89, print_=False)

    for epoch in range(cfg.num_train_epochs):
        model.train()
        num_batch = len(data.train_dataloader())
        total_loss = 0
        log_loss = 0
        for index, train_batch in enumerate(tqdm(data.train_dataloader())):
            loss = lit_model.training_step(train_batch, index)
            total_loss += loss.item()
            log_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if log_step > 0 and (index + 1) % log_step == 0:
                cur_loss = log_loss / log_step
                logging(
                    cfg.log_dir,
                    '| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'.
                    format(epoch, (index + 1), scheduler.get_last_lr(),
                           cur_loss * 1000),
                    print_=False)
                log_loss = 0
        avrg_loss = total_loss / num_batch
        logging(
            cfg.log_dir, '| epoch {:2d} | train loss {:5.3f}'.format(
                epoch, avrg_loss * 1000))

        model.eval()
        with torch.no_grad():
            val_loss = []
            for val_index, val_batch in enumerate(tqdm(data.val_dataloader())):
                loss = lit_model.validation_step(val_batch, val_index)
                val_loss.append(loss)
            f1, best, best_f1 = lit_model.validation_epoch_end(val_loss)
            logging(cfg.log_dir, '-' * 89)
            logging(cfg.log_dir,
                    '| epoch {:2d} | dev_result: {}'.format(epoch, f1))
            logging(cfg.log_dir, '-' * 89)
            logging(cfg.log_dir, '| best_f1: {}'.format(best_f1))
            logging(cfg.log_dir, '-' * 89)
            if cfg.save_path != "" and best != -1:
                save_path = cfg.save_path
                torch.save(
                    {
                        'epoch': epoch,
                        'checkpoint': model.state_dict(),
                        'best_f1': best_f1,
                        'optimizer': optimizer.state_dict()
                    },
                    save_path,
                    _use_new_zipfile_serialization=False)
                logging(cfg.log_dir,
                        '| successfully save model at: {}'.format(save_path))
                logging(cfg.log_dir, '-' * 89)
예제 #28
0
def train(args):
    os.chdir(get_original_cwd())
    run_name = 'each' if args.each else 'agg'
    run_name += '_submit' if args.submit else '_cv'
    logging.info('start ' + run_name)
    seed_everything(args.seed)
    if args.each:
        v_sales_dict = joblib.load(
            '../data/05_preprocess/each_item/v_sales_dict.joblib')
        data_count = joblib.load(
            '../data/05_preprocess/each_item/data_count.joblib')
        dims = joblib.load('../data/05_preprocess/each_item/dims.joblib')
        weight = joblib.load('../data/06_weight/weight_each.joblib')
        te = joblib.load('../data/07_te/each_te.joblib')
    else:
        v_sales_dict = joblib.load(
            '../data/05_preprocess/agg_item/v_sales_dict.joblib')
        data_count = joblib.load(
            '../data/05_preprocess/agg_item/data_count.joblib')
        dims = joblib.load('../data/05_preprocess/agg_item/dims.joblib')
        weight = joblib.load('../data/06_weight/weight_agg.joblib')
        te = joblib.load('../data/07_te/agg_te.joblib')
    v_sales = next(iter(v_sales_dict.values()))
    drop_columns = [
        'sort_key', 'id', 'cat_id', 'd', 'release_date', 'date', 'weekday',
        'year', 'week_of_month', 'holidy'
    ]
    if not args.use_prices:
        drop_columns += [
            'release_ago',
            'sell_price',
            'diff_price',
            'price_max',
            'price_min',
            'price_std',
            'price_mean',
            'price_trend',
            'price_norm',
            'diff_price_norm',
            'price_nunique',
            'dept_max',
            'dept_min',
            'dept_std',
            'dept_mean',
            'price_in_dept',
            'mean_in_dept',
            'cat_max',
            'cat_min',
            'cat_std',
            'cat_mean',
            'price_in_cat',
            'mean_in_cat',
            'price_in_month',
            'price_in_year',
        ]
    cat_columns = [
        'aggregation_level', 'item_id', 'dept_id', 'store_id', 'state_id',
        'month', 'event_name_1', 'event_type_1', 'event_name_2',
        'event_type_2', 'day_of_week'
    ]
    features = [
        col for col in v_sales.columns if col not in drop_columns + [TARGET]
    ]
    is_cats = [col in cat_columns for col in features]
    cat_dims = []
    emb_dims = []
    for col in features:
        if col in cat_columns:
            cat_dims.append(dims['cat_dims'][col])
            emb_dims.append(dims['emb_dims'][col])
    dims = pd.DataFrame({'cat_dims': cat_dims, 'emb_dims': emb_dims})
    logging.info('data loaded')

    if args.submit:
        logging.info('train for submit')
        # train model for submission
        index = 1 if args.useval else 2
        valid_term = 2
        train_index = index if args.patience == 0 else (index + valid_term)
        trainset = M5Dataset(v_sales_dict,
                             data_count,
                             features,
                             weight,
                             te,
                             remove_last4w=train_index,
                             min_data_4w=0,
                             over_sample=args.over_sample)
        validset = M5ValidationDataset(trainset.data_dict,
                                       weight,
                                       te,
                                       remove_last4w=index,
                                       term=valid_term)
        train_loader = torch.utils.data.DataLoader(
            trainset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.num_workers,
            worker_init_fn=get_worker_init_fn(args.seed))
        valid_loader = torch.utils.data.DataLoader(
            validset,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=args.num_workers)
        model = M5MLPLSTMModel(is_cats,
                               dims,
                               n_hidden=args.n_hidden,
                               dropout=args.dropout,
                               use_te=args.use_te)
        criterion = M5Distribution(dist=args.dist, df=args.df)
        module = M5LightningModule(model, criterion, train_loader,
                                   valid_loader, None, args)
        trainer = M5Trainer(args.experiment, run_name, args.max_epochs,
                            args.min_epochs, args.patience, args.val_check)
        trainer.fit(module)
        trainer.logger.experiment.log_artifact(
            trainer.logger.run_id, trainer.checkpoint_callback.kth_best_model)

        logging.info('predict')
        module.load_state_dict(
            torch.load(
                trainer.checkpoint_callback.kth_best_model)['state_dict'])
        # for reproducibility
        dmp_filename = '../data/cuda_rng_state_each.dmp' if args.each else '../data/cuda_rng_state_agg.dmp'
        torch.save(torch.cuda.get_rng_state(), dmp_filename)
        trainer.logger.experiment.log_artifact(trainer.logger.run_id,
                                               dmp_filename)
        val_acc, val_unc = predict(args,
                                   module,
                                   criterion,
                                   trainset.data_dict,
                                   weight,
                                   te,
                                   evaluation=False)
        eva_acc, eva_unc = predict(args,
                                   module,
                                   criterion,
                                   trainset.data_dict,
                                   weight,
                                   te,
                                   evaluation=True)
        submission_accuracy = pd.concat([val_acc, eva_acc])
        submission_uncertainty = pd.concat([val_unc, eva_unc])
        dump(submission_accuracy, submission_uncertainty, run_name)

    else:
        # local CV
        folds = list(range(3, -1, -1))  # [3, 2, 1, 0]
        for fold in folds:
            logging.info(f'train FOLD [{4-fold}/{len(folds)}]')
            valid_term = 2
            if args.patience == 0:
                train_index = (fold + 1) * valid_term + 1
                valid_index = (fold + 1) * valid_term + 1
                test_index = fold * valid_term + 1
            else:
                train_index = (fold + 2) * valid_term + 1
                valid_index = (fold + 1) * valid_term + 1
                test_index = fold * valid_term + 1
            trainset = M5Dataset(v_sales_dict,
                                 data_count,
                                 features,
                                 weight,
                                 te,
                                 remove_last4w=train_index,
                                 over_sample=args.over_sample)
            validset = M5ValidationDataset(trainset.data_dict,
                                           weight,
                                           te,
                                           remove_last4w=valid_index,
                                           term=valid_term)
            testset = M5TestDataset(trainset.data_dict,
                                    weight,
                                    te,
                                    remove_last4w=test_index,
                                    term=valid_term)
            train_loader = torch.utils.data.DataLoader(
                trainset,
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=args.num_workers,
                worker_init_fn=get_worker_init_fn(args.seed))
            valid_loader = torch.utils.data.DataLoader(
                validset,
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=args.num_workers)
            test_loader = torch.utils.data.DataLoader(
                testset,
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=args.num_workers)
            model = M5MLPLSTMModel(is_cats,
                                   dims,
                                   n_hidden=args.n_hidden,
                                   dropout=args.dropout,
                                   use_te=args.use_te)
            criterion = M5Distribution(dist=args.dist, df=args.df)
            module = M5LightningModule(model, criterion, train_loader,
                                       valid_loader, test_loader, args)
            fold_name = f'_{4-fold}-{len(folds)}'
            trainer = M5Trainer(args.experiment, run_name + fold_name,
                                args.max_epochs, args.min_epochs,
                                args.patience, args.val_check)
            trainer.fit(module)
            trainer.logger.experiment.log_artifact(
                trainer.logger.run_id,
                trainer.checkpoint_callback.kth_best_model)

            logging.info(f'test FOLD [{4-fold}/{len(folds)}]')
            module.load_state_dict(
                torch.load(
                    trainer.checkpoint_callback.kth_best_model)['state_dict'])
            trainer.test()
            del trainset, validset, testset, train_loader, valid_loader, test_loader, model, criterion, module, trainer
            gc.collect()