Пример #1
0
    def get_data_loaders(self):
        data_dir = self.args.data_dir

        self.train_dataset = PickleDataset(
            os.path.join(data_dir, f'{self.args.train_set}.pkl'),
            os.path.join(data_dir, self.args.train_index_file),
            segment_size=self.config.segment_size)

        self.val_dataset = PickleDataset(
            os.path.join(data_dir, f'{self.args.val_set}.pkl'),
            os.path.join(data_dir, self.args.val_index_file),
            segment_size=self.config.segment_size)

        self.train_loader = get_data_loader(self.train_dataset,
                                            batch_size=self.config.batch_size,
                                            shuffle=self.config.shuffle,
                                            num_workers=4,
                                            drop_last=False)

        self.val_loader = get_data_loader(self.val_dataset,
                                          batch_size=self.config.batch_size,
                                          shuffle=self.config.shuffle,
                                          num_workers=4,
                                          drop_last=False)

        self.train_iter = infinite_iter(self.train_loader)
        return
Пример #2
0
    def get_data_loaders(self):
        data_dir = self.args.data_dir
        self.gpu_num = torch.cuda.device_count() if torch.cuda.is_available(
        ) else 1
        self.train_dataset = PickleDataset(
            os.path.join(data_dir, f'{self.args.train_set}.pkl'),
            os.path.join(data_dir, self.args.train_index_file),
            segment_size=self.config['data_loader']['segment_size'])
        self.train_loader = get_data_loader(
            self.train_dataset,
            frame_size=self.config['data_loader']['frame_size'],
            batch_size=self.config['data_loader']['batch_size'] * self.gpu_num,
            num_workers=0,
            shuffle=self.config['data_loader']['shuffle'],
            drop_last=False)
        self.train_iter = infinite_iter(self.train_loader)

        if self.args.use_eval_set:
            self.eval_dataset = PickleDataset(
                os.path.join(data_dir, f'{self.args.eval_set}.pkl'),
                os.path.join(data_dir, self.args.eval_index_file),
                segment_size=self.config['data_loader']['segment_size'])

            self.eval_loader = get_data_loader(
                self.eval_dataset,
                frame_size=self.config['data_loader']['frame_size'],
                batch_size=self.config['data_loader']['batch_size'] *
                self.gpu_num,
                shuffle=self.config['data_loader']['shuffle'],
                num_workers=0,
                drop_last=False)
            self.eval_iter = infinite_iter(self.eval_loader)

        if self.args.use_test_set:
            self.test_dataset = PickleDataset(
                os.path.join(data_dir, f'{self.args.test_set}.pkl'),
                os.path.join(data_dir, self.args.test_index_file),
                segment_size=self.config['data_loader']['segment_size'])

            self.test_loader = get_data_loader(
                self.test_dataset,
                frame_size=self.config['data_loader']['frame_size'],
                batch_size=self.config['data_loader']['batch_size'],
                shuffle=False,
                num_workers=0,
                drop_last=False)
            self.test_iter = infinite_iter(self.test_loader)

        return
Пример #3
0
def _get_loader(network_loc, molecule_loc, exclude_ids_loc, split_by,
                batch_size, batch_size_test, num_iterations, num_workers, full,
                training_only, k, p, ms: MoleculeSpec):
    """Helper function for getting data loaders

    Args:
        network_loc (str): Location of the bipartite network
        molecule_loc (str): Location of molecule SMILES strings
        exclude_ids_loc (str): The location storing the ids to be excluded from the training set
        split_by (str): Whether to split by scaffold or molecule
        batch_size (int): The batch size for training
        batch_size_test (int): The batch size for testing
        num_iterations (int): The number of total iterations for model training
        num_workers (int): The number of workers for loading dataset
        full (bool): Whether to use the full dataset for training
        training_only (bool): Only record training loss
        k (int): The number of importance samples
        p (float): The degree of stochasticity of importance sampling 0.0 for fully stochastic decoding, 1.0 for fully
        deterministic decoding
        ms (MoleculeSpec)

    Returns:
        t.Tuple[t.Iterable, t.Iterable]:
    """
    if full:
        loader_train = get_data_loader_full(scaffold_network_loc=network_loc,
                                            molecule_smiles_loc=molecule_loc,
                                            batch_size=batch_size,
                                            num_iterations=num_iterations,
                                            num_workers=num_workers,
                                            k=k,
                                            p=p,
                                            ms=ms)
        loader_test = None
    else:
        loader_train, loader_test = get_data_loader(
            scaffold_network_loc=network_loc,
            molecule_smiles_loc=molecule_loc,
            exclude_ids_loc=exclude_ids_loc,
            split_type=split_by,
            batch_size=batch_size,
            batch_size_test=batch_size_test,
            num_iterations=num_iterations,
            num_workers=num_workers,
            k=k,
            p=p,
            ms=ms)
        if training_only:
            loader_test = None
    return loader_train, loader_test
Пример #4
0
    def get_data_loaders(self):
        data_dir = self.args.data_dir

        self.test_dataset = PickleDataset(
            os.path.join(data_dir, f'{self.args.test_set}.pkl'),
            os.path.join(data_dir, self.args.test_index_file),
            segment_size=self.config['data_loader']['segment_size'])

        self.test_loader = get_data_loader(
            self.test_dataset,
            frame_size=self.config['data_loader']['frame_size'],
            batch_size=self.config['data_loader']['batch_size'],
            shuffle=False,
            drop_last=False)
 def get_data_loaders(self):
     data_dir = self.args.data_dir
     self.train_dataset = PickleDataset(
         os.path.join(data_dir, f'{self.args.train_set}.pkl'),
         os.path.join(data_dir, self.args.train_index_file),
         segment_size=self.config['data_loader']['segment_size'])
     self.train_loader = get_data_loader(
         self.train_dataset,
         frame_size=self.config['data_loader']['frame_size'],
         batch_size=self.config['data_loader']['batch_size'],
         shuffle=self.config['data_loader']['shuffle'],
         num_workers=4,
         drop_last=False)
     self.train_iter = infinite_iter(self.train_loader)
     return
Пример #6
0
 def get_data_loaders(self):
     data_dir = self.args.data_dir
     self.train_dataset = PickleDataset(
         os.path.join(data_dir, f"{self.args.train_set}.pkl"),
         os.path.join(data_dir, self.args.train_index_file),
         segment_size=self.config["data_loader"]["segment_size"],
     )
     self.train_loader = get_data_loader(
         self.train_dataset,
         frame_size=self.config["data_loader"]["frame_size"],
         batch_size=self.config["data_loader"]["batch_size"],
         shuffle=self.config["data_loader"]["shuffle"],
         num_workers=0,
         drop_last=False,
     )
     self.train_iter = infinite_iter(self.train_loader)
     return
Пример #7
0
def train(args):
    # Define Tokenizer
    tokenizer_module = getattr(import_module("transformers"),
                               f"{args.model_name}Tokenizer")
    tokenizer = tokenizer_module.from_pretrained(args.pretrained_name_or_path)

    slot_meta, train_examples, dev_examples, dev_labels = train_data_loading(
        args, isUserFirst=False, isDialogueLevel=False)
    # Define Preprocessor
    processor = TRADEPreprocessor(slot_meta,
                                  tokenizer,
                                  max_seq_length=args.max_seq_length,
                                  use_n_gate=args.use_n_gate)

    train_features = processor.convert_examples_to_features(train_examples)
    dev_features = processor.convert_examples_to_features(dev_examples)

    train_loader = get_data_loader(processor, train_features,
                                   args.train_batch_size)
    dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)

    args.vocab_size = len(tokenizer)
    args.n_gate = len(
        processor.gating2id
    )  # gating 갯수 : (none, dontcare, ptr) or (none, yes, no, dontcare, ptr)

    # Slot Meta tokenizing for the decoder initial inputs
    tokenized_slot_meta = []
    for slot in slot_meta:
        tokenized_slot_meta.append(
            tokenizer.encode(slot.replace("-", " "), add_special_tokens=False))

    # Model 선언
    model = TRADE(args, tokenized_slot_meta)
    # model.set_subword_embedding(args)  # Subword Embedding 초기화
    print(f"Subword Embeddings is loaded from {args.pretrained_name_or_path}")
    model.to(device)
    print("Model is initialized")

    # Optimizer 및 Scheduler 선언
    n_epochs = args.epochs
    t_total = len(train_loader) * n_epochs
    # get_optimizer 부분에서 자동으로 warmup_steps를 계산할 수 있도록 바꿨음 (아래가 원래의 code)
    # warmup_steps = int(t_total * args.warmup_ratio)

    optimizer = get_optimizer(model,
                              args)  # get optimizer (Adam, sgd, AdamP, ..)

    scheduler = get_scheduler(
        optimizer, t_total, args)  # get scheduler (custom, linear, cosine, ..)

    loss_fnc_1 = masked_cross_entropy_for_value  # generation - # classes: vocab_size
    loss_fnc_2 = nn.CrossEntropyLoss()
    # loss_fnc_2 = LabelSmoothingLoss(classes=model.decoder.n_gate,smoothing=args.smoothing_factor)

    json.dump(
        vars(args),
        open(f"{args.model_dir}/{args.model_fold}/exp_config.json", "w"),
        indent=2,
        ensure_ascii=False,
    )
    json.dump(
        slot_meta,
        open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "w"),
        indent=2,
        ensure_ascii=False,
    )

    best_score, best_checkpoint = 0, 0
    for epoch in range(n_epochs):
        model.train()
        for step, batch in enumerate(train_loader):
            input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [
                b.to(device) if not isinstance(b, list) else b for b in batch
            ]

            # teacher forcing
            if (args.teacher_forcing_ratio > 0.0
                    and random.random() < args.teacher_forcing_ratio):
                tf = target_ids
            else:
                tf = None

            all_point_outputs, all_gate_outputs = model(
                input_ids, segment_ids, input_masks, target_ids.size(-1), tf)

            # generation loss
            loss_1 = loss_fnc_1(
                all_point_outputs.contiguous(),
                target_ids.contiguous().view(-1),
                tokenizer.pad_token_id,
            )

            # gating loss
            loss_2 = loss_fnc_2(
                all_gate_outputs.contiguous().view(-1, args.n_gate),
                gating_ids.contiguous().view(-1),
            )
            loss = loss_1 + loss_2

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            for learning_rate in scheduler.get_lr():
                wandb.log({"learning_rate": learning_rate})

            optimizer.zero_grad()

            if step % 100 == 0:
                print(
                    f"[{epoch}/{n_epochs}] [{step}/{len(train_loader)}] loss: {loss.item()} gen: {loss_1.item()} gate: {loss_2.item()}"
                )
                wandb.log({
                    "epoch": epoch,
                    "Train epoch loss": loss.item(),
                    "Train epoch gen loss": loss_1.item(),
                    "Train epoch gate loss": loss_2.item(),
                })

        predictions = inference_TRADE(model, dev_loader, processor, device)
        eval_result = _evaluation(predictions, dev_labels, slot_meta)
        for k, v in eval_result.items():
            if k in ("joint_goal_accuracy", 'turn_slot_accuracy',
                     'turn_slot_f1'):
                print(f"{k}: {v}")

        if best_score < eval_result["joint_goal_accuracy"]:
            print("Update Best checkpoint!")
            best_score = eval_result["joint_goal_accuracy"]
            best_checkpoint = epoch

            wandb.log({
                "epoch":
                epoch,
                "Best joint goal accuracy":
                best_score,
                "Best turn slot accuracy":
                eval_result['turn_slot_accuracy'],
                "Best turn slot f1":
                eval_result['turn_slot_f1']
            })

        if args.logging_accuracy_per_domain_slot:
            wandb.log({
                k: v
                for k, v in eval_result.items()
                if k not in ("joint_goal_accuracy", 'turn_slot_accuracy',
                             'turn_slot_f1')
            })

        torch.save(model.state_dict(),
                   f"{args.model_dir}/{args.model_fold}/model-{epoch}.bin")

    print(f"Best checkpoint: {args.model_dir}/model-{best_checkpoint}.bin")
    wandb.log(
        {"Best checkpoint": f"{args.model_dir}/model-{best_checkpoint}.bin"})
Пример #8
0
def main_inference(args, config):
    slot_meta = json.load(
        open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "r"))
    ontology = json.load(open(f"{CFG.TrainOntology}", "r"))

    if config.replace_word_data:
        slot_meta = [meta.replace('택시', '버스') for meta in slot_meta]
        ontology = {
            domain_slot_key.replace('택시', '버스'): domain_slot_value
            for domain_slot_key, domain_slot_value in ontology.items()
        }

    # Define Tokenizer
    tokenizer_module = getattr(import_module("transformers"),
                               f"{config.model_name}Tokenizer")
    tokenizer = tokenizer_module.from_pretrained(
        config.pretrained_name_or_path)

    # Extracting Featrues
    if config.dst == 'TRADE':
        eval_examples = test_data_loading(args,
                                          isUserFirst=False,
                                          isDialogueLevel=False)
        processor = TRADEPreprocessor(slot_meta, tokenizer)

        tokenized_slot_meta = []
        for slot in slot_meta:
            tokenized_slot_meta.append(
                tokenizer.encode(slot.replace("-", " "),
                                 add_special_tokens=False))

        # Model 선언
        model = TRADE(config, tokenized_slot_meta)
        model.set_subword_embedding(config)  # Subword Embedding 초기화

    elif config.dst == 'SUMBT':
        eval_examples = test_data_loading(args,
                                          isUserFirst=True,
                                          isDialogueLevel=True)
        max_turn = max([len(e) * 2 for e in eval_examples])
        processor = SUMBTPreprocessor(
            slot_meta,
            tokenizer,
            ontology=ontology,  # predefined ontology
            max_seq_length=config.max_seq_length,  # 각 turn마다 최대 길이
            max_turn_length=max_turn)  # 각 dialogue의 최대 turn 길이

        slot_type_ids, slot_values_ids = tokenize_ontology(
            ontology, tokenizer, config.max_label_length)

        # Model 선언
        num_labels = [len(s)
                      for s in slot_values_ids]  # 각 Slot 별 후보 Values의 갯수

        model = SUMBT(config, num_labels, device)
        model.initialize_slot_value_lookup(
            slot_values_ids,
            slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV

    eval_features = processor.convert_examples_to_features(eval_examples)
    eval_loader = get_data_loader(processor, eval_features,
                                  config.eval_batch_size)
    print("# eval:", len(eval_loader))

    ckpt = torch.load(
        f'{args.model_dir}/{args.model_fold}/model-{args.chkpt_idx}.bin',
        map_location="cpu")
    model.load_state_dict(ckpt)
    model.to(device)
    print("Model is loaded")

    inference_module = getattr(import_module("inference"),
                               f"inference_{config.dst}")
    predictions = inference_module(model, eval_loader, processor, device)

    os.makedirs(args.output_dir, exist_ok=True)

    json.dump(
        predictions,
        open(f"{args.output_dir}/{args.model_fold}-predictions.csv", "w"),
        indent=2,
        ensure_ascii=False,
    )
Пример #9
0
def train(args):
    # Define Tokenizer
    tokenizer_module = getattr(import_module("transformers"),
                               f"{args.model_name}Tokenizer")
    tokenizer = tokenizer_module.from_pretrained(args.pretrained_name_or_path)

    slot_meta, train_examples, dev_examples, dev_labels = train_data_loading(
        args, isUserFirst=True, isDialogueLevel=True)
    ontology = json.load(open("../input/data/train_dataset/ontology.json"))

    # Define Preprocessor
    max_turn = max([len(e) * 2 for e in train_examples])
    processor = SUMBTPreprocessor(
        slot_meta,
        tokenizer,
        ontology=ontology,  # predefined ontology
        max_seq_length=args.max_seq_length,  # 각 turn마다 최대 길이
        max_turn_length=max_turn)  # 각 dialogue의 최대 turn 길이

    train_features = processor.convert_examples_to_features(train_examples)
    dev_features = processor.convert_examples_to_features(dev_examples)

    train_loader = get_data_loader(processor, train_features,
                                   args.train_batch_size)
    dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)
    if args.replace_word_data:
        ontology = {
            domain_slot_key.replace('택시', '버스'): domain_slot_value
            for domain_slot_key, domain_slot_value in ontology.items()
        }
    slot_type_ids, slot_values_ids = tokenize_ontology(ontology, tokenizer,
                                                       args.max_label_length)

    # Model 선언
    num_labels = [len(s) for s in slot_values_ids]  # 각 Slot 별 후보 Values의 갯수
    n_gpu = 1 if torch.cuda.device_count() < 2 else torch.cuda.device_count()

    model = SUMBT(args, num_labels, device)
    model.initialize_slot_value_lookup(
        slot_values_ids,
        slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV
    model.to(device)
    print("Model is initialized")
    """## Optimizer & Scheduler 선언 """
    n_epochs = args.epochs
    t_total = len(train_loader) * n_epochs
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = get_optimizer(optimizer_grouped_parameters,
                              args)  # get optimizer (Adam, sgd, AdamP, ..)

    scheduler = get_scheduler(
        optimizer, t_total, args)  # get scheduler (custom, linear, cosine, ..)

    json.dump(
        vars(args),
        open(f"{args.model_dir}/{args.model_fold}/exp_config.json", "w"),
        indent=2,
        ensure_ascii=False,
    )
    json.dump(
        slot_meta,
        open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "w"),
        indent=2,
        ensure_ascii=False,
    )

    best_score, best_checkpoint = 0, 0
    for epoch in range(n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_loader):
            input_ids, segment_ids, input_masks, target_ids, num_turns, guids  = \
            [b.to(device) if not isinstance(b, list) else b for b in batch]

            # Forward
            if n_gpu == 1:
                loss, loss_slot, acc, acc_slot, _ = model(
                    input_ids, segment_ids, input_masks, target_ids, n_gpu)
            else:
                loss, _, acc, acc_slot, _ = model(input_ids, segment_ids,
                                                  input_masks, target_ids,
                                                  n_gpu)

            batch_loss.append(loss.item())

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            for learning_rate in scheduler.get_lr():
                wandb.log({"learning_rate": learning_rate})

            optimizer.zero_grad()

            if step % 100 == 0:
                print('[%d/%d] [%d/%d] %f' %
                      (epoch, n_epochs, step, len(train_loader), loss.item()))

                wandb.log({"epoch": epoch, "Train epoch loss": loss.item()})

        predictions = inference_SUMBT(model, dev_loader, processor, device)
        eval_result = _evaluation(predictions, dev_labels, slot_meta)

        for k, v in eval_result.items():
            if k in ("joint_goal_accuracy", 'turn_slot_accuracy',
                     'turn_slot_f1'):
                print(f"{k}: {v}")

        if best_score < eval_result["joint_goal_accuracy"]:
            print("Update Best checkpoint!")
            best_score = eval_result["joint_goal_accuracy"]
            best_checkpoint = epoch

            wandb.log({
                "epoch":
                epoch,
                "Best joint goal accuracy":
                best_score,
                "Best turn slot accuracy":
                eval_result['turn_slot_accuracy'],
                "Best turn slot f1":
                eval_result['turn_slot_f1']
            })

        if args.logging_accuracy_per_domain_slot:
            wandb.log({
                k: v
                for k, v in eval_result.items()
                if k not in ("joint_goal_accuracy", 'turn_slot_accuracy',
                             'turn_slot_f1')
            })

        torch.save(model.state_dict(),
                   f"{args.model_dir}/{args.model_fold}/model-{epoch}.bin")

    print(f"Best checkpoint: {args.model_dir}/model-{best_checkpoint}.bin")
    wandb.log(
        {"Best checkpoint": f"{args.model_dir}/model-{best_checkpoint}.bin"})
Пример #10
0
def _get_loader(
        network_loc: str, molecule_loc: str, exclude_ids_loc: str,
        split_by: str, batch_size: int, batch_size_test: int,
        num_iterations: int, num_workers: int, full: bool, training_only: bool,
        k: int, p: float,
        ms: MoleculeSpec) -> t.Tuple[t.Iterable, t.Optional[t.Iterable]]:
    """Helper function for getting data loaders

    Args:
        network_loc (str):
            Location of the bipartite network
        molecule_loc (str):
            Location of molecule SMILES strings
        exclude_ids_loc (str):
            The location storing the ids to be excluded from the training set
        split_by (str):
            Whether to split by scaffold or molecule
        batch_size (int):
            The batch size for training
        batch_size_test (int):
            The batch size for testing
        num_iterations (int):
            The number of total iterations for model training
        num_workers (int):
            The number of workers for loading dataset
        full (bool):
            Whether to use the full dataset for training
        training_only (bool):
            Only record training loss
        k (int):
            The number of importance samples
        p (float):
            The degree of stochasticity of importance sampling 0.0 for fully
            stochastic decoding, 1.0 for fully deterministic decoding
        ms (MoleculeSpec)

    Returns:
        t.Tuple[t.Iterable, t.Iterable]:
            DataLoaders for training and test data
    """
    if full:
        training_only = True
        loader_train = \
            get_data_loader_full(network_loc,
                                 molecule_loc,
                                 batch_size,
                                 num_iterations,
                                 num_workers,
                                 k, p, ms)
        loader_test = None
    else:
        loader_train, loader_test = \
            get_data_loader(network_loc,
                            molecule_loc,
                            exclude_ids_loc,
                            split_by,
                            batch_size,
                            batch_size_test,
                            num_iterations,
                            num_workers,
                            k, p, ms)
        if training_only:
            loader_test = None
    return loader_train, loader_test