def preprocess(self, requests):
        """ Very basic preprocessing code - only tokenizes.
            Extend with your own preprocessing steps as needed.
        """
        input_batch = []
        for idx, data in enumerate(requests):
            input_text = data.get("data")
            if input_text is None:
                input_text = data.get("body")
            if isinstance(input_text, (bytes, bytearray)):
                input_text = input_text.decode('utf-8')

            input_text = json.loads(input_text)
            input_batch.extend(input_text)

        eval_examples = get_examples_from_dialogues(input_batch,
                                                    user_first=False,
                                                    dialogue_level=False)
        eval_features = self.processor.convert_examples_to_features(
            eval_examples)
        eval_data = WOSDataset(eval_features)
        eval_sampler = SequentialSampler(eval_data)
        eval_loader = DataLoader(
            eval_data,
            batch_size=1,
            sampler=eval_sampler,
            collate_fn=self.processor.collate_fn,
        )

        return eval_loader
예제 #2
0
    parser.add_argument("--teacher_forcing_ratio", type=float, default=0.5)
    args = parser.parse_args()
    
    args.data_dir = os.environ['SM_CHANNEL_TRAIN']
    args.model_dir = os.environ['SM_MODEL_DIR']

    # random seed 고정
    set_seed(args.random_seed)

    # Data Loading
    train_data_file = f"{args.data_dir}/train_dials.json"
    slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json"))
    train_data, dev_data, dev_labels = load_dataset(train_data_file)

    train_examples = get_examples_from_dialogues(
        train_data, user_first=False, dialogue_level=False
    )
    dev_examples = get_examples_from_dialogues(
        dev_data, user_first=False, dialogue_level=False
    )

    # Define Preprocessor
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    processor = TRADEPreprocessor(slot_meta, tokenizer)
    args.vocab_size = len(tokenizer)
    args.n_gate = len(processor.gating2id) # gating 갯수 none, dontcare, ptr

    # Extracting Featrues
    train_features = processor.convert_examples_to_features(train_examples)
    dev_features = processor.convert_examples_to_features(dev_examples)
    
예제 #3
0
    args = parser.parse_args()
    args.data_dir = os.environ['SM_CHANNEL_EVAL']
    args.model_dir = os.environ['SM_CHANNEL_MODEL']
    args.output_dir = os.environ['SM_OUTPUT_DATA_DIR']
    
    model_dir_path = os.path.dirname(args.model_dir)
    eval_data = json.load(open(f"{args.data_dir}/eval_dials.json", "r"))
    config = json.load(open(f"{model_dir_path}/exp_config.json", "r"))
    config = argparse.Namespace(**config)
    slot_meta = json.load(open(f"{model_dir_path}/slot_meta.json", "r"))

    tokenizer = BertTokenizer.from_pretrained(config.model_name_or_path)
    processor = TRADEPreprocessor(slot_meta, tokenizer)

    eval_examples = get_examples_from_dialogues(
        eval_data, user_first=False, dialogue_level=False
    )

    # Extracting Featrues
    eval_features = processor.convert_examples_to_features(eval_examples)
    eval_data = WOSDataset(eval_features)
    eval_sampler = SequentialSampler(eval_data)
    eval_loader = DataLoader(
        eval_data,
        batch_size=args.eval_batch_size,
        sampler=eval_sampler,
        collate_fn=processor.collate_fn,
    )
    print("# eval:", len(eval_data))

    tokenized_slot_meta = []
예제 #4
0
from tqdm import tqdm

from transformers import BertTokenizer, BertConfig
from data_utils import get_examples_from_dialogues, convert_state_dict, load_dataset
from data_utils import OntologyDSTFeature, DSTPreprocessor, _truncate_seq_pair
from transformers.modeling_bert import BertOnlyMLMHead

"""## Data Loading """

train_data_file = "/opt/ml/input/data/train_dataset/train_dials.json"
slot_meta = json.load(open("/opt/ml/input/data/train_dataset/slot_meta.json"))
ontology = json.load(open("/opt/ml/input/data/train_dataset/ontology.json"))
train_data, dev_data, dev_labels = load_dataset(train_data_file)

train_examples = get_examples_from_dialogues(data=train_data,
                                             user_first=True,
                                             dialogue_level=True)

dev_examples = get_examples_from_dialogues(data=dev_data,
                                           user_first=True,
                                           dialogue_level=True)

len(train_data)

max_turn = max([len(e['dialogue']) for e in train_data])

tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal')

"""## TODO-1: SUMBT Preprocessor 정의

Ontology-based DST model인 SUMBT의 InputFeature를 만들기 위한 Preprocessor를 정의해야 합니다. <br>