def build_dataset(fields, data_type, src, src_dir=None, tgt=None, src_seq_len=50, tgt_seq_len=50, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None if data_type == 'text': src_examples_iter = TextDataset.make_examples(src, "src") elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size) else: src_examples_iter = AudioDataset.make_examples(src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples(tgt, "tgt") # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial(filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls(fields, src_examples_iter, tgt_examples_iter, filter_pred=filter_pred) return dataset
def build_dataset(fields, data_type, src, knl, src_dir=None, tgt=None, knl_seq_len=800, src_seq_len=150, tgt_seq_len=50, knl_seq_length_trunc=200, src_seq_length_trunc=50, tgt_seq_length_trunc=0, dynamic_dict=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3, corpus_type='train', model_mode='default'): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None assert not dynamic_dict or data_type == 'text', \ 'it is not possible to use dynamic_dict with non-text input' if data_type == 'text': src_examples_iter = TextDataset.make_examples( src, src_seq_length_trunc, "src", corpus_type, model_mode ) knl_examples_iter = TextDataset.make_examples( knl, knl_seq_length_trunc, "knl", corpus_type, model_mode ) elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size ) else: src_examples_iter = AudioDataset.make_examples( src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples( tgt, tgt_seq_length_trunc, "tgt", corpus_type, model_mode) # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial( filter_example, use_src_len=data_type == 'text', use_knl_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len, max_knl_len=knl_seq_len ) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls( fields, src_examples_iter, tgt_examples_iter, knl_examples_iter, dynamic_dict=dynamic_dict, filter_pred=filter_pred ) print("[onmt.inputters.inputter.py] dataset_cls:{}".format(dataset_cls)) print("[onmt.inputters.inputter.py] dataset:{}".format(dataset)) return dataset