def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats
def build_dataset(fields, data_type, src, src_dir=None, tgt=None, src_seq_len=50, tgt_seq_len=50, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None if data_type == 'text': src_examples_iter = TextDataset.make_examples(src, "src") elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size) else: src_examples_iter = AudioDataset.make_examples(src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples(tgt, "tgt") # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial(filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls(fields, src_examples_iter, tgt_examples_iter, filter_pred=filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_len=0, tgt_seq_len=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ if data_type == 'text': src_examples_iter = TextDataset.make_text_examples( src_data_iter, src_path, src_seq_length_trunc, "src" ) elif data_type == 'img': src_examples_iter = ImageDataset.make_image_examples( src_data_iter, src_path, src_dir, image_channel_size) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter = AudioDataset.make_audio_examples( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio ) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter = TextDataset.make_text_examples( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt" ) # I'm not certain about the practical utility of the second part if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial( filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len ) else: filter_pred = None if data_type == 'text': dataset = TextDataset( fields, src_examples_iter, tgt_examples_iter, dynamic_dict=dynamic_dict, filter_pred=filter_pred) else: dataset_cls = ImageDataset if data_type == 'img' else AudioDataset dataset = dataset_cls( fields, src_examples_iter, tgt_examples_iter, filter_pred=filter_pred ) return dataset
def get_fields(data_type, n_src_features, n_tgt_features): """ Args: data_type: type of the source input. Options are [text|img|audio]. n_src_features: the number of source features to create `torchtext.data.Field` for. n_tgt_features: the number of target features to create `torchtext.data.Field` for. Returns: A dictionary whose keys are strings and whose values are the corresponding Field objects. """ if data_type == 'text': return TextDataset.get_fields(n_src_features, n_tgt_features) elif data_type == 'img': return ImageDataset.get_fields(n_src_features, n_tgt_features) elif data_type == 'audio': return AudioDataset.get_fields(n_src_features, n_tgt_features) else: raise ValueError("Data type not implemented")
def get_num_features(data_type, corpus_file, side): """ Args: data_type (str): type of the source input. Options are [text|img|audio]. corpus_file (str): file path to get the features. side (str): for source or for target. Returns: number of features on `side`. """ assert side in ["src", "tgt"] if data_type == 'text': return TextDataset.get_num_features(corpus_file, side) elif data_type == 'img': return ImageDataset.get_num_features(corpus_file, side) elif data_type == 'audio': return AudioDataset.get_num_features(corpus_file, side) else: raise ValueError("Data type not implemented")
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=3): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir, image_channel_size) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=image_channel_size) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt") if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred, image_channel_size=image_channel_size) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src, knl, src_dir=None, tgt=None, knl_seq_len=800, src_seq_len=150, tgt_seq_len=50, knl_seq_length_trunc=200, src_seq_length_trunc=50, tgt_seq_length_trunc=0, dynamic_dict=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3, corpus_type='train', model_mode='default'): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None assert not dynamic_dict or data_type == 'text', \ 'it is not possible to use dynamic_dict with non-text input' if data_type == 'text': src_examples_iter = TextDataset.make_examples( src, src_seq_length_trunc, "src", corpus_type, model_mode ) knl_examples_iter = TextDataset.make_examples( knl, knl_seq_length_trunc, "knl", corpus_type, model_mode ) elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size ) else: src_examples_iter = AudioDataset.make_examples( src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples( tgt, tgt_seq_length_trunc, "tgt", corpus_type, model_mode) # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial( filter_example, use_src_len=data_type == 'text', use_knl_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len, max_knl_len=knl_seq_len ) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls( fields, src_examples_iter, tgt_examples_iter, knl_examples_iter, dynamic_dict=dynamic_dict, filter_pred=filter_pred ) print("[onmt.inputters.inputter.py] dataset_cls:{}".format(dataset_cls)) print("[onmt.inputters.inputter.py] dataset:{}".format(dataset)) return dataset