def _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_path, src_dir) elif data_type == 'audio': src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats
def build_dataset(fields, data_type, src_path, tgt_path, src_dir=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, lower=False, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, side_src_path=None, side_tgt_path=None, phrase_table=None, global_phrase_table=None): # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, lower) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt", lower) side_tgt_examples_iter, _ = TextDataset.make_text_examples_nfeats_tpl(side_tgt_path, tgt_seq_length_trunc, "side_tgt", lower) side_src_examples_iter, _ = TextDataset.make_text_examples_nfeats_tpl(side_src_path, tgt_seq_length_trunc, "side_src", lower) phrase_table_iter, _ = TextDataset.make_text_examples_nfeats_tpl(phrase_table, tgt_seq_length_trunc, "phrase_table", lower) if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, side_src_examples_iter, side_tgt_examples_iter, phrase_table_iter, global_phrase_table, num_src_feats=num_src_feats, num_tgt_feats=num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def get_num_features(data_type, corpus_file, side): """ Args: data_type (str): type of the source input. Options are [text|img|audio]. corpus_file (str): file path to get the features. side (str): for source or for target. Returns: number of features on `side`. """ assert side in ["src", "tgt"] if data_type == 'text': return TextDataset.get_num_features(corpus_file, side) elif data_type == 'img': return ImageDataset.get_num_features(corpus_file, side) elif data_type == 'audio': return AudioDataset.get_num_features(corpus_file, side)
def get_fields(data_type, n_src_features, n_tgt_features): """ Args: data_type: type of the source input. Options are [text|img|audio]. n_src_features: the number of source features to create `torchtext.data.Field` for. n_tgt_features: the number of target features to create `torchtext.data.Field` for. Returns: A dictionary whose keys are strings and whose values are the corresponding Field objects. """ if data_type == 'text': return TextDataset.get_fields(n_src_features, n_tgt_features) elif data_type == 'img': return ImageDataset.get_fields(n_src_features, n_tgt_features) elif data_type == 'audio': return AudioDataset.get_fields(n_src_features, n_tgt_features)
def build_my_dataset(fields, data_type, src_path, conversation_path, tgt_path, src_dir=None, src_seq_length=0, conversation_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, conversation_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True): # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_path, src_seq_length_trunc, "src") conversation_examples_iter, num_conversation_feats = \ TextDataset.make_text_examples_nfeats_tpl( conversation_path, conversation_seq_length_trunc, "conversation") # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt") if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, conversation_examples_iter, tgt_examples_iter, num_src_feats, num_conversation_feats, num_tgt_feats, src_seq_length=src_seq_length, conversation_seq_length=conversation_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_path, tgt_path, doc_path=None, src_dir=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True): # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt") doc_index = [int(l.strip()) for l in open(doc_path)] if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, doc_index, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_path, tgt_path, src_dir=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, symbol_representation="word2word", revert_targets=False): src_representation, tgt_representation = get_input_output_representation_level( symbol_representation) # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, src_representation) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt", tgt_representation, revert_targets) if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_path, tgt_path, src_path2, tgt_path2, src_dir=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True): # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, "src1") tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt1") src_examples_iter2, num_src_feats2 = \ _make_examples_nfeats_tpl(data_type, src_path2, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, "src2") tgt_examples_iter2, num_tgt_feats2 = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path2, tgt_seq_length_trunc, "tgt2") if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, src_examples_iter2, tgt_examples_iter2, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_path, tgt_path, src_dir=None, second_data_type=None, second_src_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, file_to_tensor_fn=None): use_second_modality = second_data_type is not None if use_second_modality: assert data_type == 'text' # Only implemented for primary input type text # Second data type should not be text. One could simply append his secondary text # to the primary input. assert second_data_type != 'text', 'second_data_type cannot be text.' assert second_src_path is not None and src_dir is not None, \ 'If second_data_type is set, second_src_path as well as src_dir needs to be present' # Build src/tgt examples iterator from corpus files, also extract # number of features. src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, file_to_tensor_fn=file_to_tensor_fn) if use_second_modality: src2_examples_iter, num_src2_feats = \ _make_examples_nfeats_tpl(second_data_type, second_src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, side='src2', file_to_tensor_fn=file_to_tensor_fn) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_path, tgt_seq_length_trunc, "tgt") if use_second_modality: dataset = MultiModalDataset(fields, src_examples_iter, src2_examples_iter, second_data_type, tgt_examples_iter, num_src_feats, num_src2_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset