def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats
def build_dataset(fields, data_type=None, data_iter=None, data_path=None, total_token_length=500, src_seq_length=100, src_sent_length=100, seq_length_trunc=0, use_filter_pred=True, tfidf=None): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ # assert data_type is not None examples_iter, num_feats = \ TextDataset.make_text_examples_nfeats_tpl( data_iter, data_path, seq_length_trunc, tfidf) dataset = TextDataset(fields, data_type, examples_iter, num_feats, total_token_length=total_token_length, src_seq_length=src_seq_length, src_sent_length=src_sent_length, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt") dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, topk_keywords=20): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, topk_keywords=20): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src", topk_keywords=topk_keywords) return src_examples_iter, num_src_feats src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, topk_keywords=topk_keywords) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt", topk_keywords=topk_keywords) dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, side): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, side) return src_examples_iter, num_src_feats
def build_dataset(fields, data_type=None, data_iter=None, data_path=None, seq_length=0, seq_length_trunc=0, dynamic_dict=True, use_filter_pred=True): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ # assert data_type is not None examples_iter, num_src_feats, num_qa_feats, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( data_iter, data_path, seq_length_trunc) dataset = TextDataset(fields, data_type, examples_iter, num_src_feats, num_qa_feats, num_tgt_feats, src_seq_length=seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=3): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir, image_channel_size) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=image_channel_size) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt") if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred, image_channel_size=image_channel_size) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset