def build_dataset(fields, data_type, src, src_dir=None, tgt=None, src_seq_len=50, tgt_seq_len=50, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None if data_type == 'text': src_examples_iter = TextDataset.make_examples(src, "src") elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size) else: src_examples_iter = AudioDataset.make_examples(src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples(tgt, "tgt") # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial(filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls(fields, src_examples_iter, tgt_examples_iter, filter_pred=filter_pred) return dataset
def build_dataset(fields, data_type=None, data_iter=None, data_path=None, total_token_length=500, src_seq_length=100, src_sent_length=100, seq_length_trunc=0, use_filter_pred=True, tfidf=None): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ # assert data_type is not None examples_iter, num_feats = \ TextDataset.make_text_examples_nfeats_tpl( data_iter, data_path, seq_length_trunc, tfidf) dataset = TextDataset(fields, data_type, examples_iter, num_feats, total_token_length=total_token_length, src_seq_length=src_seq_length, src_sent_length=src_sent_length, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_len=0, tgt_seq_len=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ if data_type == 'text': src_examples_iter = TextDataset.make_text_examples( src_data_iter, src_path, src_seq_length_trunc, "src" ) elif data_type == 'img': src_examples_iter = ImageDataset.make_image_examples( src_data_iter, src_path, src_dir, image_channel_size) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter = AudioDataset.make_audio_examples( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio ) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter = TextDataset.make_text_examples( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt" ) # I'm not certain about the practical utility of the second part if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial( filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len ) else: filter_pred = None if data_type == 'text': dataset = TextDataset( fields, src_examples_iter, tgt_examples_iter, dynamic_dict=dynamic_dict, filter_pred=filter_pred) else: dataset_cls = ImageDataset if data_type == 'img' else AudioDataset dataset = dataset_cls( fields, src_examples_iter, tgt_examples_iter, filter_pred=filter_pred ) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, topk_keywords=20): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, topk_keywords=20): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src", topk_keywords=topk_keywords) return src_examples_iter, num_src_feats src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, topk_keywords=topk_keywords) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt", topk_keywords=topk_keywords) dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats
def build_dataset(fields, data_type, src, ans, src_dir=None, tgt=None, src_seq_len=50, tgt_seq_len=50, ans_seq_len=50, sample_rate=0, window_size=0, window_stride=0, window=None, use_filter_pred=True): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = {'text': TextDataset} assert data_type in dataset_classes assert src is not None if data_type == 'text': src_examples_iter = TextDataset.make_examples(src, "src") ans_examples_iter = TextDataset.make_examples(ans, "ans") if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples(tgt, "tgt") # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial(filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len, max_ans_len=ans_seq_len) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls(fields, src_examples_iter, tgt_examples_iter, ans_examples_iter, filter_pred=filter_pred) return dataset
def build_dataset(fields, data_type=None, data_iter=None, data_path=None, seq_length=0, seq_length_trunc=0, dynamic_dict=True, use_filter_pred=True): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ # assert data_type is not None examples_iter, num_src_feats, num_qa_feats, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( data_iter, data_path, seq_length_trunc) dataset = TextDataset(fields, data_type, examples_iter, num_src_feats, num_qa_feats, num_tgt_feats, src_seq_length=seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt") dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) return dataset
def get_num_features(data_type, corpus_file, side): """ Args: data_type (str): type of the source input. Options are [text|img|audio]. corpus_file (str): file path to get the features. Returns: number of features on `side`. """ # if data_type == 'concat': return TextDataset.get_num_features(corpus_file, side)
def get_fields(data_type): """ Args: n_src_features: the number of source features to create `torchtext.data.Field` for. n_tgt_features: the number of target features to create `torchtext.data.Field` for. data_type: concat / query / hier Returns: A dictionary whose keys are strings and whose values are the corresponding Field objects. """ return TextDataset.get_fields(data_type)
def get_fields(data_type, n_src_features, n_tgt_features): """ Args: data_type: type of the source input. Options are [text|img|audio]. n_src_features: the number of source features to create `torchtext.data.Field` for. n_tgt_features: the number of target features to create `torchtext.data.Field` for. Returns: A dictionary whose keys are strings and whose values are the corresponding Field objects. """ return TextDataset.get_fields(n_src_features, n_tgt_features)
def get_num_features(data_type, corpus_file, side): """ Args: data_type (str): type of the source input. Options are [text|img|audio]. corpus_file (str): file path to get the features. side (str): for source or for target. Returns: number of features on `side`. """ assert side in ["src", "tgt"] return TextDataset.get_num_features(corpus_file, side)
def num_feats(self): """ We peek the first line and seek back to the beginning of the file. """ saved_pos = self.corpus.tell() line = self.corpus.readline().split() if self.line_truncate: line = line[:self.line_truncate] _, _, self.n_feats = TextDataset.extract_text_features(line) self.corpus.seek(saved_pos) return self.n_feats
def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, side): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, side) return src_examples_iter, num_src_feats
def _example_dict_iter(self, line, index): line = line.split() if self.line_truncate: line = line[:self.line_truncate] words, feats, n_feats = TextDataset.extract_text_features(line) example_dict = {self.side: words, "indices": index} if feats: # All examples must have same number of features. aeq(self.n_feats, n_feats) prefix = self.side + "_feat_" example_dict.update((prefix + str(j), f) for j, f in enumerate(feats)) return example_dict
def get_num_features(data_type, corpus_file, side): """ Args: data_type (str): type of the source input. Options are [text|img|audio]. corpus_file (str): file path to get the features. side (str): for source or for target. Returns: number of features on `side`. """ assert side in ["src", "tgt", "ans"] if data_type == 'text': return TextDataset.get_num_features(corpus_file, side) else: raise ValueError("Data type not implemented")
def get_fields(data_type, n_src_features, n_tgt_features, n_ans_features): """ Args: data_type: type of the source input. Options are [text|img|audio]. n_src_features: the number of source features to create `torchtext.data.Field` for. n_tgt_features: the number of target features to create `torchtext.data.Field` for. Returns: A dictionary whose keys are strings and whose values are the corresponding Field objects. """ if data_type == 'text': return TextDataset.get_fields(n_src_features, n_tgt_features, n_ans_features) else: raise ValueError("Data type not implemented")
def get_num_features(src_data_type, corpus_file, side): """ Args: src_data_type (str): ['text'|'img'|'audio'] corpus_file (str): file path to get the features. side (str): src or tgt Returns: number of features on `side`. """ assert side in ["src", "tgt"] assert src_data_type in ['text', 'img', 'audio'], \ "Data type not implemented" if side == 'src' and src_data_type != 'text': return 0 # no features for non-text else: with codecs.open(corpus_file, "r", "utf-8") as f: line = f.readline().strip().split() _, _, n_feats = TextDataset.extract_text_features(line) return n_feats
def load_fields_from_vocab(vocab, data_type="text"): """ Load Field objects from `vocab.pt` file. """ vocab = dict(vocab) n_src_features = len(collect_features(vocab, 'src')) n_tgt_features = len(collect_features(vocab, 'tgt')) n_ans_features = len(collect_features(vocab, 'ans')) #logger.info("n_src_features " + str(n_src_features)) #logger.info("n_tgt_features" + str(n_tgt_features)) #logger.info("n_ans_features " + str(n_ans_features)) fields = TextDataset.get_fields(n_src_features, n_tgt_features, n_ans_features) #fields = get_fields(data_type, n_src_features, n_tgt_features) for k, v in vocab.items(): # Hack. Can't pickle defaultdict :( v.stoi = defaultdict(lambda: 0, v.stoi) fields[k].vocab = v return fields
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, src_dir=None, tgt_data_iter=None, tgt_path=None, src_seq_length=0, tgt_seq_length=0, src_seq_length_trunc=0, tgt_seq_length_trunc=0, dynamic_dict=True, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3): """ Build src/tgt examples iterator from corpus files, also extract number of features. """ def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=3): """ Process the corpus into (example_dict iterator, num_feats) tuple on source side for different 'data_type'. """ if data_type == 'text': src_examples_iter, num_src_feats = \ TextDataset.make_text_examples_nfeats_tpl( src_data_iter, src_path, src_seq_length_trunc, "src") elif data_type == 'img': src_examples_iter, num_src_feats = \ ImageDataset.make_image_examples_nfeats_tpl( src_data_iter, src_path, src_dir, image_channel_size) elif data_type == 'audio': if src_data_iter: raise ValueError("""Data iterator for AudioDataset isn't implemented""") if src_path is None: raise ValueError("AudioDataset requires a non None path") src_examples_iter, num_src_feats = \ AudioDataset.make_audio_examples_nfeats_tpl( src_path, src_dir, sample_rate, window_size, window_stride, window, normalize_audio) return src_examples_iter, num_src_feats src_examples_iter, num_src_feats = \ _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir, src_seq_length_trunc, sample_rate, window_size, window_stride, window, normalize_audio, image_channel_size=image_channel_size) # For all data types, the tgt side corpus is in form of text. tgt_examples_iter, num_tgt_feats = \ TextDataset.make_text_examples_nfeats_tpl( tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt") if data_type == 'text': dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, src_seq_length=src_seq_length, tgt_seq_length=tgt_seq_length, dynamic_dict=dynamic_dict, use_filter_pred=use_filter_pred) elif data_type == 'img': dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, use_filter_pred=use_filter_pred, image_channel_size=image_channel_size) elif data_type == 'audio': dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter, num_src_feats, num_tgt_feats, tgt_seq_length=tgt_seq_length, sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize_audio=normalize_audio, use_filter_pred=use_filter_pred) return dataset
def build_dataset( fields, data_type, src, src_dir=None, tgt=None, src_seq_len=50, tgt_seq_len=50, src_seq_length_trunc=0, tgt_seq_length_trunc=0, # dynamic_dict=False, flag_fft=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=False, use_filter_pred=True, corpus_type='train' # image_channel_size=3 ): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ # dataset_classes = { # 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset # } dataset_classes = {'nano': NanoDataset} assert data_type in dataset_classes assert src is not None # assert not dynamic_dict or data_type == 'text', \ # 'it is not possible to use dynamic_dict with non-text input' # if data_type == 'text': # src_examples_iter = TextDataset.make_examples( # src, src_seq_length_trunc, "src" # ) # elif data_type == 'img': # # there is a truncate argument as well, but it was never set to # # anything besides None before # src_examples_iter = ImageDataset.make_examples( # src, src_dir, 'src', channel_size=image_channel_size # ) # else: # src_examples_iter = AudioDataset.make_examples( # src, src_dir, "src", sample_rate, # window_size, window_stride, window, # normalize_audio, None) src_examples_iter = NanoDataset.make_examples(src, src_dir, "src", flag_fft, sample_rate, window_size, window_stride, window, normalize_audio, None, corpus_type) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples(tgt, tgt_seq_length_trunc, "tgt") # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial(filter_example, use_src_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len) else: filter_pred = None # dataset_cls = dataset_classes[data_type] dataset = NanoDataset(fields, src_examples_iter, tgt_examples_iter, dynamic_dict=False, filter_pred=filter_pred) return dataset
def build_dataset(fields, data_type, src, knl, src_dir=None, tgt=None, knl_seq_len=800, src_seq_len=150, tgt_seq_len=50, knl_seq_length_trunc=200, src_seq_length_trunc=50, tgt_seq_length_trunc=0, dynamic_dict=False, sample_rate=0, window_size=0, window_stride=0, window=None, normalize_audio=True, use_filter_pred=True, image_channel_size=3, corpus_type='train', model_mode='default'): """ src: path to corpus file or iterator over source data tgt: path to corpus file, iterator over target data, or None """ dataset_classes = { 'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset } assert data_type in dataset_classes assert src is not None assert not dynamic_dict or data_type == 'text', \ 'it is not possible to use dynamic_dict with non-text input' if data_type == 'text': src_examples_iter = TextDataset.make_examples( src, src_seq_length_trunc, "src", corpus_type, model_mode ) knl_examples_iter = TextDataset.make_examples( knl, knl_seq_length_trunc, "knl", corpus_type, model_mode ) elif data_type == 'img': # there is a truncate argument as well, but it was never set to # anything besides None before src_examples_iter = ImageDataset.make_examples( src, src_dir, 'src', channel_size=image_channel_size ) else: src_examples_iter = AudioDataset.make_examples( src, src_dir, "src", sample_rate, window_size, window_stride, window, normalize_audio, None) if tgt is None: tgt_examples_iter = None else: tgt_examples_iter = TextDataset.make_examples( tgt, tgt_seq_length_trunc, "tgt", corpus_type, model_mode) # the second conjunct means nothing will be filtered at translation time # if there is no target data if use_filter_pred and tgt_examples_iter is not None: filter_pred = partial( filter_example, use_src_len=data_type == 'text', use_knl_len=data_type == 'text', max_src_len=src_seq_len, max_tgt_len=tgt_seq_len, max_knl_len=knl_seq_len ) else: filter_pred = None dataset_cls = dataset_classes[data_type] dataset = dataset_cls( fields, src_examples_iter, tgt_examples_iter, knl_examples_iter, dynamic_dict=dynamic_dict, filter_pred=filter_pred ) print("[onmt.inputters.inputter.py] dataset_cls:{}".format(dataset_cls)) print("[onmt.inputters.inputter.py] dataset:{}".format(dataset)) return dataset