def _example_dict_iter(self, line, index): line = line.split() if self.line_truncate: line = line[:self.line_truncate] words, feats, n_feats = TextDataset.extract_text_features(line) example_dict = {self.side: words, "indices": index} if feats: # All examples must have same number of features. aeq(self.n_feats, n_feats) prefix = self.side + "_feat_" example_dict.update((prefix + str(j), f) for j, f in enumerate(feats)) return example_dict
def num_feats(self): """ We peek the first line and seek back to the beginning of the file. """ saved_pos = self.corpus.tell() line = self.corpus.readline().split() if self.line_truncate: line = line[:self.line_truncate] _, _, self.n_feats = TextDataset.extract_text_features(line) self.corpus.seek(saved_pos) return self.n_feats
def get_num_features(src_data_type, corpus_file, side): """ Args: src_data_type (str): ['text'|'img'|'audio'] corpus_file (str): file path to get the features. side (str): src or tgt Returns: number of features on `side`. """ assert side in ["src", "tgt"] assert src_data_type in ['text', 'img', 'audio'], \ "Data type not implemented" if side == 'src' and src_data_type != 'text': return 0 # no features for non-text else: with codecs.open(corpus_file, "r", "utf-8") as f: line = f.readline().strip().split() _, _, n_feats = TextDataset.extract_text_features(line) return n_feats