def read(self, data, side, src_dir=None): """Read data into dicts. Args: data (str or Iterable[str]): Sequence of audio paths or path to file containing audio paths. In either case, the filenames may be relative to ``src_dir`` (default behavior) or absolute. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. src_dir (str): Location of source audio files. See ``data``. Yields: A dictionary containing audio data for each line. """ assert src_dir is not None and os.path.exists(src_dir),\ "src_dir must be a valid directory if data_type is audio" if isinstance(data, str): data = DataReaderBase._read_file(data) for i, line in enumerate(tqdm(data)): line = line.decode("utf-8").strip() audio_path = os.path.join(src_dir, line) if not os.path.exists(audio_path): audio_path = line assert os.path.exists(audio_path), \ 'audio path %s not found' % line spect = self.extract_features(audio_path) yield {side: spect, side + '_path': line, 'indices': i}
def read_jsonl(self, sequences, _dir=None): """Read keyphrase data from disk. Current supported data format is JSON only. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. _dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ assert _dir is None or _dir == "", \ "Cannot use _dir with KeyphraseDataReader." if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) # we need to make indices be the real index of the list, so replace it with a counter count = 0 for i, line in enumerate(sequences): try: # default input is a jsonl line line = line.decode("utf-8") data = json.loads(line) except Exception: # data must be a dict if not data or len( line.strip()) == 0 or not isinstance(data, dict): continue # insert `indices` count += 1 data['indices'] = count yield data
def read(self, lattices, side, _dir=""): """Read text data from disk. Args: sequences (str or Iterable[str]): Sequence of lattice paths or path to file containing lattice paths. text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. lattice_dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ if isinstance(lattices, str): lattices = DataReaderBase._read_file(lattices) for i, filename in enumerate(lattices): filename = filename.decode("utf-8").strip() lattice_path=filename #lattice_path = os.path.join(dir, filename) if not os.path.exists(lattice_path): lattice_path = filename assert os.path.exists(lattice_path), \ 'lattice path %s not found' % filename text, scores, lens = LatticeDataReader.read_confnet_file(lattice_path) yield {side: text, 'score': scores, 'lens': lens, side + '_path': lattice_path, 'indices': i}
def read(self, sequences, side, _dir=None): """Read edges data from disk. Args: sequences (str or Iterable[str]): path to edge file or iterable of the actual edge data. side (str): Prefix used in return dict. Usually ``"src"`` , ``"tgt" or "grh``. _dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ assert _dir is None or _dir == "", \ "Cannot use _dir with GrhDataReader." # assert _dir is not None or _dir != "", \ # "Must use _dir with GrhDataReader (provide edges vocab)." if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) # vocab = json.load(_dir) for i, seq in enumerate(sequences): if isinstance(seq, six.binary_type): seq = seq.decode("utf-8") yield {side: seq, "indices": i}
def read(self, images, side, img_dir=None): """ Args: images (str): location of a src file containing image paths src_dir (str): location of source images side (str): 'src' or 'tgt' Yields: a dictionary containing image data, path and index for each line. """ if isinstance(images, str): images = DataReaderBase._read_file(images) for i, filename in enumerate(images): filename = filename.decode("utf-8").strip() img_path = os.path.join(img_dir, filename) if not os.path.exists(img_path): img_path = filename assert os.path.exists(img_path), \ 'img path %s not found' % filename if self.channel_size == 1: img = transforms.ToTensor()(Image.fromarray( cv2.imread(img_path, 0))) else: img = transforms.ToTensor()(Image.open(img_path)) if self.truncate and self.truncate != (0, 0): if not (img.size(1) <= self.truncate[0] and img.size(2) <= self.truncate[1]): continue yield {side: img, side + '_path': filename, 'indices': i}
def read(self, vecs, side, vec_dir=None): """Read data into dicts. Args: vecs (str or Iterable[str]): Sequence of feature vector paths or path to file containing feature vector paths. In either case, the filenames may be relative to ``vec_dir`` (default behavior) or absolute. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. vec_dir (str): Location of source vectors. See ``vecs``. Yields: A dictionary containing feature vector data. """ if isinstance(vecs, str): vecs = DataReaderBase._read_file(vecs) for i, filename in enumerate(vecs): filename = filename.decode("utf-8").strip() vec_path = os.path.join(vec_dir, filename) if not os.path.exists(vec_path): vec_path = filename assert os.path.exists(vec_path), \ 'vec path %s not found' % filename vec = np.load(vec_path) yield { side: torch.from_numpy(vec), side + "_path": filename, "indices": i }
def read(self, images, side, img_dir=None): """Read data into dicts. Args: images (str or Iterable[str]): Sequence of image paths or path to file containing audio paths. In either case, the filenames may be relative to ``src_dir`` (default behavior) or absolute. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. img_dir (str): Location of source image files. See ``images``. Yields: a dictionary containing image data, path and index for each line. """ if isinstance(images, str): images = DataReaderBase._read_file(images) max_h = 80 min_h = 32 if len(images) > 0 and (isinstance(images[0], str) or isinstance(images[0], bytes)): for i, filename in enumerate(images): filename = filename.decode("utf-8").strip() img_path = os.path.join(img_dir, filename) if not os.path.exists(img_path): img_path = filename assert os.path.exists(img_path), \ 'img path %s not found' % filename if self.channel_size == 1: img = transforms.ToTensor()(Image.fromarray( cv2.imread(img_path, 0))) else: img = transforms.ToTensor()(Image.open(img_path)) if self.truncate and self.truncate != (0, 0): if not (img.size(1) <= self.truncate[0] and img.size(2) <= self.truncate[1]): continue yield {side: img, side + '_path': filename, 'indices': i} else: for i, img in enumerate(images): if (img.size[1] > max_h): img = img.resize( (int(img.size[0] * max_h / img.size[1]), max_h), Image.ANTIALIAS) if (img.size[1] < min_h): img = img.resize( (int(img.size[0] * min_h / img.size[1]), min_h), Image.ANTIALIAS) img = transforms.ToTensor()(img) if self.truncate and self.truncate != (0, 0): if not (img.size(1) <= self.truncate[0] and img.size(2) <= self.truncate[1]): continue yield {side: img, side + '_path': img_dir, 'indices': i}
def read(self, sequences, side, features={}): """Read text data from disk. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. features: (Dict[str or Iterable[str]]): dictionary mapping feature names with the path to feature file or iterable of the actual feature data. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) features_names = [] features_values = [] for feat_name, v in features.items(): features_names.append(feat_name) if isinstance(v, str): features_values.append(DataReaderBase._read_file(features)) else: features_values.append(v) for i, (seq, *feats) in enumerate(zip(sequences, *features_values)): ex_dict = {} if isinstance(seq, bytes): seq = seq.decode("utf-8") ex_dict[side] = seq for j, f in enumerate(feats): if isinstance(f, bytes): f = f.decode("utf-8") ex_dict[features_names[j]] = f yield {side: ex_dict, "indices": i}
def read(self, sequences, side, _dir=None): """Read keyphrase data from disk. Current supported data format is JSON only. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. _dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. src: title+abstract tgt: a string of a keyword, or a string of concatenated keywords (delimited by <sep>) """ assert _dir is None or _dir == "", \ "Cannot use _dir with KeyphraseDataReader." if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) for i, line in enumerate(sequences): try: # default input is a json line line = line.decode("utf-8") json_dict = json.loads(line) # Note tgt could be a list of strings seq = json_dict[side] # torchtext field only takes numeric features id = json_dict['id'] except Exception: # temporary measure for plain text input seq = line id = i try: if id.rfind('_') != -1: id = id[id.rfind('_') + 1:] id = int(id) except Exception: # if not convertible, use indices as id id = i yield {side: seq, "indices": i, 'id': id}
def read(self, sequences, side): """Read text data from disk. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) for i, seq in enumerate(sequences): if isinstance(seq, bytes): seq = seq.decode("utf-8") yield {side: seq, "indices": i}
def read(self, sequences, side, _dir=None): """ Args: sequences: path to corpus file or iterable truncate (int): maximum sequence length (0 for unlimited). side (str): "src" or "tgt". Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ assert _dir is None or _dir == "", \ "Cannot use _dir with TextDataReader." if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) for i, seq in enumerate(sequences): if isinstance(seq, six.binary_type): seq = seq.decode("utf-8") yield {side: seq, "indices": i}
def read(self, data, side): #import pdb;pdb.set_trace() """Read data into dicts. Args: data (str or Iterable[str]): Sequence of audio paths or path to file containing audio paths. In either case, the filenames may be relative to ``src_dir`` (default behavior) or absolute. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. src_dir (str): Location of source audio files. See ``data``. Yields: A dictionary containing audio data for each line. """ if isinstance(data, str): data = DataReaderBase._read_file(data) for i, line in enumerate(data): line = line.decode("utf-8").strip() yield {side: line, side + '_path': line, 'indices': i}
def read(self, sequences, side): """Read text data from disk. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. _dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) for i, seq in enumerate(sequences): #import pdb;pdb.set_trace() if isinstance(seq, six.binary_type): seq = seq.decode("utf-8") yield {side: seq, "indices": i}
def read(self, images, side, img_dir=None): """Read data into dicts. Args: images (str or Iterable[str]): Sequence of image paths or path to file containing audio paths. In either case, the filenames may be relative to ``src_dir`` (default behavior) or absolute. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. img_dir (str): Location of source image files. See ``images``. Yields: a dictionary containing image data, path and index for each line. """ if isinstance(images, str): images = DataReaderBase._read_file(images) for i, filename in enumerate(images): filename = filename.decode("utf-8").strip() img_path = os.path.join(img_dir, filename) if not os.path.exists(img_path): img_path = filename assert os.path.exists(img_path), \ 'img path %s not found' % filename if self.channel_size == 1: img = transforms.ToTensor()( Image.fromarray(cv2.imread(img_path, 0))) else: img = transforms.ToTensor()(Image.open(img_path)) if self.truncate and self.truncate != (0, 0): if not (img.size(1) <= self.truncate[0] and img.size(2) <= self.truncate[1]): continue yield {side: img, side + '_path': filename, 'indices': i}
def read(self, sequences, side, _dir=None): """Read text data from disk. Args: sequences (str or Iterable[str]): path to text file or iterable of the actual text data. side (str): Prefix used in return dict. Usually ``"src"`` or ``"tgt"``. _dir (NoneType): Leave as ``None``. This parameter exists to conform with the :func:`DataReaderBase.read()` signature. Yields: dictionaries whose keys are the names of fields and whose values are more or less the result of tokenizing with those fields. """ assert _dir is None or _dir == "", \ "Cannot use _dir with TextDataReader." if isinstance(sequences, str): sequences = DataReaderBase._read_file(sequences) for i, seq in enumerate(sequences): if isinstance(seq, six.binary_type): seq = seq.decode("utf-8") yield {side: seq, "indices": i}