示例#1
0
    def read(self, data, side, src_dir=None):
        """Read data into dicts.

        Args:
            data (str or Iterable[str]): Sequence of audio paths or
                path to file containing audio paths.
                In either case, the filenames may be relative to ``src_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            src_dir (str): Location of source audio files. See ``data``.

        Yields:
            A dictionary containing audio data for each line.
        """

        assert src_dir is not None and os.path.exists(src_dir),\
            "src_dir must be a valid directory if data_type is audio"

        if isinstance(data, str):
            data = DataReaderBase._read_file(data)

        for i, line in enumerate(tqdm(data)):
            line = line.decode("utf-8").strip()
            audio_path = os.path.join(src_dir, line)
            if not os.path.exists(audio_path):
                audio_path = line

            assert os.path.exists(audio_path), \
                'audio path %s not found' % line

            spect = self.extract_features(audio_path)
            yield {side: spect, side + '_path': line, 'indices': i}
    def read_jsonl(self, sequences, _dir=None):
        """Read keyphrase data from disk. Current supported data format is JSON only.

        Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            _dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        assert _dir is None or _dir == "", \
            "Cannot use _dir with KeyphraseDataReader."
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        # we need to make indices be the real index of the list, so replace it with a counter
        count = 0
        for i, line in enumerate(sequences):
            try:
                # default input is a jsonl line
                line = line.decode("utf-8")
                data = json.loads(line)
            except Exception:
                # data must be a dict
                if not data or len(
                        line.strip()) == 0 or not isinstance(data, dict):
                    continue

            # insert `indices`
            count += 1
            data['indices'] = count
            yield data
示例#3
0
    def read(self, lattices, side, _dir=""):
        """Read text data from disk.

        Args:
            sequences (str or Iterable[str]):
                Sequence of lattice paths or path to file containing lattice paths.
                text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            lattice_dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """

        if isinstance(lattices, str):
            lattices = DataReaderBase._read_file(lattices)

        for i, filename in enumerate(lattices):
            filename = filename.decode("utf-8").strip()
            lattice_path=filename
            #lattice_path = os.path.join(dir, filename)
            if not os.path.exists(lattice_path):
                lattice_path = filename

            assert os.path.exists(lattice_path), \
                'lattice path %s not found' % filename

            text, scores, lens = LatticeDataReader.read_confnet_file(lattice_path)
            yield {side: text, 'score': scores, 'lens': lens, side + '_path': lattice_path, 'indices': i}
示例#4
0
    def read(self, sequences, side, _dir=None):
        """Read edges data from disk.

        Args:
            sequences (str or Iterable[str]):
                path to edge file or iterable of the actual edge data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` , ``"tgt" or "grh``.
            _dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        assert _dir is None or _dir == "", \
            "Cannot use _dir with GrhDataReader."
        # assert _dir is not None or _dir != "", \
        #     "Must use _dir with GrhDataReader (provide edges vocab)."
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        # vocab = json.load(_dir)
        for i, seq in enumerate(sequences):
            if isinstance(seq, six.binary_type):
                seq = seq.decode("utf-8")
            yield {side: seq, "indices": i}
示例#5
0
    def read(self, images, side, img_dir=None):
        """
        Args:
            images (str): location of a src file containing image paths
            src_dir (str): location of source images
            side (str): 'src' or 'tgt'
        Yields:
            a dictionary containing image data, path and index for each line.
        """
        if isinstance(images, str):
            images = DataReaderBase._read_file(images)

        for i, filename in enumerate(images):
            filename = filename.decode("utf-8").strip()
            img_path = os.path.join(img_dir, filename)
            if not os.path.exists(img_path):
                img_path = filename

            assert os.path.exists(img_path), \
                'img path %s not found' % filename

            if self.channel_size == 1:
                img = transforms.ToTensor()(Image.fromarray(
                    cv2.imread(img_path, 0)))
            else:
                img = transforms.ToTensor()(Image.open(img_path))
            if self.truncate and self.truncate != (0, 0):
                if not (img.size(1) <= self.truncate[0]
                        and img.size(2) <= self.truncate[1]):
                    continue
            yield {side: img, side + '_path': filename, 'indices': i}
示例#6
0
    def read(self, vecs, side, vec_dir=None):
        """Read data into dicts.
        Args:
            vecs (str or Iterable[str]): Sequence of feature vector paths or
                path to file containing feature vector paths.
                In either case, the filenames may be relative to ``vec_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            vec_dir (str): Location of source vectors. See ``vecs``.
        Yields:
            A dictionary containing feature vector data.
        """

        if isinstance(vecs, str):
            vecs = DataReaderBase._read_file(vecs)

        for i, filename in enumerate(vecs):
            filename = filename.decode("utf-8").strip()
            vec_path = os.path.join(vec_dir, filename)
            if not os.path.exists(vec_path):
                vec_path = filename

            assert os.path.exists(vec_path), \
                'vec path %s not found' % filename

            vec = np.load(vec_path)
            yield {
                side: torch.from_numpy(vec),
                side + "_path": filename,
                "indices": i
            }
示例#7
0
    def read(self, data, side, src_dir=None):
        """Read data into dicts.

        Args:
            data (str or Iterable[str]): Sequence of audio paths or
                path to file containing audio paths.
                In either case, the filenames may be relative to ``src_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            src_dir (str): Location of source audio files. See ``data``.

        Yields:
            A dictionary containing audio data for each line.
        """

        assert src_dir is not None and os.path.exists(src_dir),\
            "src_dir must be a valid directory if data_type is audio"

        if isinstance(data, str):
            data = DataReaderBase._read_file(data)

        for i, line in enumerate(tqdm(data)):
            line = line.decode("utf-8").strip()
            audio_path = os.path.join(src_dir, line)
            if not os.path.exists(audio_path):
                audio_path = line

            assert os.path.exists(audio_path), \
                'audio path %s not found' % line

            spect = self.extract_features(audio_path)
            yield {side: spect, side + '_path': line, 'indices': i}
示例#8
0
    def read(self, images, side, img_dir=None):
        """Read data into dicts.

        Args:
            images (str or Iterable[str]): Sequence of image paths or
                path to file containing audio paths.
                In either case, the filenames may be relative to ``src_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            img_dir (str): Location of source image files. See ``images``.

        Yields:
            a dictionary containing image data, path and index for each line.
        """
        if isinstance(images, str):
            images = DataReaderBase._read_file(images)

        max_h = 80
        min_h = 32

        if len(images) > 0 and (isinstance(images[0], str)
                                or isinstance(images[0], bytes)):
            for i, filename in enumerate(images):
                filename = filename.decode("utf-8").strip()
                img_path = os.path.join(img_dir, filename)
                if not os.path.exists(img_path):
                    img_path = filename

                assert os.path.exists(img_path), \
                    'img path %s not found' % filename

                if self.channel_size == 1:
                    img = transforms.ToTensor()(Image.fromarray(
                        cv2.imread(img_path, 0)))
                else:
                    img = transforms.ToTensor()(Image.open(img_path))
                if self.truncate and self.truncate != (0, 0):
                    if not (img.size(1) <= self.truncate[0]
                            and img.size(2) <= self.truncate[1]):
                        continue
                yield {side: img, side + '_path': filename, 'indices': i}
        else:
            for i, img in enumerate(images):
                if (img.size[1] > max_h):
                    img = img.resize(
                        (int(img.size[0] * max_h / img.size[1]), max_h),
                        Image.ANTIALIAS)
                if (img.size[1] < min_h):
                    img = img.resize(
                        (int(img.size[0] * min_h / img.size[1]), min_h),
                        Image.ANTIALIAS)

                img = transforms.ToTensor()(img)
                if self.truncate and self.truncate != (0, 0):
                    if not (img.size(1) <= self.truncate[0]
                            and img.size(2) <= self.truncate[1]):
                        continue
                yield {side: img, side + '_path': img_dir, 'indices': i}
示例#9
0
    def read(self, sequences, side, features={}):
        """Read text data from disk.
            Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            features: (Dict[str or Iterable[str]]):
                dictionary mapping feature names with the path to feature
                file or iterable of the actual feature data.
        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)

        features_names = []
        features_values = []
        for feat_name, v in features.items():
            features_names.append(feat_name)
            if isinstance(v, str):
                features_values.append(DataReaderBase._read_file(features))
            else:
                features_values.append(v)
        for i, (seq, *feats) in enumerate(zip(sequences, *features_values)):
            ex_dict = {}
            if isinstance(seq, bytes):
                seq = seq.decode("utf-8")
            ex_dict[side] = seq
            for j, f in enumerate(feats):
                if isinstance(f, bytes):
                    f = f.decode("utf-8")
                ex_dict[features_names[j]] = f
            yield {side: ex_dict, "indices": i}
    def read(self, sequences, side, _dir=None):
        """Read keyphrase data from disk. Current supported data format is JSON only.

        Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            _dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
            src: title+abstract
            tgt: a string of a keyword, or a string of concatenated keywords (delimited by <sep>)
        """
        assert _dir is None or _dir == "", \
            "Cannot use _dir with KeyphraseDataReader."
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        for i, line in enumerate(sequences):
            try:
                # default input is a json line
                line = line.decode("utf-8")
                json_dict = json.loads(line)
                # Note tgt could be a list of strings
                seq = json_dict[side]
                # torchtext field only takes numeric features
                id = json_dict['id']
            except Exception:
                # temporary measure for plain text input
                seq = line
                id = i

            try:
                if id.rfind('_') != -1:
                    id = id[id.rfind('_') + 1:]
                id = int(id)
            except Exception:
                # if not convertible, use indices as id
                id = i

            yield {side: seq, "indices": i, 'id': id}
示例#11
0
    def read(self, sequences, side):
        """Read text data from disk.

        Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        for i, seq in enumerate(sequences):
            if isinstance(seq, bytes):
                seq = seq.decode("utf-8")
            yield {side: seq, "indices": i}
示例#12
0
    def read(self, sequences, side, _dir=None):
        """
        Args:
            sequences: path to corpus file or iterable
            truncate (int): maximum sequence length (0 for unlimited).
            side (str): "src" or "tgt".

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        assert _dir is None or _dir == "", \
            "Cannot use _dir with TextDataReader."
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        for i, seq in enumerate(sequences):
            if isinstance(seq, six.binary_type):
                seq = seq.decode("utf-8")
            yield {side: seq, "indices": i}
示例#13
0
    def read(self, data, side):
        #import pdb;pdb.set_trace()
        """Read data into dicts.

        Args:
            data (str or Iterable[str]): Sequence of audio paths or
                path to file containing audio paths.
                In either case, the filenames may be relative to ``src_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            src_dir (str): Location of source audio files. See ``data``.

        Yields:
            A dictionary containing audio data for each line.
        """

        if isinstance(data, str):
            data = DataReaderBase._read_file(data)

        for i, line in enumerate(data):
            line = line.decode("utf-8").strip()
            yield {side: line, side + '_path': line, 'indices': i}
示例#14
0
    def read(self, sequences, side):
        """Read text data from disk.

        Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            _dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        for i, seq in enumerate(sequences):
            #import pdb;pdb.set_trace()
            if isinstance(seq, six.binary_type):
                seq = seq.decode("utf-8")
            yield {side: seq, "indices": i}
示例#15
0
    def read(self, images, side, img_dir=None):
        """Read data into dicts.

        Args:
            images (str or Iterable[str]): Sequence of image paths or
                path to file containing audio paths.
                In either case, the filenames may be relative to ``src_dir``
                (default behavior) or absolute.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            img_dir (str): Location of source image files. See ``images``.

        Yields:
            a dictionary containing image data, path and index for each line.
        """
        if isinstance(images, str):
            images = DataReaderBase._read_file(images)

        for i, filename in enumerate(images):
            filename = filename.decode("utf-8").strip()
            img_path = os.path.join(img_dir, filename)
            if not os.path.exists(img_path):
                img_path = filename

            assert os.path.exists(img_path), \
                'img path %s not found' % filename

            if self.channel_size == 1:
                img = transforms.ToTensor()(
                    Image.fromarray(cv2.imread(img_path, 0)))
            else:
                img = transforms.ToTensor()(Image.open(img_path))
            if self.truncate and self.truncate != (0, 0):
                if not (img.size(1) <= self.truncate[0]
                        and img.size(2) <= self.truncate[1]):
                    continue
            yield {side: img, side + '_path': filename, 'indices': i}
示例#16
0
    def read(self, sequences, side, _dir=None):
        """Read text data from disk.

        Args:
            sequences (str or Iterable[str]):
                path to text file or iterable of the actual text data.
            side (str): Prefix used in return dict. Usually
                ``"src"`` or ``"tgt"``.
            _dir (NoneType): Leave as ``None``. This parameter exists to
                conform with the :func:`DataReaderBase.read()` signature.

        Yields:
            dictionaries whose keys are the names of fields and whose
            values are more or less the result of tokenizing with those
            fields.
        """
        assert _dir is None or _dir == "", \
            "Cannot use _dir with TextDataReader."
        if isinstance(sequences, str):
            sequences = DataReaderBase._read_file(sequences)
        for i, seq in enumerate(sequences):
            if isinstance(seq, six.binary_type):
                seq = seq.decode("utf-8")
            yield {side: seq, "indices": i}