예제 #1
0
    def _process_corpus(self,
                        fields,
                        src_examples_iter,
                        tgt_examples_iter,
                        num_src_feats=0,
                        num_tgt_feats=0,
                        tgt_seq_length=0,
                        use_filter_pred=True):
        """
        Build Example objects, Field objects, and filter_pred function
        from image corpus.

        Args:
            fields: a dictionary of Field objects.
            src_examples_iter: preprocessed source example_dict iterator.
            tgt_examples_iter: preprocessed target example_dict iterator.
            num_src_feats: number of source side features.
            num_tgt_feats: number of target side features.
            tgt_seq_length: maximum target sequence length.
            use_filter_pred: use a custom filter predicate to filter examples?

        Returns:
            constructed tuple of Examples objects, Field objects, filter_pred.
        """
        self.data_type = 'img'

        self.n_src_feats = num_src_feats
        self.n_tgt_feats = num_tgt_feats

        if tgt_examples_iter is not None:
            examples_iter = (_join_dicts(
                src,
                tgt) for src, tgt in zip(src_examples_iter, tgt_examples_iter))
        else:
            examples_iter = src_examples_iter

        # Peek at the first to see which fields are used.
        ex, examples_iter = _peek(examples_iter)
        keys = ex.keys()

        out_fields = [(k, fields[k]) if k in fields else (k, None)
                      for k in keys]
        example_values = ([ex[k] for k in keys] for ex in examples_iter)
        out_examples = (_construct_example_fromlist(ex_values, out_fields)
                        for ex_values in example_values)

        def filter_pred(example):
            if tgt_examples_iter is not None:
                return 0 < len(example.tgt) <= tgt_seq_length
            else:
                return True

        filter_pred = filter_pred if use_filter_pred else lambda x: True

        return out_examples, out_fields, filter_pred
예제 #2
0
    def _process_corpus(self,
                        fields,
                        src_examples_iter,
                        tgt_examples_iter,
                        num_src_feats=0,
                        num_tgt_feats=0,
                        tgt_seq_length=0,
                        sample_rate=0,
                        window_size=0,
                        window_stride=0,
                        window=None,
                        normalize_audio=True,
                        use_filter_pred=True):
        """
        Build Example objects, Field objects, and filter_pred function
        from audio corpus.

        Args:
            fields: a dictionary of Field objects.
            src_examples_iter: preprocessed source example_dict iterator.
            tgt_examples_iter: preprocessed target example_dict iterator.
            num_src_feats: number of source side features.
            num_tgt_feats: number of target side features.
            tgt_seq_length: maximum target sequence length.
            sample_rate: sample rate.
            window_size: window size for spectrogram in seconds.
            window_stride: window stride for spectrogram in seconds.
            window: indow type for spectrogram generation.
            normalize_audio: subtract spectrogram by mean and divide
                             by std or not.
            use_filter_pred: use a custom filter predicate to filter
                             examples?

        Returns:
            constructed tuple of Examples objects, Field objects, filter_pred.
        """
        self.data_type = 'audio'

        self.sample_rate = sample_rate
        self.window_size = window_size
        self.window_stride = window_stride
        self.window = window
        self.normalize_audio = normalize_audio

        self.n_src_feats = num_src_feats
        self.n_tgt_feats = num_tgt_feats

        if tgt_examples_iter is not None:
            examples_iter = (_join_dicts(
                src,
                tgt) for src, tgt in zip(src_examples_iter, tgt_examples_iter))
        else:
            examples_iter = src_examples_iter

        # Peek at the first to see which fields are used.
        ex, examples_iter = _peek(examples_iter)
        keys = ex.keys()

        out_fields = [(k, fields[k]) if k in fields else (k, None)
                      for k in keys]
        example_values = ([ex[k] for k in keys] for ex in examples_iter)
        out_examples = (_construct_example_fromlist(ex_values, out_fields)
                        for ex_values in example_values)

        def filter_pred(example):
            if tgt_examples_iter is not None:
                return 0 < len(example.tgt) <= tgt_seq_length
            else:
                return True

        filter_pred = filter_pred if use_filter_pred else lambda x: True

        return out_examples, out_fields, filter_pred
예제 #3
0
    def _process_corpus(self, fields, src_examples_iter, tgt_examples_iter,
                        num_src_feats=0, num_tgt_feats=0,
                        src_seq_length=0, tgt_seq_length=0,
                        dynamic_dict=True, use_filter_pred=True):
        """
        Build Example objects, Field objects, and filter_pred function
        from text corpus.

        Args:
            fields: a dictionary of Field objects. Keys are like 'src',
                    'tgt', 'src_map', and 'alignment'.
            src_examples_iter: preprocessed source example_dict iterator.
            tgt_examples_iter: preprocessed target example_dict iterator.
            num_src_feats: number of source side features.
            num_tgt_feats: number of target side features.
            src_seq_length: maximum source sequence length.
            tgt_seq_length: maximum target sequence length.
            dynamic_dict: create dynamic dictionaries?
            use_filter_pred: use a custom filter predicate to filter examples?

        Returns:
            constructed tuple of Examples objects, Field objects, filter_pred.
        """
        self.data_type = 'text'

        # self.src_vocabs: mutated in dynamic_dict, used in
        # collapse_copy_scores and in Translator.py
        self.src_vocabs = []

        self.n_src_feats = num_src_feats
        self.n_tgt_feats = num_tgt_feats

        # Each element of an example is a dictionary whose keys represents
        # at minimum the src tokens and their indices and potentially also
        # the src and tgt features and alignment information.
        if tgt_examples_iter is not None:
            examples_iter = (_join_dicts(src, tgt) for src, tgt in
                             zip(src_examples_iter, tgt_examples_iter))
        else:
            examples_iter = src_examples_iter

        if dynamic_dict:
            examples_iter = self._dynamic_dict(examples_iter)

        # Peek at the first to see which fields are used.
        ex, examples_iter = _peek(examples_iter)
        keys = ex.keys()

        out_fields = [(k, fields[k]) if k in fields else (k, None)
                      for k in keys]
        example_values = ([ex[k] for k in keys] for ex in examples_iter)
        out_examples = (_construct_example_fromlist(ex_values, out_fields)
                        for ex_values in example_values)

        def filter_pred(example):
            return 0 < len(example.src) <= src_seq_length \
               and 0 < len(example.tgt) <= tgt_seq_length

        filter_pred = filter_pred if use_filter_pred else lambda x: True

        return out_examples, out_fields, filter_pred