Exemplo n.º 1
0
    def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride, window,
                                  normalize_audio):
        """
        Process the corpus into (example_dict iterator, num_feats) tuple
        on source side for different 'data_type'.
        """

        if data_type == 'text':
            src_examples_iter, num_src_feats = \
                TextDataset.make_text_examples_nfeats_tpl(
                    src_data_iter, src_path, src_seq_length_trunc, "src")

        elif data_type == 'img':
            src_examples_iter, num_src_feats = \
                ImageDataset.make_image_examples_nfeats_tpl(
                    src_data_iter, src_path, src_dir)

        elif data_type == 'audio':
            if src_data_iter:
                raise ValueError("""Data iterator for AudioDataset isn't
                                    implemented""")

            if src_path is None:
                raise ValueError("AudioDataset requires a non None path")
            src_examples_iter, num_src_feats = \
                AudioDataset.make_audio_examples_nfeats_tpl(
                    src_path, src_dir, sample_rate,
                    window_size, window_stride, window,
                    normalize_audio)

        return src_examples_iter, num_src_feats
Exemplo n.º 2
0
def build_dataset(fields,
                  data_type=None,
                  data_iter=None,
                  data_path=None,
                  total_token_length=500,
                  src_seq_length=100,
                  src_sent_length=100,
                  seq_length_trunc=0,
                  use_filter_pred=True,
                  tfidf=None):
    """
    Build src/tgt examples iterator from corpus files, also extract
    number of features.
    """
    # assert data_type is not None
    examples_iter, num_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            data_iter, data_path, seq_length_trunc, tfidf)

    dataset = TextDataset(fields,
                          data_type,
                          examples_iter,
                          num_feats,
                          total_token_length=total_token_length,
                          src_seq_length=src_seq_length,
                          src_sent_length=src_sent_length,
                          use_filter_pred=use_filter_pred)

    return dataset
Exemplo n.º 3
0
def build_dataset(fields,
                  data_type,
                  src_data_iter=None,
                  src_path=None,
                  src_dir=None,
                  tgt_data_iter=None,
                  tgt_path=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True):
    """
    Build src/tgt examples iterator from corpus files, also extract
    number of features.
    """
    src_examples_iter, num_src_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
                    src_data_iter, src_path, src_seq_length_trunc, "src")

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt")

    dataset = TextDataset(fields,
                          src_examples_iter,
                          tgt_examples_iter,
                          num_src_feats,
                          num_tgt_feats,
                          src_seq_length=src_seq_length,
                          tgt_seq_length=tgt_seq_length,
                          dynamic_dict=dynamic_dict,
                          use_filter_pred=use_filter_pred)
    return dataset
Exemplo n.º 4
0
def build_dataset(fields, data_type, src_data_iter=None, src_path=None, tgt_data_iter=None, tgt_path=None,
                  src_seq_length=0, tgt_seq_length=0,
                  src_seq_length_trunc=0, tgt_seq_length_trunc=0,
                  dynamic_dict=False, sample_rate=0,
                  window_size=0, window_stride=0, window=None,
                  normalize_audio=True, use_filter_pred=True, topk_keywords=20):
    """
    Build src/tgt examples iterator from corpus files, also extract
    number of features.
    """

    def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, topk_keywords=20):
        """
        Process the corpus into (example_dict iterator, num_feats) tuple
        on source side for different 'data_type'.
        """

        src_examples_iter, num_src_feats = \
            TextDataset.make_text_examples_nfeats_tpl(
                src_data_iter, src_path, src_seq_length_trunc, "src", topk_keywords=topk_keywords)

        

        return src_examples_iter, num_src_feats

    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_data_iter, src_path,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, topk_keywords=topk_keywords)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt", topk_keywords=topk_keywords)

    dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter,
                          num_src_feats, num_tgt_feats,
                          src_seq_length=src_seq_length,
                          tgt_seq_length=tgt_seq_length,
                          dynamic_dict=dynamic_dict,
                          use_filter_pred=use_filter_pred)



    return dataset
Exemplo n.º 5
0
    def _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride, window,
                                  normalize_audio, side):
        """
        Process the corpus into (example_dict iterator, num_feats) tuple
        on source side for different 'data_type'.
        """

        if data_type == 'text':
            src_examples_iter, num_src_feats = \
                TextDataset.make_text_examples_nfeats_tpl(
                    src_data_iter, src_path, src_seq_length_trunc, side)

        return src_examples_iter, num_src_feats
Exemplo n.º 6
0
def build_dataset(fields, data_type=None, data_iter=None, data_path=None,
                  seq_length=0,
                  seq_length_trunc=0,
                  dynamic_dict=True, use_filter_pred=True):
    """
    Build src/tgt examples iterator from corpus files, also extract
    number of features.
    """
    # assert data_type is not None
    examples_iter, num_src_feats, num_qa_feats, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            data_iter, data_path, seq_length_trunc)

    dataset = TextDataset(fields, data_type, examples_iter,
                          num_src_feats, num_qa_feats, num_tgt_feats,
                          src_seq_length=seq_length,
                          dynamic_dict=dynamic_dict,
                          use_filter_pred=use_filter_pred)

    return dataset
Exemplo n.º 7
0
def build_dataset(fields,
                  data_type,
                  src_data_iter=None,
                  src_path=None,
                  src_dir=None,
                  tgt_data_iter=None,
                  tgt_path=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True,
                  image_channel_size=3):
    """
    Build src/tgt examples iterator from corpus files, also extract
    number of features.
    """
    def _make_examples_nfeats_tpl(data_type,
                                  src_data_iter,
                                  src_path,
                                  src_dir,
                                  src_seq_length_trunc,
                                  sample_rate,
                                  window_size,
                                  window_stride,
                                  window,
                                  normalize_audio,
                                  image_channel_size=3):
        """
        Process the corpus into (example_dict iterator, num_feats) tuple
        on source side for different 'data_type'.
        """

        if data_type == 'text':
            src_examples_iter, num_src_feats = \
                TextDataset.make_text_examples_nfeats_tpl(
                    src_data_iter, src_path, src_seq_length_trunc, "src")

        elif data_type == 'img':
            src_examples_iter, num_src_feats = \
                ImageDataset.make_image_examples_nfeats_tpl(
                    src_data_iter, src_path, src_dir, image_channel_size)

        elif data_type == 'audio':
            if src_data_iter:
                raise ValueError("""Data iterator for AudioDataset isn't
                                    implemented""")

            if src_path is None:
                raise ValueError("AudioDataset requires a non None path")
            src_examples_iter, num_src_feats = \
                AudioDataset.make_audio_examples_nfeats_tpl(
                    src_path, src_dir, sample_rate,
                    window_size, window_stride, window,
                    normalize_audio)

        return src_examples_iter, num_src_feats

    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_data_iter, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio,
                                  image_channel_size=image_channel_size)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_data_iter, tgt_path, tgt_seq_length_trunc, "tgt")

    if data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              tgt_examples_iter,
                              num_src_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred,
                               image_channel_size=image_channel_size)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset