Пример #1
0
def _make_examples_nfeats_tpl(data_type, src_path, src_dir,
                              src_seq_length_trunc, sample_rate,
                              window_size, window_stride,
                              window, normalize_audio):
    """
    Process the corpus into (example_dict iterator, num_feats) tuple
    on source side for different 'data_type'.
    """

    if data_type == 'text':
        src_examples_iter, num_src_feats = \
            TextDataset.make_text_examples_nfeats_tpl(
                src_path, src_seq_length_trunc, "src")

    elif data_type == 'img':
        src_examples_iter, num_src_feats = \
            ImageDataset.make_image_examples_nfeats_tpl(
                src_path, src_dir)

    elif data_type == 'audio':
        src_examples_iter, num_src_feats = \
            AudioDataset.make_audio_examples_nfeats_tpl(
                src_path, src_dir, sample_rate,
                window_size, window_stride, window,
                normalize_audio)

    return src_examples_iter, num_src_feats
Пример #2
0
def build_dataset(fields, data_type, src_path, tgt_path, src_dir=None,
                  src_seq_length=0, tgt_seq_length=0,
                  src_seq_length_trunc=0, tgt_seq_length_trunc=0, lower=False,
                  dynamic_dict=True, sample_rate=0,
                  window_size=0, window_stride=0, window=None,
                  normalize_audio=True, use_filter_pred=True,
                  side_src_path=None, side_tgt_path=None,
                  phrase_table=None, global_phrase_table=None):

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.
    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, lower)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path, tgt_seq_length_trunc, "tgt", lower)

    side_tgt_examples_iter, _ = TextDataset.make_text_examples_nfeats_tpl(side_tgt_path, tgt_seq_length_trunc, "side_tgt", lower)
    side_src_examples_iter, _ = TextDataset.make_text_examples_nfeats_tpl(side_src_path, tgt_seq_length_trunc, "side_src", lower)
    phrase_table_iter, _ = TextDataset.make_text_examples_nfeats_tpl(phrase_table, tgt_seq_length_trunc, "phrase_table", lower)

    if data_type == 'text':
        dataset = TextDataset(fields, src_examples_iter, tgt_examples_iter,
                              side_src_examples_iter, side_tgt_examples_iter,
                              phrase_table_iter, global_phrase_table,
                              num_src_feats=num_src_feats, num_tgt_feats=num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields, src_examples_iter, tgt_examples_iter,
                               num_src_feats, num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields, src_examples_iter, tgt_examples_iter,
                               num_src_feats, num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset
Пример #3
0
def get_num_features(data_type, corpus_file, side):
    """
    Args:
        data_type (str): type of the source input.
            Options are [text|img|audio].
        corpus_file (str): file path to get the features.
        side (str): for source or for target.

    Returns:
        number of features on `side`.
    """
    assert side in ["src", "tgt"]

    if data_type == 'text':
        return TextDataset.get_num_features(corpus_file, side)
    elif data_type == 'img':
        return ImageDataset.get_num_features(corpus_file, side)
    elif data_type == 'audio':
        return AudioDataset.get_num_features(corpus_file, side)
Пример #4
0
def get_fields(data_type, n_src_features, n_tgt_features):
    """
    Args:
        data_type: type of the source input. Options are [text|img|audio].
        n_src_features: the number of source features to
            create `torchtext.data.Field` for.
        n_tgt_features: the number of target features to
            create `torchtext.data.Field` for.

    Returns:
        A dictionary whose keys are strings and whose values are the
        corresponding Field objects.
    """
    if data_type == 'text':
        return TextDataset.get_fields(n_src_features, n_tgt_features)
    elif data_type == 'img':
        return ImageDataset.get_fields(n_src_features, n_tgt_features)
    elif data_type == 'audio':
        return AudioDataset.get_fields(n_src_features, n_tgt_features)
Пример #5
0
def build_my_dataset(fields,
                     data_type,
                     src_path,
                     conversation_path,
                     tgt_path,
                     src_dir=None,
                     src_seq_length=0,
                     conversation_seq_length=0,
                     tgt_seq_length=0,
                     src_seq_length_trunc=0,
                     conversation_seq_length_trunc=0,
                     tgt_seq_length_trunc=0,
                     dynamic_dict=True,
                     sample_rate=0,
                     window_size=0,
                     window_stride=0,
                     window=None,
                     normalize_audio=True,
                     use_filter_pred=True):

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.

    src_examples_iter, num_src_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            src_path, src_seq_length_trunc, "src")

    conversation_examples_iter, num_conversation_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            conversation_path, conversation_seq_length_trunc, "conversation")

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path, tgt_seq_length_trunc, "tgt")

    if data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              conversation_examples_iter,
                              tgt_examples_iter,
                              num_src_feats,
                              num_conversation_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              conversation_seq_length=conversation_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset
Пример #6
0
def build_dataset(fields,
                  data_type,
                  src_path,
                  tgt_path,
                  doc_path=None,
                  src_dir=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True):

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.
    src_examples_iter, num_src_feats = \
     _make_examples_nfeats_tpl(data_type, src_path, src_dir,
             src_seq_length_trunc, sample_rate,
             window_size, window_stride,
             window, normalize_audio)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
     TextDataset.make_text_examples_nfeats_tpl(
      tgt_path, tgt_seq_length_trunc, "tgt")

    doc_index = [int(l.strip()) for l in open(doc_path)]

    if data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              tgt_examples_iter,
                              doc_index,
                              num_src_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset
Пример #7
0
def build_dataset(fields,
                  data_type,
                  src_path,
                  tgt_path,
                  src_dir=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True,
                  symbol_representation="word2word",
                  revert_targets=False):

    src_representation, tgt_representation = get_input_output_representation_level(
        symbol_representation)

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.
    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, src_representation)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path, tgt_seq_length_trunc, "tgt", tgt_representation, revert_targets)

    if data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              tgt_examples_iter,
                              num_src_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset
Пример #8
0
def build_dataset(fields,
                  data_type,
                  src_path,
                  tgt_path,
                  src_path2,
                  tgt_path2,
                  src_dir=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True):

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.
    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, "src1")

    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path, tgt_seq_length_trunc, "tgt1")

    src_examples_iter2, num_src_feats2 = \
        _make_examples_nfeats_tpl(data_type, src_path2, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio, "src2")

    tgt_examples_iter2, num_tgt_feats2 = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path2, tgt_seq_length_trunc, "tgt2")

    if data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              tgt_examples_iter,
                              src_examples_iter2,
                              tgt_examples_iter2,
                              num_src_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)
    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset
Пример #9
0
def build_dataset(fields,
                  data_type,
                  src_path,
                  tgt_path,
                  src_dir=None,
                  second_data_type=None,
                  second_src_path=None,
                  src_seq_length=0,
                  tgt_seq_length=0,
                  src_seq_length_trunc=0,
                  tgt_seq_length_trunc=0,
                  dynamic_dict=True,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True,
                  file_to_tensor_fn=None):

    use_second_modality = second_data_type is not None
    if use_second_modality:
        assert data_type == 'text'  # Only implemented for primary input type text
        # Second data type should not be text. One could simply append his secondary text
        # to the primary input.
        assert second_data_type != 'text', 'second_data_type cannot be text.'
        assert second_src_path is not None and src_dir is not None, \
            'If second_data_type is set, second_src_path as well as src_dir needs to be present'

    # Build src/tgt examples iterator from corpus files, also extract
    # number of features.
    src_examples_iter, num_src_feats = \
        _make_examples_nfeats_tpl(data_type, src_path, src_dir,
                                  src_seq_length_trunc, sample_rate,
                                  window_size, window_stride,
                                  window, normalize_audio,
                                  file_to_tensor_fn=file_to_tensor_fn)
    if use_second_modality:
        src2_examples_iter, num_src2_feats = \
            _make_examples_nfeats_tpl(second_data_type, second_src_path, src_dir,
                                      src_seq_length_trunc, sample_rate,
                                      window_size, window_stride,
                                      window, normalize_audio, side='src2',
                                      file_to_tensor_fn=file_to_tensor_fn)

    # For all data types, the tgt side corpus is in form of text.
    tgt_examples_iter, num_tgt_feats = \
        TextDataset.make_text_examples_nfeats_tpl(
            tgt_path, tgt_seq_length_trunc, "tgt")

    if use_second_modality:
        dataset = MultiModalDataset(fields,
                                    src_examples_iter,
                                    src2_examples_iter,
                                    second_data_type,
                                    tgt_examples_iter,
                                    num_src_feats,
                                    num_src2_feats,
                                    num_tgt_feats,
                                    src_seq_length=src_seq_length,
                                    tgt_seq_length=tgt_seq_length,
                                    use_filter_pred=use_filter_pred)
    elif data_type == 'text':
        dataset = TextDataset(fields,
                              src_examples_iter,
                              tgt_examples_iter,
                              num_src_feats,
                              num_tgt_feats,
                              src_seq_length=src_seq_length,
                              tgt_seq_length=tgt_seq_length,
                              dynamic_dict=dynamic_dict,
                              use_filter_pred=use_filter_pred)

    elif data_type == 'img':
        dataset = ImageDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               use_filter_pred=use_filter_pred)

    elif data_type == 'audio':
        dataset = AudioDataset(fields,
                               src_examples_iter,
                               tgt_examples_iter,
                               num_src_feats,
                               num_tgt_feats,
                               tgt_seq_length=tgt_seq_length,
                               sample_rate=sample_rate,
                               window_size=window_size,
                               window_stride=window_stride,
                               window=window,
                               normalize_audio=normalize_audio,
                               use_filter_pred=use_filter_pred)

    return dataset