Exemplo n.º 1
0
def create_textset(tokenizer, train_split, dev_split, name, path, bucketing,
                   batch_size):
    ''' Interface for creating all kinds of text dataset'''
    msg_list = []

    # Recognize corpus
    if name.lower() == "librispeech":
        from corpus.librispeech import LibriTextDataset as Dataset
    elif name.lower() == "dlhlp":
        from corpus.dlhlp import DlhlpTextDataset as Dataset
    else:
        raise NotImplementedError

    # Create dataset
    bucket_size = batch_size if bucketing else 1
    tr_loader_bs = 1 if bucketing else batch_size
    # Do not use bucketing for dev set
    dv_set = Dataset(path, dev_split, tokenizer, 1)
    tr_set = Dataset(path, train_split, tokenizer, bucket_size)

    # Messages to show
    msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set),
                         dev_split.__str__(), len(dv_set), batch_size,
                         bucketing)

    return tr_set, dv_set, tr_loader_bs, batch_size, msg_list
Exemplo n.º 2
0
def create_dataset(tokenizer,
                   ascending,
                   name,
                   path,
                   bucketing,
                   batch_size,
                   train_split=None,
                   dev_split=None,
                   test_split=None):
    ''' Interface for creating all kinds of dataset.
        name: Dataset name, e.g LibriSpeech
        path: Dataset root dir, e.g data/LibriSpeech
        bucketing:
        batch_size:
        train_split: list of training data, e.g, ['train-clean-100','train-clean-360','train-other-500']
        dev_split: list of validation set, e.g, ['dev-clean'] 
        test_split: list of testing set.
    '''

    # Recognize corpus
    if name.lower() == "librispeech":
        from corpus.librispeech import LibriDataset as Dataset
    else:
        raise NotImplementedError

    # Create dataset
    if train_split is not None:
        # Training mode
        mode = 'train'
        tr_loader_bs = 1 if bucketing and (not ascending) else batch_size
        bucket_size = batch_size if bucketing and (
            not ascending) else 1  # Ascending without bucketing
        # Do not use bucketing for dev set
        dv_set = Dataset(path, dev_split, tokenizer, 1)
        tr_set = Dataset(path,
                         train_split,
                         tokenizer,
                         bucket_size,
                         ascending=ascending)
        # Messages to show
        msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set),
                             dev_split.__str__(), len(dv_set), batch_size,
                             bucketing)

        return tr_set, dv_set, tr_loader_bs, batch_size, mode, msg_list
    else:
        # Testing model
        mode = 'test'
        # Do not use bucketing for dev set
        dv_set = Dataset(path, dev_split, tokenizer, 1)
        # Do not use bucketing for test set
        tt_set = Dataset(path, test_split, tokenizer, 1)
        # Messages to show
        msg_list = _data_msg(name, path, dev_split.__str__(), len(dv_set),
                             test_split.__str__(), len(tt_set), batch_size,
                             False)
        msg_list = [
            m.replace('Dev', 'Test').replace('Train', 'Dev') for m in msg_list
        ]
        return dv_set, tt_set, batch_size, batch_size, mode, msg_list
def create_dataset(tokenizer,
                   ascending,
                   name,
                   path,
                   bucketing,
                   batch_size,
                   test_path,
                   train_split=None,
                   dev_split=None,
                   test_split=None):
    ''' Interface for creating all kinds of dataset'''

    # Recognize corpus
    if name.lower() == "librispeech":
        from corpus.librispeech import LibriDataset as Dataset
    elif name.lower() == "dlhlp":
        from corpus.dlhlp import DlhlpDataset as Dataset
    else:
        raise NotImplementedError

    # Create dataset
    if train_split is not None:
        # Training mode
        mode = 'train'
        tr_loader_bs = 1 if bucketing and (not ascending) else batch_size
        bucket_size = batch_size if bucketing and (
            not ascending) else 1  # Ascending without bucketing
        # Do not use bucketing for dev set
        dv_set = Dataset(path, dev_split, tokenizer, 1)
        tr_set = Dataset(path,
                         train_split,
                         tokenizer,
                         bucket_size,
                         ascending=ascending)
        # Messages to show
        msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set),
                             dev_split.__str__(), len(dv_set), batch_size,
                             bucketing)

        return tr_set, dv_set, tr_loader_bs, batch_size, mode, msg_list
    else:
        # Testing model
        mode = 'test'
        # Do not use bucketing for dev set
        # dv_set = Dataset(path, dev_split, tokenizer, 1)
        dv_set = Dataset(test_path, test_split, tokenizer, 1)
        # Do not use bucketing for test set
        print(path)
        tt_set = Dataset(test_path, test_split, tokenizer, 1)
        # Messages to show
        msg_list = _data_msg(name, path, dev_split.__str__(), len(dv_set),
                             test_split.__str__(), len(tt_set), batch_size,
                             False)
        msg_list = [
            m.replace('Dev', 'Test').replace('Train', 'Dev') for m in msg_list
        ]
        return dv_set, tt_set, batch_size, batch_size, mode, msg_list
Exemplo n.º 4
0
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                       text):
    ''' Prepare dataloader for testing'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    from corpus.dlhlp import DlhlpDataset as Dataset
    dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1)
    # Collect function
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode='test')
    # Create data loader
    dv_set = DataLoader(dv_set,
                        batch_size=1,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=n_jobs,
                        pin_memory=pin_memory)

    msg = 'I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format(
        audio['feat_type'], feat_dim, tokenizer.token_type,
        tokenizer.vocab_size)
    return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
Exemplo n.º 5
0
    def _create_dataset(tokenizer,
                        ascending,
                        name,
                        path,
                        bucketing,
                        batch_size,
                        train_split=None,
                        dev_split=None,
                        test_split=None):
        ''' Interface for creating all kinds of dataset'''

        # Recognize corpus
        if name.lower() == "librispeech":
            from corpus.librispeech import LibriDataset as Dataset
        elif name.lower() == "dlhlp":
            from corpus.dlhlp import DlhlpDataset as Dataset
        elif name.lower() == 'external':
            from corpus.external import ExternalDataset as Dataset
        else:
            raise NotImplementedError

        # Testing model
        mode = 'test'
        # Do not use bucketing for test set
        tt_set = Dataset(path, test_split, tokenizer, 1)
        # Messages to show
        return tt_set, batch_size, mode, []