예제 #1
0
def main():
    target_dir = args.target_dir
    language_dir = args.language_dir

    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")

    if os.path.exists(target_unpacked_dir):
        print('Find existing folder {}'.format(target_unpacked_dir))
    else:
        print("Could not find Common Voice, Downloading corpus...")

        filename = wget.download(COMMON_VOICE_URL, target_dir)
        target_file = os.path.join(target_dir, os.path.basename(filename))

        os.makedirs(target_unpacked_dir, exist_ok=True)
        print("Unpacking corpus to {} ...".format(target_unpacked_dir))
        tar = tarfile.open(target_file)
        tar.extractall(target_unpacked_dir)
        tar.close()

    folder_path = os.path.join(target_unpacked_dir,
                               VERSION + '/{}/'.format(language_dir))

    for csv_file in args.files_to_process:
        convert_to_wav(csv_file=os.path.join(folder_path, csv_file),
                       target_dir=os.path.join(target_dir,
                                               os.path.splitext(csv_file)[0]),
                       num_workers=args.num_workers)

    print('Creating manifests...')
    for csv_file in args.files_to_process:
        create_manifest(data_path=os.path.join(target_dir,
                                               os.path.splitext(csv_file)[0]),
                        output_name='commonvoice_' +
                        os.path.splitext(csv_file)[0] + '_manifest.json',
                        manifest_path=args.manifest_dir,
                        min_duration=args.min_duration,
                        max_duration=args.max_duration,
                        num_workers=args.num_workers)
예제 #2
0
def main():
    root_path = 'an4/'
    raw_tar_path = 'an4_raw.bigendian.tar.gz'
    if not os.path.exists(raw_tar_path):
        wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir, exist_ok=True)
    _format_training_data(root_path=root_path)
    _format_test_data(root_path=root_path)
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    val_path = args.target_dir + '/val/'
    test_path = args.target_dir + '/test/'
    print('Creating manifests...')
    create_manifest(data_path=train_path,
                    output_name='an4_train_manifest.csv',
                    manifest_path=args.manifest_dir,
                    min_duration=args.min_duration,
                    max_duration=args.max_duration)
    create_manifest(data_path=val_path,
                    output_name='an4_val_manifest.csv',
                    manifest_path=args.manifest_dir,
                    min_duration=args.min_duration,
                    max_duration=args.max_duration)
    create_manifest(data_path=test_path,
                    output_name='an4_test_manifest.csv',
                    manifest_path=args.manifest_dir)
def main():
    target_dir = args.target_dir
    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
    os.makedirs(target_unpacked_dir, exist_ok=True)

    if args.tar_path and os.path.exists(args.tar_path):
        print('Find existing file {}'.format(args.tar_path))
        target_file = args.tar_path
    else:
        print(
            "Could not find downloaded Common Voice archive, Downloading corpus..."
        )
        filename = wget.download(COMMON_VOICE_URL, target_dir)
        target_file = os.path.join(target_dir, os.path.basename(filename))

    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
    tar = tarfile.open(target_file)
    tar.extractall(target_unpacked_dir)
    tar.close()

    for csv_file in args.files_to_process.split(','):
        convert_to_wav(
            os.path.join(target_unpacked_dir, 'cv-corpus-5-2020-06-22/zh-CN/',
                         csv_file),
            os.path.join(target_dir,
                         os.path.splitext(csv_file)[0]))

    print('Creating manifests...')
    for csv_file in args.files_to_process.split(','):
        create_manifest(data_path=os.path.join(target_dir,
                                               os.path.splitext(csv_file)[0]),
                        output_name=os.path.splitext(csv_file)[0] +
                        '_manifest.csv',
                        manifest_path=args.manifest_dir,
                        min_duration=args.min_duration,
                        max_duration=args.max_duration)
예제 #4
0
def main():
    source_dir = args.source_dir

    if not os.path.exists(source_dir):
        raise NotADirectoryError(f"Directory does not exist: {source_dir}")

    if args.dataframe:
        print("Creating DataFrame")
        create_dataframe(
            data_path=source_dir,
            output_name=f"{args.manifest_name}.csv",
            manifest_path=args.manifest_dir,
        )
    else:
        print("Creating manifest")
        manifest = create_manifest(
            data_path=source_dir,
            output_name=f"{args.manifest_name}.csv",
            manifest_path=args.manifest_dir,
            min_duration=args.min_duration,
            max_duration=args.max_duration,
        )

        if args.split is not None:
            train, val, test = [float(a) for a in args.split]
            df = pd.read_csv(manifest)
            manifest_dir = Path(args.manifest_dir)
            print(f"Total size of manifest is {len(df)} records.")
            test_df = None  # make IDE happy
            if test > 0:
                train_df, test_df = train_test_split(df, test_size=test)
                train_df, val_df = train_test_split(train_df, test_size=val)
                test_df.to_csv(manifest_dir / f"test_{args.manifest_name}.csv",
                               index=False)
            else:
                train_df, val_df = train_test_split(df, test_size=val)

            train_df.to_csv(manifest_dir / f"train_{args.manifest_name}.csv",
                            index=False)
            val_df.to_csv(manifest_dir / f"val_{args.manifest_name}.csv",
                          index=False)

            print(f"Train size is {len(train_df)}")
            print(f"Val size is {len(val_df)}")
            print(f"Test size is {0 if test_df is None else len(test_df)}")
예제 #5
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2")
    if args.tar_path and os.path.exists(args.tar_path):
        target_file = args.tar_path
    else:
        print(
            "Could not find downloaded TEDLIUM archive, Downloading corpus...")
        wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
        target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz")

    if not os.path.exists(target_unpacked_dir):
        print("Unpacking corpus...")
        tar = tarfile.open(target_file)
        tar.extractall(target_dl_dir)
        tar.close()
    else:
        print("Found TEDLIUM directory, skipping unpacking of tar files")

    train_ted_dir = os.path.join(target_unpacked_dir, "train")
    val_ted_dir = os.path.join(target_unpacked_dir, "dev")
    test_ted_dir = os.path.join(target_unpacked_dir, "test")

    prepare_dir(train_ted_dir)
    prepare_dir(val_ted_dir)
    prepare_dir(test_ted_dir)
    print('Creating manifests...')

    create_manifest(data_path=train_ted_dir,
                    output_name='ted_train_manifest.json',
                    manifest_path=args.manifest_dir,
                    min_duration=args.min_duration,
                    max_duration=args.max_duration,
                    num_workers=args.num_workers)
    create_manifest(data_path=val_ted_dir,
                    output_name='ted_val_manifest.json',
                    manifest_path=args.manifest_dir,
                    num_workers=args.num_workers)
    create_manifest(data_path=test_ted_dir,
                    output_name='ted_test_manifest.json',
                    manifest_path=args.manifest_dir,
                    num_workers=args.num_workers)
예제 #6
0
def download_an4(target_dir: str, manifest_dir: str, min_duration: float,
                 max_duration: float, val_fraction: float, sample_rate: int,
                 num_workers: int):
    root_path = 'an4/'
    raw_tar_path = 'an4_raw.bigendian.tar.gz'
    if not os.path.exists(raw_tar_path):
        wget.download(
            'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'
        )
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(target_dir, exist_ok=True)
    _format_training_data(root_path=root_path,
                          val_fraction=val_fraction,
                          sample_rate=sample_rate,
                          target_dir=target_dir)
    _format_test_data(root_path=root_path,
                      sample_rate=sample_rate,
                      target_dir=target_dir)
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = target_dir + '/train/'
    val_path = target_dir + '/val/'
    test_path = target_dir + '/test/'

    print('Creating manifests...')
    create_manifest(data_path=train_path,
                    output_name='an4_train_manifest.json',
                    manifest_path=manifest_dir,
                    min_duration=min_duration,
                    max_duration=max_duration,
                    num_workers=num_workers)
    create_manifest(data_path=val_path,
                    output_name='an4_val_manifest.json',
                    manifest_path=manifest_dir,
                    min_duration=min_duration,
                    max_duration=max_duration,
                    num_workers=num_workers)
    create_manifest(data_path=test_path,
                    output_name='an4_test_manifest.json',
                    manifest_path=manifest_dir,
                    num_workers=num_workers)
예제 #7
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)
    files_to_dl = args.files_to_use.strip().split(',')
    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
        split_dir = os.path.join(target_dl_dir, split_type)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_wav_dir = os.path.join(split_dir, "wav")
        if not os.path.exists(split_wav_dir):
            os.makedirs(split_wav_dir)
        split_txt_dir = os.path.join(split_dir, "txt")
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir)
        for url in lst_libri_urls:
            # check if we want to dl this file
            dl_flag = False
            for f in files_to_dl:
                if url.find(f) != -1:
                    dl_flag = True
            if not dl_flag:
                print("Skipping url: {}".format(url))
                continue
            filename = url.split("/")[-1]
            target_filename = os.path.join(split_dir, filename)
            if not os.path.exists(target_filename):
                wget.download(url, split_dir)
            print("Unpacking {}...".format(filename))
            tar = tarfile.open(target_filename)
            tar.extractall(split_dir)
            tar.close()
            os.remove(target_filename)
            print("Converting flac files to wav and extracting transcripts...")
            assert os.path.exists(
                extracted_dir
            ), "Archive {} was not properly uncompressed.".format(filename)
            for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                for f in files:
                    if f.find(".flac") != -1:
                        _process_file(wav_dir=split_wav_dir,
                                      txt_dir=split_txt_dir,
                                      base_filename=f,
                                      root_dir=root)

            print("Finished {}".format(url))
            shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(data_path=split_dir,
                            output_name='libri_' + split_type +
                            '_manifest.json',
                            manifest_path=args.manifest_dir,
                            min_duration=args.min_duration,
                            max_duration=args.max_duration,
                            num_workers=args.num_workers)
        else:
            create_manifest(data_path=split_dir,
                            output_name='libri_' + split_type +
                            '_manifest.json',
                            manifest_path=args.manifest_dir,
                            num_workers=args.num_workers)
예제 #8
0
                with io.FileIO(target_txt_file, "w") as file:
                    file.write(utterance.encode('utf-8'))
                original_wav_file = os.path.join(recordings_dir, wav_file)
                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
                                                                      target_wav_file)], shell=True)

        shutil.rmtree(dirpath)


if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
    print('Creating manifests...')
    create_manifest(
        data_path=target_dir,
        output_name='voxforge_train_manifest.json',
        manifest_path=args.manifest_dir,
        min_duration=args.min_duration,
        max_duration=args.max_duration,
        num_workers=args.num_workers
    )
예제 #9
0
                target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
                target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
                with io.FileIO(target_txt_file, "w") as file:
                    file.write(utterance.encode('utf-8'))
                original_wav_file = os.path.join(recordings_dir, wav_file)
                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
                                                                      target_wav_file)], shell=True)

        shutil.rmtree(dirpath)


if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
    print('Creating manifests...')
    create_manifest(data_path=target_dir,
                    output_name='voxforge_train_manifest.csv',
                    manifest_path=args.manifest_dir,
                    min_duration=args.min_duration,
                    max_duration=args.max_duration)