예제 #1
0
def main():
    target_dir = args.target_dir
    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
    os.makedirs(target_unpacked_dir, exist_ok=True)

    if args.tar_path and os.path.exists(args.tar_path):
        print('Find existing file {}'.format(args.tar_path))
        target_file = args.tar_path
    else:
        print("Could not find downloaded Common Voice archive, Downloading corpus...")
        filename = wget.download(COMMON_VOICE_URL, target_dir)
        target_file = os.path.join(target_dir, os.path.basename(filename))

    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
    tar = tarfile.open(target_file)
    tar.extractall(target_unpacked_dir)
    tar.close()

    for csv_file in args.files_to_process.split(','):
        convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file),
                       os.path.join(target_dir, os.path.splitext(csv_file)[0]))

    print('Creating manifests...')
    for csv_file in args.files_to_process.split(','):
        create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
                        os.path.splitext(csv_file)[0] + '_manifest.csv',
                        args.min_duration,
                        args.max_duration)
예제 #2
0
def main():
    target_dir = args.target_dir
    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
    os.makedirs(target_unpacked_dir, exist_ok=True)

    if args.tar_path and os.path.exists(args.tar_path):
        print('Find existing file {}'.format(args.tar_path))
        target_file = args.tar_path
    else:
        print("Could not find downloaded Common Voice archive, Downloading corpus...")
        filename = wget.download(COMMON_VOICE_URL, target_dir)
        target_file = os.path.join(target_dir, os.path.basename(filename))

    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
    tar = tarfile.open(target_file)
    tar.extractall(target_unpacked_dir)
    tar.close()

    for csv_file in args.files_to_process.split(','):
        convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file),
                       os.path.join(target_dir, os.path.splitext(csv_file)[0]))

    print('Creating manifests...')
    for csv_file in args.files_to_process.split(','):
        create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
                        os.path.splitext(csv_file)[0] + '_manifest.csv',
                        args.min_duration,
                        args.max_duration)
예제 #3
0
def main():
    target_dl_dir = args.target_dir
    source_dl_dir = args.source_dir
    #speech_only/speech_and_noise
    data_type = args.data_type
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    # prepare target dir
    target_train_dir = os.path.join(target_dl_dir, "train_C")
    if not os.path.exists(target_train_dir):
        os.makedirs(target_train_dir)
    target_val_dir = os.path.join(target_dl_dir, "dev")
    if not os.path.exists(target_val_dir):
        os.makedirs(target_val_dir)
    target_test_dir = os.path.join(target_dl_dir, "test")
    if not os.path.exists(target_test_dir):
        os.makedirs(target_test_dir)
    # source dir
    source_train = os.path.join(source_dl_dir, "train_C")
    source_val = os.path.join(source_dl_dir, "dev")
    source_test = os.path.join(source_dl_dir, "test")

    print " prepare data for train  "
    prepare_dir(target_train_dir, source_train, data_type)
    print " prepare data for dev  "
    #   prepare_dir(target_val_dir, source_val, data_type)
    print " prepare data for test  "
    #  prepare_dir(target_test_dir, source_test, data_type)
    print('Creating manifests...')

    create_manifest(target_dl_dir, os.path.join(target_train_dir, "converted"),
                    'train_C', data_type)
예제 #4
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)
    files_to_dl = args.files_to_use.strip().split(',')
    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
        split_dir = os.path.join(target_dl_dir, split_type)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_wav_dir = os.path.join(split_dir, "wav")
        if not os.path.exists(split_wav_dir):
            os.makedirs(split_wav_dir)
        split_txt_dir = os.path.join(split_dir, "txt")
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir)
        for url in lst_libri_urls:
            # check if we want to dl this file
            filename = url.split("/")[-1]
            target_filename = os.path.join(split_dir, filename)

            matches = [f for f in files_to_dl if url.find(f) != -1]
            if not matches: continue

            if not os.path.exists(target_filename):
                print("Downloading file {} from {} to {}".format(
                    filename, url, target_filename))
                raise Exception("Error")
                wget.download(url, split_dir)
            else:
                print("Skipping existing file from url: {}".format(url))
            print("Unpacking {}...".format(filename))
            tar = tarfile.open(target_filename)
            tar.extractall(split_dir)
            tar.close()
            os.remove(target_filename)
            print("Converting flac files to wav and extracting transcripts...")
            assert os.path.exists(
                extracted_dir
            ), "Archive {} was not properly uncompressed.".format(filename)
            for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                for f in files:
                    if f.find(".flac") != -1:
                        _process_file(wav_dir=split_wav_dir,
                                      txt_dir=split_txt_dir,
                                      base_filename=f,
                                      root_dir=root)

            print("Finished {}".format(url))
            shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv',
                            args.min_duration, args.max_duration)
        else:
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
예제 #5
0
def main():
    if not os.path.isdir(args.target_dir):
      os.makedirs(args.target_dir)
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'

    subprocess.call(["local/clean_corpus.sh","$HOME/copora/NIKL",args.target_dir])
    subprocess.call(["local/data_prep.sh","$HOME/copora/NIKL",args.target_dir])

    print ('\n', 'Creating manifests...')
    create_manifest(train_path, 'nikl_train_manifest.csv', args.min_duration, args.max_duration)
    create_manifest(test_path, 'nikl_val_manifest.csv')
def main():
    target_dir = args.target_dir
    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
    os.makedirs(target_unpacked_dir, exist_ok=True)

    if args.data_dir and os.path.exists(args.data_dir):
        print('Find existing file {}'.format(args.data_dir))
    else:
        raise RuntimeError(
            "Could not find downloaded IARPA babel corpus, please download the relevant corpus from LDC"
        )

    if os.path.isdir(args.data_dir):
        print("Identified unpacked IARPA dataset")
        unpacked_location = args.data_dir
    else:
        print("Unpacking corpus to {} ...".format(target_unpacked_dir))
        tar = tarfile.open(target_file)
        tar.extractall(target_unpacked_dir)
        tar.close()
        unpacked_location = target_unpacked_dir

    path_flattened = re.sub(r"[\/]", "_", os.path.splitext(args.data_dir)[0])
    os.makedirs(os.path.join(target_dir, path_flattened), exist_ok=True)
    roots = {}
    # collect all the filepaths
    for root, dirs, files in os.walk(unpacked_location):
        roots[root] = files

    audio_trans_pairs = []  # this is a list of tuples
    for root in roots:
        # find all the audio directories
        if re.search(r"/audio", root):
            transcription_root = re.sub(r"/audio", "/transcription", root)
            print(transcription_root)
            for fp in roots[root]:
                txt_fp = re.sub(r"\.wav", ".txt", fp)
                if os.path.exists(os.path.join(transcription_root, txt_fp)):
                    pair_tuple = (os.path.join(transcription_root,
                                               txt_fp), os.path.join(root, fp))
                    audio_trans_pairs.append(pair_tuple)

    for txt_path, audio_path in audio_trans_pairs:
        convert_to_wav(txt_path, audio_path,
                       os.path.join(target_dir, path_flattened))

    # make a separate manifest for each
    print('Creating manifests...')
    create_manifest(os.path.join(target_dir, path_flattened),
                    path_flattened + '_manifest.csv', args.min_duration,
                    args.max_duration)
예제 #7
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)
    files_to_dl = args.files_to_use.strip().split(',')
    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
        split_dir = os.path.join(target_dl_dir, split_type)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_wav_dir = os.path.join(split_dir, "wav")
        if not os.path.exists(split_wav_dir):
            os.makedirs(split_wav_dir)
        split_txt_dir = os.path.join(split_dir, "txt")
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir)
        for url in lst_libri_urls:
            # check if we want to dl this file
            dl_flag = False
            for f in files_to_dl:
                if url.find(f) != -1:
                    dl_flag = True
            if not dl_flag:
                print("Skipping url: {}".format(url))
                continue
            filename = url.split("/")[-1]
            target_filename = os.path.join(split_dir, filename)
            if not os.path.exists(target_filename):
                wget.download(url, split_dir)
            print("Unpacking {}...".format(filename))
            tar = tarfile.open(target_filename)
            tar.extractall(split_dir)
            tar.close()
            os.remove(target_filename)
            print("Converting flac files to wav and extracting transcripts...")
            assert os.path.exists(
                extracted_dir
            ), "Archive {} was not properly uncompressed.".format(filename)
            for root, subdirs, files in os.walk(extracted_dir):
                for f in files:
                    if f.find(".flac") != -1:
                        _process_file(wav_dir=split_wav_dir,
                                      txt_dir=split_txt_dir,
                                      base_filename=f,
                                      root_dir=root)

            print("Finished {}".format(url))
            shutil.rmtree(extracted_dir)
        create_manifest(split_dir, 'libri_' + split_type)
예제 #8
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)
    files_to_dl = args.files_to_use.strip().split(',')
    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
        split_dir = os.path.join(target_dl_dir, split_type)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_wav_dir = os.path.join(split_dir, "wav")
        if not os.path.exists(split_wav_dir):
            os.makedirs(split_wav_dir)
        split_txt_dir = os.path.join(split_dir, "txt")
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir)
        for url in lst_libri_urls:
            # check if we want to dl this file
            dl_flag = False
            for f in files_to_dl:
                if url.find(f) != -1:
                    dl_flag = True
            if not dl_flag:
                print("Skipping url: {}".format(url))
                continue
            filename = url.split("/")[-1]
            target_filename = os.path.join(split_dir, filename)
            if not os.path.exists(target_filename):
                wget.download(url, split_dir)
            print("Unpacking {}...".format(filename))
            tar = tarfile.open(target_filename)
            tar.extractall(split_dir)
            tar.close()
            os.remove(target_filename)
            print("Converting flac files to wav and extracting transcripts...")
            assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename)
            for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                for f in files:
                    if f.find(".flac") != -1:
                        _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
                                      base_filename=f, root_dir=root)

            print("Finished {}".format(url))
            shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
        else:
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
예제 #9
0
    def test_run_media_linker_during_adapter(self):
        mfest = otio.plugins.ActiveManifest()

        manifest = utils.create_manifest()
        # this wires up the media linkers into the active manifest
        mfest.media_linkers.extend(manifest.media_linkers)
        fake_tl = self.adp.read_from_file("foo", media_linker_name="example")

        self.assertTrue(
            fake_tl.tracks[0][0].media_reference.metadata.get(
                'from_test_linker'
            )
        )

        fake_tl = self.adp.read_from_string(
            "foo",
            media_linker_name="example"
        )

        self.assertTrue(
            fake_tl.tracks[0][0].media_reference.metadata.get(
                'from_test_linker'
            )
        )

        # explicitly turn the media_linker off
        fake_tl = self.adp.read_from_file("foo", media_linker_name=None)
        self.assertIsNone(
            fake_tl.tracks[0][0].media_reference.metadata.get(
                'from_test_linker'
            )
        )

        # Delete the temporary manifest
        utils.remove_manifest(manifest)
예제 #10
0
def main():
    root_path = 'an4/'
    name = 'an4'
    wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'
    print ('\n', 'Creating manifests...')
    create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration)
    create_manifest(test_path, 'an4_val_manifest.csv')
예제 #11
0
def main():
    root_path = 'an4/'
    name = 'an4'
    wget.download(
        'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'
    print('\n', 'Creating manifests...')
    create_manifest(train_path, 'an4_train')
    create_manifest(test_path, 'an4_val')
예제 #12
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2")
    if args.tar_path and os.path.exists(args.tar_path):
        target_file = args.tar_path
    else:
        print("Could not find downloaded TEDLIUM archive, Downloading corpus...")
        wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
        target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz")

    if not os.path.exists(target_unpacked_dir):
        print("Unpacking corpus...")
        tar = tarfile.open(target_file)
        tar.extractall(target_dl_dir)
        tar.close()
    else:
        print("Found TEDLIUM directory, skipping unpacking of tar files")

    train_ted_dir = os.path.join(target_unpacked_dir, "train")
    val_ted_dir = os.path.join(target_unpacked_dir, "dev")
    test_ted_dir = os.path.join(target_unpacked_dir, "test")

    prepare_dir(train_ted_dir)
    prepare_dir(val_ted_dir)
    prepare_dir(test_ted_dir)
    print('Creating manifests...')

    create_manifest(train_ted_dir, 'ted_train_manifest.csv', args.min_duration, args.max_duration)
    create_manifest(val_ted_dir, 'ted_val_manifest.csv')
    create_manifest(test_ted_dir, 'ted_test_manifest.csv')
예제 #13
0
def main():
    target_dl_dir = args.target_dir
    source_dl_dir = args.source_dir
    #speech_only/speech_and_noise

    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    # prepare target dir
    target_train_dir = os.path.join(target_dl_dir, "train")
    if not os.path.exists(target_train_dir):
        os.makedirs(target_train_dir)
    target_val_dir = os.path.join(target_dl_dir, "dev")
    if not os.path.exists(target_val_dir):
        os.makedirs(target_val_dir)
    target_test_dir = os.path.join(target_dl_dir, "test")
    if not os.path.exists(target_test_dir):
        os.makedirs(target_test_dir)
    # source dir
    source_train = os.path.join(source_dl_dir, "train")
    source_val = os.path.join(source_dl_dir, "dev")
    source_test = os.path.join(source_dl_dir, "test")

    print " prepare data for train  "
    #prepare_dir(target_train_dir, source_train,"train")
    print " prepare data for dev  "
    #prepare_dir(target_val_dir, source_val,"dev")
    print " prepare data for test  "
    #prepare_dir(target_test_dir, source_test,"test")
    print('Creating manifests...')
    print(" target_train_dir  ", target_train_dir)

    create_manifest(target_dl_dir, target_train_dir, 'train')
    create_manifest(target_dl_dir, target_val_dir, 'val')
    create_manifest(target_dl_dir, target_test_dir, 'test')
예제 #14
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2")
    if args.tar_path and os.path.exists(args.tar_path):
        target_file = args.tar_path
    else:
        print(
            "Could not find downloaded TEDLIUM archive, Downloading corpus...")
        wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
        target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz")

    if not os.path.exists(target_unpacked_dir):
        print("Unpacking corpus...")
        tar = tarfile.open(target_file)
        tar.extractall(target_dl_dir)
        tar.close()
    else:
        print("Found TEDLIUM directory, skipping unpacking of tar files")

    train_ted_dir = os.path.join(target_unpacked_dir, "train")
    val_ted_dir = os.path.join(target_unpacked_dir, "dev")
    test_ted_dir = os.path.join(target_unpacked_dir, "test")

    prepare_dir(train_ted_dir)
    prepare_dir(val_ted_dir)
    prepare_dir(test_ted_dir)
    print('Creating manifests...')

    create_manifest(train_ted_dir, 'ted_train', target_dl_dir)
    create_manifest(val_ted_dir, 'ted_val', target_dl_dir)
    create_manifest(test_ted_dir, 'ted_test', target_dl_dir)
예제 #15
0
def main():
    root_path = "an4/"
    name = "an4"
    wget.download(
        "http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz")
    tar = tarfile.open("an4_raw.bigendian.tar.gz")
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, "train", name, "an4_clstk")
    _format_data(root_path, "test", name, "an4test_clstk")
    shutil.rmtree(root_path)
    os.remove("an4_raw.bigendian.tar.gz")
    train_path = args.target_dir + "/train/"
    test_path = args.target_dir + "/test/"
    print("\n", "Creating manifests...")
    create_manifest(
        train_path,
        "an4_train_manifest.csv",
        args.min_duration,
        args.max_duration,
    )
    create_manifest(test_path, "an4_val_manifest.csv")
    def test_run_media_linker_during_adapter(self):
        mfest = otio.plugins.ActiveManifest()

        manifest = utils.create_manifest()
        # this wires up the media linkers into the active manifest
        mfest.media_linkers.extend(manifest.media_linkers)
        fake_tl = self.adp.read_from_file("foo", media_linker_name="example")

        self.assertTrue(fake_tl.tracks[0][0].media_reference.metadata.get(
            'from_test_linker'))

        fake_tl = self.adp.read_from_string("foo", media_linker_name="example")

        self.assertTrue(fake_tl.tracks[0][0].media_reference.metadata.get(
            'from_test_linker'))

        # explicitly turn the media_linker off
        fake_tl = self.adp.read_from_file("foo", media_linker_name=None)
        self.assertIsNone(fake_tl.tracks[0][0].media_reference.metadata.get(
            'from_test_linker'))

        # Delete the temporary manifest
        utils.remove_manifest(manifest)
예제 #17
0
def main():
    root_path = 'an4/'
    raw_tar_path = 'an4_raw.bigendian.tar.gz'
    if not os.path.exists(raw_tar_path):
        wget.download(
            'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'
        )
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_training_data(root_path=root_path)
    _format_test_data(root_path=root_path)
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    val_path = args.target_dir + '/val/'
    test_path = args.target_dir + '/test/'
    print('Creating manifests...')
    create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration,
                    args.max_duration)
    create_manifest(val_path, 'an4_val_manifest.csv', args.min_duration,
                    args.max_duration)
    create_manifest(test_path, 'an4_test_manifest.csv')
예제 #18
0
def main():
    target_dl_dir = args.target_dir
    source_dl_dir = args.source_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)




    # prepare target dir
    target_train_dir = os.path.join(target_dl_dir, "train")
    if not os.path.exists(target_train_dir):
        os.makedirs(target_train_dir)
    target_val_dir = os.path.join(target_dl_dir, "dev")
    if not os.path.exists(target_val_dir):
        os.makedirs(target_val_dir)
    target_test_dir = os.path.join(target_dl_dir, "test")
    if not os.path.exists(target_test_dir):
        os.makedirs(target_test_dir)
    # source dir
    source_train = os.path.join(source_dl_dir, "train")
    source_val = os.path.join(source_dl_dir, "dev")
    source_test = os.path.join(source_dl_dir, "test")


    print " prepare data for train  "
    prepare_dir(target_train_dir, source_train, data_type)
    print " prepare data for dev  "
    prepare_dir(target_val_dir, source_val, data_type)
    print " prepare data for test  "
    prepare_dir(target_test_dir, source_test, data_type)
    print('Creating manifests...')
    print " target_train_dir ", target_train_dir
    print " target_dl_dir  ", target_dl_dir
    create_manifest(target_dl_dir, os.path.join(target_train_dir, "converted"), 'train')
    create_manifest(target_dl_dir, os.path.join(target_val_dir, "converted"), 'val')
    create_manifest(target_dl_dir, os.path.join(target_test_dir, "converted"), 'test')
    def local_product_func(self,parameters):
        if IE_DEBUG > 0:
            self._logger.info(
                "wfm: executing INGEST LOCAL PRODUCT, id=" +\
                    `parameters["scenario_id"]`)

        percent = 1
        ncn_id = None
        n_errors = 0
        try:
            sc_id = parameters["scenario_id"]
            self._wfm.set_scenario_status(
                self._id, sc_id, 0, "LOCAL ING.: UNPACK", percent)
            self._wfm.set_ingestion_pid(sc_id, os.getpid())
            ncn_id = parameters["ncn_id"].encode('ascii','ignore')

            data = parameters["data"]
            orig_data = None
            data = ie_unpack_maybe(parameters["dir_path"], data)
            if not data:
                raise IngestionError(
                    "Error unpacking or accessing " +
                    os.path.join(parameters["dir_path"]), data)

            if 'NO' != parameters["s2_preprocess"]:
                s2script_args = self.mk_s2pre_scriptandargs(
                    parameters["s2_preprocess"],
                    parameters["dir_path"],
                    parameters["metadata"])

                if s2script_args:

                    self._wfm.set_scenario_status(
                        self._id, sc_id, 0, "LOCAL ING.: S2-PRE", percent)
            
                    s2pre_errors = self.run_scripts(sc_id, ncn_id, s2script_args)
                    if s2pre_errors > 0:
                        n_errors += s2pre_errors
                    else:
                        orig_data = data
                        data = extract_outfile(s2script_args[0][3])

            mf_name = create_manifest(
                self._logger,
                ncn_id,
                parameters["dir_path"],
                metadata=parameters["metadata"],
                data=data,
                orig_data=orig_data
                )

            self._wfm.set_scenario_status(
                self._id, sc_id, 0, "RUNNING SCRIPTS", percent)

            scripts_args = []
            
            scripts = parameters["scripts"]
            if len(scripts) > 0:
                resp_fname = mkFname("addProdResp_")
                dl_dir = parameters["dir_path"]
                resp_full_fname = os.path.join(dl_dir,resp_fname)
                ap_script = [scripts[0]]
                ap_script.append("-add")
                ap_script.append("-dldir="+dl_dir)
                ap_script.append("-response="+resp_fname)
                metadata=parameters["metadata"]
                if metadata is not None:
                    ap_script.append("-meta="+get_base_fname(metadata))
                ap_script.append("-data="+get_base_fname(data))
                scripts_args.append(ap_script)

            n_errors += self.run_scripts(sc_id, ncn_id, scripts_args)

            if n_errors > 0:
                raise IngestionError("Number of errors " +`n_errors`)
            self._wfm.set_scenario_status(self._id, sc_id, 1, "IDLE", 0)
            self._logger.info("Local ingestion completed, dir: " +
                              parameters["dir_path"])

        except StopRequest as e:
            self._logger.info(`ncn_id`+
                              ": Stop request from user: Local Ingestion Stopped")
            self._wfm.set_scenario_status(self._id, sc_id, 1, "IDLE", 0)

        except Exception as e:
            self._logger.error(`ncn_id`+" Error while ingesting local product: " + `e`)
            self._wfm.set_scenario_status(self._id, sc_id, 1, "INGEST ERROR", 0)
            if IE_DEBUG > 0:
                traceback.print_exc(12,sys.stdout)

        finally:
            self._wfm.set_ingestion_pid(sc_id, 0)
예제 #20
0
                continue
            utterance = transcriptions[transcription_key]
            # print(utterance)
            target_wav_file = os.path.join(
                wav_dir, "{}_{}.wav".format(recording_name, recording_id))
            target_txt_file = os.path.join(
                txt_dir, "{}_{}.txt".format(recording_name, recording_id))
            with io.FileIO(target_txt_file, "w") as file:
                file.write(utterance.lower().encode('utf-8'))
            original_wav_file = os.path.join(recordings_dir, wav_file)
            subprocess.call([
                "sox {}  -r {} -b 16 -c 1 {}".format(
                    original_wav_file, str(args.sample_rate), target_wav_file)
            ],
                            shell=True)
    shutil.rmtree(dirpath)


if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate
    input_dir = args.input_dir
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    for root, root_dir_names, filenames in os.walk(input_dir):
        print root, root_dir_names, len(filenames)
        for filename in fnmatch.filter(filenames, '*.tgz'):
            prepare_sample(os.path.join(root, filename), target_dir)
    print('Creating manifests...')
    create_manifest(target_dir, 'voxforge_train')
예제 #21
0
 def setUp(self):
     self.man = utils.create_manifest()
예제 #22
0
 def setUp(self):
     self.man = utils.create_manifest()
     self.jsn = baseline_reader.json_baseline_as_string(LINKER_PATH)
     self.mln = otio.adapters.otio_json.read_from_string(self.jsn)
     self.mln._json_path = os.path.join(baseline_reader.MODPATH,
                                        "baselines", LINKER_PATH)
예제 #23
0
                transcription_key = recording_name + "/mfc/" + recording_id
                if transcription_key not in transcriptions:
                    continue
                utterance = transcriptions[transcription_key]

                target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
                target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
                with io.FileIO(target_txt_file, "w") as file:
                    file.write(utterance.encode('utf-8'))
                original_wav_file = os.path.join(recordings_dir, wav_file)
                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
                                                                      target_wav_file)], shell=True)

        shutil.rmtree(dirpath)


if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
    print('Creating manifests...')
    create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
예제 #24
0
                            str(args.sample_rate),
                            target_wav_file,
                        )
                    ],
                    shell=True,
                )

        shutil.rmtree(dirpath)


if __name__ == "__main__":
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall('href\="(.*\.tgz)"', content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f,
                       target_dir)
    print("Creating manifests...")
    create_manifest(
        target_dir,
        "voxforge_train_manifest.csv",
        args.min_duration,
        args.max_duration,
    )
예제 #25
0
                transcription_key = recording_name + "/mfc/" + recording_id
                if transcription_key not in transcriptions:
                    continue
                utterance = transcriptions[transcription_key]

                target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
                target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
                with io.FileIO(target_txt_file, "w") as file:
                    file.write(utterance.encode('utf-8'))
                original_wav_file = os.path.join(recordings_dir, wav_file)
                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
                                                                      target_wav_file)], shell=True)

        shutil.rmtree(dirpath)


if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
    print('Creating manifests...')
    create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
 def setUp(self):
     self.man = utils.create_manifest()