예제 #1
0
def clean_transcriptions(corpus):
    print("Cleaning transcriptions ...")
    for utterance in tqdm.tqdm(corpus.utterances.values()):
        ll = utterance.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT]

        for label in ll:
            label.value = text_cleaning.clean_sentence(label.value)[0]
예제 #2
0
def clean_transcriptions(corpus):
    for utterance in corpus.utterances.values():
        transcription = utterance.label_lists[
            audiomate.corpus.LL_WORD_TRANSCRIPT][0].value
        cleaned = text_cleaning.clean_sentence(transcription)
        utterance.label_lists[
            audiomate.corpus.LL_WORD_TRANSCRIPT][0].value = cleaned
예제 #3
0
    return transcripts


parser = argparse.ArgumentParser(description='Clean text corpus.')
parser.add_argument('source_path', type=str)
parser.add_argument('target_path', type=str)
parser.add_argument('--training_csv', type=str)

args = parser.parse_args()

index = 0

with open(args.source_path, 'r') as source_file, open(args.target_path,
                                                      'w') as target_file:
    for index, line in enumerate(source_file):
        cleaned_sentence = text_cleaning.clean_sentence(line)
        target_file.write('{}\n'.format(cleaned_sentence))

        if index % 1000 == 0:
            print(index)

    print('Cleaned {} lines!'.format(index))

    if args.training_csv is not None:
        training_transcripts = read_training_transcripts(args.training_csv)
        target_file.write('\n'.join(training_transcripts))

        print('Added {} transcripts from training data!'.format(
            len(training_transcripts)))
def main():
    parser = argparse.ArgumentParser(description="Clean and shuffle datasets")
    parser.add_argument("input_csv_path", type=str)
    parser.add_argument("output_csv_path", type=str)
    parser.add_argument("--shuffle", action="store_true")
    parser.add_argument("--sort",
                        action="store_true",
                        help="Sort dataset by filesize")
    parser.add_argument("--replace", action="store_true")
    parser.add_argument("--clean", action="store_true")
    parser.add_argument("--exclude", action="store_true")
    parser.add_argument("--nostats", action="store_true")
    args = parser.parse_args()

    pandarallel.initialize()
    print("This may take a few minutes ... ")

    file_path = os.path.dirname(os.path.realpath(__file__)) + "/"
    with open(file_path + "../data/excluded_files.json") as json_file:
        excluded = json.load(json_file)

    if not (args.shuffle or args.replace or args.clean or args.exclude):
        print("No operation given")
        exit()

    start_time = time.time()

    # Keep the german 0 as "null" string
    data = pd.read_csv(args.input_csv_path, keep_default_na=False)

    if not args.nostats:
        # Add statistics columns, save start size and duration and print data statistics
        data = add_statistics_columns(data)
        size_start = len(data)
        duration_start = data["duration"].sum()
        print_statistics(data)

    if args.exclude:
        length_old = len(data)
        data = data[~data["wav_filename"].isin(excluded)]
        msg = "Excluded {} files which were marked for exclusion"
        print(msg.format(length_old - len(data)))

    if args.shuffle:
        data = data.reindex(np.random.permutation(data.index))

    if args.sort:
        data = data.sort_values("wav_filesize")
        data = data.reset_index(drop=True)

    if args.replace:
        data["transcript"] = data["transcript"].str.lower()
        data["transcript"] = data["transcript"].parallel_apply(
            lambda x: text_cleaning.clean_sentence(x, replace_umlauts=True)[0])

    if args.clean and not args.nostats:
        data = clean(data)

    if not args.nostats:
        # Print statistics again, save end size and duration and drop temporary columns
        size_end = len(data)
        time_end = data["duration"].sum()
        size_diff = size_start - size_end
        time_diff = duration_start - time_end
        print_statistics(data)
        data = data.drop(
            columns=["duration", "text_length", "avg_time_per_char"])

        # Print summary
        msg = "Excluded in total {} of {} files, those are {:.1f}% of all files"
        print(msg.format(size_diff, size_start, size_diff / size_start * 100))

        msg = "This are {} of {} hours, those are  {:.1f}% of the full duration"
        msg = msg.format(
            seconds_to_hours(time_diff),
            seconds_to_hours(duration_start),
            time_diff / duration_start * 100,
        )
        print(msg)

        msg = "Your dataset now has {} files and a duration of {} hours\n"
        print(msg.format(size_end, seconds_to_hours(time_end)))

    data.to_csv(args.output_csv_path, index=False, encoding="utf-8")
    end_time = time.time()
    msg = "Preparation took {} hours\n"
    print(msg.format(seconds_to_hours(end_time - start_time)))
예제 #5
0
def clean_transcriptions(corpus):
    for utterance in corpus.utterances.values():
        ll = utterance.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT]

        for label in ll:
            label.value = text_cleaning.clean_sentence(label.value)
예제 #6
0
def clean_transcriptions(corpus):
    for utterance in corpus.utterances.values():
        transcription = utterance.label_lists['transcription'][0].value
        cleaned = text_cleaning.clean_sentence(transcription)
        utterance.label_lists['transcription'][0].value = cleaned