def clean_transcriptions(corpus): print("Cleaning transcriptions ...") for utterance in tqdm.tqdm(corpus.utterances.values()): ll = utterance.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT] for label in ll: label.value = text_cleaning.clean_sentence(label.value)[0]
def clean_transcriptions(corpus): for utterance in corpus.utterances.values(): transcription = utterance.label_lists[ audiomate.corpus.LL_WORD_TRANSCRIPT][0].value cleaned = text_cleaning.clean_sentence(transcription) utterance.label_lists[ audiomate.corpus.LL_WORD_TRANSCRIPT][0].value = cleaned
return transcripts parser = argparse.ArgumentParser(description='Clean text corpus.') parser.add_argument('source_path', type=str) parser.add_argument('target_path', type=str) parser.add_argument('--training_csv', type=str) args = parser.parse_args() index = 0 with open(args.source_path, 'r') as source_file, open(args.target_path, 'w') as target_file: for index, line in enumerate(source_file): cleaned_sentence = text_cleaning.clean_sentence(line) target_file.write('{}\n'.format(cleaned_sentence)) if index % 1000 == 0: print(index) print('Cleaned {} lines!'.format(index)) if args.training_csv is not None: training_transcripts = read_training_transcripts(args.training_csv) target_file.write('\n'.join(training_transcripts)) print('Added {} transcripts from training data!'.format( len(training_transcripts)))
def main(): parser = argparse.ArgumentParser(description="Clean and shuffle datasets") parser.add_argument("input_csv_path", type=str) parser.add_argument("output_csv_path", type=str) parser.add_argument("--shuffle", action="store_true") parser.add_argument("--sort", action="store_true", help="Sort dataset by filesize") parser.add_argument("--replace", action="store_true") parser.add_argument("--clean", action="store_true") parser.add_argument("--exclude", action="store_true") parser.add_argument("--nostats", action="store_true") args = parser.parse_args() pandarallel.initialize() print("This may take a few minutes ... ") file_path = os.path.dirname(os.path.realpath(__file__)) + "/" with open(file_path + "../data/excluded_files.json") as json_file: excluded = json.load(json_file) if not (args.shuffle or args.replace or args.clean or args.exclude): print("No operation given") exit() start_time = time.time() # Keep the german 0 as "null" string data = pd.read_csv(args.input_csv_path, keep_default_na=False) if not args.nostats: # Add statistics columns, save start size and duration and print data statistics data = add_statistics_columns(data) size_start = len(data) duration_start = data["duration"].sum() print_statistics(data) if args.exclude: length_old = len(data) data = data[~data["wav_filename"].isin(excluded)] msg = "Excluded {} files which were marked for exclusion" print(msg.format(length_old - len(data))) if args.shuffle: data = data.reindex(np.random.permutation(data.index)) if args.sort: data = data.sort_values("wav_filesize") data = data.reset_index(drop=True) if args.replace: data["transcript"] = data["transcript"].str.lower() data["transcript"] = data["transcript"].parallel_apply( lambda x: text_cleaning.clean_sentence(x, replace_umlauts=True)[0]) if args.clean and not args.nostats: data = clean(data) if not args.nostats: # Print statistics again, save end size and duration and drop temporary columns size_end = len(data) time_end = data["duration"].sum() size_diff = size_start - size_end time_diff = duration_start - time_end print_statistics(data) data = data.drop( columns=["duration", "text_length", "avg_time_per_char"]) # Print summary msg = "Excluded in total {} of {} files, those are {:.1f}% of all files" print(msg.format(size_diff, size_start, size_diff / size_start * 100)) msg = "This are {} of {} hours, those are {:.1f}% of the full duration" msg = msg.format( seconds_to_hours(time_diff), seconds_to_hours(duration_start), time_diff / duration_start * 100, ) print(msg) msg = "Your dataset now has {} files and a duration of {} hours\n" print(msg.format(size_end, seconds_to_hours(time_end))) data.to_csv(args.output_csv_path, index=False, encoding="utf-8") end_time = time.time() msg = "Preparation took {} hours\n" print(msg.format(seconds_to_hours(end_time - start_time)))
def clean_transcriptions(corpus): for utterance in corpus.utterances.values(): ll = utterance.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT] for label in ll: label.value = text_cleaning.clean_sentence(label.value)
def clean_transcriptions(corpus): for utterance in corpus.utterances.values(): transcription = utterance.label_lists['transcription'][0].value cleaned = text_cleaning.clean_sentence(transcription) utterance.label_lists['transcription'][0].value = cleaned