def gen_csv(): csv_train = [] for subdir, dirs, files in os.walk(rootdir): for file in files: if (".csv" in file and file.startswith("metadata")): file_dir = os.path.join(subdir, file) #Too many csvs we are short on memory #os.system("mv "+file_dir +" /data/home/GPUAdmin1/asr/M-AILABS/csvs/" + file) with open(file_dir) as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') for row in csv_reader: #print("filename: " + row[0]) #print("transcript: " + row[2]) filename = row[0] transcript = row[2] transcript = clean_sentence(transcript) wav_file_dir = "/speech/M-AILABS/" + filename + ".wav" if (os.path.exists(wav_file_dir)): csv_train.append((wav_file_dir, transcript)) df = pandas.DataFrame(data=csv_train) output_file = "/data/home/GPUAdmin1/asr/train_csvs/M-AILABS_train.csv" df.to_csv(output_file, header=False, index=False, sep=",") #create dict from csvs """
def gen_swc_csv(root_dir=dir): csv = [] with open("transcriptions.txt", 'r') as f: lines = f.readlines() i = 0 for line in lines: i += 1 file_name = line.split(" ", 1)[0] file_text = line.split(" ", 1)[1] sentence = file_text.split(" ") if len(sentence) <= 2: continue trans = clean_sentence(file_text) file_path = os.path.join(root_dir, file_name + ".wav") csv.append((file_path, trans)) print("File " + str(i) + " / " + str(len(lines)), end='\r') print() print("Writing CSV File:") df = pandas.DataFrame(data=csv) output_file = "/home/GPUAdmin1/asr/train_csvs/swc_train.csv" df.to_csv(output_file, header=False, index=False, sep=",")
def rename_utterances_and_gen_csv(root_dir=dir): wav_files = os.path.join(root_dir, "wav_files") valid_wav = os.path.join(root_dir, "valid_wav") validated_tsv = os.path.join(root_dir, "validated.tsv") csv_data = [] speakers_dict = get_dict_speakers() with open(validated_tsv) as f: lines = csv.reader(f, delimiter='\t') next(lines, None) i = 0 for line in lines: client_id = line[0] speaker = speakers_dict.get(client_id) src = os.path.join(wav_files, line[1] + ".wav") dst = os.path.join( valid_wav, "spk{0:0=4d}".format(speaker) + "_utt{0:0=6d}.wav".format(i)) shutil.copy(src, dst) trans = clean_sentence(line[2]) csv_data.append((dst, trans)) i += 1 print("Renaming: " + str(i) + " / 277603 ", end="\r") sorted_csv = sorted(csv_data, key=lambda tup: tup[0]) df = pandas.DataFrame(data=sorted_csv) output_file = "/speech/common_voice_de/common_voice_valid_wav.csv" df.to_csv(output_file, header=False, index=False, sep=",")
def convert_to_wav(root_dir=dir): valid_wav = os.path.join(root_dir, "valid_wav") if not os.path.exists(valid_wav): os.makedirs(valid_wav) validated_tsv = os.path.join(root_dir, "validated.tsv") valid_data = [] with open(validated_tsv) as f: lines = csv.reader(f, delimiter='\t') next(lines, None) total = len(list(lines)) i = 0 for line in lines: i += 1 src = os.path.join(root_dir, "clips", line[1] + ".mp3") dst = os.path.join(valid_wav, line[1] + ".wav") trans = clean_sentence(line[2]) valid_data.append((dst, trans)) # convert wav to mp3 sound = AudioSegment.from_mp3(src) sound = sound.set_frame_rate(16000) sound.export(dst, format="wav") print(str(i), end='\r') print("Converting files: " + str(i) + " / " + str(total), end="\r") df = pandas.DataFrame(data=valid_data) output_file = "/speech/common_voice_de/common_voice_valid_wav.csv" df.to_csv(output_file, header=False, index=False, sep=",")
def generate_csv(): paths = ["test", "dev", "train"] for path in paths: csv = [] files = [ f for f in listdir(join(directory, path)) if isfile(join(directory, path, f)) ] dir_path = os.path.join(directory, path) processed_files = 0 total_files = len(files) for file in files: file_path = os.path.join(dir_path, file) processed_files+=1 print("Processing " + path + " " + str(processed_files) + "/" + str(total_files), end="\r") if file.endswith(".xml"): tree = ET.parse(file_path) recording = tree.getroot() sent = recording.find("cleaned_sentence") sent = sent.text.lower() transcript = clean_sentence(sent) file_xml, _ = file.split(".", 1) found = 0 for wav_file in files: if wav_file.startswith(file_xml) and wav_file.endswith(".wav"): wav_file_dir = os.path.join(dir_path, wav_file) csv.append((wav_file_dir, transcript)) found += 1 #remove that check if you keep more than 2 microphones #if found >= 2: if found >= 5: break print() output_file = os.path.join(directory, path + ".csv") with open(output_file, 'w') as f: for line in csv: f.write(line[0]+","+line[1] + "\n") print("Successfully generated csv file {}.csv".format(path)) print("=====================")
def preprocess_wem(tuplist): # inputs were formerly: (tuplist, start, limit) '''This function cleans and tokenizes sentences, removing punctuation and numbers and making words into lower-case stems. Inputs: list of four-element tuples, the last element of which holds the long string of text we care about; an integer limit (bypassed when set to -1) indicating the DF row index on which to stop the function (for testing purposes), and similarly, an integer start (bypassed when set to -1) indicating the DF row index on which to start the function (for testing purposes). This function loops over five nested levels, which from high to low are: row, tuple, chunk, sentence, word. Note: This approach maintains accurate semantic distances by keeping stopwords.''' global mpdo # Check if we're doing multiprocessing. If so, then mpdo=True global sents_combined # Grants access to variable holding a list of lists of words, where each list of words represents a sentence in its original order (only relevant for this function if we're not using multiprocessing) global pcount # Grants access to preprocessing counter known_pages = set() # Initialize list of known pages for a school sents_combined = [] # Initialize list of all school's sentences if type(tuplist) == float: return # Can't iterate over floats, so exit #print('Parsing school #' + str(pcount)) # Print number of school being parsed for tup in tuplist: # Iterate over tuples in tuplist (list of tuples) if tup[3] in known_pages or tup == '': # Could use hashing to speed up comparison: hashlib.sha224(tup[3].encode()).hexdigest() continue # Skip this page if exactly the same as a previous page on this school's website for chunk in tup[3].split('\n'): for sent in sent_tokenize( chunk ): # Tokenize chunk by sentences (in case >1 sentence in chunk) #sent = clean_sentence(sent, fast=True) # Clean and tokenize sentence sent = clean_sentence(sent) if ( (sent == []) or (len(sent) == 0) ): # If sentence is empty, continue to next sentence without appending continue # TO DO: Chunk this by school, not just sentence # TO DO: Now that sentences are parsed and cleaned by spaces, # recombine and then parse more accurately using spacy word tokenizer # Save preprocessing sentence to object (if not multiprocessing) #sents_combined.append(sent) # add sent to object #if nested works sents_combined.extend(sent) # if nested version doesnt work known_pages.add(tup[3]) school_sentslist.append(sents_combined) # add sent to object #pcount += 1 # Add to counter return sents_combined
def gen_csv(root_dir = dir): csv_list = [] trans = root_dir + "transcript.txt" error = 0 with open(trans, 'r') as f: lines = csv.reader(f, delimiter='|') i=0 for line in lines: i+=1 path = join(root_dir, line[0]) text = line[2] clean_text = clean_sentence(text) csv_list.append( (path, clean_text) ) print("File " + str(i) + " / 7427", end='\r') print() print("Writing CSV File:") df = pandas.DataFrame(data=csv_list) output_file = "/home/GPUAdmin1/asr/train_csvs/single_speaker.csv" df.to_csv(output_file, header=False, index=False, sep=",")
def preprocess_wem2(ls): '''This function cleans and tokenizes sentences, removing punctuation and numbers and making words into lower-case stems. Inputs: list of strings; This function loops over all elements in the input list given, cleans the texts and returns one string''' global mpdo # Check if we're doing multiprocessing. If so, then mpdo=True global sents_combined # Grants access to variable holding a list of lists of words, where each list of words represents a sentence in its original order (only relevant for this function if we're not using multiprocessing) global pcount # Grants access to preprocessing counter known_pages = set() # Initialize list of known pages for a school sents_combined = [] # Initialize list of all school's sentences #print('Parsing school #' + str(pcount)) # Print number of school being parsed for s in ls: # Iterate over tuples in tuplist (list of tuples) for chunk in s.split('\n'): for sent in sent_tokenize( chunk ): # Tokenize chunk by sentences (in case >1 sentence in chunk) #sent = clean_sentence(sent, fast=True) # Clean and tokenize sentence sent = clean_sentence(sent) if ( (sent == []) or (len(sent) == 0) ): # If sentence is empty, continue to next sentence without appending continue # TO DO: Chunk this by school, not just sentence # TO DO: Now that sentences are parsed and cleaned by spaces, # recombine and then parse more accurately using spacy word tokenizer # Save preprocessing sentence to object (if not multiprocessing) #sents_combined.append(sent) # add sent to object #if nested works sents_combined.extend(sent) # if nested version doesnt work school_sentslist.append(sents_combined) # add sent to object return sents_combined
punctstr = punctstr_make() print("Stopwords, Unicodes, Punctuations lists creation complete!") #word2vec computation whole_text = [] s_count = 0 #initializing count for number of schools' texts appended for school in df['text']: s_count += 1 if s_count % 10000 == 0: print("Processed: ", s_count, " Schools' texts.") for chunk in school.split("\n"): for sent in sent_tokenize(chunk): <<<<<<< HEAD sent = clean_sentence(sent) ======= sent = clean_sentence(sent, unhyphenate=True, remove_propernouns=False) >>>>>>> 8a929bf... cleaning up and normalizing WEMs and N-Gram work from summer 2019 sent = [word for word in sent if word != ''] if len(sent) > 0: whole_text.append(sent) print("Text appending/processing complete!") #defining directory locations to save word embedding model/vocab cwd = os.getcwd() <<<<<<< HEAD model_path = cwd + "/wem_model_300d.bin" vocab_path = cwd + "/wem_vocab_300d.txt"
def split_text( in_file: str, out_file: str, vocabulary: List[str] = None, language='eng', remove_square_brackets=True, do_lower_case=True, min_length=20, ): """ Breaks down the in_file into sentences. Each sentence will be on a separate line. Also replaces numbers with a simple spoken equivalent based on NUMBERS_TO_<lang> map and removes punctuation Args: in_file: path to original transcript out_file: path to the output file vocabulary: ASR model vocabulary language: text language remove_square_brackets: Set to True if square brackets [] should be removed from text. Text in square brackets often contains unaudibale fragments like notes or translations do_lower_case: flag that determines whether to apply lower case to the in_file text """ print(f'Splitting text in {in_file} into sentences.') with open(in_file, "r") as f: transcript = f.read() # remove some symbols for better split into sentences transcript = (transcript.replace("\n", " ").replace("\t", " ").replace( "…", "...").replace("»", "").replace("«", "").replace("\\", "").replace( "”", "").replace("„", "")) # remove extra space transcript = re.sub(r' +', ' ', transcript) if remove_square_brackets: transcript = re.sub(r'(\[.*?\])', ' ', transcript) # Read and split transcript by utterance (roughly, sentences) split_pattern = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s" if language == 'ru': lower_case_ru_letters_unicode = '\u0430-\u04FF' upper_case_ru_letters_unicode = '\u0410-\u042F' # remove space in the middle of the lower case abbreviation to avoid spliting into separate sentences matches = re.findall(r'[a-z\u0430-\u04FF]\.\s[a-z\u0430-\u04FF]\.', transcript) for match in matches: transcript = transcript.replace(match, match.replace('. ', '.')) split_pattern = ("(?<!\w\.\w.)(?<![A-Z" + upper_case_ru_letters_unicode + "][a-z" + lower_case_ru_letters_unicode + "]\.)(?<![" + upper_case_ru_letters_unicode + "]\.)(?<=\.|\?|\!)\s") elif language not in ['ru', 'eng']: print( f'Consider using {language} unicode letters for better sentence split.' ) sentences = re.split(split_pattern, transcript) sentences_comb = [] # adds a short sentence to the previous one for i in range(len(sentences)): if len(sentences[i]) < min_length and len(sentences_comb) > 0: sentences_comb[-1] += ' ' + sentences[i].strip() else: sentences_comb.append(sentences[i].strip()) sentences = "\n".join([s.strip() for s in sentences_comb if s]) # save split text with original punctuation and case out_dir, out_file_name = os.path.split(out_file) with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'), "w") as f: f.write(sentences) # substitute common abbreviations before applying lower case if language == 'ru': for k, v in RU_ABBREVIATIONS.items(): sentences = sentences.replace(k, v) if do_lower_case: sentences = sentences.lower() print(len(sentences.split('\n'))) sentences = '\n'.join( [clean_sentence(sentence) for sentence in sentences.split('\n')]) # if language == 'eng': # # for k, v in NUMBERS_TO_ENG.items(): # # sentences = sentences.replace(k, v) # # remove non acsii characters # sentences = ''.join(i for i in sentences if ord(i) < 128) # elif language == 'ru': # if vocabulary and '-' not in vocabulary: # sentences = sentences.replace('-', ' ') # for k, v in NUMBERS_TO_RU.items(): # sentences = sentences.replace(k, v) # # replace Latin characters with Russian # for k, v in LATIN_TO_RU.items(): # sentences = sentences.replace(k, v) # # # make sure to leave punctuation present in vocabulary # all_punct_marks = string.punctuation + "–—’“”" # if vocabulary: # for v in vocabulary: # all_punct_marks = all_punct_marks.replace(v, '') # sentences = re.sub("[" + all_punct_marks + "]", "", sentences).strip() with open(out_file, "w") as f: f.write(sentences)
from clean_text import clean_sentence with open("/lm_corpus/German_sentences_8mil_filtered_maryfied.txt", "r") as text: with open("/lm_corpus/mary.txt", "w") as out_file: for line in text: sent = clean_sentence(line) out_file.write(sent + "\n")
] files_out = [ rootdir + "test_csvs/cv_test.csv", rootdir + "dev_csvs/cv_dev.csv", rootdir + "test_csvs/tuda_test.csv", rootdir + "dev_csvs/tuda_dev.csv" ] sentences = [] for file_dir in files: if (".csv" in file_dir): with open(file_dir) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: sentences.append(row[1] + "\n") else: with open(file_dir, "r") as text: for line in text: sent = clean_sentence(line.split(" ", 1)[1]) sentences.append(sent + "\n") sentences_out = [] for file_dir in files_out: with open(file_dir) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: sentences_out.append(row[1] + "\n") sent_set = set(sentences) corpus = open("/data/home/GPUAdmin1/asr/corpus.txt", "w") for sent in sent_set: if (sent in sentences_out): continue corpus.write(sent)
# --TODO remove things between [] and ♪ if "[" in transcript and "]" in transcript: transcript = clean(transcript, "[", "]") if "(" in transcript and ")" in transcript: transcript = clean(transcript, "(", ")") if "<" in transcript and ">" in transcript: transcript = clean(transcript, "<", ">") if "*" in transcript and "*" in transcript.split("*", 1)[1]: transcript = clean(transcript, "*") if "♪" in transcript and "♪" in transcript.split("♪", 1)[1]: transcript = clean(transcript, "♪") transcript = transcript.replace("- ", "") transcript = transcript.replace("-", "") transcript = transcript.replace('"', "") transcript = transcript.replace("'", "") transcriptclean = clean_sentence(transcript) # --TODO if the whole wav is non talk igneore it i.e continue if ( # TODO continue after Parfum transcriptclean.strip() == "" or transcriptclean.strip() == "nina deggert thomas deggert" or transcriptclean.strip() == "dr friedrich kronberg" or transcriptclean.strip() == "operation juninacht" or transcriptclean.strip() == "monika schöllack" or transcriptclean.strip() == "spricht polnisch" or transcriptclean.strip() == "telefon" or transcriptclean.strip() == "hupen" or transcript.strip() == "Ostberlin 1980" or "hassans vater spricht arabisch" in transcriptclean or "singt schlaflied auf polnisch" in transcriptclean or "reifen quietschen" in transcriptclean
def nltk_tokenize(rootdir=dir, output_root="/lm_corpus/dewiki_nltk_segmented/"): paths = listdir(rootdir) exists = os.path.isdir(output_root) if not exists: os.mkdir(output_root) total_paths = len(paths) current_path = 0 for path in paths: output_dir = join(output_root, path) exists = os.path.isdir(output_dir) if not exists: os.mkdir(output_dir) files = [ f for f in listdir(join(rootdir, path)) if isfile(join(rootdir, path, f)) ] current_path += 1 total_files = len(files) processed_files = 0 for file in files: file_path = join(rootdir, path, file) new_file_name = join(output_dir, file + ".txt") processed_files += 1 print("Processing path " + path + " " + str(current_path) + "/" + str(total_paths) + " Files: " + str(processed_files) + "/" + str(total_files), end="\r") with open(file_path, 'r+', encoding='utf-8') as f: with open(new_file_name, 'w', encoding='utf-8') as new_file: doc = "" skip_header = False while (True): line = f.readline() if not line: doc = "" break if skip_header: skip_header = False continue if "<doc id=" in line: skip_header = True continue if not line.strip(): continue if "</doc>" in line: sentences = sent_tokenize(doc) for j in range(len(sentences)): clean_sent = clean_sentence(sentences[j]) clean_sent = ' '.join(clean_sent.split()) new_file.write(clean_sent + '\n') doc = "" else: doc = doc + line
def spacy_tokenize(rootdir=dir, output_root="/lm_corpus/dewiki_spacy_segmented/"): nlp = spacy.load('de') paths = listdir(rootdir) exists = os.path.isdir(output_root) if not exists: os.mkdir(output_root) total_paths = len(paths) current_path = 0 for path in paths: output_dir = join(output_root, path) exists = os.path.isdir(output_dir) if not exists: os.mkdir(output_dir) files = [ f for f in listdir(join(rootdir, path)) if isfile(join(rootdir, path, f)) ] current_path += 1 total_files = len(files) processed_files = 0 for file in files: file_path = join(rootdir, path, file) new_file_name = join(output_dir, file + "_spacy.txt") processed_files += 1 print("Processing path " + path + " " + str(current_path) + "/" + str(total_paths) + " Files: " + str(processed_files) + "/" + str(total_files), end="\r") with open(file_path, 'r+', encoding='utf-8') as f: with open(new_file_name, 'w', encoding='utf-8') as new_file: content = f.readlines() doc = "" skip_header = False for i in range(len(content)): if skip_header: skip_header = False continue if "<doc id=" in content[i]: skip_header = True continue if not content[i].strip(): continue if "</doc>" in content[i]: doc = nlp(doc) sentences = list(doc.sents) for j in range(len(sentences)): clean_sent = clean_sentence( sentences[j].string.strip()) #clean_sent = sentences[j].string.strip() clean_sent = ' '.join(clean_sent.split()) new_file.write(clean_sent + '\n') new_file.write('\n') doc = "" else: doc = doc + content[i]