def convert_full_set(path, pattern, new_ext="wav", **kwargs): pattern = os.path.join(path, pattern) audio_files = glob.glob(pattern) for af in tqdm.tqdm(audio_files): base, ext = os.path.splitext(af) wav = base + os.path.extsep + new_ext convert.to_wave(af, wav, **kwargs)
def main(label_csv:str, audio_dir:str, json_path:str)->None: """ Reads the label_csv and writes the labels, audio_path, and duration to the path in json_path. """ with open(json_path, 'w') as fid: with open(label_csv, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for line in reader: assert len(line)==2, f"row with {line[0]} has more than 2 elements" src_ext = "m4a" src_filename = line[0] + os.path.extsep + src_ext src_audio_path = os.path.join(audio_dir, src_filename) dst_ext = "wav" dst_filename = line[0] + os.path.extsep + dst_ext dst_audio_path = os.path.join(audio_dir, dst_filename) to_wave(src_audio_path, dst_audio_path) labels = line[1] labels = process_labels(labels) duration = wav_duration(dst_audio_path) datum = {'text' : labels, 'duration' : duration, 'audio' : dst_audio_path} json.dump(datum, fid) fid.write("\n")
def extract_samples(self, save_dir: str): """ Extracts the wav files from the directories and copies them into the noise_dir. The audio files in the "SCAFE_48k" data subset are in 48 kHz and should be converted to 16 kHz. The if-statement in the for-loop does this conversion. Assumptions: - The directory structure of the zip files will not change - """ pattern = "*/*.wav" high_res_audio = {"SCAFE"} all_wav_paths = glob.glob(os.path.join(save_dir, pattern)) print("Extracting and removing sample files...") for wav_path in tqdm.tqdm(all_wav_paths): filename = os.path.basename(wav_path) dirname = os.path.basename(os.path.dirname(wav_path)) dst_filename = "{}_{}".format(dirname, filename) dst_wav_path = os.path.join(self.feed_model_dir, dst_filename) if os.path.exists(dst_wav_path): print(f"{dst_wav_path} exists. Skipping...") continue else: # if the wavs are high resolution, down-convert to 16kHz if dirname in high_res_audio: to_wave(wav_path, dst_wav_path) # if not high-res, just copy else: copyfile(wav_path, dst_wav_path)
def convert_full_set(path, pattern, new_ext="wav", **kwargs): """Function from Awni's original codebase that is used to convert TIMIT """ pattern = os.path.join(path, pattern) audio_files = glob.glob(pattern) for af in tqdm.tqdm(audio_files): base, ext = os.path.splitext(af) wav = base + os.path.extsep + new_ext convert.to_wave(af, wav, **kwargs)
def convert_full_set(path, pattern, new_ext="wav", **kwargs): """ To convert from other audio formats to wav """ pattern = os.path.join(path, pattern) audio_files = glob.glob(pattern) for af in tqdm.tqdm(audio_files): # split across extension e.g. /ff/ff/ff/.wav will be split into /ff/ff/ff and wav base, ext = os.path.splitext(af) # os.path.extsep is the separator e.g. / wav = base + os.path.extsep + new_ext convert.to_wave(af, wav, **kwargs)
def resample(audio_dir:str, target_samp_rate:int)->None: """ resamples all of the audio files in audio_dir to the target sample rate Arguments audio_dir (str): the audio directory whose files will be resampled target_samp_rate(int): the sample rate the files will be resampled to """ assert os.path.exists(audio_dir) == True, "audio directory does not exist" out_dir = os.path.join(audio_dir, "resampled") if not os.path.exists(out_dir): os.mkdir(out_dir) extensions = ["*.wav", "*.mp3", "*.aiff", "*.flac"] audio_files = list() for ext in extensions: pattern = os.path.join(audio_dir, ext) audio_files.extend(glob.glob(pattern)) for audio_fn in audio_files: filename = os.path.splitext(os.path.basename(audio_fn))[0] wav_file = filename + os.path.extsep + "wav" out_path = os.path.join(out_dir, wav_file) convert.to_wave(audio_fn, out_path)
def download_dataset(self): """ This method loops through the firestore document database using paginated queries based on the document id. It filters out documents where `target != guess` if `self.target_eq_guess` is True and saves the audio file and target text into separate files. """ PROJECT_ID = 'speak-v2-2a1f1' QUERY_LIMIT = 2000 # max size of query SAMPLES_PER_QUERY = 200 # number of random samples downloaded per query AUDIO_EXT = '.m4a' # extension of downloaded audio audio_dir = os.path.join(self.output_dir, "audio") os.makedirs(audio_dir, exist_ok=True) # verify and set the credientials CREDENTIAL_PATH = "/home/dzubke/awni_speech/speak-v2-2a1f1-d8fc553a3437.json" assert os.path.exists( CREDENTIAL_PATH ), "Credential file does not exist or is in the wrong location." # set the enviroment variable that `firebase_admin.credentials` will use os.putenv("GOOGLE_APPLICATION_CREDENTIALS", CREDENTIAL_PATH) # initialize the credentials and firebase db client cred = credentials.ApplicationDefault() firebase_admin.initialize_app(cred, {'projectId': PROJECT_ID}) db = firestore.client() # create the data-label path and initialize the tsv headers date = datetime.date.today().isoformat() self.data_label_path = os.path.join(self.output_dir, "eval2-v4_data_" + date + ".tsv") self.metadata_path = os.path.join( self.output_dir, "eval2-v4_metadata_" + date + ".json") # re-calculate the constraints in the `config` as integer counts based on the `dataset_size` self.constraints = { name: int(self.constraints[name] * self.num_examples) for name in self.constraints.keys() } # constraint_names will help to ensure the dict keys created later are consistent. constraint_names = list(self.constraints.keys()) print("constraints: ", self.constraints) # id_counter keeps track of the counts for each speaker, lesson, and line ids id_counter = {name: dict() for name in constraint_names} # create a mapping from record_id to lesson, line, and speaker ids disjoint_ids_map = get_record_ids_map(metadata_path, constraint_names) # create a dict of sets of all the ids in the disjoint datasets that will not # be included in the filtered dataset disjoint_id_sets = {name: set() for name in self.disjoint_id_names} for disj_dataset_path in self.disjoint_datasets: disj_dataset = read_data_json(disj_dataset_path) # extracts the record_ids from the excluded datasets record_ids = [ path_to_id(example['audio']) for example in disj_dataset ] # loop through each record id for record_id in record_ids: # loop through each id_name and update the disjoint_id_sets for disjoint_id_name, disjoint_id_set in disjoint_id_sets.items( ): disjoint_id_set.add( disjoint_ids_map[record_id][disjoint_id_name]) # creating a data range from `self.days_from_today` in the correct format now = datetime.datetime.utcnow() day_delta = datetime.timedelta(days=self.days_from_today) day_range = now - day_delta day_range = day_range.isoformat("T") + "Z" with open(self.data_label_path, 'w', newline='\n') as tsv_file: tsv_writer = csv.writer(tsv_file, delimiter='\t') header = [ "id", "target", "guess", "lessonId", "target_sentence", "lineId", "uid", "redWords_score", "date" ] tsv_writer.writerow(header) # create the first query based on the constant QUERY_LIMIT rec_ref = db.collection(u'recordings') # this is the final record_id that was downloaded from the speak training set speak_train_last_id = 'SR9TIlF8bSWApZa1tqEBIHOQs5z1-1583920255' next_query = rec_ref\ .order_by(u'id')\ .start_after({u'id': speak_train_last_id})\ .limit(QUERY_LIMIT)\ # loop through the queries until the example_count is at least the num_examples example_count = 0 # get the ids from the training and testsets to ensure the downloaded set is disjoint train_test_set = self.get_train_test_ids() while example_count < self.num_examples: print(f"another loop with {example_count} examples written") # convert the generator to a list to retrieve the last doc_id docs = list( map(lambda x: self._doc_trim_to_dict(x), next_query.stream())) try: # this time will be used to start the next query last_id = docs[-1]['id'] # if the docs list is empty, there are no new documents # and an IndexError will be raised and break the while loop except IndexError: print("Exiting while loop") break # selects a random sample of `SAMPLES_PER_QUERY` from the total queries #docs = random.sample(docs, SAMPLES_PER_QUERY) for doc in docs: # if num_examples is reached, break if example_count >= self.num_examples: break target = process_text(doc['info']['target']) # check that the speaker, target-sentence, and record_Id are disjoint if doc['user']['uid'] not in disjoint_id_sets['speaker']\ and target not in disjoint_id_sets['target_sentence']\ and doc['id'] not in train_test_set: # set `self.target_eq_guess` to True in `init` if you want ## to filter by `target`==`guess` if self.target_eq_guess: # process the target and guess and remove apostrophe's for comparison guess = process_text(doc['result']['guess']) # removing apostrophes for comparison target_no_apostrophe = target.replace("'", "") guess_no_apostrophe = guess.replace("'", "") # if targ != guess, skip the record if target_no_apostrophe != guess_no_apostrophe: continue # if `True` constraints on the records downloaded will be checked if self.check_constraints: # create a mapping to feed into `check_update_constraints` record_ids_map = { doc['id']: { 'lesson': doc['info']['lessonId'], 'target_sentence': target, # using processed target as id 'speaker': doc['user']['uid'] } } pass_constraint = check_update_contraints( doc['id'], record_ids_map, id_counter, self.constraints) # if the record doesn't pass the constraints, continue to the next record if not pass_constraint: continue # save the audio file from the link in the document audio_url = doc['result']['audioDownloadUrl'] audio_path = os.path.join(audio_dir, doc['id'] + AUDIO_EXT) # convert the downloaded file to .wav format # usually, this conversion done in the preprocessing step # but some eval sets don't need PER labels, and so this removes the need to # preprocess the evalset. base, raw_ext = os.path.splitext(audio_path) # use the `.wv` extension if the original file is a `.wav` wav_path = base + os.path.extsep + "wav" # if the wave file doesn't exist, convert to wav if not os.path.exists(wav_path): try: to_wave(audio_path, wav_path) except subprocess.CalledProcessError: # if the file can't be converted, skip the file by continuing logging.info( f"Process Error converting file: {audio_path}" ) continue # save the target in a tsv row # tsv header: "id", "target", "guess", "lessonId", "target_id", "lineId", "uid", "date" tsv_row = [ doc['id'], doc['info']['target'], doc['result']['guess'], doc['info']['lessonId'], target, # using this to replace lineId doc['info']['lineId'], doc['user']['uid'], doc['result']['score'], doc['info']['date'] ] tsv_writer.writerow(tsv_row) # save all the metadata in a separate file #with open(self.metadata_path, 'a') as jsonfile: # json.dump(doc, jsonfile) # jsonfile.write("\n") example_count += 1 # create the next query starting after the last_id next_query = (rec_ref\ .order_by(u'id')\ .start_after({u'id': last_id})\ .limit(QUERY_LIMIT) )