def main(): # man 900 middle 750 default 500 frame_length = 900 parser = argparse.ArgumentParser() parser.add_argument('-s', '--speed', type=float, default=1.) parser.add_argument('-t', '--time', type=float, default=-10.) parser.add_argument('-o', '--output', type=str, default='output.wav') parser.add_argument('-i', '--input', type=str, default='input.wav') args = parser.parse_args() parameters = {} input_filename = args.input output_filename = args.output if not os.path.isfile(input_filename): raise RuntimeError('no input file') x, fs = core.load(input_filename) #f0, sp, ap = pw.wav2world(x, fs) frame_length = 1500 #100000 // int(calculateF0(f0)) // 2 * 2 y, sr = core.load(input_filename, sr=fs) onset_frames = onset.onset_detect(x, sr=sr, wait=1, pre_avg=1, post_avg=1, pre_max=1, post_max=1) onset_times = librosa.frames_to_time(onset_frames) plt.plot(y) for i in onset_times: plt.plot([i * 22050, i * 22050], [-1, 1], color="red") S = librosa.stft(x) logS = librosa.amplitude_to_db(abs(S)) plt.savefig('woman.png') if args.time < 0: parameters['origin_time'] = core.get_duration(y, sr) parameters['convert_time'] = parameters['origin_time'] / args.speed else: parameters['origin_time'] = core.get_duration(y, sr) parameters['convert_time'] = args.time parameters['sample_rate'] = sr parameters['frame_length'] = int(fs / 22050 * frame_length) #if parameters['convert_time'] / parameters['origin_time'] > 0.8: convert_upper_threshold(input_filename, output_filename, parameters)
def upload_tracks(request, project_id): message = "" project = Project.objects.get(id=project_id) if request.method == "POST": form = MultipleFileFieldForm(request.POST, request.FILES) current_files = project.audiotrack_set.values_list("name", flat=True) if form.is_valid(): files = request.FILES.getlist("file_field") non_audio = 0 duplicate = 0 success = 0 for f in files: if not f.name.endswith((".mp3", ".ogg", ".wav", ".flac")): non_audio += 1 continue if f.name in current_files: duplicate += 1 continue local_file = os.path.join(settings.MEDIA_ROOT, f.name) with open(local_file, 'wb+') as destination: for chunk in f.chunks(): destination.write(chunk) try: duration = get_duration(filename=local_file) except Exception: duration = 0 AudioTrack.objects.create( name=f.name, file=os.path.join(settings.MEDIA_URL, f.name), format=os.path.splitext(f.name)[1][1:], project=project, duration=duration) success += 1 message = f"{success} files successfully uploaded" if non_audio: message += f", {non_audio} non-audio files rejected" if duplicate: message += f", {duplicate} duplicate files rejected" form = MultipleFileFieldForm() return render(request, 'annotate/upload.html', { 'form': form, 'project': project, 'message': message })
def get_next_clip(all_files): # select only clips that are available all_files = [a for a in all_files if a["elapsed"] is False] # if no more files are available, exit if len(all_files) == 0: return None # randomly pick a file selected_idx = np.random.randint(0, len(all_files)) selected_file = all_files[selected_idx] file_path = selected_file["file_path"] # load only part of the clip clip, sr = librosa.load(file_path, sr=args.sample_rate, offset=selected_file["time_elapsed"].seconds, duration=args.max_clip_len) assert sr == args.sample_rate clip_len = timedelta(seconds=get_duration(clip, sr=sr)) if clip_len.seconds == 0: selected_file["elapsed"] = True return timedelta(seconds=0, hours=0, minutes=0) elapsed_time = selected_file["time_elapsed"].seconds # print("Clip ID: ", elapsed_time, "Clip Len:", clip_len, # "Time Elapsed: ", selected_file["time_elapsed"]) selected_file["time_elapsed"] += clip_len split_clip_id = "{}_{}_{}".format(selected_file["book_id"], selected_file["clip_id"], elapsed_time) dest_path = os.path.join(output_dir, "{}.wav".format(split_clip_id)) log.debug("Saving clip to {}".format(dest_path)) librosa.output.write_wav(dest_path, clip, args.sample_rate) return clip_len
def extract_feature(self, song_fname): song, _ = self.load_song(song_fname) duration = int(get_duration(song)) n = song.shape[0] num_windows = int(np.floor((duration - self.wsize) / self.stride)) + 20 feats = np.zeros((num_windows, self.feat_dim), dtype=np.float32) start = 0 counter = 0 while True: end = start + int(self.wsize * self.sr) if end >= n: break window = song[start:end] feat_ = self._calc_all_feat(window) feats[counter] = feat_ counter += 1 start += int(self.stride * self.sr) if start >= n: break if counter < num_windows: feats = feats[:counter, :] return feats
start = datetime.now() # Get speaker ID split, _, speaker, _ = fpath.split(os.sep)[-4:] if speaker in all_speakers: speaker_id = all_speakers.index(speaker) else: speaker_id = len(all_speakers) all_speakers.append(speaker) # Get duration if debug: print('\n==== DEBUG ====') print(fpath) try: duration = str(timedelta(seconds=get_duration(filename=fpath))) if debug: print('File duration: {}'.format(duration)) except: print('UNABLE TO GET DURATION') raise # Chunk into segments with speech audio times, segs = VAD_chunk(2, fpath) if segs == []: print('No voice activity detected') continue if debug: print('{} - {:,} segments'.format(datetime.now() - start, len(segs)))
transforms.ToTensor(), normalize ]) categories = utils.categories ### ImageNet Categories places_categories = placesCNN_basic.classes ### Places Categories fps = 24 nb_frames = 1 nbsec = 1.49 n_obj = 3 beg_film = 0 end_film = np.floor(get_duration(filename=wavfile)) allpreds = [] onsets = [] model_imagenet = resnet18(pretrained=True) model_imagenet.eval() model_places = placesCNN_basic.model.eval() ### Define and register hook for extracting output feature map places_fm = [] places_proba = [] def get_fm_places(m, i, o):
#!/usr/bin/env python from sklearn.metrics.pairwise import pairwise_distances import scipy from librosa.core import get_duration import numpy as np import h5py from scipy.fftpack import dct, idct from utils import (WINDOW_SIZE, HOP_SIZE, SAMPLE_RATE, COEFS, TIMBRE_GROUP) ONE_FRAME = get_duration(sr=SAMPLE_RATE, n_fft=WINDOW_SIZE, hop_length=HOP_SIZE, S=np.zeros((1, 2))) def beat_similarity(vecs): return 1.0 - pairwise_distances(vecs, metric='cosine') def normed_diags(B, max_lag): B_diags = np.asarray([B.trace(i) for i in range(max_lag)]) B_diags -= np.min(B_diags) return B_diags / np.max(B_diags) def fft_autocorrelate(S): B_fft = scipy.signal.fftconvolve(S, S[::-1, ::-1], mode='full') B_fft = B_fft[S.shape[0]:, S.shape[1]:] return B_fft
with open(os.path.join(meta_path, cl) + ".json") as reader: meta_data = json.load(reader) meta_data = {d["book_id"]: d for d in meta_data} book_ids = defaultdict( lambda: timedelta(hours=0, minutes=0, seconds=0)) author_ids = defaultdict( lambda: timedelta(hours=0, minutes=0, seconds=0)) for idx, file_id in enumerate(all_files): file_path = os.path.join(cl_path, file_id) sound, sample_rate = librosa.load(file_path, sr=None) assert sample_rate == assigned_sample_rate current_time += timedelta( seconds=get_duration(sound, sr=sample_rate)) book_id = file_id.split("_")[0] book_ids[book_id] += timedelta( seconds=get_duration(sound, sr=sample_rate)) author_ids[meta_data[book_id]["reader_url"]] += timedelta( seconds=get_duration(sound, sr=sample_rate)) if idx % 100 == 0: print("\t{} of {}".format(idx, len(all_files))) print("{} Books".format(cl)) for book_id in book_ids: print("\t{}: Time: {}".format(book_id, book_ids[book_id])) book_times.append({ "language": cl,
# collect all wav files cwd = os.getcwd() stations = filter(os.path.isdir, os.listdir(cwd)) stations = [f for f in stations if not f.startswith('.')] audio_paths = [] for station in stations: data_path = f'{cwd}/{station}' files = os.listdir(data_path) wav_names = [f for f in files if f.endswith('.wav')] wav_paths = [f'{data_path}/{f}' for f in wav_names] audio_paths.extend(wav_paths) print(f'we found {len(audio_paths)} audio files in total') lens = [get_duration(filename=f) for f in audio_paths] print(f'their total play time amounts to {sum(lens) / 60 / 60:.2f} hours') # collect all aup files labeled_data = [] for station in stations: data_path = f'{cwd}/{station}' files = os.listdir(data_path) audacity_projects = [f for f in files if f.endswith('.aup')] project_paths = [f'{data_path}/{aup}' for aup in audacity_projects] station_data = [extract_aup(i, data_path, station, verbose=0) for i in project_paths] labeled_data.extend(station_data) print(f'{len(labeled_data)} instances have labels provided') # infer amount of detections