def resample_and_segment_audio(dir_path, seconds): pathlist = Path(dir_path).glob('**/*.mp3') for path in pathlist: absolute_path = str(path) print('preprocessing ' + absolute_path + ' ...') # do resampling on root path speech_array, sampling_rate = torchaudio.load(str(path)) speech_resampled = librosa.resample( np.asarray(speech_array[0].numpy()), sampling_rate, 16_000) segmented_path = str(path.parent).replace( 'test', 'test-segmented-' + str(seconds)) counter = 0 for block in sf.blocks(speech_resampled, blocksize=seconds * 16000, overlap=16000, fill_value=0): new_file_path = segmented_path + '/' + str( path.name) + '_' + str(counter) + '.mp3' filename = Path(segmented_path) filename.mkdir(parents=True, exist_ok=True) torchaudio.save(new_file_path, block, 16_000, format='mp3') counter += 1
def deNoise(self, outputFile): """ De-noising function that reads the audio signal in chunks and processes and writes to the output file efficiently. VISU Shrink is used to generate the noise threshold Parameters ---------- outputFile : str de-noised file name """ info = soundfile.info(self.__inputFile) # getting info of the audio rate = info.samplerate with soundfile.SoundFile(outputFile, "w", samplerate=rate, channels=info.channels) as of: for block in tqdm(soundfile.blocks(self.__inputFile, int(rate * info.duration * 0.10))): coefficients = pywt.wavedec(block, 'db4', mode='per', level=2) # getting variance of the input signal sigma = mad(coefficients[- 1]) # VISU Shrink thresholding by applying the universal threshold proposed by Donoho and Johnstone thresh = sigma * np.sqrt(2 * np.log(len(block))) # thresholding using the noise threshold generated coefficients[1:] = (pywt.threshold(i, value=thresh, mode='soft') for i in coefficients[1:]) # getting the clean signal as in original form and writing to the file clean = pywt.waverec(coefficients, 'db4', mode='per') of.write(clean)
def waveman(fn, config=None): if config == None: with open("config.json", "r") as f: CONFIG = json.load(f) else: CONFIG = config CONFIG['width'] = CONFIG['steps'] * CONFIG['step_width'] log("Initializing new drawing context", file=fn) canvas = svgwrite.Drawing(profile='tiny', viewBox=f"0 0 {CONFIG['width']} {CONFIG['height']}", preserveAspectRatio=CONFIG["preserveAspectRatio"]) sf = soundfile.SoundFile(fn) total_samples = len(sf) block_length = int(total_samples // CONFIG['steps']) f = open(fn, "rb") block_iterator = soundfile.blocks(f, blocksize=block_length) chunks = [] for i, block in enumerate(block_iterator): mono_block = list(map(lambda sample: (sample[0] + sample[1]) / 2, block)) chunks.append(transformer(mono_block, CONFIG['mode'])) log("Reduced frame to sample", i=i) chunks = normalize(chunks) log("Tranformed frames into chunks") for i, chunk in enumerate(chunks): canvas.add(artist(canvas, chunk, i, CONFIG['step_width'], CONFIG['height'], CONFIG['gap'], CONFIG['align'], CONFIG['rounded'], "#abcdef")) log("Created SVG rectangles for all data chunks") return canvas
def load_file(self, filepath, blocksize=1024, overlap=512): items = [] if self._debug: print("File Processing", end="", flush=True) sr = sf.info(filepath).samplerate if(sr != self._sr): blocksize = int(sr / (self._sr/blocksize)) if overlap > 0: overlap = int(sr / (self._sr/overlap)) blockgen = sf.blocks(filepath, blocksize=blocksize, overlap=overlap, always_2d=True, fill_value=0.0) for bl in blockgen: if not np.any(bl): continue if self._debug: print(".", end="", flush=True) y = bl.transpose() y = librosa.resample(y, sr, self._sr) # Lowpass y = lowpass(y) y = y[:int(blocksize)] y = y[np.newaxis, :] items.append(y) if self._debug: print("Done") return np.vstack(items)
def convert(self): """ Performs the fft for each time step and transforms the result into midi compatible data. This data is then passed to a midi file writer. """ logging.info(str(self.info)) logging.info("window: {} ms".format(self.time_window)) logging.info("frequencies: min = {} Hz, max = {} Hz".format( self.min_freq, self.max_freq)) with midi_writer.MidiWriter( outfile=self.outfile, channels=self.info.channels, time_window=self.time_window, bpm=self.bpm, condense=self.condense, condense_max=self.condense_max, max_note_length=self.max_note_length, ) as writer: for block in soundfile.blocks( self.infile, blocksize=self.block_size, always_2d=True, ): if len(block) != self.block_size: filler = numpy.array([ numpy.array([0.0 for _ in range(self.info.channels)]) for _ in range(self.block_size - len(block)) ]) block = numpy.append(block, filler, axis=0) notes = self._block_to_notes(block) writer.add_notes(notes) self._increment_progress()
def load_file_blockwise(filename, blocksize=1024, overlap=512, debug=False): items = [] if debug: print("File Processing", end="", flush=True) blockgen = sf.blocks(audio_dir + filename, blocksize=blocksize, overlap=overlap, always_2d=True, fill_value=0.0) sr = sf.info(audio_dir + filename).samplerate for bl in blockgen: if not np.any(bl): continue if debug: print(".", end="", flush=True) y = bl.transpose() y = librosa.resample(y, sr, SR) y = y[:int(blocksize)] y = y[np.newaxis, :] items.append(y) if debug: print("Done") return items
def load_audio_blockwise(data, blocksize=1024, overlap=512, debug=False): start_time = time.time() items = [] target = [] h_target = [] for i, sample in data.iterrows(): if debug: print("File Processing", end="", flush=True) blockgen = sf.blocks(audio_dir + sample['filename'], blocksize=blocksize, overlap=overlap, always_2d=True, fill_value=0.0) sr = sf.info(audio_dir + sample['filename']).samplerate for bl in blockgen: if not np.any(bl): continue if debug: print(".", end="", flush=True) y = bl.transpose() y = librosa.resample(y, sr, SR) y = y[:int(blocksize)] y = y[np.newaxis, :] items.append(y) h_target.append(sample.h_target) target.append(sample.target) if debug: print("Done") if debug: print("\tProcessing Time: " + str(time.time() - start_time)) return np.vstack(items), np.array(h_target), np.array(target)
def main(): population = Population(150) try: blocks = sf.blocks('inputs/hello_world.wav', blocksize=512, overlap=32) generations = tqdm(range(1)) for generation in generations: block = next(blocks) block = np.pad(block, (0, 512 - len(block)), 'constant') fitness = population.evaluate(block) population.reproduce(generation) population.speciate(generation) generations.set_postfix({ 'G': '{:.4e}'.format(fitness[0]), 'D': '{:.4e}'.format(fitness[1]) }) except AssertionError: pass print(population) print(population.enc_best_gen, population.enc_best_fit) print(population.dec_best_gen, population.dec_best_fit) print(population.crit_best_gen, population.crit_best_fit) print()
def transcribe_file(speech_filepath, output_filepath, keyword_file, start_timestamp, length): """Transcribe the given audio file. Length is in seconds.""" from google.cloud import speech speech_client = speech.Client() """Read keyword_file, if any.""" keywords = [] #empty list if os.path.exists(keyword_file): with open(keyword_file, 'r') as f: keywords_raw = f.readlines() for i in range(len(keywords_raw)): keywords.append(keywords_raw[i].lower().strip()) #MULTILINGUAL SUPPORT DISABLED FOR NOW language = 'en-US' ''' #set language language = '' if lang == '-e': language = 'en-US' else: language = 'ja-JP' ''' """create and open text file to save transcription""" save_file = open(output_filepath, "w+") text = '' """Slice audio into # of blocks, then send to google cloud for analysis""" sample_rate = 44100 stop = -1 if length > 0: stop = start_timestamp + sample_rate * length print("start and stop is ", start_timestamp, stop) count = 0 for audio in sf.blocks(speech_filepath, start=start_timestamp, stop=stop, \ blocksize=PAYLOAD_LIMIT, overlap=OVERLAP): sf.SoundFile('buffer.wav', 'w', sample_rate, 1, 'PCM_16').write(audio.sum(axis=1) / float(2)) content = io.open('buffer.wav', 'rb').read() audio_sample = speech_client.sample(content=content, source_uri=None, encoding='LINEAR16', sample_rate=sample_rate) print("evaluating block ", count) count += 1 try: alternatives = audio_sample.sync_recognize(language_code=language, speech_context=keywords) for alternative in alternatives: text += alternative.transcript + ' ' except ValueError: continue """final save""" #add stop timestep on top text = str(stop) + "\n" + text save_file.write(text.encode('utf-8')) save_file.close() return text
def get_blocks(self, block_size): # self.blocks_count = (sound_file.frames - 1) // (block_size // 2) + 1 blocks = sf.blocks(self._filename, block_size) blocks = list(map(one_channel, blocks)) blocks_count = len(blocks) return IterableWithLength(blocks, blocks_count)
def getBlocksSF(audiosrc): sbl = sf.blocks(audiosrc, blocksize=22050) rate = sf.info(audiosrc).samplerate tgram = [] for bl in sbl: y=np.mean(bl, axis=1) tgram.append(tempft(y,sr=rate)) return(sbl,tgram,rate)
def test_blocks_inplace_modification(file_stereo_r): out = np.empty((3, 2)) blocks = [] for block in sf.blocks(file_stereo_r, out=out, overlap=1): blocks.append(np.copy(block)) block *= 2 expected_blocks = [data_stereo[0:3], data_stereo[2:5]] assert_equal_list_of_arrays(blocks, expected_blocks)
def __init__(self, audio_path, blocksize, sr=16000, overlap=0, padding=None, dtype="float32"): assert blocksize >= 0, "blocksize cannot be 0 or negative" self._sr = sr self._orig_sr = sf.info(audio_path).samplerate self._sf_blocks = sf.blocks(audio_path, blocksize=blocksize, overlap=overlap, fill_value=padding, dtype=dtype)
def blocks(self): frames = [ frame for frame in sf.blocks(self.speechPath, self.blocksize) ] if np.shape(frames[-1:])[1] < 4 * self.samplerate: temp = np.concatenate(frames[-2:]) frames = frames[:len(frames) - 2] frames.append(temp) return frames
def generator(self): sr = sf.info(self._audio_path).samplerate sf_blocks = sf.blocks(self._audio_path, blocksize=self._blocksize, overlap=self._overlap, fill_value=self._padding, dtype=self._dtype) for block in sf_blocks: yield self.__resample_file(block, sr, self._sr)
def next_file(flist, blocksize=1024, dur=3): fname = random.choice(flist) info = sf.info(fname) segsize = dur * info.samplerate start = random.randint(0, max(0, info.frames - segsize)) return sf.blocks(fname, blocksize=blocksize, start=start, stop=start + segsize, overlap=0)
def signal_enhance(speechFile, destFile, vadFile): dest_dir = '/'.join(destFile.split('/')[:-1]) os.makedirs(dest_dir, exist_ok=True) vad_dir = '/'.join(vadFile.split('/')[:-1]) os.makedirs(vad_dir, exist_ok=True) samplerate = sf.info(speechFile).samplerate x_wpe = [] for block in sf.blocks(speechFile, 5 * samplerate): x_wpe.append(wpe.wpe_dereverb(block, samplerate)) return x_wpe
def DelayedFileAudioSource(path, blocksize=512): """ Simulates real-time decoding with a time.sleep corresponding to the duration of the audio packet sent at each yield (assumes 8kHz) :path: Path to the audio input (any format supported by soundfile package) :blocksize: Size of the blocks of audio which will be sent (in samples) """ for block in sf.blocks(path, blocksize): # Soundfile converts to 64-bit float ndarray. We convert back to bytes bytes = (block * 2**15).astype('<i2').tobytes() time.sleep(blocksize / 8000. / 2.) # 8kHz yield bytes
def generator(w, d, batch_size=32, N=N): i = 0 while 1: data = [] labels = [] wet = sf.blocks(w, blocksize=N, overlap=N//2, start=i) dry = sf.blocks(d, blocksize=N, overlap=N//2, start=i) for n in range(batch_size): data.append(next(wet)) labels.append(1) data.append(next(dry)) labels.append(0) i += N data = np.array(data) # data = data[:, :, 0] # for stereo data data = np.expand_dims(data, axis=1) data = data.reshape((data.shape[0], 1, data.shape[2])) data = np.expand_dims(data, axis=3) # print("\n{:,}".format(i)) yield data, np.array(to_categorical(labels)) if i + batch_size * N > 1000000000: i = 0
def load_audio_blocks(filename, frame_length, block_length, hop_length): # Separate audio file into overlapping blocks blocks = sf.blocks(filename, blocksize=frame_length + (block_length - 1) * hop_length, overlap=frame_length - hop_length, fill_value=None, start=0, frames=-1, dtype=np.float32, always_2d=False) return blocks
def transform(self, path): dict = {} count = 0 for block in sf.blocks(path, blocksize=160000, overlap=16000, fill_value=0): dict[count] = block count += 1 data = list(dict.items()) array = np.array(data) return torch.from_numpy(array)
def play(self): for block in soundfile.blocks(self.file, blocksize=self.framesize): if not self.start_time: self.start_time = rospy.Time.now() end_time = self.start_time + self.frametime timestamps = RecordingTimeStamps() timestamps.start = self.start_time timestamps.finish = end_time rospy.sleep(end_time - rospy.Time.now()) self.esiaf_handler.publish(self.topic, block, msg_to_string(timestamps)) self.start_time = end_time
def getAttacks(tempo,partitions,filename): data,samplerate = sf.read(filename) windowsize = 60*samplerate/(tempo*partitions) rms = [np.sqrt(np.mean(block**2)) for block in sf.blocks(filename, blocksize=(int)(windowsize), overlap=(int)(windowsize/2))] x = np.linspace(0, len(rms)-1, len(rms)) # print(rms) rms = mf.filter(rms) # pl.show(pl.plot(x,rms)) att,rls = attacks(rms) return att,rls
def decode(modFile): # save message for return val message = [] # load wav data and stats data, sampleRate = sf.read(modFile, dtype=WAV_DATATYPE) frameSize = sampleRate // BAUD_RATE print(f"\nStats: sample rate {sampleRate}, frame size {frameSize}") # create decoder object decoder = Demodulator(sampleRate, frameSize) # init loop vars start = False cntBits = 0 bits = [] # start of message print("\n\"", end=" ") # mimic real-time, read blocks from file like it's a buffer for frame in sf.blocks(modFile, blocksize = frameSize): # wait for start bit if start == False and decoder.decode(frame) == SPACE: start = True # read the byte elif start == True and cntBits < 8: bits.append(decoder.decode(frame)) cntBits += 1 elif start == True and cntBits == 8: # verify next bit is end bit assert decoder.decode(frame) == MARK, "Stop bit not detected" letter = convertBits(bits) print(letter, end=" ") message.append(letter) bits = [] cntBits = 0 start = False # end of message print(" \"\n") return message
def run_long_demo(self, wav, feature_converter, cut_time): """ wav: wav file path feature_converter: to processs feature, class filterbank cut_time: cut long wac into equal length wav file, block size """ print( 'Start Decoding, decoding in blocks, block size is approximately 6 seconds' ) self.long_wav = '' #self.block_size = cut_time*16320 self.block_size = cut_time * 15840 wav = sf.blocks(wav, blocksize=self.block_size, overlap=0, dtype='float32') result_index = [] result_index_list = [[]] preprocess_time = 0 time_info = { 'lstmrun': 0, 'convert': 0, 'read': 0, 'preprocess_time': 0 } start = time.time() for part in wav: part = self.preprocess_part(part) part_length = torch.tensor([int(part.shape[1])]) p_s = time.time() input = feature_converter([part, part_length]) preprocess_time += time.time() - p_s index, time_info_dict = self._run_short(input) result_index_list = self.concate_2list(result_index_list, index) for k, v in time_info_dict.items(): time_info[k] += v time_info['preprocess_time'] = preprocess_time end = time.time() self.decoding_time = end - start print() self.print_summary() print('dpu time:', time_info['lstmrun']) # Fix me , result_index is batch 1 ,need to support multi batch index return result_index_list
def iter_corpus(only_label=None, as_binary=None, except_label=None): """ Iterates over data, returning tuples of the form (label, [feature0, feature1, ...featureN]) """ for audio_fn, speaker_label in speakers_data.items(): # print(audio_fn, speaker_label) if only_label is not None and speaker_label != only_label: continue if except_label is not None and speaker_label == except_label: continue #https://stackoverflow.com/a/44800492/247542 #https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.io.wavfile.read.html # sample_rate, samples = wavfile.read(os.path.join(DATA_DIR, audio_fn)) first_len = None for sample in sf.blocks(os.path.join(DATA_DIR, audio_fn), blocksize=CHUNK_SIZE): # print('sample_rate:', sample_rate) # samples/second # print('sample.shape:', sample.shape) f, t, Sxx = signal.spectrogram(sample, RATE) # Limit frequencies to the human voice range. # fmin = 50 # Hz # fmax = 300 # Hz # freq_slice = np.where((f >= fmin) & (f <= fmax)) # f = f[freq_slice] # Sxx = Sxx[freq_slice,:][0] f, Sxx = only_voice_range(f, Sxx) Sxx = Sxx.flatten() if first_len is None: first_len = len(Sxx) elif len(Sxx) != first_len: continue # print('Sxx.shape:', Sxx.shape, len(Sxx)) # assert Sxx.shape == (258,) assert Sxx.shape == (8, ) if as_binary is not None: speaker_label = speaker_label == as_binary yield Sxx, speaker_label # X, y
def iter_corpus(only_label=None, as_binary=None, except_label=None): """ Iterates over data, returning tuples of the form (label, [feature0, feature1, ...featureN]) """ for audio_fn, speaker_label in speakers_data.items(): # print(audio_fn, speaker_label) if only_label is not None and speaker_label != only_label: continue if except_label is not None and speaker_label == except_label: continue #https://stackoverflow.com/a/44800492/247542 #https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.io.wavfile.read.html # sample_rate, samples = wavfile.read(os.path.join(DATA_DIR, audio_fn)) first_len = None for sample in sf.blocks(os.path.join(DATA_DIR, audio_fn), blocksize=CHUNK_SIZE): # print('sample_rate:', sample_rate) # samples/second # print('sample.shape:', sample.shape) f, t, Sxx = signal.spectrogram(sample, RATE) # Limit frequencies to the human voice range. # fmin = 50 # Hz # fmax = 300 # Hz # freq_slice = np.where((f >= fmin) & (f <= fmax)) # f = f[freq_slice] # Sxx = Sxx[freq_slice,:][0] f, Sxx = only_voice_range(f, Sxx) Sxx = Sxx.flatten() if first_len is None: first_len = len(Sxx) elif len(Sxx) != first_len: continue # print('Sxx.shape:', Sxx.shape, len(Sxx)) # assert Sxx.shape == (258,) assert Sxx.shape == (8,) if as_binary is not None: speaker_label = speaker_label == as_binary yield Sxx, speaker_label # X, y
def process(s, alpha): print("Analyzing the file " + s) # block processing env = np.concatenate( [filt(block) for block in sf.blocks(wd + '/' + s, blocksize=fs * 10)]) #pool = Pool(processes = 5) #result = pool.map(filt, blocks) #env = sum(result) peaks = peak_find(env) # add file name as a separate column result = np.hstack((peaks, np.array([[s]] * len(peaks)))) result # convert into dataframe df_res = pd.DataFrame({ 'Time': result[:, 0], 'Amp': result[:, 1], 'File': result[:, 2] }) return df_res
def FileAudioSource(path, chunk_size=4096): """ Simple audio file reader. Should be compatible with all files supported by 'soundfile' package. chunk_size is in samples, so the size in bytes of the sent packet is 2*chunk_size, since we are sending 16-bit signed PCM samples. chunk_size*2 should be smaller than the predefined maximum payload from the configured websocket connection. :path: Path to the audio input (any format supported by soundfile package) :chunk_size: Size of the blocks of audio which will be sent (in samples) :yields: bytestrings of size <chunk_size> * 2 Terminates when the audio file provided has no more content """ for block in sf.blocks(path, chunk_size): # Soundfile converts to 64-bit float ndarray. We convert back to bytes bytestr = (block * 2**15).astype('<i2').tobytes() yield bytestr
def getBeatsBl(audiosrc): y, sr = librosa.load(audiosrc) bl = range(0,len(y),10*sr) spl = list(zip(bl[:-1],bl[1:])) tempo = [] beats=[] ts = [] for start, end in spl: onset_env = librosa.onset.onset_strength(y=y[start:end], sr=sr, aggregate=np.median) temp, beat = librosa.beat.beat_track(y=y[start:end], sr=sr, onset_envelope=onset_env) # tst = librosa.frames_to_time(beat, sr=sr) beats.append(beat) tempo.append(temp) #import pdb;pdb.set_trace() tst = librosa.frames_to_time(beats, sr=sr) ts.append(tst) # import pdb;pdb.set_trace() return(ts) ablocks = sf.blocks(src, blocksize=1024) srate = sf.info(src).samplerate return(ablocks, srate)
def test_blocks_fill_last_block(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=3, fill_value=0)) last_block = np.row_stack((data_stereo[3:4], np.zeros((2, 2)))) assert_equal_list_of_arrays(blocks, [data_stereo[0:3], last_block])
def test_blocks_partial_last_block(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=3)) assert_equal_list_of_arrays(blocks, [data_stereo[0:3], data_stereo[3:4]])
def test_blocks_full_last_block(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2)) assert_equal_list_of_arrays(blocks, [data_stereo[0:2], data_stereo[2:4]])
def test_blocks_without_blocksize(): with pytest.raises(TypeError): list(sf.blocks(filename_stereo))
def stream(path, block_length, frame_length, hop_length, mono=True, offset=0.0, duration=None, fill_value=None, dtype=np.float32): '''Stream audio in fixed-length buffers. This is primarily useful for processing large files that won't fit entirely in memory at once. Instead of loading the entire audio signal into memory (as in `load()`, this function produces *blocks* of audio spanning a fixed number of frames at a specified frame length and hop length. While this function strives for similar behavior to `load`, there are a few caveats that users should be aware of: 1. This function does not return audio buffers directly. It returns a generator, which you can iterate over to produce blocks of audio. A *block*, in this context, refers to a buffer of audio which spans a given number of (potentially overlapping) frames. 2. Automatic sample-rate conversion is not supported. Audio will be streamed in its native sample rate, so no default values are provided for `frame_length` and `hop_length`. It is recommended that you first get the sampling rate for the file in question, using `get_samplerate()`, and set these parameters accordingly. 3. Many analyses require access to the entire signal to behave correctly, such as `resample`, `cqt`, or `beat_track`, so these methods will not be appropriate for streamed data. 4. The `block_length` parameter specifies how many frames of audio will be produced per block. Larger values will consume more memory, but will be more efficient to process down-stream. The best value will ultimately depend on your application and other system constraints. 5. By default, most librosa analyses (e.g., short-time Fourier transform) assume centered frames, which requires padding the signal at the beginning and end. This will not work correctly when the signal is carved into blocks, because it would introduce padding in the middle of the signal. To disable this feature, use `center=False` in all frame-based analyses. See the examples below for proper usage of this function. Parameters ---------- path : string, int, or file-like object path to the input file to stream. Any codec supported by `soundfile` is permitted here. block_length : int > 0 The number of frames to include in each block. Note that at the end of the file, there may not be enough data to fill an entire block, resulting in a shorter block by default. To pad the signal out so that blocks are always full length, set `fill_value` (see below). frame_length : int > 0 The number of samples per frame. hop_length : int > 0 The number of samples to advance between frames. Note that by when `hop_length < frame_length`, neighboring frames will overlap. Similarly, the last frame of one *block* will overlap with the first frame of the next *block*. mono : bool Convert the signal to mono during streaming offset : float Start reading after this time (in seconds) duration : float Only load up to this much audio (in seconds) fill_value : float [optional] If padding the signal to produce constant-length blocks, this value will be used at the end of the signal. In most cases, `fill_value=0` (silence) is expected, but you may specify any value here. dtype : numeric type data type of audio buffers to be produced Yields ------ y : np.ndarray An audio buffer of (at most) `block_length * (hop_length-1) + frame_length` samples. See Also -------- load get_samplerate soundfile.blocks Examples -------- Apply a short-term Fourier transform to blocks of 256 frames at a time. Note that streaming operation requires left-aligned frames, so we must set `center=False` to avoid padding artifacts. >>> filename = librosa.util.example_audio_file() >>> sr = librosa.get_samplerate(filename) >>> stream librosa.stream(filename, ... block_length=256, ... frame_length=4096, ... hop_length=1024) >>> for y_block in stream: ... D_block = librosa.stft(y_block, center=False) Or compute a mel spectrogram over a stream, using a shorter frame and non-overlapping windows >>> filename = librosa.util.example_audio_file() >>> sr = librosa.get_samplerate(filename) >>> stream = librosa.stream(filename, ... block_length=256, ... frame_length=2048, ... hop_length=2048) >>> for y_block in stream: ... m_block = librosa.feature.melspectrogram(y_block, sr=sr, ... n_fft=2048, ... hop_length=2048, ... center=False) ''' if not (np.issubdtype(type(block_length), np.integer) and block_length > 0): raise ParameterError('block_length={} must be a positive integer') if not (np.issubdtype(type(frame_length), np.integer) and frame_length > 0): raise ParameterError('frame_length={} must be a positive integer') if not (np.issubdtype(type(hop_length), np.integer) and hop_length > 0): raise ParameterError('hop_length={} must be a positive integer') # Get the sample rate from the file info sr = sf.info(path).samplerate # Construct the stream if offset: start = int(offset * sr) else: start = 0 if duration: frames = int(duration * sr) else: frames = -1 blocks = sf.blocks(path, blocksize=frame_length + (block_length - 1) * hop_length, overlap=frame_length - hop_length, fill_value=fill_value, start=start, frames=frames, dtype=dtype, always_2d=False) for block in blocks: if mono: yield to_mono(block.T) else: yield block.T
def test_blocks_with_overlap(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=3, overlap=2)) assert_equal_list_of_arrays(blocks, [data_stereo[0:3], data_stereo[1:4]])
def test_blocks_with_start(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, start=2)) assert_equal_list_of_arrays(blocks, [data_stereo[2:4]])
def test_blocks_mono(): blocks = list(sf.blocks(filename_mono, blocksize=3, dtype='int16', fill_value=0)) assert_equal_list_of_arrays(blocks, [[0, 1, 2], [-2, -1, 0]])
def detect_silent(self, block_size=1.0, slide_size=0.1, threshold=0.2, col=0): ''' 引数は全て秒単位 block_size : ブロックサイズを1秒(1秒以上無音が続いたら削除) slide_size : 0.1秒のギャップを許す(ブロックのスライド間隔) threshold : 絶対値が0.2以下を無音とみなす ''' blocksize_ms = int(block_size * self.samplerate) slide_ms = int(slide_size * self.samplerate) overlap_ms = blocksize_ms - slide_ms print(blocksize_ms) print(slide_ms) print(overlap_ms) # silent_listを作る silent_judge = [] # 0なら無音、1なら有音 i = 0 for block in sf.blocks(self.fname, blocksize=blocksize_ms, overlap=overlap_ms): if len(block.shape) == 1: block = abs(block) if len(block) != blocksize_ms: length_last_block = len(block) print("last") print(length_last_block) else: length_last_block = 0 else: block = abs(block[:, col]) if len(block) != blocksize_ms: length_last_block = len(block) print("last") print(length_last_block) else: length_last_block = 0 binal_arr = np.where(block > threshold, 1, 0) # 絶対値が0.2以上は1, 未満は0の行列 if np.sum(np.ones(len(binal_arr)) * binal_arr) == 0: silent_judge.append(0) else: silent_judge.append(1) i += 1 # silent_listから実際の長さと合うような無音判定行列を作る final_silent_judge = [] for i in range(len(silent_judge)): val = silent_judge[i] if val == 0: if i == len(silent_judge) - 1: final_silent_judge.append([0] * length_last_block) else: final_silent_judge.append([0] * slide_ms) elif val == 1: if i == len(silent_judge) - 1: final_silent_judge.append([1] * length_last_block) else: final_silent_judge.append([1] * slide_ms) # silent_judge = [] # for smll in new_listing: # for val in smll: # silent_judge.append(val) # silent_judge_final = np.ones(self.length_ms) # silent_judge_final[:len(silent_judge)] = silent_judge final_silent_judge_flatten = [] for blk in final_silent_judge: for num in blk: final_silent_judge_flatten.append(num) self.silent_judge_final = np.array(final_silent_judge_flatten) self.silent_mask = np.array(final_silent_judge_flatten) == 1
def test_blocks_with_frames_and_fill_value(file_stereo_r): blocks = list( sf.blocks(file_stereo_r, blocksize=2, frames=3, fill_value=0)) last_block = np.row_stack((data_stereo[2:3], np.zeros((1, 2)))) assert_equal_list_of_arrays(blocks, [data_stereo[0:2], last_block])
def test_blocks_with_stop_smaller_than_start(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, start=2, stop=1)) assert blocks == []
def test_blocks_with_frames(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, frames=3)) assert_equal_list_of_arrays(blocks, [data_stereo[0:2], data_stereo[2:3]])
def test_blocks_with_negative_start_and_stop(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, start=-2, stop=-1)) assert_equal_list_of_arrays(blocks, [data_stereo[-2:-1]])
def test_blocks_with_too_large_stop(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=3, stop=666)) assert_equal_list_of_arrays(blocks, [data_stereo[0:3], data_stereo[3:4]])
def test_blocks_with_too_large_start(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, start=666)) assert_equal_list_of_arrays(blocks, [[]])
def test_blocks_with_stop(file_stereo_r): blocks = list(sf.blocks(file_stereo_r, blocksize=2, stop=2)) assert_equal_list_of_arrays(blocks, [data_stereo[0:2]]) with pytest.raises(TypeError): list(sf.blocks(filename_stereo, blocksize=2, frames=2, stop=2))
def test_blocks_with_out(file_stereo_r): out = np.empty((3, 2)) blocks = list(sf.blocks(file_stereo_r, out=out)) assert blocks[0] is out # First frame was overwritten by second block: assert np.all(blocks[0] == data_stereo[[3, 1, 2]]) assert blocks[1].base is out assert np.all(blocks[1] == data_stereo[[3]]) with pytest.raises(TypeError): list(sf.blocks(filename_stereo, blocksize=3, out=out))