def get_embeddings(melspecs: dict[str, np.ndarray], architectures: dict, predictors: dict) -> Optional[dict]: data = {} for architecture, metadata in architectures.items(): input_pool = Pool() input_pool.set('model/Placeholder', melspecs[metadata['essentia-algorithm']]) for dataset in metadata['datasets']: # TODO: chunk the input melspecs to avoid OOM error try: output_pool = predictors[f'{dataset}-{architecture}'](input_pool) except RuntimeError: return None for layer, layer_data in metadata['layers'].items(): embeddings = output_pool[layer_data['name']].squeeze() if len(embeddings) == 0: return None if len(embeddings.shape) == 1: embeddings = np.expand_dims(embeddings, axis=0) data[f'{dataset}-{architecture}-{layer}'] = embeddings return data
if opt.segmentation: INFO('Process step 2: Low Level') computeLowLevel(input_file, pool, startTime, endTime) segmentation.compute(input_file, pool, startTime, endTime) segments = pool['segmentation.timestamps'] for i in xrange(len(segments)-1): startTime = segments[i] endTime = segments[i+1] INFO('**************************************************************************') INFO('Segment ' + str(i) + ': processing audio from ' + str(startTime) + 's to ' + str(endTime) + 's') INFO('**************************************************************************') # set segment name: segment_name = 'segment_'+ str(i) pool.set('segments.'+segment_name+'.name', segment_name) # set segment scope: pool.set('segments.'+segment_name+'.scope', numpy.array([startTime, endTime])) # compute descriptors: namespace = 'segments.'+segment_name+'.descriptors' segments_namespace.append(namespace) INFO('\tProcess step 2: Low Level') computeLowLevel(input_file, pool, startTime, endTime, namespace) INFO('\tProcess step 3: Mid Level') computeMidLevel(input_file, pool, startTime, endTime, namespace) INFO('\tProcess step 4: High Level') highlevel.compute(pool, namespace) # compute the rest of the descriptors for the entire audio. LowLevel # descriptors were already computed during segmentation startTime = float(opt.startTime)
for i in xrange(len(segments) - 1): startTime = segments[i] endTime = segments[i + 1] INFO( '**************************************************************************' ) INFO('Segment ' + str(i) + ': processing audio from ' + str(startTime) + 's to ' + str(endTime) + 's') INFO( '**************************************************************************' ) # set segment name: segment_name = 'segment_' + str(i) pool.set('segments.' + segment_name + '.name', segment_name) # set segment scope: pool.set('segments.' + segment_name + '.scope', numpy.array([startTime, endTime])) # compute descriptors: namespace = 'segments.' + segment_name + '.descriptors' segments_namespace.append(namespace) INFO('\tProcess step 2: Low Level') computeLowLevel(input_file, pool, startTime, endTime, namespace) INFO('\tProcess step 3: Mid Level') computeMidLevel(input_file, pool, startTime, endTime, namespace) INFO('\tProcess step 4: High Level') highlevel.compute(pool, namespace) # compute the rest of the descriptors for the entire audio. LowLevel # descriptors were already computed during segmentation
class ModelsWrapper: def __init__(self, arch): self.architechture = arch self.in_layer = None self.out_layer = None if arch == 'musicnn': self.feature_extractor = es.TensorflowInputMusiCNN() self.frame_size = 512 self.hop_size = 256 self.patch_size = 187 self.num_bands = 96 elif arch == 'vggish': self.feature_extractor = es.TensorflowInputVGGish() self.frame_size = 400 self.hop_size = 200 self.patch_size = 96 self.num_bands = 64 self.feature_frames = [] self.in_pool = Pool() self.out_pool = Pool() # setup model self.predict = None def load_model(self, model_path, in_layer, out_layer): if not self.predict: self.predict = es.TensorflowPredict(graphFilename=model_path, inputs=[in_layer], outputs=[out_layer], squeeze=True) self.in_layer = in_layer self.out_layer = out_layer def compute_features(self, audio): frames = [] self.feature_frames = [] # ensure it's empty for frame in es.FrameGenerator(audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frames.append(frame) for f in frames: self.feature_frames.append(self.feature_extractor(f)) return self.feature_frames def make_prediction(self): self._featuresToTensorAsBatch() self.out_pool.clear() self.out_pool = self.predict(self.in_pool) return self.out_pool[self.out_layer] def _featuresToTensorAsBatch(self): # reshape features as tensor, zeropadding as needed feature_frames_as_np = np.array(self.feature_frames, dtype=np.single) incomplete_patch_size = feature_frames_as_np.shape[0] % self.patch_size zero_frame_size = self.patch_size - incomplete_patch_size zero_frames = np.zeros((zero_frame_size, self.num_bands), dtype=np.single) zero_padded_features = np.append(feature_frames_as_np, zero_frames, axis=0) batch = np.expand_dims( np.reshape(zero_padded_features, [-1, self.patch_size, self.num_bands]), 1) self.in_pool.set(self.in_layer, batch) def dispose(self): # clear model from memory self.predict = None self.in_layer = None
if opt.segmentation: INFO("Process step 2: Low Level") computeLowLevel(input_file, neqPool, eqPool, startTime, endTime) segmentation.compute(input_file, eqPool, startTime, endTime) segments = eqPool["segmentation.timestamps"] for i in xrange(len(segments) - 1): startTime = segments[i] endTime = segments[i + 1] INFO("**************************************************************************") INFO("Segment " + str(i) + ": processing audio from " + str(startTime) + "s to " + str(endTime) + "s") INFO("**************************************************************************") # set segment name: segment_name = "segment_" + str(i) neqPool.set("segments." + segment_name + ".name", segment_name) eqPool.set("segments." + segment_name + ".name", segment_name) # set segment scope: neqPool.set("segments." + segment_name + ".scope", numpy.array([startTime, endTime])) eqPool.set("segments." + segment_name + ".scope", numpy.array([startTime, endTime])) # compute descriptors: namespace = "segments." + segment_name + ".descriptors" segments_namespace.append(namespace) INFO("\tProcess step 2: Low Level") computeLowLevel(input_file, neqPool, eqPool, startTime, endTime, namespace) INFO("\tProcess step 3: Mid Level") computeMidLevel(input_file, neqPool, eqPool, startTime, endTime, namespace) INFO("\tProcess step 4: High Level") highlevel.compute(eqPool, namespace) highlevel.compute(neqPool, namespace)