def encode_into_datamatrix(variables, encoder, session, database_name, kernel_only): with_duration = variables['with_duration'] dm_name = variables['dm_name'] ndims = encoder.latent_dims database = get_or_error(Database, dict(name__iexact=database_name)) audio_files = AudioFile.objects.filter(database=database) segments = Segment.objects.filter(audio_file__in=audio_files) encoding_result = encode_syllables(variables, encoder, session, segments, kernel_only) features_value = np.array(list(encoding_result.values())) sids = np.array(list(encoding_result.keys()), dtype=np.int32) sid_sorted_inds = np.argsort(sids) sids = sids[sid_sorted_inds] features_value = features_value[sid_sorted_inds] preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = segments.order_by(preserved) tids = segments.values_list('tid', flat=True) features = [feature_map['s2s_autoencoded']] col_inds = {'s2s_autoencoded': [0, ndims]} if with_duration: features.append(feature_map['duration']) col_inds['duration'] = [ndims, ndims + 1] durations = list( segments.annotate(duration=F('end_time_ms') - F('start_time_ms')).values_list('duration', flat=True)) durations = np.array(durations) assert len(durations) == len(sids) features_value = np.concatenate( (features_value, durations.reshape(-1, 1)), axis=1) features_value = features_value.astype(np.float32) dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = '-'.join([str(x.id) for x in features]) dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features_value, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def handle(self, *args, **options): path = options['path'] if not os.path.isfile(path): raise Exception('File {} not found'.format(path)) database_name = options['database_name'] dm_name = options['dm_name'] database = get_or_error(Database, dict(name__iexact=database_name)) dataset = data_set.load(Path(path)) features = dataset.features filenames = dataset.filenames sids = [int(x[:-4]) for x in filenames] nobs, ndims = dataset.features.shape preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = Segment.objects.filter(id__in=sids).order_by(preserved) tids = segments.values_list('tid', flat=True) col_inds = {'s2s_autoencoded': [0, ndims]} dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = 's2s_autoencoded' dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)