def encode_into_datamatrix(variables, encoder, session, database_name, kernel_only): with_duration = variables['with_duration'] dm_name = variables['dm_name'] ndims = encoder.latent_dims database = get_or_error(Database, dict(name__iexact=database_name)) audio_files = AudioFile.objects.filter(database=database) segments = Segment.objects.filter(audio_file__in=audio_files) encoding_result = encode_syllables(variables, encoder, session, segments, kernel_only) features_value = np.array(list(encoding_result.values())) sids = np.array(list(encoding_result.keys()), dtype=np.int32) sid_sorted_inds = np.argsort(sids) sids = sids[sid_sorted_inds] features_value = features_value[sid_sorted_inds] preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = segments.order_by(preserved) tids = segments.values_list('tid', flat=True) features = [feature_map['s2s_autoencoded']] col_inds = {'s2s_autoencoded': [0, ndims]} if with_duration: features.append(feature_map['duration']) col_inds['duration'] = [ndims, ndims + 1] durations = list( segments.annotate(duration=F('end_time_ms') - F('start_time_ms')).values_list('duration', flat=True)) durations = np.array(durations) assert len(durations) == len(sids) features_value = np.concatenate( (features_value, durations.reshape(-1, 1)), axis=1) features_value = features_value.astype(np.float32) dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = '-'.join([str(x.id) for x in features]) dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features_value, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def handle(self, *args, **options): database_name = options['database_name'] celery = options['celery'] save_db = options['save_db'] if not save_db and celery: warning('celery reverted to False because save_db is False') database = get_or_error(Database, dict(name__iexact=database_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join( list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) user = User.objects.get(username='******') if save_db: dm = DataMatrix(database=database) dm.ndims = 0 dm.name = uuid.uuid4().hex dm.features_hash = features_hash dm.aggregations_hash = aggregations_hash dm.save() task = Task(user=user, target='{}:{}'.format(DataMatrix.__name__, dm.id)) task.save() dm.task = task dm.save() else: task = NonDbTask(user=user) segments = Segment.objects.filter(audio_file__database=database) sids = segments.values_list('id', flat=True) task.sids = sids task.features_hash = features_hash task.aggregations_hash = aggregations_hash if celery: extract_database_measurements.delay(task.id) else: extract_database_measurements(task, force=True)
def handle(self, *args, **options): path = options['path'] if not os.path.isfile(path): raise Exception('File {} not found'.format(path)) database_name = options['database_name'] dm_name = options['dm_name'] database = get_or_error(Database, dict(name__iexact=database_name)) dataset = data_set.load(Path(path)) features = dataset.features filenames = dataset.filenames sids = [int(x[:-4]) for x in filenames] nobs, ndims = dataset.features.shape preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = Segment.objects.filter(id__in=sids).order_by(preserved) tids = segments.values_list('tid', flat=True) col_inds = {'s2s_autoencoded': [0, ndims]} dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = 's2s_autoencoded' dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def form_valid(self, form): post_data = self.request.POST user = self.request.user form_data = form.cleaned_data name = form_data.get('name', None) dmid = form_data.get('data_matrix', None) has_error = False is_recreating = False if dmid: dm = get_or_error(DataMatrix, dict(id=dmid)) is_recreating = True else: if 'database' in post_data: database_id = int(post_data['database']) database = get_or_error(Database, dict(id=int(database_id))) if DataMatrix.objects.filter(database=database, name=name).exists(): form.add_error('name', 'This name is already taken') has_error = True dm = DataMatrix(database=database) else: database_id = get_or_error(post_data, 'tmpdb') database = get_or_error(TemporaryDatabase, dict(id=int(database_id))) if DataMatrix.objects.filter(tmpdb=database, name=name).exists(): form.add_error('name', 'This name is already taken') has_error = True dm = DataMatrix(tmpdb=database) if has_error: context = self.get_context_data() context['form'] = form rendered = render_to_string( 'partials/feature-selection-form.html', context=context) return HttpResponse( json.dumps( dict(message=dict(success=False, html=rendered)))) features = form_data['features'].order_by('id') aggregations = form_data['aggregations'].order_by('id') dm.name = name dm.ndims = 0 dm.features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) dm.aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) dm.save() task = Task(user=user, target='{}:{}'.format(DataMatrix.__name__, dm.id)) task.save() dm.task = task dm.save() delay_in_production(extract_database_measurements, task.id) if is_recreating: ord_tasks = recreate_associated_ordination_tasks(dmid, user) for task in ord_tasks: delay_in_production(construct_ordination, task.id) sim_tasks = recreate_associated_similarity_tasks(dmid, user) for task in sim_tasks: delay_in_production(calculate_similarity, task.id) context = self.get_context_data() context['task'] = task rendered = render_to_string('partials/feature-extraction-tasks.html', context=context) return HttpResponse( json.dumps(dict(message=dict(success=True, html=rendered))))