def _calculate_similarity(sim, runner): dm = sim.dm ord = sim.ord assert dm.task is None or dm.task.is_completed(), \ 'Cannot calculate similarity because previous error occurred when extracting features' assert ord is None or ord.task is None or ord.task.is_completed(),\ 'Cannot calculate similarity because previous error occutred when constructing ordination' if ord: sids_path = ord.get_sids_path() source_bytes_path = ord.get_bytes_path() else: sids_path = dm.get_sids_path() source_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(sids_path, np.int32) coordinates = get_rawdata_from_binary(source_bytes_path, len(sids)) coordinates[np.where(np.logical_not(np.isfinite(coordinates)))] = 0 runner.start() tree = linkage(coordinates, method='average') order = natural_order(tree) sorted_order = np.argsort(order).astype(np.int32) runner.wrapping_up() sim_sids_path = sim.get_sids_path() sim_bytes_path = sim.get_bytes_path() ndarray_to_bytes(sorted_order, sim_bytes_path) ndarray_to_bytes(sids, sim_sids_path)
def visualise(self, dist_triu, cls_labels, syl_labels, clusters): if self.ord is not None: ord_bytes_path = os.path.join(settings.BASE_DIR, self.ord.get_bytes_path()) self.ord_coordinates = get_rawdata_from_binary( ord_bytes_path, len(self.sids)) pdf_name = 'symprof-annotated-{}-{}-{}-pca={}%.pdf'.format( self.feature_grouper, self.max_deviation, self.class_aggregation, self.pca_explained) pdf = PdfPages(pdf_name) tree = linkage(dist_triu, method='complete') plot_dendrogram(tree, 'blah', cls_labels, clusters, pdf=pdf) for cluster in clusters: highlighted_cls_names = cls_labels[np.array(cluster)] if len(highlighted_cls_names) == 1: continue if self.ord is not None: scatter_plot_with_highlighted_clusters(highlighted_cls_names, syl_labels, self.sids, self.ord_coordinates, pdf=pdf) show_highlighed_cls_syllables(highlighted_cls_names, syl_labels, self.tids, pdf=pdf) pdf.close()
def post_init(self, options): super(Command, self).post_init(options) dmid = options['dmid'] ordid = options['ordid'] self.class_aggregation = options['class_aggregation'] if (dmid is None) == (ordid is None): raise Exception( 'Either but not both --dm-id and --ord-id should be given') if dmid: self.dm = get_or_error(DataMatrix, dict(id=dmid)) self.ord = None else: self.ord = get_or_error(Ordination, dict(id=ordid)) self.dm = self.ord.dm sids_path = self.dm.get_sids_path() source_bytes_path = self.dm.get_bytes_path() self.sids = bytes_to_ndarray(sids_path, np.int32) self.tids = get_tids(self.sids) coordinates = get_rawdata_from_binary(source_bytes_path, len(self.sids)) coordinates = drop_useless_columns(coordinates) coordinates = zscore(coordinates) coordinates[np.where(np.isinf(coordinates))] = 0 coordinates[np.where(np.isnan(coordinates))] = 0 self.coordinates = coordinates
def test_pca(self): django.setup() from koe.models import Feature, Aggregation, FullTensorData, Database from koe.ts_utils import bytes_to_ndarray, get_rawdata_from_binary database = Database.objects.get(name='Bellbird_TMI') features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor is None: raise Exception('Tensor not found') full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(sids)) with tictoc('PCA'): dim_reduce_func = pca(n_components=50) dim_reduce_func.fit_transform(full_data)
def handle(self, *args, **options): tensor_to_dm = {} for tensor in FullTensorData.objects.all(): sids_path = tensor.get_sids_path() bytes_path = tensor.get_bytes_path() cols_path = tensor.get_cols_path() sids = bytes_to_ndarray(sids_path, np.int32) data = get_rawdata_from_binary(bytes_path, len(sids)) dm = DataMatrix.objects.filter(name=tensor.name).first() if dm is None: dm = DataMatrix.objects.create( database=tensor.database, name=tensor.name, features_hash=tensor.features_hash, aggregations_hash=tensor.aggregations_hash, ndims=data.shape[1]) dm_sids_path = dm.get_sids_path() dm_bytes_path = dm.get_bytes_path() dm_cols_path = dm.get_cols_path() ensure_parent_folder_exists(dm_sids_path) shutil.copy(sids_path, dm_sids_path) shutil.copy(bytes_path, dm_bytes_path) shutil.copy(cols_path, dm_cols_path) tensor_to_dm[tensor] = dm for tensor in DerivedTensorData.objects.exclude(dimreduce='none'): dm = tensor_to_dm[tensor.full_tensor] sids_path = tensor.full_tensor.get_sids_path() bytes_path = tensor.get_bytes_path() if not os.path.exists(bytes_path): bytes_path = tensor.full_tensor.get_bytes_path() method = tensor.dimreduce ndims = tensor.ndims if method.startswith('tsne'): ndims = int(method[4:]) method = 'tsne' ord = Ordination.objects.filter(dm=dm, method=method, ndims=ndims).first() if ord is None: ord = Ordination.objects.create(dm=dm, method=method, ndims=ndims) ord_sids_path = ord.get_sids_path() ord_bytes_path = ord.get_bytes_path() ensure_parent_folder_exists(ord_sids_path) shutil.copy(sids_path, ord_sids_path) shutil.copy(bytes_path, ord_bytes_path)
def _calculate_similarity(sids_path, source_bytes_path, return_tree=False): sids = bytes_to_ndarray(sids_path, np.int32) coordinates = get_rawdata_from_binary(source_bytes_path, len(sids)) tree = linkage(coordinates, method='average') order = natural_order(tree) sorted_order = np.argsort(order).astype(np.int32) if return_tree: return sids, sorted_order, tree return sids, sorted_order
def prepare_data_for_analysis(self, pkl_filename, options): label_level = options['label_level'] cdm = options['cdm'] dmid = options['dmid'] annotator_name = options['annotator_name'] methods = dict(mean=np.mean, median=np.median) method = get_or_error( methods, cdm, 'Unknown value {} for --class-distance-method.'.format(cdm)) dm = get_dm(dmid) sids_path = dm.get_sids_path() source_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(sids_path, np.int32) coordinates = get_rawdata_from_binary(source_bytes_path, len(sids)) coordinates = drop_useless_columns(coordinates) coordinates = zscore(coordinates) coordinates[np.where(np.isinf(coordinates))] = 0 coordinates[np.where(np.isnan(coordinates))] = 0 if annotator_name is not None: annotator = get_or_error(User, dict(username__iexact=annotator_name)) label_arr, syl_label_enum_arr = get_syllable_labels( annotator, label_level, sids) nlabels = len(label_arr) distmat, classes_info = calc_class_dist_by_syl_features( syl_label_enum_arr, nlabels, coordinates, method) dist_triu = mat2triu(distmat) else: dist_triu = distance.pdist(coordinates, 'euclidean') label_arr = [] syl_label_enum_arr = [] classes_info = [] for sind, sid in enumerate(sids): label = str(sind) label_arr.append(label) syl_label_enum_arr.append(sind) classes_info.append([sind]) tree = linkage(dist_triu, method='average') saved_dict = dict(tree=tree, dbid=dm.database.id, sids=sids, unique_labels=label_arr, classes_info=classes_info) with open(pkl_filename, 'wb') as f: pickle.dump(saved_dict, f) return saved_dict
def construct_ordination(task_id): task = get_or_wait(task_id) runner = TaskRunner(task) try: runner.preparing() cls, ord_id = task.target.split(':') ord_id = int(ord_id) assert cls == Ordination.__name__ ord = Ordination.objects.get(id=ord_id) dm = ord.dm method_name = ord.method ndims = ord.ndims param_kwargs = Ordination.params_to_kwargs(ord.params) assert dm.task is None or dm.task.is_completed() assert method_name in methods.keys(), 'Unknown method {}'.format( method_name) assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination' runner.start() dm_sids_path = dm.get_sids_path() dm_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(dm_sids_path, np.int32) dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids)) data = zscore(dm_data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 method = methods[method_name] result = method(data, ndims, **param_kwargs) runner.wrapping_up() ord_sids_path = ord.get_sids_path() ord_bytes_path = ord.get_bytes_path() ndarray_to_bytes(result, ord_bytes_path) ndarray_to_bytes(sids, ord_sids_path) runner.complete() except Exception as e: runner.error(e)
def post_init(self, options): super(Command, self).post_init(options) dmid = options['dmid'] self.dm = get_or_error(DataMatrix, dict(id=dmid)) sids_path = self.dm.get_sids_path() source_bytes_path = self.dm.get_bytes_path() self.sids = bytes_to_ndarray(sids_path, np.int32) self.tids = get_tids(self.sids) coordinates = get_rawdata_from_binary(source_bytes_path, len(self.sids)) coordinates = drop_useless_columns(coordinates) coordinates = zscore(coordinates) coordinates[np.where(np.isinf(coordinates))] = 0 coordinates[np.where(np.isnan(coordinates))] = 0 self.coordinates = coordinates
def _construct_ordination(ord, runner): dm = ord.dm method_name = ord.method ndims = ord.ndims param_kwargs = Ordination.params_to_kwargs(ord.params) assert dm.task is None or dm.task.is_completed( ), 'Cannot construct ordination because its DataMatrix failed' assert method_name in methods.keys(), 'Unknown method {}'.format( method_name) assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination' runner.start() dm_sids_path = dm.get_sids_path() dm_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(dm_sids_path, np.int32) dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids)) dm_dims = dm_data.shape[1] assert dm_data.shape[1] >= ndims, \ 'Data has only {} dimension(s), not enough to construct a {}-dimensional ordination'.format(dm_dims, ndims) data = zscore(dm_data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 method = methods[method_name] result = method(data, ndims, **param_kwargs) result = result.astype(np.float32) runner.wrapping_up() ord_sids_path = ord.get_sids_path() ord_bytes_path = ord.get_bytes_path() ndarray_to_bytes(result, ord_bytes_path) ndarray_to_bytes(sids, ord_sids_path)
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] niters = options['niters'] profile = options.get('profile', None) tsv_file = profile + '.tsv' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join( list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception( 'No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format(ftgroup_name, source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format( ftgroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max)) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / ( label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) with open(tsv_file, open_mode, encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('Accuracy: \n') f.write('\t'.join(list(map(str, label_prediction_scores)))) f.write('\n') f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') open_mode = 'a'
def create_derived_tensor(full_tensor, annotator, dim_reduce, ndims, recreate): admin = get_or_error(User, dict(username__iexact='superuser')) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(sids)) if dim_reduce != 'none': dim_reduce_fun = reduce_funcs[dim_reduce] n_feature_cols = full_data.shape[1] n_components = min(n_feature_cols // 2, ndims) else: dim_reduce_fun = None n_components = None derived_tensor = DerivedTensorData.objects.filter( database=full_tensor.database, full_tensor=full_tensor, features_hash=full_tensor.features_hash, aggregations_hash=full_tensor.aggregations_hash, ndims=n_components, dimreduce=dim_reduce, creator=admin, annotator=annotator).first() if derived_tensor and not recreate: print( 'Derived tensor {} already exists. If you want to recreate, turn on flag --recreate' .format(derived_tensor.name)) return derived_tensor, False if derived_tensor is None: derived_tensors_name = uuid.uuid4().hex derived_tensor = DerivedTensorData( name=derived_tensors_name, database=full_tensor.database, full_tensor=full_tensor, features_hash=full_tensor.features_hash, aggregations_hash=full_tensor.aggregations_hash, dimreduce=dim_reduce, ndims=n_components, creator=admin, annotator=annotator) derived_cfg_path = derived_tensor.get_config_path() if dim_reduce_fun: # TSNE needs normalisation first if dim_reduce.startswith('tsne'): full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 dim_reduced_data = dim_reduce_fun(full_data, n_components) derived_bytes_path = derived_tensor.get_bytes_path() ndarray_to_bytes(dim_reduced_data, derived_bytes_path) tensor_shape = dim_reduced_data.shape tensor_path = '/' + derived_bytes_path, else: tensor_shape = full_data.shape tensor_path = '/' + full_bytes_path, # Always write config last - to make sure it's not missing anything embedding = dict( tensorName=derived_tensor.name, tensorShape=tensor_shape, tensorPath=tensor_path, metadataPath=reverse('tsne-meta', kwargs={'tensor_name': derived_tensor.name}), ) config = dict(embeddings=[embedding]) write_config(config, derived_cfg_path) derived_tensor.save() return derived_tensor, True
def handle(self, database_name, population_name, type, perplexity, normalised, *args, **kwargs): database = get_or_error(Database, dict(name__iexact=database_name)) assert type in ['tsne2', 'tsne3', 'mds', 'mdspca'] features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor is None: raise Exception( 'Full feature matrix not found. Need to create FullTensor first.' ) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() full_sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(full_sids)) sids, tids = get_sids_tids(database, population_name) normalised_str = 'normed' if normalised else 'raw' if type.startswith('tsne'): file_name = '{}_{}_{}_{}_{}.pkl'.format(database_name, population_name, type, perplexity, normalised_str) else: file_name = '{}_{}_{}_{}.pkl'.format(database_name, population_name, type, normalised_str) if os.path.isfile(file_name): with open(file_name, 'rb') as f: saved = pickle.load(f) coordinate = saved['coordinate'] stress = saved['stress'] else: population_data = cherrypick_tensor_data_by_sids( full_data, full_sids, sids).astype(np.float64) if normalised: population_data = zscore(population_data) population_data[np.where(np.isnan(population_data))] = 0 population_data[np.where(np.isinf(population_data))] = 0 if type.startswith('mds'): if type == 'mdspca': dim_reduce_func = PCA(n_components=50) population_data = dim_reduce_func.fit_transform( population_data, y=None) if hasattr(dim_reduce_func, 'explained_variance_ratio_'): print( 'Cumulative explained variation for {} principal components: {}' .format( 50, np.sum(dim_reduce_func. explained_variance_ratio_))) similarities = squareform(pdist(population_data, 'euclidean')) model = MDS(n_components=3, dissimilarity='precomputed', random_state=7, verbose=1, max_iter=1000) coordinate = model.fit_transform(similarities) stress = model.stress_ else: ntsne_dims = int(type[4:]) dim_reduce_func = PCA(n_components=50) population_data = dim_reduce_func.fit_transform( population_data, y=None) print('Cumulative explained variation: {}'.format( np.sum(dim_reduce_func.explained_variance_ratio_))) time_start = time.time() tsne = TSNE(n_components=ntsne_dims, verbose=1, perplexity=perplexity, n_iter=4000) coordinate = tsne.fit_transform(population_data) print( 't-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start)) stress = None with open(file_name, 'wb') as f: pickle.dump(dict(coordinate=coordinate, stress=stress, sids=sids, tids=tids), f, protocol=pickle.HIGHEST_PROTOCOL)
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] dm_name = options['dm_name'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) if dm_name is None: features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter( enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join( list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception('No full data matrix for database {}'.format( database_name)) else: dm = DataMatrix.objects.filter(database=database, name=dm_name).first() if dm is None: raise Exception('No such matrix {} for database {}'.format( dm_name, database_name)) if dm.aggregations_hash: aggregations_list = dm.aggregations_hash.split('-') aggregators = [aggregator_map[x] for x in aggregations_list] else: aggregators = [] features = Feature.objects.filter( id__in=dm.features_hash.split('-')) ftgroup_names = { 'custom': list(features.values_list('name', flat=True)) } dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 30 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype( np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') if source == 'full': f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'. format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, score, '\t'.join(map(str, lb_hitrates)))) else: f.write( 'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n' .format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ratio_ = options['ratio'] niters = options['niters'] csv_filename = options.get('csv_filename', None) train_ratio, valid_ratio = get_ratios(ratio_, 2) assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join(list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter(database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception('No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) if csv_filename: with open(csv_filename, 'w', encoding='utf-8') as f: if source == 'pca': f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev' '\t{}\n'.format('\t '.join(unique_labels))) else: f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n' .format('\t '.join(unique_labels))) for ftgroup_name, feature_names in ftgroup_names.items(): for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]: if agggroup_name == 'all': aggs = [aggregator_map[x.name] for x in aggregations] if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggs: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...' .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5)))) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) if csv_filename: with open(csv_filename, 'a', encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') else: print('{}/{}: {} by {}: mean = {} std = {}' .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores, std_label_prediction_scores))
def bulk_get_segment_info(segs, extras): """ Return rows contains Segments' information to display in SlickGrid :param segs: an array of segment object (or a QuerySet) :param extras: Must specify the user to get the correct ExtraAttrValue columns :return: [row] """ viewas = extras.viewas holdout = extras.get('_holdout', 'false') == 'true' user = extras.user if 'database' in extras: database_id = extras.database current_database = get_or_error(Database, dict(id=database_id)) else: database_id = extras.tmpdb current_database = get_or_error(TemporaryDatabase, dict(id=database_id)) similarity_id = extras.similarity current_similarity = None if similarity_id: current_similarity = get_or_error(SimilarityIndex, dict(id=similarity_id)) rows = [] ids = [] if current_database is None: return ids, rows if holdout: ids_holder = ExtraAttrValue.objects.filter( attr=settings.ATTRS.user.hold_ids_attr, owner_id=user.id, user=user).first() if ids_holder is not None and ids_holder.value != '': ids = ids_holder.value.split(',') segs = segs.filter(id__in=ids) elif isinstance(current_database, TemporaryDatabase): ids = current_database.ids segs = segs.filter(id__in=ids) else: segs = segs.filter(audio_file__database=current_database.id) values = list( segs.values_list( 'id', 'tid', 'start_time_ms', 'end_time_ms', 'audio_file__name', 'audio_file__id', 'audio_file__quality', 'audio_file__added', 'audio_file__track__name', 'audio_file__track__date', 'audio_file__individual__name', 'audio_file__individual__gender', )) segids = [x[0] for x in values] song_ids = [x[5] for x in values] extra_attr_values_list = ExtraAttrValue.objects \ .filter(user__username=viewas, attr__klass=Segment.__name__, owner_id__in=segids) \ .values_list('owner_id', 'attr__name', 'value') song_extra_attr_values_list = ExtraAttrValue.objects \ .filter(user__username=viewas, attr__klass=AudioFile.__name__, owner_id__in=song_ids) \ .values_list('owner_id', 'attr__name', 'value') extra_attr_values_lookup = {} for id, attr, value in extra_attr_values_list: if id not in extra_attr_values_lookup: extra_attr_values_lookup[id] = {} extra_attr_dict = extra_attr_values_lookup[id] extra_attr_dict[attr] = value song_extra_attr_values_lookup = {} for id, attr, value in song_extra_attr_values_list: if id not in song_extra_attr_values_lookup: song_extra_attr_values_lookup[id] = {} extra_attr_dict = song_extra_attr_values_lookup[id] extra_attr_dict[attr] = value ids = np.array([x[0] for x in values], dtype=np.int32) if current_similarity is None: id2order = {} else: sim_sids_path = current_similarity.get_sids_path() sim_bytes_path = current_similarity.get_bytes_path() sim_sids = bytes_to_ndarray(sim_sids_path, np.int32).tolist() sim_order = np.squeeze( get_rawdata_from_binary(sim_bytes_path, len(sim_sids), np.int32)).tolist() id2order = dict(zip(sim_sids, sim_order)) for id, tid, start, end, song_name, song_id, quality, added, track, date, individual, gender in values: sim_index = id2order.get(id, None) duration = end - start url = reverse('segmentation', kwargs={'file_id': song_id}) url = '[{}]({})'.format(url, song_name) row = dict( id=id, start_time_ms=start, end_time_ms=end, duration=duration, song=url, sim_index=sim_index, song_track=track, song_individual=individual, sex=gender, song_quality=quality, record_date=date, song_added=added.date(), spectrogram=tid, ) extra_attr_dict = extra_attr_values_lookup.get(id, {}) song_extra_attr_dict = song_extra_attr_values_lookup.get(song_id, {}) for attr in extra_attr_dict: row[attr] = extra_attr_dict[attr] for song_attr in song_extra_attr_dict: attr = 'song_{}'.format(song_attr) row[attr] = song_extra_attr_dict[song_attr] rows.append(row) return ids, rows