def handle(self, *args, **options): database_name = options['database_name'] load_from = options['load_from'] tmp_dir = options['tmp_dir'] format = options['format'] window_len = options['window_len'] extractor = extractors[format] if not load_from.lower().endswith('.zip'): load_from += '.zip' if not os.path.isdir(tmp_dir): mkdirp(tmp_dir) variables = read_variables(load_from) variables['tmp_dir'] = tmp_dir variables['extractor'] = extractor variables['is_log_psd'] = format.startswith('log_') variables['database_name'] = database_name variables['window_len'] = window_len factory = NDS2SAEFactory() factory.set_output(load_from) factory.learning_rate = None factory.learning_rate_func = None encoder = factory.build() session = encoder.recreate_session() showcase_segmentation(variables, encoder, session) session.close()
def create_segmenter(self, variables) -> Segmenter: load_from = variables['load_from'] factory = NDS2SAEFactory() factory.set_output(load_from) factory.learning_rate = None factory.learning_rate_func = None self.encoder = factory.build() self.session = self.encoder.recreate_session() return SeqAutoEncoderSegmenter(variables, self.encoder, self.session)
def train(variables, save_to): sids_for_training = variables['sids_for_training'] sids_for_testing = variables['sids_for_testing'] n_train = len(sids_for_training) n_test = len(sids_for_testing) topology = variables['topology'] batch_size = variables['batch_size'] n_iterations = variables['n_iterations'] keep_prob = variables['keep_prob'] profiles = variables['profiles'] batch_index_limits = dict(train=n_train, test=n_test) sids_collections = dict(train=sids_for_training, test=sids_for_testing) spects = {} windows_masked = {} # @profile # noqa F821 def get_batch(this_batch_size=10, data_type='train'): batch_index_limit = batch_index_limits[data_type] sids_collection = sids_collections[data_type] if this_batch_size is None: this_batch_size = batch_index_limit current_batch_index = variables['current_batch_index'][data_type] next_batch_index = current_batch_index + this_batch_size if current_batch_index == 0: np.random.shuffle(sids_collection) if next_batch_index >= batch_index_limit: next_batch_index = batch_index_limit variables['current_batch_index'][data_type] = 0 final_batch = True else: variables['current_batch_index'][data_type] = next_batch_index final_batch = False batch_ids = sids_for_training[current_batch_index:next_batch_index] input_spects = [] output_masks = [] for sid in batch_ids: filepath, beg, end, window_masked = profiles[sid] if sid in windows_masked: window_masked = windows_masked[sid] else: window_masked = np.array(window_masked, dtype=np.float32) windows_masked[sid] = window_masked if filepath in spects: file_spect = spects[filepath] else: with open(filepath, 'rb') as f: file_spect = pickle.load(f).transpose(1, 0) spects[filepath] = file_spect windowed_spect = file_spect[beg:end, :] input_spects.append(windowed_spect) output_masks.append(window_masked) return input_spects, output_masks, final_batch def train_batch_gen(batch_size): return get_batch(batch_size, 'train') def test_batch_gen(batch_size): return get_batch(batch_size, 'test') factory = NDS2SAEFactory() factory.set_output(save_to) factory.lrtype = variables['lrtype'] factory.lrargs = variables['lrargs'] factory.input_dim = variables['input_dims'] factory.output_dim = variables['output_dims'] factory.keep_prob = keep_prob factory.stop_pad_length = 0 factory.go_token = -1 factory.layer_sizes = infer_topology(topology, variables['input_dims']) encoder = factory.build() encoder.train(train_batch_gen, test_batch_gen, batch_size=batch_size, n_iterations=n_iterations, display_step=100, save_step=500)
def train(variables, save_to): sids_for_training = variables['sids_for_training'] sids_for_testing = variables['sids_for_testing'] n_train = len(sids_for_training) n_test = len(sids_for_testing) spect_dir = variables['spect_dir'] format = variables['format'] topology = variables['topology'] batch_size = variables['batch_size'] n_iterations = variables['n_iterations'] keep_prob = variables['keep_prob'] batch_index_limits = dict(train=n_train, test=n_test) sids_collections = dict(train=sids_for_training, test=sids_for_testing) def get_batch(this_batch_size=10, data_type='train'): batch_index_limit = batch_index_limits[data_type] sids_collection = sids_collections[data_type] if this_batch_size is None: this_batch_size = batch_index_limit current_batch_index = variables['current_batch_index'][data_type] next_batch_index = current_batch_index + this_batch_size if current_batch_index == 0: np.random.shuffle(sids_collection) if next_batch_index >= batch_index_limit: next_batch_index = batch_index_limit variables['current_batch_index'][data_type] = 0 final_batch = True else: variables['current_batch_index'][data_type] = next_batch_index final_batch = False batch_ids = sids_for_training[current_batch_index:next_batch_index] spects = [] for i, sid in enumerate(batch_ids): spect_path = os.path.join(spect_dir, '{}.{}'.format(sid, format)) with open(spect_path, 'rb') as f: spect = pickle.load(f) spects.append(spect.transpose(1, 0)) return spects, spects, final_batch def train_batch_gen(batch_size): return get_batch(batch_size, 'train') def test_batch_gen(batch_size): return get_batch(batch_size, 'test') factory = NDS2SAEFactory() if os.path.isfile(save_to): factory.load(save_to) factory.lrtype = variables['lrtype'] factory.lrargs = variables['lrargs'] factory.input_dim = variables['dims'] factory.output_dim = variables['dims'] factory.keep_prob = keep_prob factory.stop_pad_length = 5 factory.stop_pad_token = 0 factory.pad_token = -2 factory.go_token = -3 factory.layer_sizes = infer_topology(topology, variables['dims']) encoder = factory.build(save_to) encoder.train(train_batch_gen, test_batch_gen, batch_size=batch_size, n_iterations=n_iterations, display_step=100, save_step=200)
def handle(self, *args, **options): mode = options['mode'] database_name = options['database_name'] database_only = options['database_only'] kernel_only = options['kernel_only'] load_from = options['load_from'] tmp_dir = options['tmp_dir'] dm_name = options['dm_name'] format = options['format'] with_duration = options['with_duration'] min_max_loc = options['min_max_loc'] denormalised = options['denormalised'] extractor = extractors[format] if database_name is None and database_only: raise Exception( 'only_database must be True when database_name is provided') if denormalised and min_max_loc is None: raise Exception( 'If data is denomalised, --min-max-loc must be provided') if mode not in ['showcase', 'dm']: raise Exception('--mode can only be "showcase" or "dm"') if mode == 'showcase': if dm_name is not None: raise Exception( 'Can\'t accept --dm-name argument in showcase mode') else: if dm_name is None: raise Exception('Must provide --dm-name argument in dm mode') if database_name is None: raise Exception('database-name is required in dm mode') if not load_from.lower().endswith('.zip'): load_from += '.zip' if not os.path.isdir(tmp_dir): mkdirp(tmp_dir) variables = read_variables(load_from) variables['tmp_dir'] = tmp_dir variables['dm_name'] = dm_name variables['extractor'] = extractor variables['with_duration'] = with_duration variables['denormalised'] = denormalised if denormalised: global_min, global_max = load_global_min_max(min_max_loc) variables['global_min'] = global_min variables['global_max'] = global_max variables['is_log_psd'] = format.startswith('log_') factory = NDS2SAEFactory() factory.load(load_from) factory.learning_rate = None factory.learning_rate_func = None encoder = factory.build(load_from) session = encoder.recreate_session() if mode == 'showcase': showcase_reconstruct(variables, encoder, session, database_name, database_only) else: encode_into_datamatrix(variables, encoder, session, database_name, kernel_only) session.close()
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] load_from = options['load_from'] format = options['format'] min_max_loc = options['min_max_loc'] denormalised = options['denormalised'] kernel_only = options['kernel_only'] extractor = extractors[format] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] annotator = get_or_error(User, dict(username__iexact=annotator_name)) if not load_from.lower().endswith('.zip'): load_from += '.zip' variables = read_variables(load_from) variables['extractor'] = extractor variables['denormalised'] = denormalised if denormalised: global_min, global_max = load_global_min_max(min_max_loc) variables['global_min'] = global_min variables['global_max'] = global_max variables['is_log_psd'] = format.startswith('log_') factory = NDS2SAEFactory() factory.set_output(load_from) factory.learning_rate = None factory.learning_rate_func = None encoder = factory.build() session = encoder.recreate_session() _sids, full_data = encode_into_data(variables, encoder, session, database_name, kernel_only) labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, _, labels = exclude_no_labels(_sids, None, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 ndims = full_data.shape[1] unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = EnumDataProvider(full_data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = full_data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 10 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype(np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'.format( '\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format('s2senc', ndims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'