reload(run_lib) plt.close('all') n_folds_list = [5, 10, 20, 50] n_repetitions = 5 method = 'svr' out_path = '/Users/dedan/projects/master/results/validation/gen_score_svr' base_path = os.path.join(os.path.dirname(__file__), '..') config = json.load(open(os.path.join(base_path, 'config', 'validate_genscore_svr.json'))) config['data_path'] = os.path.join(base_path, 'data') # load the features features = run_lib.prepare_features(config) used_glomeruli = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json'))) res = {g: {nf: [] for nf in n_folds_list} for g in used_glomeruli} for glom in used_glomeruli: print(glom) config['glomerulus'] = glom data, targets, molids = run_lib.load_data_targets(config, features) config['feature_selection']['k_best'] = data.shape[1] for i, n_folds in enumerate(n_folds_list): print(n_folds) config['methods'][method]['n_folds'] = n_folds for j in range(n_repetitions): run_res = run_lib.run_runner(config, data, targets) res[glom][n_folds].append(run_res[method]['gen_score']) json.dump(open(os.path.join(out_path, 'res.json')))
dtm = {} for name, config in feat_config.items(): base_config['features'].update(config) data, targets, molids = rl.load_data_targets(base_config, cache[name]['features']) dtm[name] = { 'data': data, 'targets': targets, 'molids': molids } # select molecules that none of the models will be trained on all_trained = set(dtm.values()[0]['molids']).union(*[m['molids'] for m in dtm.values()[1:]]) to_predict_molids = mol_intersection - all_trained for name, data in dtm.items(): # fit model print('working on model: {}'.format(name)) base_config['feature_selection']['k_best'] = data['data'].shape[1] print("use {} molecules for training".format(data['data'].shape[0])) tmp_res = rl.run_runner(base_config, data['data'], data['targets'], get_models=True) to_predict = np.array([cache[name]['features'][molid] for molid in to_predict_molids]) res[name][glom]['predictions'] = tmp_res[method]['model'].predict(to_predict) res[name][glom]['cases'] = [id2door[molid] for molid in to_predict_molids] res[name][glom]['targets'] = data['targets'] res[name][glom]['score'] = tmp_res[method]['gen_score'] print('model genscore: {:.2f}\n'.format(tmp_res[method]['gen_score'])) pickle.dump(dict(res), open(os.path.join(outpath, 'predictions.pkl'), 'w'))
}, "randomization_test": False } used_gloms = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json'))) alone_haddad, alone_vib, together = [], [], [] for glom in used_gloms: config['glomerulus'] = glom # prepare haddad features features_h = run_lib.prepare_features(config) data_h, targets_h, molids_h = run_lib.load_data_targets(config, features_h) config['feature_selection']['k_best'] = data_h.shape[1] tmp = run_lib.run_runner(config, data_h, targets_h) print glom, tmp alone_haddad.append(tmp['svr']['gen_score']) # prepare vib100 config_spec = copy.deepcopy(config) config_spec['features']['type'] = 'spectral' config_spec['features']['kernel_width'] = 100 config_spec['features']['bin_width'] = 150 config_spec['features']['use_intensity'] = False config_spec['features']['spec_type'] = 'ir' features_v = run_lib.prepare_features(config_spec) data_v, targets_v, molids_v = run_lib.load_data_targets(config_spec, features_v) config['feature_selection']['k_best'] = data_v.shape[1] tmp = run_lib.run_runner(config, data_v, targets_v)
# overwrite optimal (param search results) parameters for m in config['methods'].keys(): if not m == method: del config['methods'][m] config['methods'][method]['C'] = 1.0 del config['methods'][method]['regularization'] config['feature_selection']['k_best'] = k_best_dict[descriptor][-1] # load features print 'preparing features..' features = run_lib.prepare_features(config) data, targets, molids = run_lib.load_data_targets(config, features) # fit model print("use {} molecules for training".format(data.shape[0])) tmp_res = run_lib.run_runner(config, data, targets, get_models=True) model = tmp_res[method]['model'] # # structure plot for active targets and predictions # active_targets = np.where(targets > active_thresh)[0] # act_molids = [molids[i] for i in active_targets] # active_predictions = np.where(predictions > active_thresh)[0] # act_predict_molids = [molids_to_predict[i] for i in active_predictions] # fig = plt.figure(figsize=(5,5)) # plib.structure_plot(fig, (act_molids, act_predict_molids), # (targets[active_targets], predictions[active_predictions])) # fig.suptitle(glom) # fig.savefig(os.path.join(outpath, glom + '_structures.png')) fig = plt.figure()
import os import json from master.libs import run_lib import numpy as np import pylab as plt config = { "data_path": os.path.join(os.path.dirname(__file__), "..", "data"), "features": {"type": "conventional", "descriptor": "all", "normalize": True, "properties_to_add": []}, "feature_selection": {"method": "linear"}, "methods": {"svr": {"C": 1.0, "n_folds": 50}}, "randomization_test": False, } used_gloms = json.load(open(os.path.join(config["data_path"], "used_glomeruli.json"))) for glom in used_gloms: config["glomerulus"] = glom features = run_lib.prepare_features(config) data_all, targets, _ = run_lib.load_data_targets(config, features) config["features"]["descriptor"] = "saito_desc" data_saito, _, _ = run_lib.load_data_targets(config, features) np.random.seed() # map(np.random.shuffle, data_all.T) new_data = np.hstack((data_saito, data_all)) config["feature_selection"]["k_best"] = data_all.shape[1] tmp = run_lib.run_runner(config, data_all, targets, False, False) print glom, tmp["svr"]["gen_score"]