} }, "data_path": "/Users/dedan/projects/master/data", "glomerulus": "Or22a", "randomization_test": False } # get the molids spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl')) _, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()}) # some molids map to two CAS numbers for some molecules, use only first first_molids_idx = sorted([molids.index(m) for m in set(molids)]) targets = targets[first_molids_idx] molids = [molids[i] for i in first_molids_idx] # place the intensity values in a high-resolution vector freqs = {k: v for k, v in spectra.items() if k in molids} data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir') assert len(molids) == len(targets) == data_orig.shape[0] # randomization np.random.seed() map(np.random.shuffle, data_orig.T) # fit model sel_scores = run_lib.get_selection_score(config, data_orig, targets) data = flib.select_k_best(data_orig, sel_scores, 2**9) tmp_res = run_lib.run_runner(config, data, targets, get_models=True) print tmp_res['svr']['model'].oob_score_
fig = plt.figure() all_freqs = list(it.chain(*freqs.values())) ax = fig.add_subplot(311) ax.hist(all_freqs, 400, range=[0, 4000]) ax.set_xlabel("histogram of all frequencies") ax = fig.add_subplot(312) ax.hist(targets) ax.set_xlabel("target value histogram") # frequency distribution of active targets active = [m for i, m in enumerate(molids) if targets[i] > active_thresh] act_freqs = list(it.chain(*[v for k, v in freqs.items() if k in active])) ax = fig.add_subplot(313) ax.hist(act_freqs, 40000, range=[0, 4000]) ax.set_xlabel("do ligand share bins for resolution of 0.1?") fig.savefig(os.path.join(outpath, "distributions.png")) # look at the relation between data and target inner distances fig = plt.figure() ma = flib._place_waves_in_vector(freqs, 0.1, True, "ir") target_distances, ligands = pdist_1d(targets) f_select_config = {"feature_selection": {"method": "linear"}} sel_scores = run_lib.get_selection_score(f_select_config, ma, targets) ma_sel = flib.select_k_best(ma, sel_scores, 2 ** 11) feature_distances_sel = pdist(ma_sel, "cosine") ax = fig.add_subplot(111) ax.plot(feature_distances_sel[~ligands], target_distances[~ligands], "xb") ax.plot(feature_distances_sel[ligands], target_distances[ligands], "xr") fig.savefig(os.path.join(outpath, "distance_relation.png"))
from master.libs import features_lib as flib import numpy as np import pylab as plt from mpl_toolkits.mplot3d import Axes3D reload(run_lib) reload(flib) # search config config = json.load(open(sys.argv[1])) # load the features features = run_lib.prepare_features(config) data, targets = run_lib.load_data_targets(config, features) sel_scores = run_lib.get_selection_score(config, data, targets) data = flib.select_k_best(data, sel_scores, config['feature_selection']['k_best']) tmp_res = run_lib.run_runner(config, data, targets, get_models=True) model = tmp_res['svr_ens']['model'] fig = plt.figure() ax = fig.add_subplot(211) ax.plot(sel_scores) # 3 d plot with dots colorcoded by target value # [1 0 4] is the order of features selected # "('WALK_PATH_COUNTS', 'MPC04')" --> molecular path count of order 04 # "('TOPOLOGICAL', 'TI2')" --> second Mohar index TI2 # "('TWOD_AUTOCORRELATIONS', 'GATS1m')" --> Geary autocorrelation - lag 1 / weighted by atomic masses fig = plt.figure(figsize=(10,7)) ax = fig.add_subplot(111, projection='3d')
id2name = defaultdict(str, rdl.get_id2name()) door2id = json.load(open(os.path.join(data_path, 'door2id.json'))) id2cas = defaultdict(str, {val[0]: key for key, val in door2id.items() if val}) daniel_set = json.load(open(os.path.join(data_path, 'daniel_set.json'))) daniel_set_molid = [door2id[cas][0] for cas in daniel_set] # get the best parameters from the search result config = rdl.get_best_params(inpath, descriptor, glom, method, selection) features = run_lib.prepare_features(config) data, targets, molids = run_lib.load_data_targets(config, features) # fit model print("use {} molecules for training".format(data.shape[0])) sel_scores = run_lib.get_selection_score(config, data, targets) data = flib.select_k_best(data, sel_scores, config['feature_selection']['k_best']) tmp_res = run_lib.run_runner(config, data, targets, get_models=True) model = tmp_res[method]['model'] # compute predictions molids_to_predict = list(set(daniel_set_molid).difference(molids)) print("want to predict on {} molecules".format(len(molids_to_predict))) data_to_predict = np.array([features[m] for m in molids_to_predict if list(features[m])]) print("found features for {} molecules".format(data_to_predict.shape[0])) molids_to_predict = np.array([m for m in molids_to_predict if len(features[m]) != 0 ]) data_to_predict = flib.select_k_best(data_to_predict, sel_scores, config['feature_selection']['k_best']) assert len(data_to_predict) == len(molids_to_predict) predictions = model.predict(data_to_predict) with open(os.path.join(outpath, glom + '_daniel_predict.csv'), 'w') as f: writer = csv.writer(f, delimiter=',')