} }, "data_path": "/Users/dedan/projects/master/data", "glomerulus": "Or22a", "randomization_test": False } # get the molids spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl')) _, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()}) # some molids map to two CAS numbers for some molecules, use only first first_molids_idx = sorted([molids.index(m) for m in set(molids)]) targets = targets[first_molids_idx] molids = [molids[i] for i in first_molids_idx] # place the intensity values in a high-resolution vector freqs = {k: v for k, v in spectra.items() if k in molids} data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir') assert len(molids) == len(targets) == data_orig.shape[0] # randomization np.random.seed() map(np.random.shuffle, data_orig.T) # fit model sel_scores = run_lib.get_selection_score(config, data_orig, targets) data = flib.select_k_best(data_orig, sel_scores, 2**9) tmp_res = run_lib.run_runner(config, data, targets, get_models=True) print tmp_res['svr']['model'].oob_score_
fig = plt.figure() all_freqs = list(it.chain(*freqs.values())) ax = fig.add_subplot(311) ax.hist(all_freqs, 400, range=[0, 4000]) ax.set_xlabel("histogram of all frequencies") ax = fig.add_subplot(312) ax.hist(targets) ax.set_xlabel("target value histogram") # frequency distribution of active targets active = [m for i, m in enumerate(molids) if targets[i] > active_thresh] act_freqs = list(it.chain(*[v for k, v in freqs.items() if k in active])) ax = fig.add_subplot(313) ax.hist(act_freqs, 40000, range=[0, 4000]) ax.set_xlabel("do ligand share bins for resolution of 0.1?") fig.savefig(os.path.join(outpath, "distributions.png")) # look at the relation between data and target inner distances fig = plt.figure() ma = flib._place_waves_in_vector(freqs, 0.1, True, "ir") target_distances, ligands = pdist_1d(targets) f_select_config = {"feature_selection": {"method": "linear"}} sel_scores = run_lib.get_selection_score(f_select_config, ma, targets) ma_sel = flib.select_k_best(ma, sel_scores, 2 ** 11) feature_distances_sel = pdist(ma_sel, "cosine") ax = fig.add_subplot(111) ax.plot(feature_distances_sel[~ligands], target_distances[~ligands], "xb") ax.plot(feature_distances_sel[ligands], target_distances[ligands], "xr") fig.savefig(os.path.join(outpath, "distance_relation.png"))