}
    },
    "data_path": "/Users/dedan/projects/master/data",
    "glomerulus": "Or22a",
    "randomization_test": False
}

# get the molids
spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl'))
_, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()})

# some molids map to two CAS numbers for some molecules, use only first
first_molids_idx = sorted([molids.index(m) for m in set(molids)])
targets = targets[first_molids_idx]
molids = [molids[i] for i in first_molids_idx]

# place the intensity values in a high-resolution vector
freqs = {k: v for k, v in spectra.items() if k in molids}
data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir')
assert len(molids) == len(targets) == data_orig.shape[0]

# randomization
np.random.seed()
map(np.random.shuffle, data_orig.T)

# fit model
sel_scores = run_lib.get_selection_score(config, data_orig, targets)
data = flib.select_k_best(data_orig, sel_scores, 2**9)
tmp_res = run_lib.run_runner(config, data, targets, get_models=True)
print tmp_res['svr']['model'].oob_score_
fig = plt.figure()
all_freqs = list(it.chain(*freqs.values()))
ax = fig.add_subplot(311)
ax.hist(all_freqs, 400, range=[0, 4000])
ax.set_xlabel("histogram of all frequencies")

ax = fig.add_subplot(312)
ax.hist(targets)
ax.set_xlabel("target value histogram")

# frequency distribution of active targets
active = [m for i, m in enumerate(molids) if targets[i] > active_thresh]
act_freqs = list(it.chain(*[v for k, v in freqs.items() if k in active]))
ax = fig.add_subplot(313)
ax.hist(act_freqs, 40000, range=[0, 4000])
ax.set_xlabel("do ligand share bins for resolution of 0.1?")
fig.savefig(os.path.join(outpath, "distributions.png"))

# look at the relation between data and target inner distances
fig = plt.figure()
ma = flib._place_waves_in_vector(freqs, 0.1, True, "ir")
target_distances, ligands = pdist_1d(targets)
f_select_config = {"feature_selection": {"method": "linear"}}
sel_scores = run_lib.get_selection_score(f_select_config, ma, targets)
ma_sel = flib.select_k_best(ma, sel_scores, 2 ** 11)
feature_distances_sel = pdist(ma_sel, "cosine")
ax = fig.add_subplot(111)
ax.plot(feature_distances_sel[~ligands], target_distances[~ligands], "xb")
ax.plot(feature_distances_sel[ligands], target_distances[ligands], "xr")
fig.savefig(os.path.join(outpath, "distance_relation.png"))
from master.libs import features_lib as flib
import numpy as np
import pylab as plt
from mpl_toolkits.mplot3d import Axes3D
reload(run_lib)
reload(flib)

# search config
config = json.load(open(sys.argv[1]))

# load the features
features = run_lib.prepare_features(config)

data, targets = run_lib.load_data_targets(config, features)
sel_scores = run_lib.get_selection_score(config, data, targets)
data = flib.select_k_best(data, sel_scores, config['feature_selection']['k_best'])
tmp_res = run_lib.run_runner(config, data, targets, get_models=True)
model = tmp_res['svr_ens']['model']

fig = plt.figure()
ax = fig.add_subplot(211)
ax.plot(sel_scores)

# 3 d plot with dots colorcoded by target value
# [1 0 4] is the order of features selected
# "('WALK_PATH_COUNTS', 'MPC04')" --> molecular path count of order 04
# "('TOPOLOGICAL', 'TI2')" --> second Mohar index TI2
# "('TWOD_AUTOCORRELATIONS', 'GATS1m')" --> Geary autocorrelation - lag 1 / weighted by atomic masses

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
id2name = defaultdict(str, rdl.get_id2name())
door2id = json.load(open(os.path.join(data_path, 'door2id.json')))
id2cas = defaultdict(str, {val[0]: key for key, val in door2id.items() if val})
daniel_set = json.load(open(os.path.join(data_path, 'daniel_set.json')))
daniel_set_molid = [door2id[cas][0] for cas in daniel_set]

# get the best parameters from the search result
config = rdl.get_best_params(inpath, descriptor, glom, method, selection)
features = run_lib.prepare_features(config)
data, targets, molids = run_lib.load_data_targets(config, features)


# fit model
print("use {} molecules for training".format(data.shape[0]))
sel_scores = run_lib.get_selection_score(config, data, targets)
data = flib.select_k_best(data, sel_scores, config['feature_selection']['k_best'])
tmp_res = run_lib.run_runner(config, data, targets, get_models=True)
model = tmp_res[method]['model']

# compute predictions
molids_to_predict = list(set(daniel_set_molid).difference(molids))
print("want to predict on {} molecules".format(len(molids_to_predict)))
data_to_predict = np.array([features[m] for m in molids_to_predict if list(features[m])])
print("found features for {} molecules".format(data_to_predict.shape[0]))
molids_to_predict = np.array([m for m in molids_to_predict if len(features[m]) != 0 ])
data_to_predict = flib.select_k_best(data_to_predict, sel_scores,
                                     config['feature_selection']['k_best'])
assert len(data_to_predict) == len(molids_to_predict)
predictions = model.predict(data_to_predict)
with open(os.path.join(outpath, glom + '_daniel_predict.csv'), 'w') as f:
    writer = csv.writer(f, delimiter=',')