def read_data_sets_mol( fname, validation_rate = 0, test_rate = 0, disp = False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv( fname) xM_fp = jpd.pd_get_xM( pdr) xM_key = jpd.pd_get_xM_MACCSkeys( pdr) xM_molw = jpd.pd_get_xM_molw( pdr) xM_lasa = jpd.pd_get_xM_lasa( pdr) xM = np.concatenate( [xM_fp, xM_key, xM_molw, xM_lasa], axis = 1) yV = jpd.pd_get_yV( pdr, y_id = 'exp').A1 yV = [1 if y > 0 else 0 for y in yV] # classification is performed X, Y = list(map( np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split( X, Y, test_rate) data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split( X, Y, validation_rate) data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV( X, Y, disp = disp) data_sets.IMAGE_PIXELS = xM.shape[1] return data_sets
def grid_BIKE2(pdr, alphas_log, y_id = 'Solubility_log_mol_l'): print "BIKE with (A+B)+W" xM1 = jpd.pd_get_xM( pdr, radius=6, nBits=4096) xM2 = jpd.pd_get_xM_MACCSkeys( pdr) yV = jpd.pd_get_yV( pdr, y_id = y_id) #A1 = jpyx.calc_tm_sim_M( xM1) #A2 = jpyx.calc_tm_sim_M( xM2) #A = np.concatenate( ( A1, A2), axis = 1) xM = np.concatenate( ( xM1, xM2), axis = 1) A = jpyx.calc_tm_sim_M( xM1) print A.shape molw_l = jchem.rdkit_molwt( pdr.SMILES.tolist()) print np.shape( molw_l) A_molw = jchem.add_new_descriptor( A, molw_l) print A_molw.shape gs = jgrid.gs_Ridge( A_molw, yV, alphas_log=alphas_log) jutil.show_gs_alpha( gs.grid_scores_) jgrid.cv( 'Ridge', A_molw, yV, alpha = gs.best_params_['alpha']) return gs
def read_data_sets_mol(fname, validation_rate=0, test_rate=0, disp=False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv(fname) xM_fp = jpd.pd_get_xM(pdr) xM_key = jpd.pd_get_xM_MACCSkeys(pdr) xM_molw = jpd.pd_get_xM_molw(pdr) xM_lasa = jpd.pd_get_xM_lasa(pdr) xM = np.concatenate([xM_fp, xM_key, xM_molw, xM_lasa], axis=1) yV = jpd.pd_get_yV(pdr, y_id='exp').A1 yV = [1 if y > 0 else 0 for y in yV] # classification is performed X, Y = list(map(np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split(X, Y, test_rate) data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split(X, Y, validation_rate) data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV(X, Y, disp=disp) data_sets.IMAGE_PIXELS = xM.shape[1] return data_sets
def grid_MLR_B(pdr, alphas_log, y_id = 'Solubility_log_mol_l'): print "MLR with B" xM2 = jpd.pd_get_xM_MACCSkeys( pdr) xM_molw = xM2 yV = jpd.pd_get_yV( pdr, y_id = y_id) gs = jgrid.gs_Ridge( xM_molw, yV, alphas_log=alphas_log) jutil.show_gs_alpha( gs.grid_scores_) jgrid.cv( 'Ridge', xM_molw, yV, alpha = gs.best_params_['alpha']) return gs