def make_Y_obs(kinds, target_dilution=None, imputer=None): if target_dilution == 'gold': # For actual testing, use 1/1000 dilution for intensity and # high dilution for everything else. Y, imputer = make_Y_obs(kinds, target_dilution='high', imputer=imputer) intensity, imputer = make_Y_obs(kinds, target_dilution=-3, imputer=imputer) Y['mean_std'][:, 0] = intensity['mean_std'][:, 0] Y['mean_std'][:, 21] = intensity['mean_std'][:, 21] for i in range(1, 50): Y['subject'][i][:, 0] = intensity['subject'][i][:, 0] return Y, imputer if type(kinds) is str: kinds = [kinds] if imputer in [None, 'median']: imputer = Imputer(missing_values=np.nan, strategy='median', axis=0) Y = {} for kind in kinds: assert kind in ['training','leaderboard','testset'], \ "No such kind %s" % kind if kind == 'leaderboard': loading.format_leaderboard_perceptual_data() _, perceptual_data = loading.load_perceptual_data(kind) print("Getting basic perceptual data...") matrices = get_perceptual_matrices(perceptual_data, target_dilution=target_dilution) print("Flattening into vectors...") v_mean = get_perceptual_vectors(matrices, imputer=imputer, statistic='mean', target_dilution=target_dilution) v_std = get_perceptual_vectors(matrices, imputer=imputer, statistic='std', target_dilution=target_dilution) v_subject = get_perceptual_vectors(matrices, imputer=imputer, statistic=None, target_dilution=target_dilution) print("Assembling into matrices...") Y[kind] = build_Y_obs(v_mean, v_std, v_subject) print("Combining Y matrices...") Y_ = {'subject': {}} Y_['mean_std'] = np.vstack([Y[kind]['mean_std'] for kind in ['training','leaderboard','testset'] \ if kind in kinds]) for subject in range(1, 50): Y_['subject'][subject] = np.ma.vstack([Y[kind]['subject'][subject] for kind in ['training','leaderboard','testset'] \ if kind in kinds]) print("The Y['mean_std'] matrix now has shape (%dx%d) " % Y_['mean_std'].shape +\ "molecules by 2 x perceptual descriptors") print("The Y['subject'] dict now has %d matrices of shape (%dx%d) " % \ (len(Y_['subject']),Y_['subject'][1].shape[0],Y_['subject'][1].shape[1]) +\ "molecules by perceptual descriptors, one for each subject") return Y_, imputer
def make_Y_obs(kinds, target_dilution=None, imputer=None, quiet=False): if target_dilution == 'gold': # For actual testing, use 1/1000 dilution for intensity and # high dilution for everything else. Y,imputer = make_Y_obs(kinds,target_dilution='high',imputer=imputer,quiet=True) intensity,imputer = make_Y_obs(kinds,target_dilution=-3,imputer=imputer) Y['mean_std'][:,0] = intensity['mean_std'][:,0] Y['mean_std'][:,21] = intensity['mean_std'][:,21] for i in range(1,50): Y['subject'][i][:,0] = intensity['subject'][i][:,0] return Y,imputer if type(kinds) is str: kinds = [kinds] if imputer in [None,'median']: imputer = Imputer(missing_values=np.nan,strategy='median',axis=0) Y = {} for kind in kinds: assert kind in KINDS, "No such kind %s" % kind if kind == 'leaderboard': loading.format_leaderboard_perceptual_data() if kind == 'testset': loading.format_testset_perceptual_data() _, perceptual_data = loading.load_perceptual_data(kind) #print("Getting basic perceptual data...") matrices = get_perceptual_matrices(perceptual_data, target_dilution=target_dilution) #print("Flattening into vectors...") v_mean = get_perceptual_vectors(matrices, imputer=imputer, statistic='mean', target_dilution=target_dilution) v_std = get_perceptual_vectors(matrices, imputer=imputer, statistic='std', target_dilution=target_dilution) v_subject = get_perceptual_vectors(matrices, imputer=imputer, statistic=None, target_dilution=target_dilution) #print("Assembling into matrices...") Y[kind] = build_Y_obs(v_mean,v_std,v_subject) #print("Combining Y matrices...") Y_ = {'subject':{}} Y_['mean_std'] = np.vstack([Y[kind]['mean_std'] for kind in KINDS \ if kind in kinds]) for subject in range(1,50): Y_['subject'][subject] = np.ma.vstack([Y[kind]['subject'][subject] for kind in KINDS if kind in kinds]) if not quiet: print("The Y['mean_std'] matrix now has shape (%dx%d) " % Y_['mean_std'].shape +\ "molecules by 2 x perceptual descriptors") print("The Y['subject'] dict now has %d matrices of shape (%dx%d) " % \ (len(Y_['subject']),Y_['subject'][1].shape[0],Y_['subject'][1].shape[1]) +\ "molecules by perceptual descriptors, one for each subject") return Y_,imputer
from scipy.stats import pearsonr from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor from sklearn.cross_validation import ShuffleSplit, cross_val_score from sklearn.grid_search import GridSearchCV gerkin_path = os.path.dirname(os.path.abspath(__file__)) opc_python_path = os.path.dirname(gerkin_path) root_path = os.path.dirname(opc_python_path) sys.path.append(root_path) import dream from opc_python.utils import loading from opc_python.utils import scoring # Load the perceptual descriptors data. perceptual_headers, perceptual_obs_data = loading.load_perceptual_data( 'training') loading.format_leaderboard_perceptual_data() # Show the perceptual metadata types and perceptual descriptor names. print(perceptual_headers) # Show the metadata and perceptual descriptor values for the first compound. print(perceptual_obs_data[1]) num_descriptors = len(perceptual_headers[6:]) assert num_descriptors == dream.NUM_DESCRIPTORS num_subjects = dream.NUM_SUBJECTS print( 'There are %d different perceptual descriptors and %d different subjects' % (num_descriptors, num_subjects)) # Load the molecular descriptors data.
from scipy.stats import pearsonr from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor from sklearn.cross_validation import ShuffleSplit,cross_val_score from sklearn.grid_search import GridSearchCV gerkin_path = os.path.dirname(os.path.abspath(__file__)) opc_python_path = os.path.dirname(gerkin_path) root_path = os.path.dirname(opc_python_path) sys.path.append(root_path) import dream from opc_python.utils import loading from opc_python.utils import scoring # Load the perceptual descriptors data. perceptual_headers, perceptual_obs_data = loading.load_perceptual_data('training') loading.format_leaderboard_perceptual_data() # Show the perceptual metadata types and perceptual descriptor names. print(perceptual_headers) # Show the metadata and perceptual descriptor values for the first compound. print(perceptual_obs_data[1]) num_descriptors = len(perceptual_headers[6:]) assert num_descriptors == dream.NUM_DESCRIPTORS num_subjects = dream.NUM_SUBJECTS print('There are %d different perceptual descriptors and %d different subjects' % (num_descriptors,num_subjects)) # Load the molecular descriptors data. molecular_headers, molecular_data = loading.load_molecular_data() print("First ten molecular descriptor types are %s" % molecular_headers[:10])