def demo(): label_name_list = 'Keck_Pria_AS_Retest' print 'label_name_list ', label_name_list # specify dataset k = 5 directory = '../../dataset/sample_fold/' file_list = [] for i in range(k): file_list.append('{}.csv'.format(i)) output_file_list = [directory + f_ for f_ in file_list] print output_file_list[0:4] train_pd = read_merged_data(output_file_list[0:4]) print output_file_list[4] test_pd = read_merged_data([output_file_list[4]]) # extract data, and split training data into training and val X_train, y_train = extract_feature_and_label(train_pd, feature_name='Fingerprints', label_name_list=label_name_list) X_test, y_test = extract_feature_and_label(test_pd, feature_name='Fingerprints', label_name_list=label_name_list) cross_validation_split = StratifiedShuffleSplit(y_train, 1, test_size=0.2, random_state=1) for t_index, val_index in cross_validation_split: X_t, X_val = X_train[t_index], X_train[val_index] y_t, y_val = y_train[t_index], y_train[val_index] print 'done data preparation' task = RF_Tester() task.get_rf(X_train, y_train, X_val, y_val, X_test, y_test) return
def demo_single_classification(): with open(config_json_file, 'r') as f: conf = json.load(f) label_name_list = conf['label_name_list'] print 'label_name_list ', label_name_list # specify dataset k = 5 directory = '../../dataset/fixed_dataset/fold_{}/'.format(k) file_list = [] for i in range(k): file_list.append('file_{}.csv'.format(i)) # merge training and test dataset dtype_list = { 'Molecule': np.str, 'SMILES': np.str, 'Fingerprints': np.str, 'Keck_Pria_AS_Retest': np.int64, 'Keck_Pria_FP_data': np.int64, 'Keck_Pria_Continuous': np.float64, 'Keck_RMI_cdd': np.float64 } output_file_list = [directory + f_ for f_ in file_list] print output_file_list[0:4] train_pd = read_merged_data(output_file_list[0:4]) print output_file_list[4] test_pd = read_merged_data([output_file_list[4]]) # extract data, and split training data into training and val X_train, y_train = extract_feature_and_label( train_pd, feature_name='Fingerprints', label_name_list=label_name_list) X_test, y_test = extract_feature_and_label(test_pd, feature_name='Fingerprints', label_name_list=label_name_list) cross_validation_split = StratifiedShuffleSplit(y_train, 1, test_size=0.2, random_state=1) for t_index, val_index in cross_validation_split: X_t, X_val = X_train[t_index], X_train[val_index] y_t, y_val = y_train[t_index], y_train[val_index] print 'done data preparation' print conf['label_name_list'] task = SingleClassification(conf=conf) task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, PMTNN_weight_file) store_data(transform_json_to_csv(config_json_file), config_csv_file) return
dtype_list = {'Molecule': np.str, 'SMILES': np.str, 'Fingerprints': np.str, 'Keck_Pria_AS_Retest': np.int64, 'Keck_Pria_FP_data': np.int64, 'Keck_Pria_Continuous': np.float64, 'Keck_RMI_cdd': np.float64} output_file_list = [directory + f_ for f_ in file_list] print output_file_list[0:4] train_pd = read_merged_data(output_file_list[0:4]) print output_file_list[4] test_pd = read_merged_data([output_file_list[4]]) # extract data, and split training data into training and val X_train, y_train = extract_feature_and_label(train_pd, feature_name='Fingerprints', label_name_list=label_name_list) X_test, y_test = extract_feature_and_label(test_pd, feature_name='Fingerprints', label_name_list=label_name_list) y_train_classification = reshape_data_into_2_dim(y_train[:, 0]) y_train_regression = reshape_data_into_2_dim(y_train[:, 1]) y_test_classification = reshape_data_into_2_dim(y_test[:, 0]) y_test_regression = reshape_data_into_2_dim(y_test[:, 1]) cross_validation_split = StratifiedShuffleSplit(y_train_classification, 1, test_size=0.2, random_state=1) for t_index, val_index in cross_validation_split: X_t, X_val = X_train[t_index], X_train[val_index] y_t_classification, y_val_classification = y_train_classification[t_index], y_train_classification[val_index] y_t_regression, y_val_regression = y_train_regression[t_index], y_train_regression[val_index]
def demo_multi_classification(): with open(config_json_file, 'r') as f: conf = json.load(f) label_name_list = conf['label_name_list'] print 'label_name_list ', label_name_list # specify dataset k = 5 directory = '../../dataset/keck_pcba/fold_{}/'.format(k) file_list = [] for i in range(k): file_list.append('file_{}.csv'.format(i)) output_file_list = [directory + f_ for f_ in file_list] train_pd = read_merged_data(output_file_list[0:3]) train_pd.fillna(0, inplace=True) val_pd = read_merged_data(output_file_list[3:4]) val_pd.fillna(0, inplace=True) test_pd = read_merged_data(output_file_list[4:5]) test_pd.fillna(0, inplace=True) multi_name_list = train_pd.columns[-128:].tolist() multi_name_list.extend(label_name_list) print 'multi_name_list ', multi_name_list X_train, y_train = extract_feature_and_label( train_pd, feature_name='Fingerprints', label_name_list=multi_name_list) X_val, y_val = extract_feature_and_label(val_pd, feature_name='Fingerprints', label_name_list=multi_name_list) X_test, y_test = extract_feature_and_label(test_pd, feature_name='Fingerprints', label_name_list=multi_name_list) sample_weight_dir = '../../dataset/sample_weights/keck_pcba/fold_5/' file_list = [] for i in range(k): file_list.append('sample_weight_{}.csv'.format(i)) sample_weight_file = [sample_weight_dir + f_ for f_ in file_list] sample_weight_pd = read_merged_data(sample_weight_file[0:3]) _, sample_weight = extract_feature_and_label(sample_weight_pd, feature_name='Fingerprints', label_name_list=labels_list) print 'done data preparation' task = MultiClassification(conf=conf) task.train_and_predict(X_train, y_train, X_val, y_val, X_test, y_test, sample_weight=sample_weight, PMTNN_weight_file=PMTNN_weight_file, score_file=score_file) store_data(transform_json_to_csv(config_json_file), config_csv_file) whole_EF = [] for EF_ratio in task.EF_ratio_list: EF_list = task.get_EF_score_with_existing_model( X_test, y_test, PMTNN_weight_file, EF_ratio) whole_EF.append([EF_ratio]) whole_EF.append(EF_list) print(EF_ratio, EF_list) print print whole_EF return