def merge_and_split(select_idx, new_set, train_set, test_set, seed=None): # get df of the train set and test set first train_df = to_dataframe(train_set) test_df = to_dataframe(test_set) # then get df of all molecules from the new set in this current loop new_data_df = to_dataframe(new_set) # filter out the uncertain molecules from the new set x_uncertain = new_data_df.X[select_idx] y_uncertain = new_data_df.y[select_idx] w_uncertain = new_data_df.w[select_idx] id_uncertain = new_data_df.ids[select_idx] # and form the uncertain df by combining the columns uncertain_df = pd.concat( [x_uncertain, y_uncertain, w_uncertain, id_uncertain], axis=1) # then combine all train, test, and uncertain dfs together into 1 dataframe total_df = pd.concat([train_df, test_df, uncertain_df], axis=0) total_df = total_df.reset_index( drop=True ) # TRY: resetting index to make everything consistent if it affects? total_set = from_dataframe( total_df ) # need to do this to make the disk dataset consistent with normally loaded disk dataset.. final_disk_data = DiskDataset.from_numpy( X=total_set.X.transpose()[0], # to keep consistent dataset shape y=total_set.y, w=total_set.w, ids=total_set.ids) # finally, do 8020 random splits of the total set splitter = dc.splits.RandomSplitter() new_tot_train, new_tot_test = splitter.train_test_split(final_disk_data, frac_train=0.8, seed=seed) return new_tot_train, new_tot_test
# Fit models metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") kf = KFold(n_splits=5, shuffle=True, random_state=123) all_train_scores = [] all_test_scores = [] start = timeit.default_timer() for train_index, test_index in kf.split(X): train_dataset = DiskDataset.from_numpy(X[train_index], y[train_index, :], w[train_index, :], verbose=False) test_dataset = DiskDataset.from_numpy(X[test_index], y[test_index, :], w[test_index, :], verbose=False) # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 ''' graph_model = dc.nn.SequentialGraph(n_feat) graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu')) graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1)) graph_model.add(dc.nn.GraphPool())
train_dataset, valid_dataset, test_dataset = tox21_datasets X = train_dataset.X y = train_dataset.y w = train_dataset.w kf = KFold(n_splits=5, shuffle=True, random_state=123) all_train_scores = [] all_test_scores = [] start = timeit.default_timer() for train_index, test_index in kf.split(X): train_dataset = DiskDataset.from_numpy(X[train_index,:], y[train_index, :], w[train_index, :], verbose=False) test_dataset = DiskDataset.from_numpy(X[test_index,:], y[test_index, :], w[test_index, :], verbose=False) # Fit models metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode='classification') model = dc.models.MultiTaskClassifier( len(tox21_tasks), train_dataset.get_data_shape()[0], layer_sizes=[1500], bias_init_consts=[1.], dropouts=[0.5], penalty=0.1, penalty_type='l2', learning_rate=0.001, weight_init_stddevs=[0.02], batch_size=50, verbosity="high") # Fit trained model model.fit(train_dataset, nb_epoch=10)