def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10): res_train = np.zeros((train.shape[0], 2)) res_test = np.zeros((test.shape[0], 2)) for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)), total=cv.get_n_splits()): target_trn = target.iloc[trn_idx] X_trn = train.iloc[trn_idx] X_val = train.iloc[val_idx] n = X_trn[target_trn == 0] p = X_trn[target_trn == 1] for j, X in enumerate([n, p]): u = build(X, n_trees) res_train[val_idx, j] = get_feat(X_val, u, k=k) res_test[:, j] += get_feat(test, u, k) res_test /= cv.get_n_splits() return res_train, res_test
def target_encoding(X_train, y_train, X_test, cols, cv_id): cols = list(cols) train_new = X_train.copy() test_new = X_test.copy() test_new[:] = 0 cv = PredefinedSplit(cv_id) X_train.index = X_train.index.astype(int) for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()): enc = TargetEncoder(cols=cols) enc.fit(X_train.iloc[trn_idx], y_train[trn_idx]) train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx]) test_new += enc.transform(X_test) test_new /= cv.get_n_splits() train_new = train_new[cols] test_new = test_new[cols] train_new.columns = train_new.columns + '_target' test_new.columns = test_new.columns + '_target' print(list(train_new.columns)) return train_new, test_new
def test_predefined_split(): cv = PredefinedSplit(np.array(list(range(4)) * 5)) cv2 = PredefinedSplit(np.array(list(range(5)) * 4)) assert tokenize(cv) == tokenize(cv) assert tokenize(cv) != tokenize(cv2) sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
def main(argv): start_time = datetime.now() logger.info("START") args = argparser.parse_args() inFile = args.inFile testFile = args.testFile nameModel = args.nameModel conf_file = args.mod mod = __import__(conf_file, fromlist=['*']) model_conf = mod.gridSearch_Model_types[nameModel] conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf) prefix_dict = conf['prefix_dict'] out_dict = h.outfileName(fo=args.outFile, fi=inFile, prefix_dict=prefix_dict, add_date=True) logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile)) logger.info("LOADING THE DATA SET") param_grid = PARAM_DICT[nameModel] # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)} scoring = {'RMS': make_scorer(r2_score)} X, Y, len_train, numFeatures = readFile(inFile) cv = None if testFile: logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" % testFile) X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile) X = pd.concat([X, X_test], ignore_index=True) Y = pd.concat([Y, Y_test], ignore_index=True) cv_arr = [1] * len_train cv_arr.extend([0] * len_train_test) cv = PredefinedSplit(test_fold=cv_arr) print("Stampa di cv: ", cv) print("numero di fold", cv.get_n_splits()) for train_index, test_index in cv.split(): print("TRAIN:", train_index, "TEST:", test_index) logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape) logger.info("CREATION OF THE MODEL") t = TestClass(conf=conf, nm=nameModel, nf=numFeatures) if nameModel == 'NN': model = KerasClassifier(build_fn=t.createModelNN) X = X.as_matrix() Y = Y.as_matrix() else: model = t.selectModel() logger.info("START GRID SEARCH") grid_result = gridSearch(model, param_grid, cv, X, Y, scoring) logger.info("END OF GRID SEARCH") logger.info("PRINTING RESULTS") gridResults(grid_result, X, nameModel) SaveModel(nameModel, grid_result) logger.info("EXECUTED IN %f SEC" % ((datetime.now() - start_time)).total_seconds()) logger.info("END")
def aggregate_fold_stats(db_paths, cv_pkl_file): preprocessed_db = imglmdb.multidbwrapper(sorted(db_paths)) with open(cv_pkl_file, "rb") as pkl: test_fold, nested_test_folds = pickle.load(pkl) splitter = PredefinedSplit(test_fold) data = [{}] * splitter.get_n_splits() for i, (nested_test_fold, (_, test_idx)) in enumerate(zip(nested_test_folds, splitter.split())): per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=test_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["outer"] = (per_pixel_stats[0], std_per_pixel) nested_splitter = PredefinedSplit(nested_test_fold) data[i]["nested"] = [{}] * nested_splitter.get_n_splits() for j, (train_idx, val_idx) in enumerate(nested_splitter.split()): per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=train_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["nested"][j]["train"] = (per_pixel_stats[0], std_per_pixel) per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=val_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["nested"][j]["val"] = (per_pixel_stats[0], std_per_pixel) with open(os.path.splitext(cv_pkl_file)[0] + "_stats.pkl", "wb") as pkl: pickle.dump(data, pkl) return data
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = PredefinedSplit(folds) # n_splits is simply the no of unique folds assert_equal(len(np.unique(folds)), ps.get_n_splits()) for train_ind, test_ind in ps.split(): ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
break X_test = np.array(new_x) y_test = np.array(new_y) print(X_test.shape, y_test.shape, len(y_test[y_test == 0]), len(y_test[y_test == 1])) assert X_test.shape[1] == tmp_shape[1] assert X_test.shape[0] >= tmp_shape[0] assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]) # leave person out each fold test_fold = np.concatenate( [[0] * 43, [1] * 43, [2] * 43, [3] * 43, [4] * 43, [5] * 43, [6] * 43, [7] * 43, [-1] * ((nsubjects * (nclips - 1)) - (8 * nsubjects))]) gkf = PredefinedSplit(test_fold) print('split train set into:', gkf.get_n_splits(), 'folds') # We will use a Support Vector Classifier with class_weight balanced svm = SVC(class_weight='balanced') clf_best = GridSearchCV( estimator=svm, param_grid=p_grid, cv=gkf, iid=False, scoring=['accuracy', 'balanced_accuracy', 'f1_macro'], refit='f1_macro' ) # get params that give best 'refit' value clf_best.fit(X_train, y_train) y_pred = clf_best.predict(X_train) train_f1 = clf_best.best_score_
raw_test = read_idx("./Bases/MNIST/t10k-images-idx3-ubyte.gz") test_data = raw_test.reshape(10000, 28 * 28) test_label = read_idx("./Bases/MNIST/t10k-labels-idx1-ubyte.gz") ''' amostra=base_MNIST.head() base_MNIST = pd.DataFrame(data=amostra) base_MNIST.to_excel("./Bases/MNIST.xlsx") ''' X_total = np.concatenate( (train_data, test_data)) / 255.0 # padronização pras variáveis ficarem entre 0 e 1 Y_total = np.concatenate((train_label, test_label)) base_sep = np.repeat([-1, 0], [60000, 10000]) ps = PredefinedSplit(base_sep) ''' ps.get_n_splits() for train_index, test_index in ps.split(): print("TRAIN:", train_index, "TEST:", test_index) ''' # visualiza os dados foto = 123 fig = plt.figure(figsize=(2, 2)) ax = fig.add_subplot(111) ax.set_axis_off() ax.imshow(raw_train[foto, :], cmap=plt.cm.gray_r, interpolation='nearest') ax.set_title("valor do target = " + np.str(train_label[foto])) # diminue/filtra a base de trainamento idx = (train_label == 2) | (train_label == 3) | (train_label == 8)
def main(): parser = ArgumentParser() add_args(parser) args = parser.parse_args() dsmoothedTraj = pickle.load(open(args.inputfile, 'rb')) addsmooth(dsmoothedTraj) ltraj = [ smo for smo in dsmoothedTraj.values() if smo.trajff.shape[0] > 100 ] print([smo.trajff.shape[0] for smo in ltraj]) ntraj = len(ltraj) random.shuffle(ltraj) ts = pd.concat([smo.trajff for smo in ltraj]) print(list(ts)) # hyperparameters used for the 25.02 submissions # parameters ={'feature_fraction': 0.837266468665352, 'learning_rate': 0.0013782873851139932, 'min_child_samples': 33, 'num_leaves': 4, 'reg_lambda': 5.725801055525217e-12, 'subsample': 0.4944846046759285} # parameters={k:[v] for k,v in parameters.items()} parameters = { 'num_leaves': scipy.stats.randint(2, 11), 'learning_rate': scipy.stats.loguniform(1e-4, 1e-2), 'min_child_samples': scipy.stats.randint(10, 60), 'subsample': scipy.stats.uniform(loc=0.3, scale=0.4), 'reg_lambda': scipy.stats.loguniform(1e-14, 1e-10), 'feature_fraction': scipy.stats.uniform(loc=0.7, scale=0.3), } print(parameters) lvar = [ "error", "smoothedrawerror", "nb", "dt01", "countmeasure", "countmeasurecorrected", "baroAltitude" ] + [x for x in list(ts) if "density" in x] + [ x for x in list(ts) if "speed" in x ] + [x for x in list(ts) if "curvature" in x] if args.latlon: lvar = lvar + ["smoothedlatitude", "smoothedlongitude" ] #"nnpredlatitude","nnpredlongitude"] if args.dbaro: lvar = lvar + ["dbaroAltitude"] # compute folds so that each aircraft is inside only one fold test_fold = np.concatenate([ np.repeat(i // 30, smo.trajff.shape[0]) for i, smo in enumerate(ltraj) ]) #[keep] ps = PredefinedSplit(test_fold) print("number of folds", ps.get_n_splits()) lsensors, X = makeX(ts, lvar) X = X y = makey(ts) model = MyLGBMClassifier( lsensors, feature_fraction=1, num_leaves=7, learning_rate=0.1, min_child_samples=10, subsample=1., reg_lambda=0.) if args.classif else lgb.LGBMRegressor( n_estimators=4000, subsample_freq=10, random_state=0, n_jobs=1, objective='l2', importance_type='gain', max_bin=511) model = RandomizedSearchCV(model, parameters, cv=ps, n_jobs=args.n_jobs, verbose=1, n_iter=args.n_iter, random_state=0) # 3 dirty lines below... just close your eyes and skip it model.argslearnmodel = args model.lsensors = lsensors model.lvar = lvar model.fit(X, y) print(model.score(X, y)) print(model.cv_results_) print(model.best_params_) print(model.best_score_) if args.outputfile != '': with open(args.outputfile, 'wb') as f: pickle.dump(model, f)
def evaluate_on_target_systems(target_systems, training_systems, predictor, pair_params, kernel_params, opt_params, input_dir, estimator, feature_type, n_jobs=1, perc_for_training=100): """ Task: Evaluate rank-correlation, accuracy, etc. by learning an order predictor using the given set of training systems and prediction on the given set of target systems. For the evaluation we use either a repeated random-split of the target systems' data (if less than 75 examples are provided for test) or a cross-validation (else). The hyper-paramters of the order predictor are optimized using a nested cross-validation. The routines for that can be found in the file 'model_selection_cls.py'. If desired (excl_mol_by_struct_only == True), the molecular structures from the test set are removed from the training based on their molecular structure, e.g. by comparison of their InChIs, _even_ if these structures have been measured with another than the target system, i.e., another chromatographic system. See also the paper for details on the evaluation strategy. :param target_systems: list of strings, containing the target systems :param training_systems: list of strings, containing the training systems :param predictor: list of string, containing the predictors / molecular features used for the model construction. :param pair_params: dictionary, containing the paramters used for the creation of the RankSVM learning pairs, e.g. minimum and maximum oder distance. :param kernel_params: dictionary, containing the parameters for the kernels and generally for handling the input features / predictors. See definition of the dictionary in the __main__ of file 'evaluation_scenario_cls.py'. :param opt_params: dictionary, containing the paramters controlling the hyper-paramter optimization, number of cross-validation splits, etc. See definition of the dictionary in the __main__ of file 'evaluation_scenario_cls.py'. :param input_dir: string, directory containing the input data, e.g., fingerprints and retention times. :param estimator: string, order predictor to use: either "ranksvm" or "svr". :param feature_type: string, feature type that is used for the RankSVM. Currently only 'difference' features are supported, i.e., \phi_j - \phi_i is used for the decision. If the estimator is not RankSVM, but e.g. Support Vector Regression, than tis parameter can be set to None and is ignored. :param n_jobs: integer, number of jobs used for the hyper-parameter estimation. The maximum number of used jobs, is the number of inner splits (cross-validation or random split)! :param perc_for_training: scalar, percentage of the target systems data, that is used for the training, e.g., selected by simple random sub-sampling. This value only effects the training process, of the target system is in the set of training systems. :return: tuple of pandas.DataFrame 1) mapped_values: predicted order scores for each target system - corresponds to: w^\phi_i in the RankSVM case - corresponds to: the predicted retention time, in the SVR case 2) correlations: rank correlations of the order scores for each target system 3) accuracies: pairwise prediction accuracies for each target system 4) simple_statistics: number of training and test examples, etc. 5) grid_search_results: hyper-parameter scores for the different grid-parameters 6) grid_search_best_params: hyper-parameter scores for the best grid-parameters NOTE: The returned results (except mapped_values and grid search results) are averages across the different random splits / crossvalidation folds and repetitions. """ # Variables related to the number of random / cv splits, for inner (*_cv) # and outer fold (*_ncv). n_splits_shuffle = opt_params["n_splits_shuffle"] n_splits_nshuffle = opt_params["n_splits_nshuffle"] n_splits_cv = opt_params["n_splits_cv"] n_splits_ncv = opt_params["n_splits_ncv"] n_rep = opt_params["n_rep"] # Should molecules be excluded from the training, if their structure appears # in the test _even if_ they have been measured with another system than the # (current) target system: excl_mol_by_struct_only = opt_params["excl_mol_by_struct_only"] # Currently only 'slack_type == "on_pairs"' is supported. slack_type = opt_params["slack_type"] if slack_type != "on_pairs": raise ValueError("Invalid slack type: %s" % slack_type) # Should all possible pairs be used for the (inner) test split during the # parameter estimation, regardless of what are the settings for 'd_upper' # and 'd_lower'? all_pairs_for_test = opt_params["all_pairs_for_test"] if not estimator in ["ranksvm", "svr"]: raise ValueError("Invalid estimator: %s" % estimator) # RankSVM and SVR regularization parameter param_grid = {"C": opt_params["C"]} if estimator == "svr": # error-tube width of the SVR param_grid["epsilon"] = opt_params["epsilon"] # Molecule kernel if kernel_params["kernel"] == "linear": kernel = "linear" elif kernel_params["kernel"] in ["rbf", "gaussian"]: param_grid["gamma"] = kernel_params["gamma"] kernel = "rbf" elif kernel_params["kernel"] == "tanimoto": if estimator in ["ranksvm"]: kernel = tanimoto_kernel elif estimator in ["svr"]: kernel = tanimoto_kernel_mat elif kernel_params["kernel"] == "minmax": if estimator in ["ranksvm"]: kernel = minmax_kernel elif estimator in ["svr"]: kernel = minmax_kernel_mat else: raise ValueError("Invalid kernel: %s." % kernel_params["kernel"]) if isinstance(target_systems, str): target_systems = [target_systems] if isinstance(training_systems, str): training_systems = [training_systems] all_systems = list(set(target_systems).union(training_systems)) assert isinstance(target_systems, list) and isinstance( training_systems, list) n_target_systems = len(target_systems) n_training_systems = len(training_systems) print("Target systems (# = %d): %s" % (n_target_systems, ",".join(target_systems))) print("Training systems (# = %d): %s" % (n_training_systems, ",".join(training_systems))) ## Load the target and training systems into directories using (molecule, system)-keys ## and retention times respectively molecular features as values # If we use molecular descriptors, we need to scale the data, e.g. to [0, 1]. if kernel_params["scaler"] == "noscaling": scaler = None elif kernel_params["scaler"] == "minmax": scaler = MinMaxScaler() elif kernel_params["scaler"] == "std": scaler = StandardScaler() elif kernel_params["scaler"] == "l2norm": scaler = Normalizer() else: raise ValueError("Invalid scaler for the molecular features: %s" % kernel_params["scaler"]) # Handle counting MACCS fingerprints if predictor[0] == "maccsCount_f2dcf0b3": predictor_c = ["maccs"] predictor_fn = "fps_maccs_count.csv" else: predictor_c = predictor predictor_fn = None d_rts, d_features, d_system_index = OrderedDict(), OrderedDict( ), OrderedDict() for k_sys, system in enumerate(all_systems): rts, data = load_data(input_dir, system=system, predictor=predictor_c, pred_fn=predictor_fn) # Use (mol-id, system)-tupel as key keys = list(zip(rts.inchi.values, [system] * rts.shape[0])) # Values: retention time, features rts = rts.rt.values.reshape(-1, 1) data = data.drop("inchi", axis=1).values if kernel_params["poly_feature_exp"]: # If we use binary fingerprints, we can include some # interactions, e.g. x_1x_2, ... data = PolynomialFeatures(interaction_only=True, include_bias=False).fit_transform(data) # Make ordered directories d_rts[system], d_features[system] = OrderedDict(), OrderedDict() for i, key in enumerate(keys): d_rts[system][key] = rts[i, 0] d_features[system][key] = data[i, :] # Dictionary containing a unique numeric identifier for each system d_system_index[system] = k_sys if scaler is not None: if getattr(scaler, "partial_fit", None) is not None: # 'partial_fit' allows us to learn the parameters of the scaler # online. (great stuff :)) scaler.partial_fit(data) else: # We have scaler at hand, that does not allow online fitting. # This probably means, that this is a scaler, that performs # the desired scaling for each example independently, e.g. # sklearn.preprocessing.Normalizer. pass for system in target_systems: print("Target set '%s' contains %d examples." % (system, len(d_rts[system]))) # Collect all the data that is available for training. d_rts_training = join_dicts(d_rts, training_systems) d_features_training = join_dicts(d_features, training_systems) # (mol-id, system)-tuples used in the training set l_keys_training = list(d_features_training.keys()) # Data frames storing the evaluation measures mapped_values = { target_system: DataFrame() for target_system in target_systems } accuracies, correlations, simple_statistics = DataFrame(), DataFrame( ), DataFrame() grid_search_results, grid_search_best_params = DataFrame(), DataFrame() for idx_system, target_system in enumerate(target_systems): print("Process target system: %s (%d/%d)." % (target_system, idx_system + 1, len(target_systems))) # (mol-id, system)-tuples in the target set l_keys_target = list(d_features[target_system].keys()) for i_rep in range(n_rep): print("Repetition: %d/%d" % (i_rep + 1, n_rep)) # Get a random subset of the training data l_keys_training_sub = sample_perc_from_list(l_keys_training, tsystem=target_system, perc=perc_for_training, random_state=747 * i_rep) print("Training set contains %d (%f%%) examples." % (len(l_keys_training_sub), 100 * len(l_keys_training_sub) / len(l_keys_training))) for training_system in training_systems: n_train_sys_sub = sum( np.array(list(zip( *l_keys_training_sub))[1]) == training_system) n_train_sys = sum( np.array(list(zip( *l_keys_training))[1]) == training_system) print("\tSystem %s contributes %d (%f%%) examples." % (training_system, n_train_sys_sub, 100 * n_train_sys_sub / n_train_sys)) # Check whether the target system has any overlap with training system print("Outer validation split strategy: ", end="", flush=True) l_molids_training = list(zip(*l_keys_training_sub))[0] l_molids_target = list(zip(*l_keys_target))[0] if (excl_mol_by_struct_only and (len (set (l_molids_training) & set (l_molids_target)) == 0)) or \ (not excl_mol_by_struct_only and (len (set (l_keys_training_sub) & set (l_keys_target)) == 0)): print( "Predefined split:\n" "\tTraining and target do not share molecular structures " "(excl_mol_by_struct_only=%d)" % excl_mol_by_struct_only) cv_outer = PredefinedSplit(np.zeros(len(l_keys_target))) else: # Determine strategy for training / test splits if len(l_keys_target) < 75: print("ShuffleSplit") train_size = 0.75 cv_outer = ShuffleSplit(n_splits=n_splits_shuffle, train_size=train_size, test_size=(1 - train_size), random_state=320 * i_rep) else: print("KFold") cv_outer = KFold(n_splits=n_splits_cv, shuffle=True, random_state=320 * i_rep) # Performance evaluation using cross-validation / random splits for i_fold, (_, test_set) in enumerate(cv_outer.split(l_keys_target)): print("Outer fold: %d/%d" % (i_fold + 1, cv_outer.get_n_splits())) # (mol-id, system)-tuples in the test subset of the target set l_keys_target_test = [l_keys_target[idx] for idx in test_set] # Remove test subset of the target set from the training set. # NOTE: The training set might contain the whole target set. l_molids_target_test = list(zip(*l_keys_target_test))[0] if excl_mol_by_struct_only: l_keys_training_train = [ key for key in l_keys_training_sub if key[0] not in l_molids_target_test ] else: l_keys_training_train = [ key for key in l_keys_training_sub if key not in l_keys_target_test ] if isinstance(cv_outer, PredefinedSplit): print("Shuffle pre-defined split.") rs_old = np.random.get_state() np.random.seed(320 * i_fold) # If we use the pre-defined splits we need to shuffle by our self. # In that way we prevent bias during the h-param estimation. np.random.shuffle( l_keys_training_train) # Shuffle is done inplace np.random.set_state(rs_old) l_molids_training_train = list(zip(*l_keys_training_train))[0] if excl_mol_by_struct_only: assert (len( set(l_molids_target_test) & set(l_molids_training_train)) == 0) else: assert (len( set(l_keys_target_test) & set(l_keys_training_train)) == 0) # Determine strategy for training / test splits (inner) print("Inner (h-param) validation split strategy: ", end="", flush=True) if len(l_keys_training_train) < 75: print("GroupShuffleSplit") train_size = 0.75 cv_inner = GroupShuffleSplit(n_splits=n_splits_nshuffle, train_size=train_size, test_size=(1 - train_size), random_state=350 * i_fold * i_rep) else: print("GroupKFold") cv_inner = GroupKFold(n_splits=n_splits_ncv) # Train the rankSVM: Find optimal set of hyper-parameters od_rts_training_train, od_features_training_train = OrderedDict( ), OrderedDict() for key in l_keys_training_train: od_rts_training_train[key] = d_rts_training[key] od_features_training_train[key] = d_features_training[key] start_time = time.time() if estimator == "ranksvm": best_params, cv_results, n_train_pairs, ranking_model, _, _ = find_hparan_ranksvm( estimator=KernelRankSVC(kernel=kernel, slack_type=slack_type, random_state=319 * i_fold * i_rep), fold_score_aggregation="weighted_average", X=od_features_training_train, y=od_rts_training_train, param_grid=param_grid, cv=cv_inner, pair_params=pair_params, n_jobs=n_jobs, scaler=scaler, all_pairs_as_test=all_pairs_for_test) elif estimator == "svr": best_params, cv_results, n_train_pairs, ranking_model = find_hparam_regression( estimator=SVRPairwise(kernel=kernel), X=od_features_training_train, y=od_rts_training_train, param_grid=param_grid, cv=cv_inner, n_jobs=n_jobs, scaler=scaler) else: raise ValueError("Invalid estimator: %s" % estimator) rtime_gcv = time.time() - start_time print("[find_hparam_*] %.3fsec" % rtime_gcv) # Store the grid-search statistics for further analyses grid_search_results_tmp = DataFrame(cv_results) grid_search_results_tmp["target_system"] = target_system grid_search_results_tmp["training_systems"] = ";".join( training_systems) grid_search_results = grid_search_results.append( grid_search_results_tmp) grid_search_best_params_tmp = DataFrame([best_params]) grid_search_best_params_tmp["target_system"] = target_system grid_search_best_params_tmp["training_systems"] = ";".join( training_systems) grid_search_best_params = grid_search_best_params.append( grid_search_best_params_tmp) print(grid_search_best_params_tmp) ## Do prediction for the test set # Calculate: w' * \phi(x_i), for all molecules i X_test, rts_test = [], [] for key in l_keys_target_test: rts_test.append(d_rts[target_system][key]) X_test.append(d_features[target_system][key]) rts_test = np.array(rts_test).reshape(-1, 1) X_test = np.array(X_test) if scaler is not None: X_test = scaler.transform(X_test) if estimator == "ranksvm": Y_pred_test = ranking_model.predict(X_test, X_test) elif estimator == "svr": Y_pred_test = ranking_model.predict(X_test) else: raise ValueError("Invalid estimator: %s" % estimator) wTx = ranking_model.map_values(X_test) mapped_values[target_system] = pd.concat([ mapped_values[target_system], DataFrame({ "mapped_value": wTx, "true_rt": rts_test.flatten(), "inchi": l_molids_target_test }) ], ignore_index=True) correlations = correlations.append( { "rank_corr": sp.stats.kendalltau(wTx, rts_test)[0], "spear_corr": sp.stats.spearmanr(wTx, rts_test)[0], "target_system": target_system, "training_system": ";".join(training_systems) }, ignore_index=True) n_train_mol = len(set(l_molids_training_train)) n_test_mol = len(set(l_molids_target_test)) n_shared_mol = len( set(l_molids_target_test) & (set(l_molids_training_train))) p_shared_mol = float(n_shared_mol) / n_test_mol # Predict: x_i > x_j or x_i < x_j for all molecule pairs (i, j) with Timer("Get prediction score"): for d_lower, d_upper in itertools.product( [0] + list(range(1, 15, 2)), 2**np.array([0, 1, 2, 3, 4, 5, 6, np.inf])): if d_lower > d_upper: continue pairs_test = get_pairs_single_system(rts_test, d_lower=d_lower, d_upper=d_upper) accuracies = accuracies.append( { "score_w": ranking_model.score_using_prediction( Y_pred_test, pairs_test, normalize=False), "score": ranking_model.score_using_prediction( Y_pred_test, pairs_test), "n_pairs_test": len(pairs_test), "target_system": target_system, "training_system": ";".join(training_systems), "d_lower": d_lower, "d_upper": d_upper, "i_rep": i_rep }, ignore_index=True) # Write out how many molecular structures are shared between the target and training systems n_test_pairs = len(pairs_test) simple_statistics = simple_statistics.append( { "n_shared_mol": n_shared_mol, "p_shared_mol": p_shared_mol, "n_train_mol": n_train_mol, "n_test_mol": n_test_mol, "n_train_pairs": n_train_pairs, "n_test_pairs": n_test_pairs, "grid_search_time": rtime_gcv, "target_system": target_system, "training_systems": ";".join(training_systems), "d_lower": d_lower, "d_upper": d_upper }, ignore_index=True) # Average the mapped values over the repetitions for target_system in target_systems: mapped_values[target_system]["mapped_value_std"] = mapped_values[ target_system]["mapped_value"] mapped_values[target_system] = mapped_values[target_system].groupby( ["inchi"], as_index=False).agg({ "mapped_value": np.mean, "mapped_value_std": np.std, "true_rt": np.unique }) # Aggregate the rows in 'correlations' to get the mean- and std-values across the folds. correlations["rank_corr_std"] = correlations["rank_corr"] correlations["spear_corr_std"] = correlations["spear_corr"] correlations = correlations.groupby(["target_system", "training_system"], as_index=False).agg({ "rank_corr": np.mean, "rank_corr_std": np.std, "spear_corr": np.mean, "spear_corr_std": np.std }) # Aggregate the rows in 'accuracies' to get the expected pairwise accuracy accuracies = accuracies.groupby( ["target_system", "training_system", "d_lower", "d_upper", "i_rep"], as_index=False).agg({ "score_w": np.sum, "n_pairs_test": np.sum, "score": np.mean }) accuracies["score_w"] = accuracies["score_w"] / accuracies["n_pairs_test"] accuracies.drop(["i_rep", "n_pairs_test"], axis=1, inplace=True) # Calculate expected accuracy across the repetitions accuracies["score_w_std"] = accuracies["score_w"] accuracies["score_std"] = accuracies["score"] accuracies = accuracies.groupby( ["target_system", "training_system", "d_lower", "d_upper"], as_index=False).agg({ "score_w": np.mean, "score_w_std": np.std, "score": np.mean, "score_std": np.std }) # Aggregate the simple statistics simple_statistics["n_shared_mol_std"] = simple_statistics["n_shared_mol"] simple_statistics["p_shared_mol_std"] = simple_statistics["p_shared_mol"] simple_statistics["n_train_mol_std"] = simple_statistics["n_train_mol"] simple_statistics["n_test_mol_std"] = simple_statistics["n_test_mol"] simple_statistics["n_train_pairs_std"] = simple_statistics["n_train_pairs"] simple_statistics["n_test_pairs_std"] = simple_statistics["n_test_pairs"] simple_statistics["grid_search_time_std"] = simple_statistics[ "n_test_pairs"] simple_statistics = simple_statistics.groupby( ["target_system", "training_systems", "d_lower", "d_upper"], as_index=False).agg({ "n_shared_mol": np.mean, "p_shared_mol": np.mean, "n_train_mol": np.mean, "n_test_mol": np.mean, "n_train_pairs": np.mean, "n_test_pairs": np.mean, "grid_search_time": np.mean, "n_shared_mol_std": np.std, "p_shared_mol_std": np.std, "n_train_mol_std": np.std, "n_test_mol_std": np.std, "n_train_pairs_std": np.std, "n_test_pairs_std": np.std, "grid_search_time_std": np.std }) return mapped_values, correlations, accuracies, simple_statistics, grid_search_results, grid_search_best_params
# %% -------------------- import numpy as np from sklearn.model_selection import PredefinedSplit # %% -------------------- X = np.array([0, 1, 2, 3, 4]) y = np.array([0, 0, 1, 1, 1]) # %% -------------------- # For example, when using a validation set, set the test_fold to 0 for all samples that are part # of the validation set, and to -1 for all other samples. # test_fold = [2, 2, 1, 1, -1] test_fold = np.append(np.full(4, -1), np.full(1, 0)) # %% -------------------- ps = PredefinedSplit(test_fold) # %% -------------------- print(ps.get_n_splits()) # %% -------------------- print(ps) # %% -------------------- for train_index, test_index in ps.split(): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # %% --------------------
print('Real:', Counter(y_train)) print('Over:', Counter(y_over)) print('Under:', Counter(y_under)) print('Balanced:', Counter(y_bal)) ############################################################################### ## Create learning model (Decision Tree) and tune hyperparameters ############################################################################### from sklearn.model_selection import PredefinedSplit from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier ### -1 indices -> train ### 0 indices -> validation test_fold = np.repeat([-1, 0], [X_train.shape[0], X_val.shape[0]]) myPreSplit = PredefinedSplit(test_fold) myPreSplit.get_n_splits() myPreSplit.split() for train_index, test_index in myPreSplit.split(): print("TRAIN:", train_index, "TEST:", test_index) parameters = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 10, 100, 1000, 10000, 100000, 1000000, None], 'min_samples_split': [2, 3, 4] } clf = DecisionTreeClassifier() model = GridSearchCV(estimator=clf, param_grid=parameters, scoring='f1_weighted', cv=myPreSplit,
pipeline = pipeline = Pipeline([('dim_red', FunctionTransformer(validate=True)), ('norm', FunctionTransformer(validate=True))]) X2 = pipeline.fit_transform(X1) #%% import numpy as np from sklearn.model_selection import PredefinedSplit X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 1, 1]) test_fold = [-1, -1, -1, 0] ps = PredefinedSplit(test_fold) ps.get_n_splits() print(ps) for train_index, test_index in ps.split(): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #%% data = pd.DataFrame( data={ 'text_feat': [ 'This is my first sentence. I hope you like it', 'Here is my second sentence. It is pretty similar.' ],
return float(cm[0][0] + cm[1][1] + cm[2][2]) / (sum(cm[0]) + sum(cm[1]) + sum(cm[2])) def sen(m): return m[0][0] / (sum(m[0])) def ppv(m): return m[0][0] / (m[0][0] + m[1][0]) #my_scorer= make_scorer(myMCC) ps = PredefinedSplit(fold_set) #print(ps) print("NUMBER OF K-FOLDS=", ps.get_n_splits()) #returns the number of k-folds print("\nThe lenght of the set X:", len(X), "y: ", len(y)) print("\nThe lenght of the fold set number:", len(fold_set)) print() print(len(X)) #print(len(X[-1])) #print("y_true") print(len(y)) print(type(y[1])) mySVC = SVC(C=2.0, kernel='rbf', gamma=0.5) #build the model SVC print() print("TRAINING INITIALIZATION") print() y_pred = cross_val_predict(mySVC, X, y, cv=ps, n_jobs=2) print()