def train_pipeline(df, pipe, clf_params, folds=10): """ trains the pipeline by the given classifier and predefined dimension reducer parameters. :return: tuple of best estimator and best parameters """ features, target = split_features_labels(df) # params to tune, reducers: pca, rfe and kbest max_n = features.columns.size params = create_pipeline_params(clf_params, max_n) # use stratified splits folder = StratifiedShuffleSplit(folds, test_size=.3, train_size=.7) folder.random_state = random_state # use f1 scoring to train combined precision and recall metrics grid = GridSearchCV(pipe, cv=folder, n_jobs=4, param_grid=params, scoring='f1') grid.fit(features, target) return [grid.best_estimator_, grid.best_params_]
def run( word_params: Optional["CNNParams"], char_params: Optional["CNNParams"], training_sizes: List[int], window_sizes: List[Tuple[int, int]], k: int = 5, nocluster_dropout: float = 0.5, kmeans_path: str = "../clustered", gmm_path: str = "../clustered_gmm", num_clusters: int = 10, num_clusters_gmm: int = 10, use_cluster_cnn: bool = False, use_only_clusters: bool = False, use_bow: bool = False, ) -> Tuple[Results, Results]: if not (word_params or char_params): print("Need at least one of {word_params, char_params") return Results(None, None, None), Results(None, None, None) both_models = word_params and char_params baseline = defaultdict(dict) dbscan = defaultdict(dict) gmm = defaultdict(dict) char_baseline = defaultdict(dict) char_dbscan = defaultdict(dict) char_gmm = defaultdict(dict) if use_cluster_cnn: def fn(w, n): return lambda r: CNNClusterLabels(r, w, n, word_params.dropout) elif use_only_clusters: def fn(w, n): return lambda r: OnlyClusterLabels(r, n * (sum(w) + 1), word_params.dropout) else: def fn(w, n): return lambda r: CategoricalClusterLabels(r, n * (sum(w) + 1), word_params.dropout) for training_size in training_sizes: for window_size in window_sizes: optim_fn = lambda p: torch.optim.Adam(p) model_fns = [] if nocluster_dropout >= 0: model_fns.append(lambda r: NoClusterLabels(r, nocluster_dropout)) if word_params: model_fns += [ fn(window_size, num_clusters), fn(window_size, num_clusters_gmm), ] if nocluster_dropout >= 0: model_fns.append(lambda r: NoClusterLabels(r, nocluster_dropout)) if char_params: model_fns += [ fn(window_size, num_clusters), fn(window_size, num_clusters_gmm), ] dataset, validset, testset = load_dataset( kmeans_path, gmm_path, num_clusters, num_clusters_gmm, window_size[0], window_size[1], old_test=True ) splitter = StratifiedShuffleSplit( n_splits=k, train_size=training_size, test_size=None, random_state=100, ) params_list = [] multiplier = 3 if nocluster_dropout >= 0 else 2 params_list += ([word_params] * multiplier) if word_params else [] params_list += ([char_params] * multiplier) if char_params else [] use_dist_list: List[bool] if nocluster_dropout >= 0: use_dist_list = [False, False, True] * (2 if both_models else 1) else: use_dist_list = [False, True] * (2 if both_models else 1) splitter.random_state = 100 if use_bow: values = cross_val_bow(k, splitter, dataset, testset=testset) else: values = cross_val( k, splitter, model_fns, use_dist_list, optim_fn, dataset, params=params_list, early_stopping=3, validation_set=validset, batch_size=128, testset=testset, ) result_order = [] if word_params: if nocluster_dropout >= 0: result_order.append(baseline) result_order += [dbscan, gmm] if char_params: if nocluster_dropout >= 0: result_order.append(char_baseline) result_order += [char_dbscan, char_gmm] if use_bow: # special case, override the order result_order = [baseline] num_iter = len(values[0]) assert(num_iter == len(result_order)) for i, var in enumerate(result_order): var[window_size][training_size] = [v[i] for v in values] return ( Results(baseline, dbscan, gmm), Results(char_baseline, char_dbscan, char_gmm), )