def stop_by_num_features_parallel(logs: ListLogListType, extractor: ExtractorFnType, metric_name: str, min_num_features: int = 50) -> bool: """ Selects the best log out of a list to see if feature selection should stop Parameters ---------- logs : list of list of list of dict A list of log-like lists of dictionaries evaluations. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted min_num_features: int (default 50) The minimun number of features the model can have before stopping Returns ---------- stop: bool A boolean whether to stop recursion or not """ best_log = get_best_performing_log(first(logs), extractor, metric_name) return stop_by_num_features([best_log], min_num_features)
def composer(self, tokens, **kwargs): rekey = [] for i, token in enumerate(tokens): token = [token] if first(token[0]) is map: token[0][1] = [delayed(token[0][1][0])] token.append([Parallel(n_jobs=self.n_jobs), [], {}]) rekey.extend(token) return super().composer(rekey)
def auto_correlation(im, **kwargs): windowed = weiner_khinchin_auto_correlation.copy() for i, token in enumerate(windowed._tokens): if first(token) in ( np.fft.fftpack.fftn, np.fft.fftpack.ifftn, ): windowed._tokens[i][2] = kwargs return windowed.value(im)
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) return assoc(first_train_log, "validator_log", list(validator_logs))
def _get_conditions(*conditions): return " AND ".join( concatv( [ "gp.docstatus = 1", "gp.posting_date >= %(from)s", "gp.posting_date <= %(to)s", ], [first(x) for x in filter(lambda x: x, conditions)], ))
def test_backward_subset_feature_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name): features_sets = { "first": ["x1", "x2"], "second": ["x4", "x5"], "third": ["x3", "x6"] } logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=-1, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs)[0])) <= 5 # Assert stop by remaining features logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def load_csv(): '''Initialize data from csv.''' tablename = 'measurements' print("Checking if table %s exists." % tablename) tbl = check_table(tablename) if tbl is not None: print('Table exists, skipping. Drop it first?') return print('Table not found, initializing with csv data.') tbl = get_table(tablename) filepath = os.path.abspath(os.path.dirname(__file__)) with open(filepath + '/data/old_entries.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['site_id'] is not '3': print('Skip entry for test_site') continue # Remove empty items from dict cleaned_row = dict((k, v) for k, v in row.items() if v) # Take date part of time string gotten from postgres datestr = cleaned_row['date'][:10] st = strptime(datestr, "%Y-%m-%d") entry_date = date.fromtimestamp(mktime(st)) typeval_dict = dissoc(cleaned_row, 'id', 'date', 'site_id') typeval = first(typeval_dict.items()) entry = { 'type': first(typeval), 'value': second(typeval), 'date': entry_date } print("Adding " + str(entry)) tbl.insert(entry)
def render_stack(self, ents): for group in ents: ent = first(ents[group]) self.config.print_fn("{}".format( colored.stylize(ent.file_capture(), colored.fg("green")))) for i, ent in enumerate(ents[group]): maybe_dotdot = "" if i == 0 else "\t:\n" self.config.print_fn("{}{}\t{}".format( colored.stylize(maybe_dotdot, colored.fg("dark_gray")), colored.stylize(ent.pos, colored.fg("dark_gray")), pretty(ent.line).strip(), )) self.config.print_fn("")
def subsample(graphs, targets, subsample_size=100): """subsample.""" tg = zip(targets, graphs) num_classes = len(set(targets)) class_graphs = groupby(lambda x: first(x), tg) subgraphs = [] subtargets = [] for y in class_graphs: class_subgraphs = class_graphs[y][:subsample_size / num_classes] class_subgraphs = [second(x) for x in class_subgraphs] subgraphs += class_subgraphs subtargets += [y] * len(class_subgraphs) subgraphs, subtargets = paired_shuffle(subgraphs, subtargets) return list(subgraphs), list(subtargets)
def load_csv(): '''Initialize data from csv.''' tablename = 'measurements' print("Checking if table %s exists." % tablename) tbl = check_table(tablename) if tbl is not None: print('Table exists, skipping. Drop it first?') return print('Table not found, initializing with csv data.') tbl = get_table(tablename) filepath = os.path.abspath(os.path.dirname(__file__)) with open(filepath + '/data/old_entries.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['site_id'] is not '3': print('Skip entry for test_site') continue # Remove empty items from dict cleaned_row = dict((k, v) for k, v in row.items() if v) # Take date part of time string gotten from postgres datestr = cleaned_row['date'][:10] st = strptime(datestr, "%Y-%m-%d") entry_date = date.fromtimestamp(mktime(st)) typeval_dict = dissoc(cleaned_row, 'id', 'date', 'site_id') typeval = first(typeval_dict.items()) entry = {'type': first(typeval), 'value': second(typeval), 'date': entry_date} print("Adding " + str(entry)) tbl.insert(entry)
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name): features = ["x1", "x2", "x3", "x4", "x5", "x6"] logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs))) <= 5 # Assert stop by remaining features logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn, base_extractor, metric_name): features = ["x1", "x2", "x3", "x4", "x5", "x6"] logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=0, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs))) <= 6 # Assert stop by remaining features logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def stop_by_num_features(logs: ListLogListType, min_num_features: int = 50) -> bool: """ Checks for logs to see if feature selection should stop Parameters ---------- logs : list of list of dict A list of log-like lists of dictionaries evaluations. min_num_features: int (default 50) The minimun number of features the model can have before stopping Returns ------- stop: bool A boolean whether to stop recursion or not """ return len(get_used_features(first(logs))) <= min_num_features
def __getattr__(self, attr): # Try to do the dataframe things first. try: value = super().__getattr__(attr) if isinstance(value, pandas.DataFrame): value = value.pipe(self.__class__) return value except AttributeError as e: pass super().__getattribute__(first(self._get_param_names())) # If it ain't a dataframe thing then # try each of the extensions. if not attr.startswith('_'): try: return self.pipe(self.env.pipes, attr) except: pass return super().__getattr__(attr)
def balance(graphs, targets, estimator, ratio=2): """balance.""" class_counts = Counter(targets) majority_class = None max_count = 0 minority_class = None min_count = 1e6 for class_key in class_counts: if max_count < class_counts[class_key]: majority_class = class_key max_count = class_counts[class_key] if min_count > class_counts[class_key]: minority_class = class_key min_count = class_counts[class_key] desired_size = int(min_count * ratio) tg = zip(targets, graphs) class_graphs = groupby(lambda x: first(x), tg) maj_graphs = [second(x) for x in class_graphs[majority_class]] min_graphs = [second(x) for x in class_graphs[minority_class]] if estimator: # select only the instances in the majority class that # have a small margin preds = estimator.decision_function(maj_graphs) else: # select at random preds = [random.random() for i in range(len(maj_graphs))] preds = [abs(pred) for pred in preds] pred_graphs = sorted(zip(preds, maj_graphs))[:desired_size] maj_graphs = [g for p, g in pred_graphs] bal_graphs = min_graphs + maj_graphs bal_pos = [minority_class] * len(min_graphs) bal_neg = [majority_class] * len(maj_graphs) bal_targets = bal_pos + bal_neg return paired_shuffle(bal_graphs, bal_targets)
def remove_features_subsets( log_list: LogListType, extractor: ExtractorFnType, metric_name: str, num_removed_by_step: int = 1) -> List[Tuple[str, ...]]: """ Performs feature selection based on the best performing model out of several trained models Parameters ---------- log_list : list of dict A list of log-like lists of dictionaries evaluations. extractor: function string -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted num_removed_by_step: int (default 1) The number of features to remove Returns ---------- keys: list of str The remaining keys of feature sets after choosing the current best subset """ best_log = get_best_performing_log(log_list, extractor, metric_name) best_subset: List[str] = first(gen_dict_extract('used_subsets', best_log)) return list( combinations(best_subset, len(best_subset) - num_removed_by_step))
def feature_importance_backward_selection(train_data: pd.DataFrame, param_train_fn: TuningLearnerFnType, features: List[str], split_fn: SplitterFnType, eval_fn: EvalFnType, extractor: ExtractorFnType, metric_name: str, num_removed_by_step: int = 5, threshold: float = 0.005, early_stop: int = 2, iter_limit: int = 50, min_remaining_features: int = 50, save_intermediary_fn: SaveIntermediaryFnType = None, n_jobs: int = 1) -> ListLogListType: """ Performs train-evaluation iterations while subsampling the used features to compute statistics about feature relevance Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data auxiliary_columns: list of str List of columns from the dataset that are not used as features but are used for evaluation or cross validation. (id, date, etc) param_train_fn : function (DataFrame, List of Strings) -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and a feature list and returns a predict function, a dataset with training predictions and training logs. features: list of str Elements must be columns of the train_data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted num_removed_by_step: int (default 5) Number of features removed at each iteration threshold: float (default 0.005) Threshold for model performance comparison early_stop: int (default 2) Number of rounds without improvement before stopping process iter_limit: int (default 50) Maximum number of iterations before stopping min_remaining_features: int (default 50) Minimum number of features that should remain in the model, combining num_removed_by_step and iter_limit accomplishes the same functionality as this parameter. save_intermediary_fn : function(log) -> save to file Partially defined saver function that receives a log result from a tuning step and appends it into a file Example: save_intermediary_result(save_path='tuning.pkl') n_jobs : int Number of parallel processes to spawn. Returns ---------- Logs: list of list of dict A list log-like lists of dictionaries evaluations. Each element of the list is validation step of the algorithm. """ selector_fn = remove_by_feature_importance(num_removed_by_step=num_removed_by_step) stop_fn = aggregate_stop_funcs( stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop, threshold=threshold), stop_by_iter_num(iter_limit=iter_limit), stop_by_num_features(min_num_features=min_remaining_features)) train_fn = lambda df: param_train_fn(df, features) first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs) logs = [first_logs] while not stop_fn(logs): curr_log = first(logs) new_features = selector_fn(curr_log) new_train_fn = lambda df: param_train_fn(df, new_features) next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs) if save_intermediary_fn is not None: save_intermediary_fn(next_log) logs = [next_log] + logs return logs
def filter_first_of_type(entries, wanted_type): return first([entry for entry in entries if entry['type'] == wanted_type ]) or None
def q_num(cell): assert cell.metadata.tags return first(filter(lambda t: 'q' in t, cell.metadata.tags))
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType, perturb_fn_train: PerturbFnType = identity, perturb_fn_test: PerturbFnType = identity, predict_oof: bool = False) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. perturb_fn_train : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at train-time. perturb_fn_test : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at test-time. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) train_fn = compose(train_fn, perturb_fn_train) eval_fn = compose(eval_fn, perturb_fn_test) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn, predict_oof) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) def get_perturbed_columns(perturbator: PerturbFnType) -> List[str]: args = inspect.getfullargspec(perturbator).kwonlydefaults return args['cols'] if args else [] train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) perturbator_log = { 'perturbated_train': [], 'perturbated_test': [] } # type: LogType if perturb_fn_train != identity: perturbator_log['perturbated_train'] = get_perturbed_columns( perturb_fn_train) if perturb_fn_test != identity: perturbator_log['perturbated_test'] = get_perturbed_columns( perturb_fn_test) first_train_log = assoc(first_train_log, "perturbator_log", perturbator_log) return assoc(first_train_log, "validator_log", list(validator_logs))
def first_pane(self): first_row = first(self.panes.values()) return first(first_row.values())
def filter_first_of_type(entries, wanted_type): return first([entry for entry in entries if entry['type'] == wanted_type]) or None
def test_remove_by_feature_importance(logs): log = first(logs) next_features = remove_by_feature_importance(log, num_removed_by_step=2) assert next_features == ["x1", "x3", "x5"]
def get_used_features(log: Dict) -> List[str]: return first((gen_dict_extract('features', log)))
def order_feature_importance_avg_from_logs(log: Dict) -> List[str]: d = first(gen_dict_extract('feature_importance', log)) return sorted(d, key=d.get, reverse=True)
def gen_key_avgs_from_iteration(key: str, log: Dict) -> Any: return first(gen_dict_extract(key, log))
def modulemap(root, io): modules = dirs(root, io) return pipe(modules, map(lambda m: assoc({}, basename(m), io.yaml(join(m, RUNNER_YAML)))), # noqa filter(lambda m: m[first(m)] is not None), merge) # noqa yapf: disable
def backward_subset_feature_selection(train_data: pd.DataFrame, param_train_fn: TuningLearnerFnType, features_sets: Dict[str, List[str]], split_fn: SplitterFnType, eval_fn: EvalFnType, extractor: ExtractorFnType, metric_name: str, threshold: float = 0.005, num_removed_by_step: int = 3, early_stop: int = 2, iter_limit: int = 50, min_remaining_features: int = 50, save_intermediary_fn: SaveIntermediaryFnType = None, n_jobs: int = 1) -> ListLogListType: """ Performs train-evaluation iterations while testing the subsets of features to compute statistics about the importance of each feature category Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and a feature list and returns a predict function, a dataset with training predictions and training logs. features_sets: dict of string -> list Each String Key on the dict is a subset of columns from the dataset, the function will analyse the influence of each group of features on the model performance split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted num_removed_by_step: int (default 3) Number of features removed at each iteration threshold: float (default 0.005) Threshold for model performance comparison early_stop: int (default 2) Number of rounds without improvement before stopping process iter_limit: int (default 50) Maximum number of iterations before stopping min_remaining_features: int (default 50) Minimum number of features that should remain in the model, combining num_removed_by_step and iter_limit accomplishes the same functionality as this parameter. save_intermediary_fn : function(log) -> save to file Partially defined saver function that receives a log result from a tuning step and appends it into a file Example: save_intermediary_result(save_path='tuning.pkl') n_jobs : int Number of parallel processes to spawn. Returns ---------- logs: list of list of dict A list log-like lists of dictionaries evaluations. Each element of the list is validation step of the algorithm. """ selector_fn = remove_features_subsets(extractor=extractor, metric_name=metric_name, num_removed_by_step=num_removed_by_step) stop_fn = aggregate_stop_funcs( stop_by_no_improvement_parallel(extractor=extractor, metric_name=metric_name, early_stop=early_stop, threshold=threshold), stop_by_iter_num(iter_limit=iter_limit), stop_by_num_features_parallel(extractor=extractor, metric_name=metric_name, min_num_features=min_remaining_features) ) used_subsets = [features_sets.keys()] used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets] trainers = [lambda df: param_train_fn(df, feat) for feat in used_features] first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers] logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]] while not stop_fn(logs): curr_log = first(logs) new_subsets = selector_fn(curr_log) new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets] trainers = [lambda df: param_train_fn(df, feat) for feat in new_features] val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers] new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)] if save_intermediary_fn is not None: save_intermediary_fn(new_logs) logs = [new_logs] + logs return logs