def solve(params, set_eta, calc_d2f): """Solve the phase field problem with FiPy Args: params: dictionary of parameter values set_eta: function to set the initial value of the phase field calc_df2: function to calculate the second derivative of the free energy function Returns: dictionary of the equation, variables and residuals """ def sweep_wrapper(kwargs): """Wrapper for sweep function Ensures that residuals tuple has the residual appended to it. """ return pipe( dissoc(kwargs, "residuals"), lambda x: sweep(params["dt"], calc_d2f, **x), lambda x: kwargs["residuals"] + (x, ), assoc(kwargs, "residuals"), ) return pipe( params, get_mesh, get_vars(params, set_eta), lambda x: assoc(x, "equation", get_eq(params, **x)), lambda x: assoc(x, "residuals", ()), iterate_(sweep_wrapper, params["fipy_iter"]), )
def test_sfepy(): """Run some tests """ assert np.allclose( run_sfepy_fake(assoc(get_params(), "delta", 0.1), (10, 10), 1.0)[1][0, 0], [-0.00515589, -0.00515589], ) assert np.allclose( run_sfepy_fake(assoc(get_params(), "delta", 0.1), (10, 10), 0.1)[1][0, 0], [-0.00051559, -0.00051559], )
def split_cf_messages(format_message, var_length_key, event, separator=', ', max_length=255): """ Try to split cloud feed log events out into multiple events if the message is too long (the variable-length variable would cause the message to be too long.) :param str format_message: The format string to use to format the event :param str var_length_key: The key in the event dictionary that contains the variable-length part of the formatted message. :param dict event: The event dictionary :param str separator: The separator to use to join the various elements that should be varied. (e.g. if the elements in "var_length_key" are ["1", "2", "3"] and the separator is "; ", "var_length_key" will be represented as "1; 2; 3") :param int max_length: The maximum length of the formatted message. :return: `list` of event dictionaries with the formatted message and the split event field. """ def length_calc(e): return len(format_message.format(**e)) render = compose(assoc(event, var_length_key), separator.join, curry(map, str)) if length_calc(event) <= max_length: return [(render(event[var_length_key]), format_message)] events = split(render, event[var_length_key], max_length, length_calc) return [(e, format_message) for e in events]
def split_list_servers(event, maxlength=event_max_length): """ Split response_body in listing servers detail log such that each each log's response_body is < maxlength. Since this event only has response_body as large part it is fine to only have that < maxlength since maxlength is generic guideline for length :param dict event: Event to split :param int maxlength: Event JSON max length :return: List of (event, formatted message) tuples """ message = "Listing server details succeeded" _json = json.dumps(event["response_body"]) if len(_json) < maxlength: event["response_body"] = _json return [(event, message)] def part_json(servers): return json.dumps({"servers": servers}) parts = split(part_json, event["response_body"]["servers"], maxlength, len) del event["response_body"] return [(assoc(event, "response_body", part), message) for part in parts]
def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log)
def _process_config(self, config: Mapping) -> Mapping: processed_config = pipe( config, assoc(key='Tags', value=merge(standard_tags(self), config.get('Tags', {}))), # original tags takes precedence if there is a conflict super()._process_config) return processed_config
def _process_config(self, config: Mapping) -> Mapping: tags_dict = merge(standard_tags(self), config.get('Tags', {})) tags_list = [{'Key': k, 'Value': v} for k, v in tags_dict.items()] processed_config = pipe( config, assoc(key='Tags', value=tags_list), super()._process_config, ) return processed_config
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) return assoc(first_train_log, "validator_log", list(validator_logs))
def sweep_wrapper(kwargs): """Wrapper for sweep function Ensures that residuals tuple has the residual appended to it. """ return pipe( dissoc(kwargs, "residuals"), lambda x: sweep(params["dt"], calc_d2f, **x), lambda x: kwargs["residuals"] + (x, ), assoc(kwargs, "residuals"), )
def test_combined(): """Run a combined test """ assert pipe( get_params(), assoc(key="fipy_iter", value=2), run_main, get("eta"), np.array, np.sum, lambda x: np.allclose(x, 1515.784), )
def _process_config(self, config: Mapping) -> Mapping: tags = [{ 'Key': k, 'Value': v } for k, v in merge(standard_tags(self), config.get('Tags', {})).items()] processed_config = pipe( config, assoc(key='Tags', value=tags), # original tags takes precedence if there is a conflict super()._process_config) return processed_config
def scatter_drag(self, x_points: 'Array', y_points: 'Array', *, show_eqn=True, options={}): options = tz.assoc(options, '_fig', self.figure) box = scatter_drag(x_points, y_points, show_eqn=show_eqn, options=options) widget = box.children[0] self.widgets.append(widget) return self
def test_fipy(): """Run the FiPy tests """ assert pipe( dict(e11=0.0, e12=0.0, e22=0.0), lambda x: np.allclose( fipy_solve( assoc(get_params(), "fipy_iter", 2), set_eta(None), calc_d2f(get_params(), x), )["residuals"][-1], 60.736145628467526, ), )
def _process_config(self, config: Mapping) -> Mapping: self.role = role = config['Role'] assert isinstance(self.role, Role) processed_config = pipe( config, assoc(key='Role', value=role.arn), assoc(key='Tags', value=merge(standard_tags(self), config.get('Tags', {}))), # original tags takes precedence if there is a conflict super()._process_config, dict) for config_key, model in self.non_creation_parameters.items(): if config_key in processed_config: operation_name = getattr( self.service_client, model.create_name + '_' + model.sdk_name) operation_model = get_operation_model(self.service_client, operation_name) value = self._process_config_value( None, processed_config[config_key]) if model.is_collection: processed_value = [ self._process_config_value(operation_model.input_shape, elt) for elt in value ] else: processed_value = self._process_config_value( operation_model.input_shape.members[config_key], processed_config[config_key]) processed_config[config_key] = processed_value return processed_config
def fipy_iter(params, data): """One FiPy iteration Args: params: the parameter dictionary total_strain: dictionary of total strain fields Returns: updated data dictionary """ return pipe( dissoc(data, "eta"), calc_d2f(params), lambda x: fipy_solve(params, set_eta(data["eta"]), x)["eta"], lambda x: assoc(data, "eta", x), )
def one_iter(params, data): """Do one iteration Args: params: the parameter dictionary data: dictionary of the phase field and strain fields Returns: dictionary of the phase field and strain fields """ return pipe( data, fipy_iter(params), sfepy_iter(params), lambda x: assoc(x, "step_counter", x["step_counter"] + 1), )
def add_weight(answer: dict): def is_a_matching_question(answer): return pipe( [answer_keys.match_left, answer_keys.incorrect], map(lambda k: k in answer), any, ) needs_weight = compose( any, juxt(complement(is_a_matching_question), ), ) if needs_weight(answer): return assoc(answer, answer_keys.weight, int(answer.get(answer_keys.weight, 0) and 100)) return answer
def split_execute_convergence(event, max_length=event_max_length): """ Try to split execute-convergence event out into multiple events if there are too many CLB nodes, too many servers, or too many steps. The problem is mainly the servers, since they take up the most space. Experimentally determined that probably logs cut off at around 75k, characters - we're going to limit it to 50k. :param dict event: The 'execute-convergence' type event dictionary to split :param int max_length: The maximum length of the entire JSON-formatted dictionary. :return: `list` of `tuple` of (`dict`, `str`). The `dict`s in the tuple represents the spit up event dicts, and the `str` the format string for each. If the event does not need to be split, the list will only have one tuple. """ message = "Executing convergence" if _json_len(event) <= max_length: return [(event, message)] events = [(event, message)] large_things = sorted(('servers', 'lb_nodes'), key=compose(_json_len, event.get), reverse=True) # simplified event which serves as a base for the split out events base_event = keyfilter( lambda k: k not in ('desired', 'servers', 'lb_nodes', 'steps'), event) for thing in large_things: split_up_events = split( assoc(base_event, thing), event[thing], max_length, _json_len) events.extend([(e, message) for e in split_up_events]) del event[thing] if _json_len(event) <= max_length: break return events
def make_csv(columns, number, size, filename): return pipe( 'data.json', lambda x: loadfn(x, cls=MontyDecoder)[:number], map( lambda x: assoc( x, key='formula', value=x['final_str'].composition.reduced_formula ), ), list, lambda x: pandas.DataFrame(x), lambda x: x[columns], lambda x: x.to_csv('tmp.csv', index=False), lambda _: pandas.read_csv('tmp.csv', na_values=['None', 'na']), lambda x: x.dropna().reset_index(drop=True).ix[:size], lambda x: x.to_csv(filename, index=False) )
def split_execute_convergence(event, max_length=event_max_length): """ Try to split execute-convergence event out into multiple events if there are too many CLB nodes, too many servers, or too many steps. The problem is mainly the servers, since they take up the most space. Experimentally determined that probably logs cut off at around 75k, characters - we're going to limit it to 50k. :param dict event: The 'execute-convergence' type event dictionary to split :param int max_length: The maximum length of the entire JSON-formatted dictionary. :return: `list` of `tuple` of (`dict`, `str`). The `dict`s in the tuple represents the spit up event dicts, and the `str` the format string for each. If the event does not need to be split, the list will only have one tuple. """ message = "Executing convergence" if _json_len(event) <= max_length: return [(event, message)] events = [(event, message)] large_things = sorted(('servers', 'lb_nodes'), key=compose(_json_len, event.get), reverse=True) # simplified event which serves as a base for the split out events base_event = keyfilter( lambda k: k not in ('desired', 'servers', 'lb_nodes', 'steps'), event) for thing in large_things: split_up_events = split(assoc(base_event, thing), event[thing], max_length, _json_len) events.extend([(e, message) for e in split_up_events]) del event[thing] if _json_len(event) <= max_length: break return events
def calc_gradient_free_energy(data): """Calculate the gradient free energy for one time step Args: data: dictionary of data from a output file for given time step Returns: a float representing the gradient free energy for a given time step """ func = sequence( lambda x: get_vars(x, set_eta(data["eta"]), get_mesh(x)), get("eta"), lambda x: x.grad.mag ** 2, ) return pipe( data["params"].item(), lambda x: assoc(x, "dx", x["lx"] / x["nx"]), lambda x: func(x) * (x["kappa"] / 2) * calc_dx2(x), np.array, np.sum, )
read_and_plot(calc_position_d)(ctx) @cli.command() @click.pass_context def elastic_free_energy(ctx): """Command to plot the elastic free energy """ read_and_plot(calc_elastic_free_energy)(ctx) calc_dx2 = lambda x: (x["lx"] / x["nx"]) ** 2 calc_elastic_free_energy = sequence( lambda x: assoc(x, "params", x["params"].item()), lambda x: assoc(x, "dx", x["params"]["lx"] / x["params"]["nx"]), lambda x: assoc(x, "total_strain", dict(e11=x["e11"], e22=x["e22"], e12=x["e12"])), lambda x: calc_elastic_f(x["params"], x["total_strain"], x["eta"]) * calc_dx2(x["params"]), np.sum, ) @cli.command() @click.pass_context def bulk_free_energy(ctx): """Command to plot the bulk free energy """ read_and_plot(calc_bulk_free_energy)(ctx)
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType, perturb_fn_train: PerturbFnType = identity, perturb_fn_test: PerturbFnType = identity, predict_oof: bool = False) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. perturb_fn_train : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at train-time. perturb_fn_test : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at test-time. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) train_fn = compose(train_fn, perturb_fn_train) eval_fn = compose(eval_fn, perturb_fn_test) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn, predict_oof) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) def get_perturbed_columns(perturbator: PerturbFnType) -> List[str]: args = inspect.getfullargspec(perturbator).kwonlydefaults return args['cols'] if args else [] train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) perturbator_log = { 'perturbated_train': [], 'perturbated_test': [] } # type: LogType if perturb_fn_train != identity: perturbator_log['perturbated_train'] = get_perturbed_columns( perturb_fn_train) if perturb_fn_test != identity: perturbator_log['perturbated_test'] = get_perturbed_columns( perturb_fn_test) first_train_log = assoc(first_train_log, "perturbator_log", perturbator_log) return assoc(first_train_log, "validator_log", list(validator_logs))
def parallel_validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType, n_jobs: int = 1, predict_oof: bool = False) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold. Tries to run each fold in parallel using up to n_jobs processes. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. n_jobs : int Number of parallel processes to spawn. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list log-like dictionary evaluations. """ folds, logs = split_fn(train_data) dumped_train_fn = cloudpickle.dumps(train_fn) dumped_eval_fn = cloudpickle.dumps(eval_fn) result = Parallel(n_jobs=n_jobs, backend="threading")( delayed(parallel_validator_iteration)(train_data, x, dumped_train_fn, dumped_eval_fn, predict_oof) for x in enumerate(folds)) gc.collect() train_log = { "train_log": [fold_result["train_log"] for fold_result in result] } @curry def kwdissoc(d: Dict, key: str) -> Dict: return dissoc(d, key) validator_logs = pipe( result, partial(zip, logs), map(lambda log_tuple: assoc(log_tuple[1], "split_log", log_tuple[0])), map(kwdissoc(key="train_log")), list) return assoc(train_log, "validator_log", validator_logs)
def validator_iteration(data: pd.DataFrame, train_index: pd.Index, test_indexes: pd.Index, fold_num: int, train_fn: LearnerFnType, eval_fn: EvalFnType, predict_oof: bool = False) -> LogType: """ Perform an iteration of train test split, training and evaluation. Parameters ---------- data : pandas.DataFrame A Pandas' DataFrame with training and testing subsets train_index : numpy.Array The index of the training subset of `data`. test_indexes : list of numpy.Array A list of indexes of the testing subsets of `data`. fold_num : int The number of the fold in the current iteration train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A log-like dictionary evaluations. """ train_data = data.iloc[train_index] empty_set_warn = "Splitter on validator_iteration in generating an empty training dataset. train_data.shape is %s" \ % str(train_data.shape) warnings.warn( empty_set_warn) if train_data.shape[0] == 0 else None # type: ignore predict_fn, train_out, train_log = train_fn(train_data) eval_results = [] oof_predictions = [] for test_index in test_indexes: test_predictions = predict_fn(data.iloc[test_index]) eval_results.append(eval_fn(test_predictions)) if predict_oof: oof_predictions.append(test_predictions) logs = { 'fold_num': fold_num, 'train_log': train_log, 'eval_results': eval_results } return assoc(logs, "oof_predictions", oof_predictions) if predict_oof else logs
def hist(self, hist_function, *, options={}, **interact_params): options = tz.assoc(options, '_fig', self.figure) box = hist(hist_function, options=options, **interact_params) widget = box.children[0] self.widgets.append(widget) return self
def _process_config(self, config: Mapping) -> Mapping: tags_dict = merge(standard_tags(self), config.get('Description', {})) processed_config = pipe(config, assoc(key='Description', value=tags_dict), super()._process_config) return processed_config
def rename_key(d, key_name, key_new_name): return assoc(dissoc(d, key_name), key_new_name, d[key_name])
def load_regles_nodes(json_file_name): return pipe( read_ast_json_file(json_file_name), filter(lambda node: 'batch' in node['applications']), map(lambda d: assoc(d, 'source_file_name', '{}.m'.format(os.path.splitext(json_file_name)[0]))), )
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity') global args args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING), stream=sys.stdout, ) if not os.path.isdir(json_dir_path): os.mkdir(json_dir_path) if not os.path.isdir(ast_dir_path): os.mkdir(ast_dir_path) # Load variables definitions tgvh_infos = list(load_tgvH_file()) # Write constants constant_by_name = pipe( tgvh_infos, filter(lambda val: val['type'] == 'variable_const'), map(lambda d: (d['name'], d['value'])), dict, ) write_json_file(data=constant_by_name, file_name='constants.json') # Write variables dependencies regles_nodes = list(mapcat(load_regles_nodes, iter_json_file_names('chap-*.json', 'res-ser*.json'))) dependencies_by_formula_name = dict(list(mapcat(dependencies_visitors.visit_node, regles_nodes))) write_json_file(data=dependencies_by_formula_name, file_name='formulas_dependencies.json') # Write variables definitions ast_infos_by_variable_name = {} for regle_node in regles_nodes: regle_infos = { 'regle_applications': regle_node['applications'], 'regle_linecol': regle_node['linecol'], 'regle_name': regle_node['name'], 'source_file_name': regle_node['source_file_name'], } regle_tags = list(pluck('value', regle_node.get('tags', []))) if regle_tags: regle_infos['regle_tags'] = regle_tags for formula_node in regle_node['formulas']: if formula_node['type'] == 'formula': ast_infos_by_variable_name[formula_node['name']] = assoc( regle_infos, 'formula_linecol', formula_node['linecol']) elif formula_node['type'] == 'pour_formula': for unlooped_formula_node in unloop_helpers.iter_unlooped_nodes( loop_variables_nodes=formula_node['loop_variables'], node=formula_node['formula'], unloop_keys=['name'], ): pour_formula_infos = merge(regle_infos, { 'pour_formula_linecol': formula_node['formula']['linecol'], 'pour_formula_name': formula_node['formula']['name'], }) ast_infos_by_variable_name[unlooped_formula_node['name']] = pour_formula_infos else: assert False, 'Unhandled formula_node type: {}'.format(formula_node) def rename_key(d, key_name, key_new_name): return assoc(dissoc(d, key_name), key_new_name, d[key_name]) tgvh_infos_by_variable_name = pipe( tgvh_infos, filter(lambda d: d['type'] in ('variable_calculee', 'variable_saisie')), map(lambda d: rename_key(d, 'linecol', 'tgvh_linecol')), map(lambda d: (d['name'], d)), # Index by name dict, ) definition_by_variable_name = merge_with(merge, ast_infos_by_variable_name, tgvh_infos_by_variable_name) write_json_file(data=definition_by_variable_name, file_name='variables_definitions.json') return 0
def line(self, x_fn, y_fn, *, options={}, **interact_params): options = tz.assoc(options, '_fig', self.figure) box = line(x_fn, y_fn, options=options, **interact_params) widget = box.children[0] self.widgets.append(widget) return self
def spatial_learning_curve_splitter(train_data: pd.DataFrame, space_column: str, time_column: str, training_limit: DateType, holdout_gap: timedelta = timedelta(days=0), train_percentages: Iterable[float] = (0.25, 0.5, 0.75, 1.0), random_state: int = None) -> SplitterReturnType: """ Splits the data for a spatial learning curve. Progressively adds more and more examples to the training in order to verify the impact of having more data available on a validation set. The validation set starts after the training set, with an optional time gap. Similar to the temporal learning curves, but with spatial increases in the training set. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame that will be split for learning curve estimation. space_column : str The name of the ID column of `train_data`. time_column : str The name of the temporal column of `train_data`. training_limit: datetime or str The date limiting the training (after which the holdout begins). holdout_gap: timedelta The gap between the end of training and the start of the holdout. If you have censored data, use a gap similar to the censor time. train_percentages: list or tuple of floats A list containing the percentages of IDs to use in the training. Defaults to (0.25, 0.5, 0.75, 1.0). For example: For the default value, there would be four model trainings, containing respectively 25%, 50%, 75%, and 100% of the IDs that are not part of the held out set. random_state : int A seed for the random number generator that shuffles the IDs. """ if np.min(train_percentages) < 0 or np.max(train_percentages) > 1: raise ValueError('Train percentages must be between 0 and 1') if isinstance(training_limit, str): training_limit = datetime.strptime(training_limit, "%Y-%m-%d") if training_limit < train_data[time_column].min() or training_limit > train_data[time_column].max(): raise ValueError('Temporal training limit should be within datasets temporal bounds (min and max times)') if timedelta(days=0) > holdout_gap: raise ValueError('Holdout gap cannot be negative') if holdout_gap >= (train_data[time_column].max() - training_limit): raise ValueError('After taking the gap into account, there should be enough time for the holdout set') train_data = train_data.reset_index() # We need to sample the space column before getting its unique values so their order in the DF won't matter here spatial_ids = train_data[space_column].sample(frac=1, random_state=random_state).unique() cumulative_ids = pipe( spatial_ids, lambda ids: (np.array(train_percentages) * len(ids)).astype(int), # Get the corresponding indices for each % lambda idx: np.split(spatial_ids, idx)[:-1], # Split spatial ids by the indices lambda l: map(lambda x: x.tolist(), l), # Transform sub-arrays into sub-lists lambda l: filter(None, l), # Drop empty sub-lists accumulate(operator.add) # Cumulative sum of lists ) validation_set = train_data[train_data[time_column] > (training_limit + holdout_gap)] train_data = train_data[train_data[time_column] <= training_limit] folds = [(train_data[train_data[space_column].isin(ids)][time_column], validation_set[time_column]) for ids in cumulative_ids] folds_indices = _lc_fold_to_indexes(folds) # final formatting with idx logs = [assoc(learner, "percentage", p) for learner, p in zip(map(_log_time_fold, folds), train_percentages)] return folds_indices, logs
def modulemap(root, io): modules = dirs(root, io) return pipe(modules, map(lambda m: assoc({}, basename(m), io.yaml(join(m, RUNNER_YAML)))), # noqa filter(lambda m: m[first(m)] is not None), merge) # noqa yapf: disable
def transform_year(d): if 'year' in d: return _.assoc(d, 'year', int(d['year'])) return d