def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=[], save=True, **kwargs): """Evaluate the model on the validation set Args: metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s) batch_size: num_workers: eval_train: if True, also compute the evaluation metrics on the training set save: save the json file to the output directory """ if len(kwargs) > 0: logger.warn(f"Extra kwargs were provided to trainer.evaluate(): {kwargs}") # Save the complete model -> HACK self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl')) # contruct a list of dataset to evaluate if eval_train: eval_datasets = [('train', self.train_dataset)] + self.valid_dataset else: eval_datasets = self.valid_dataset # skip some datasets for evaluation try: if len(eval_skip) > 0: logger.info(f"Using eval_skip: {eval_skip}") eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip] except Exception: logger.warn(f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}") metric_res = OrderedDict() for d in eval_datasets: if len(d) == 2: dataset_name, dataset = d eval_metric = None # Ignore the provided metric elif len(d) == 3: # specialized evaluation metric was passed dataset_name, dataset, eval_metric = d else: raise ValueError("Valid dataset needs to be a list of tuples of 2 or 3 elements" "(name, dataset) or (name, dataset, metric)") logger.info(f"Evaluating dataset: {dataset_name}") metric_res[dataset_name] = self.seq_model.evaluate(dataset, eval_metric=eval_metric, num_workers=num_workers, batch_size=batch_size) if save: write_json(metric_res, self.evaluation_path, indent=2) logger.info("Saved metrics to {}".format(self.evaluation_path)) if self.cometml_experiment is not None: self.cometml_experiment.log_metrics(flatten(metric_res, separator='/'), prefix="eval/") if self.wandb_run is not None: self.wandb_run.summary.update(flatten(dict_prefix_key(metric_res, prefix="eval/"), separator='/')) return metric_res
def flatten_batch(batch, nested_sep="/"): """Convert the nested batch of numpy arrays into a dictionary of 1-dimensional numpy arrays Args: batch: batch of data nested_sep: What separator to use for flattening the nested dictionary structure into a single key Returns: A dictionary of 1-dimensional numpy arrays. """ def array2array_dict(arr): """Convert a numpy array into a dictionary of numpy arrays >>> arr = np.arange(9).reshape((1, 3, 3)) >>> assert array2array_dict(arr)["0"]["1"][0] == arr[:, 0, 1][0] """ if isinstance(arr, np.ndarray): if arr.ndim <= 1: return arr else: return collections.OrderedDict([(str(i), array2array_dict(arr[:, i])) for i in range(arr.shape[1])]) elif isinstance(arr, pd.DataFrame): return {k: v.values for k, v in arr.to_dict("records").items()} elif (arr.__class__.__module__, arr.__class__.__name__) == ('kipoi.metadata', 'GenomicRanges'): return arr.to_dict() else: raise ValueError("Unknown data type: %s" % str(type(arr))) return flatten(map_nested(batch, array2array_dict), separator=nested_sep)
def test_unflatten_with_list_issue15(): """https://github.com/amirziai/flatten/issues/15""" dic = { "Required": { "a": "1", "b": ["1", "2", "3"], "c": { "d": { "e": [[{ "s1": 1 }, { "s2": 2 }], [{ "s3": 1 }, { "s4": 2 }]] } }, "f": ["1", "2"] }, "Optional": { "x": "1", "y": ["1", "2", "3"] } } dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic
def test_flatten_dict(nested_dict): fd = flatten(nested_dict) assert dict(fd) == { 'a': 1, 'b_c': 3, 'b_d_0': 1, 'b_d_1': 2, 'b_d_2': 3, 'b_e_0_f': 1, 'b_e_1_g': 4 } assert unflatten_list(fd) == dict(nested_dict)
def test_list_and_dict(): dic = {'a': 1, 'b': 2, 'c': [{'d': [2, 3, 4], 'e': [{'f': 1, 'g': 2}]}]} expected = { 'a': 1, 'b': 2, 'c_0_d_0': 2, 'c_0_d_1': 3, 'c_0_d_2': 4, 'c_0_e_0_f': 1, 'c_0_e_0_g': 2 } actual = flatten(dic) assert actual == expected
def test_blog_example(): dic = {"a": 1, "b": 2, "c": [{"d": ['2', 3, 4], "e": [{"f": 1, "g": 2}]}]} expected = { 'a': 1, 'b': 2, 'c_0_d_0': '2', 'c_0_d_1': 3, 'c_0_d_2': 4, 'c_0_e_0_f': 1, 'c_0_e_0_g': 2 } actual = flatten(dic) assert actual == expected
def batch_write(self, batch): """Write a batch of data to bed file # Arguments batch: batch of data. Either a single `np.array` or a list/dict thereof. """ fbatch = flatten(batch, separator="/") batch_sizes = [fbatch[k].shape[0] for k in fbatch] # assert all shapes are the same assert len(pd.Series(batch_sizes).unique()) == 1 batch_size = batch_sizes[0] if self.first_pass: # have a dictionary holding for k in fbatch: if fbatch[k].dtype.type == np.dtype("object"): import h5py # assume that all elements of fbatch[k] have the same dtype dtype = fbatch[k][0].dtype dtype = h5py.special_dtype(vlen=dtype) # TODO: h5py.special_dtype is deprecated from h5py >= 2.10; eventually change to h5py.vlen_dtype elif fbatch[k].dtype.type in [ np.string_, np.str_, np.unicode_ ]: dtype = self.string_type else: dtype = fbatch[k].dtype self.f.create_dataset(k, shape=(0, ) + fbatch[k].shape[1:], dtype=dtype, maxshape=(None, ) + fbatch[k].shape[1:], compression=self.compression, chunks=(self.chunk_size, ) + fbatch[k].shape[1:]) self.first_pass = False # add data to the buffer if self.write_buffer is None: self.write_buffer = [fbatch] self.write_buffer_size = batch_size else: self.write_buffer.append(fbatch) self.write_buffer_size += batch_size if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size: self._flush_buffer()
def test_unflatten_with_list_deep(): dic = { 'a': [{ 'b': [{ 'c': [{ 'a': 5, 'b': { 'a': [1, 2, 3] }, 'c': { 'x': 3 } }] }] }] } dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic
def __init__(self, file_path, metadata_schema, header=True): self.file_path = file_path self.header = header self.first_pass = True f_dl_schema = flatten(metadata_schema) range_keys = [ "metadata/" + k for k in f_dl_schema if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES ] if len(range_keys) > 1: raise ValueError( "Found multiple genomic ranges in metadata: {0}. For writing to the " + "bed file exactly one genomic range has to exist".format( range_keys)) elif len(range_keys) == 0: raise ValueError( "Found no genomic ranges in metadata. For writing to the " + "bed file exactly one genomic range has to exist") self.ranges_key = range_keys[0]
def batch_write(self, batch): """Write a batch of data to bed file # Arguments batch: batch of data. Either a single `np.array` or a list/dict thereof. """ fbatch = flatten(batch, separator="/") batch_sizes = [fbatch[k].shape[0] for k in fbatch] # assert all shapes are the same assert len(pd.Series(batch_sizes).unique()) == 1 batch_size = batch_sizes[0] if self.first_pass: # have a dictionary holding for k in fbatch: if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]: dtype = self.string_type else: dtype = fbatch[k].dtype self.f.create_dataset(k, shape=(0, ) + fbatch[k].shape[1:], dtype=dtype, maxshape=(None, ) + fbatch[k].shape[1:], compression=self.compression, chunks=(self.chunk_size, ) + fbatch[k].shape[1:]) self.first_pass = False # add data to the buffer if self.write_buffer is None: self.write_buffer = [fbatch] self.write_buffer_size = batch_size else: self.write_buffer.append(fbatch) self.write_buffer_size += batch_size if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size: self._flush_buffer()
def test_no_flatten(): dic = {'a': '1', 'b': '2', 'c': 3} expected = dic actual = flatten(dic) assert actual == expected
def train(self, batch_size=256, epochs=100, early_stop_patience=4, num_workers=8, train_epoch_frac=1.0, valid_epoch_frac=1.0, train_samples_per_epoch=None, validation_samples=None, train_batch_sampler=None, tensorboard=True): """Train the model Args: batch_size: epochs: patience: early stopping patience num_workers: how many workers to use in parallel train_epoch_frac: if smaller than 1, then make the epoch shorter valid_epoch_frac: same as train_epoch_frac for the validation dataset train_batch_sampler: batch Sampler for training. Useful for say Stratified sampling tensorboard: if True, tensorboard output will be added """ if train_batch_sampler is not None: train_it = self.train_dataset.batch_train_iter( shuffle=False, batch_size=1, drop_last=None, batch_sampler=train_batch_sampler, num_workers=num_workers) else: train_it = self.train_dataset.batch_train_iter( batch_size=batch_size, shuffle=True, num_workers=num_workers) next(train_it) valid_dataset = self.valid_dataset[0][1] # take the first one valid_it = valid_dataset.batch_train_iter(batch_size=batch_size, shuffle=True, num_workers=num_workers) next(valid_it) if tensorboard: tb = [TensorBoard(log_dir=self.output_dir)] else: tb = [] if self.wandb_run is not None: from wandb.keras import WandbCallback wcp = [WandbCallback(save_model=False) ] # we save the model using ModelCheckpoint else: wcp = [] # train the model if len(valid_dataset) == 0: raise ValueError("len(self.valid_dataset) == 0") if train_samples_per_epoch is None: train_steps_per_epoch = max( int(len(self.train_dataset) / batch_size * train_epoch_frac), 1) else: train_steps_per_epoch = max( int(train_samples_per_epoch / batch_size), 1) if validation_samples is None: # parametrize with valid_epoch_frac validation_steps = max( int(len(valid_dataset) / batch_size * valid_epoch_frac), 1) else: validation_steps = max(int(validation_samples / batch_size), 1) self.model.fit_generator( train_it, epochs=epochs, steps_per_epoch=train_steps_per_epoch, validation_data=valid_it, validation_steps=validation_steps, callbacks=[ EarlyStopping(patience=early_stop_patience, restore_best_weights=True), CSVLogger(self.history_path) ] + tb + wcp # ModelCheckpoint(self.ckp_file, save_best_only=True)] ) self.model.save(self.ckp_file) # self.model = load_model(self.ckp_file) # not necessary, EarlyStopping is already restoring the best weights # log metrics from the best epoch try: dfh = pd.read_csv(self.history_path) m = dict(dfh.iloc[dfh.val_loss.idxmin()]) if self.cometml_experiment is not None: self.cometml_experiment.log_multiple_metrics( m, prefix="best-epoch/") if self.wandb_run is not None: self.wandb_run.summary.update( flatten(prefix_dict(m, prefix="best-epoch/"), separator='/')) except FileNotFoundError as e: logger.warning(e)
def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=(), save=True, **kwargs): """Evaluate the model on the validation set Args: metrics: a list or a dictionary of metrics batch_size: num_workers: eval_train: if True, also compute the evaluation metrics on the training set save: save the json file to the output directory """ if len(kwargs) > 0: logger.warning( f"Extra kwargs were provided to trainer.evaluate: {kwargs}") # contruct a list of dataset to evaluate if eval_train: eval_datasets = [('train', self.train_dataset) ] + self.valid_dataset else: eval_datasets = self.valid_dataset try: if len(eval_skip) > 0: eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip] except: logger.warning( f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}" ) metric_res = OrderedDict() for d in eval_datasets: if len(d) == 2: dataset_name, dataset = d eval_metric = metric # use the default eval metric elif len(d) == 3: # specialized evaluation metric was passed dataset_name, dataset, eval_metric = d else: # TODO - this should be made more explicit with classes raise ValueError( "Valid dataset needs to be a list of tuples of 2 or 3 elements" "(name, dataset) or (name, dataset, metric)") logger.info(f"Evaluating dataset: {dataset_name}") lpreds = [] llabels = [] from copy import deepcopy for inputs, targets in tqdm( dataset.batch_train_iter(cycle=False, num_workers=num_workers, batch_size=batch_size), total=len(dataset) // batch_size): lpreds.append(self.model.predict_on_batch(inputs)) llabels.append(deepcopy(targets)) del inputs del targets preds = numpy_collate_concat(lpreds) labels = numpy_collate_concat(llabels) del lpreds del llabels metric_res[dataset_name] = eval_metric(labels, preds) if save: write_json(metric_res, self.evaluation_path, indent=2) logger.info("Saved metrics to {}".format(self.evaluation_path)) if self.cometml_experiment is not None: self.cometml_experiment.log_multiple_metrics(flatten( metric_res, separator='/'), prefix="eval/") if self.wandb_run is not None: self.wandb_run.summary.update( flatten(prefix_dict(metric_res, prefix="eval/"), separator='/')) metric_res = {**self.metrics, **metric_res} return metric_res
def test_list(): dic = {'a': 1, 'b': [{'c': [2, 3]}]} expected = {'a': 1, 'b_0_c_0': 2, 'b_0_c_1': 3} actual = flatten(dic) assert actual == expected
def test_custom_separator(): dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}} expected = {'a': '1', 'b': '2', 'c*c1': '3', 'c*c2': '4'} actual = flatten(dic, '*') assert actual == expected
def test_one_flatten_utf8_dif(): a = {u'eñe': 1} info = dict(info=a) expected = {u'info_{}'.format(u'eñe'): 1} actual = flatten(info) assert actual == expected
def test_one_flatten_utf8(): dic = {'a': '1', u'ñ': u'áéö', 'c': {u'c1': '3', 'c2': '4'}} expected = {'a': '1', u'ñ': u'áéö', 'c_c1': '3', 'c_c2': '4'} actual = flatten(dic) assert actual == expected
def test_one_flatten(): dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}} expected = {'a': '1', 'b': '2', 'c_c1': '3', 'c_c2': '4'} actual = flatten(dic) assert actual == expected
def test_unflatten_with_list_nested(): dic = {"a": [[{"b": 1}], [{"d": 1}]]} dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic