Пример #1
0
    def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=[], save=True, **kwargs):
        """Evaluate the model on the validation set
        Args:
          metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s)
          batch_size:
          num_workers:
          eval_train: if True, also compute the evaluation metrics on the training set
          save: save the json file to the output directory
        """
        if len(kwargs) > 0:
            logger.warn(f"Extra kwargs were provided to trainer.evaluate(): {kwargs}")
        # Save the complete model -> HACK
        self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl'))

        # contruct a list of dataset to evaluate
        if eval_train:
            eval_datasets = [('train', self.train_dataset)] + self.valid_dataset
        else:
            eval_datasets = self.valid_dataset

        # skip some datasets for evaluation
        try:
            if len(eval_skip) > 0:
                logger.info(f"Using eval_skip: {eval_skip}")
                eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip]
        except Exception:
            logger.warn(f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}")

        metric_res = OrderedDict()
        for d in eval_datasets:
            if len(d) == 2:
                dataset_name, dataset = d
                eval_metric = None  # Ignore the provided metric
            elif len(d) == 3:
                # specialized evaluation metric was passed
                dataset_name, dataset, eval_metric = d
            else:
                raise ValueError("Valid dataset needs to be a list of tuples of 2 or 3 elements"
                                 "(name, dataset) or (name, dataset, metric)")
            logger.info(f"Evaluating dataset: {dataset_name}")
            metric_res[dataset_name] = self.seq_model.evaluate(dataset,
                                                               eval_metric=eval_metric,
                                                               num_workers=num_workers,
                                                               batch_size=batch_size)
        if save:
            write_json(metric_res, self.evaluation_path, indent=2)
            logger.info("Saved metrics to {}".format(self.evaluation_path))

        if self.cometml_experiment is not None:
            self.cometml_experiment.log_metrics(flatten(metric_res, separator='/'), prefix="eval/")

        if self.wandb_run is not None:
            self.wandb_run.summary.update(flatten(dict_prefix_key(metric_res, prefix="eval/"), separator='/'))

        return metric_res
Пример #2
0
def flatten_batch(batch, nested_sep="/"):
    """Convert the nested batch of numpy arrays into a dictionary of 1-dimensional numpy arrays

    Args:
      batch: batch of data
      nested_sep: What separator to use for flattening the nested dictionary structure
          into a single key

    Returns:
      A dictionary of 1-dimensional numpy arrays.
    """
    def array2array_dict(arr):
        """Convert a numpy array into a dictionary of numpy arrays

        >>> arr = np.arange(9).reshape((1, 3, 3))
        >>> assert array2array_dict(arr)["0"]["1"][0] == arr[:, 0, 1][0]
        """
        if isinstance(arr, np.ndarray):
            if arr.ndim <= 1:
                return arr
            else:
                return collections.OrderedDict([(str(i),
                                                 array2array_dict(arr[:, i]))
                                                for i in range(arr.shape[1])])
        elif isinstance(arr, pd.DataFrame):
            return {k: v.values for k, v in arr.to_dict("records").items()}
        elif (arr.__class__.__module__,
              arr.__class__.__name__) == ('kipoi.metadata', 'GenomicRanges'):
            return arr.to_dict()
        else:
            raise ValueError("Unknown data type: %s" % str(type(arr)))

    return flatten(map_nested(batch, array2array_dict), separator=nested_sep)
Пример #3
0
def test_unflatten_with_list_issue15():
    """https://github.com/amirziai/flatten/issues/15"""
    dic = {
        "Required": {
            "a": "1",
            "b": ["1", "2", "3"],
            "c": {
                "d": {
                    "e": [[{
                        "s1": 1
                    }, {
                        "s2": 2
                    }], [{
                        "s3": 1
                    }, {
                        "s4": 2
                    }]]
                }
            },
            "f": ["1", "2"]
        },
        "Optional": {
            "x": "1",
            "y": ["1", "2", "3"]
        }
    }
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic
Пример #4
0
def test_flatten_dict(nested_dict):
    fd = flatten(nested_dict)

    assert dict(fd) == {
        'a': 1,
        'b_c': 3,
        'b_d_0': 1,
        'b_d_1': 2,
        'b_d_2': 3,
        'b_e_0_f': 1,
        'b_e_1_g': 4
    }
    assert unflatten_list(fd) == dict(nested_dict)
Пример #5
0
def test_list_and_dict():
    dic = {'a': 1, 'b': 2, 'c': [{'d': [2, 3, 4], 'e': [{'f': 1, 'g': 2}]}]}
    expected = {
        'a': 1,
        'b': 2,
        'c_0_d_0': 2,
        'c_0_d_1': 3,
        'c_0_d_2': 4,
        'c_0_e_0_f': 1,
        'c_0_e_0_g': 2
    }
    actual = flatten(dic)
    assert actual == expected
Пример #6
0
def test_blog_example():
    dic = {"a": 1, "b": 2, "c": [{"d": ['2', 3, 4], "e": [{"f": 1, "g": 2}]}]}
    expected = {
        'a': 1,
        'b': 2,
        'c_0_d_0': '2',
        'c_0_d_1': 3,
        'c_0_d_2': 4,
        'c_0_e_0_f': 1,
        'c_0_e_0_g': 2
    }
    actual = flatten(dic)
    assert actual == expected
Пример #7
0
    def batch_write(self, batch):
        """Write a batch of data to bed file

        # Arguments
            batch: batch of data. Either a single `np.array` or a list/dict thereof.
        """
        fbatch = flatten(batch, separator="/")

        batch_sizes = [fbatch[k].shape[0] for k in fbatch]
        # assert all shapes are the same
        assert len(pd.Series(batch_sizes).unique()) == 1
        batch_size = batch_sizes[0]

        if self.first_pass:
            # have a dictionary holding
            for k in fbatch:
                if fbatch[k].dtype.type == np.dtype("object"):
                    import h5py

                    # assume that all elements of fbatch[k] have the same dtype
                    dtype = fbatch[k][0].dtype
                    dtype = h5py.special_dtype(vlen=dtype)
                    # TODO: h5py.special_dtype is deprecated from h5py >= 2.10; eventually change to h5py.vlen_dtype
                elif fbatch[k].dtype.type in [
                        np.string_, np.str_, np.unicode_
                ]:
                    dtype = self.string_type
                else:
                    dtype = fbatch[k].dtype

                self.f.create_dataset(k,
                                      shape=(0, ) + fbatch[k].shape[1:],
                                      dtype=dtype,
                                      maxshape=(None, ) + fbatch[k].shape[1:],
                                      compression=self.compression,
                                      chunks=(self.chunk_size, ) +
                                      fbatch[k].shape[1:])
            self.first_pass = False
        # add data to the buffer
        if self.write_buffer is None:
            self.write_buffer = [fbatch]
            self.write_buffer_size = batch_size
        else:
            self.write_buffer.append(fbatch)
            self.write_buffer_size += batch_size

        if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size:
            self._flush_buffer()
Пример #8
0
def test_unflatten_with_list_deep():
    dic = {
        'a': [{
            'b': [{
                'c': [{
                    'a': 5,
                    'b': {
                        'a': [1, 2, 3]
                    },
                    'c': {
                        'x': 3
                    }
                }]
            }]
        }]
    }
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic
Пример #9
0
    def __init__(self, file_path, metadata_schema, header=True):
        self.file_path = file_path
        self.header = header
        self.first_pass = True

        f_dl_schema = flatten(metadata_schema)
        range_keys = [
            "metadata/" + k for k in f_dl_schema
            if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES
        ]
        if len(range_keys) > 1:
            raise ValueError(
                "Found multiple genomic ranges in metadata: {0}. For writing to the "
                + "bed file exactly one genomic range has to exist".format(
                    range_keys))
        elif len(range_keys) == 0:
            raise ValueError(
                "Found no genomic ranges in metadata. For writing to the " +
                "bed file exactly one genomic range has to exist")
        self.ranges_key = range_keys[0]
Пример #10
0
    def batch_write(self, batch):
        """Write a batch of data to bed file

        # Arguments
            batch: batch of data. Either a single `np.array` or a list/dict thereof.
        """
        fbatch = flatten(batch, separator="/")

        batch_sizes = [fbatch[k].shape[0] for k in fbatch]
        # assert all shapes are the same
        assert len(pd.Series(batch_sizes).unique()) == 1
        batch_size = batch_sizes[0]

        if self.first_pass:
            # have a dictionary holding
            for k in fbatch:
                if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]:
                    dtype = self.string_type
                else:
                    dtype = fbatch[k].dtype

                self.f.create_dataset(k,
                                      shape=(0, ) + fbatch[k].shape[1:],
                                      dtype=dtype,
                                      maxshape=(None, ) + fbatch[k].shape[1:],
                                      compression=self.compression,
                                      chunks=(self.chunk_size, ) +
                                      fbatch[k].shape[1:])
            self.first_pass = False
        # add data to the buffer
        if self.write_buffer is None:
            self.write_buffer = [fbatch]
            self.write_buffer_size = batch_size
        else:
            self.write_buffer.append(fbatch)
            self.write_buffer_size += batch_size

        if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size:
            self._flush_buffer()
Пример #11
0
def test_no_flatten():
    dic = {'a': '1', 'b': '2', 'c': 3}
    expected = dic
    actual = flatten(dic)
    assert actual == expected
Пример #12
0
    def train(self,
              batch_size=256,
              epochs=100,
              early_stop_patience=4,
              num_workers=8,
              train_epoch_frac=1.0,
              valid_epoch_frac=1.0,
              train_samples_per_epoch=None,
              validation_samples=None,
              train_batch_sampler=None,
              tensorboard=True):
        """Train the model
        Args:
          batch_size:
          epochs:
          patience: early stopping patience
          num_workers: how many workers to use in parallel
          train_epoch_frac: if smaller than 1, then make the epoch shorter
          valid_epoch_frac: same as train_epoch_frac for the validation dataset
          train_batch_sampler: batch Sampler for training. Useful for say Stratified sampling
          tensorboard: if True, tensorboard output will be added
        """

        if train_batch_sampler is not None:
            train_it = self.train_dataset.batch_train_iter(
                shuffle=False,
                batch_size=1,
                drop_last=None,
                batch_sampler=train_batch_sampler,
                num_workers=num_workers)
        else:
            train_it = self.train_dataset.batch_train_iter(
                batch_size=batch_size, shuffle=True, num_workers=num_workers)
        next(train_it)
        valid_dataset = self.valid_dataset[0][1]  # take the first one
        valid_it = valid_dataset.batch_train_iter(batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=num_workers)
        next(valid_it)

        if tensorboard:
            tb = [TensorBoard(log_dir=self.output_dir)]
        else:
            tb = []

        if self.wandb_run is not None:
            from wandb.keras import WandbCallback
            wcp = [WandbCallback(save_model=False)
                   ]  # we save the model using ModelCheckpoint
        else:
            wcp = []

        # train the model
        if len(valid_dataset) == 0:
            raise ValueError("len(self.valid_dataset) == 0")

        if train_samples_per_epoch is None:
            train_steps_per_epoch = max(
                int(len(self.train_dataset) / batch_size * train_epoch_frac),
                1)
        else:
            train_steps_per_epoch = max(
                int(train_samples_per_epoch / batch_size), 1)

        if validation_samples is None:
            # parametrize with valid_epoch_frac
            validation_steps = max(
                int(len(valid_dataset) / batch_size * valid_epoch_frac), 1)
        else:
            validation_steps = max(int(validation_samples / batch_size), 1)

        self.model.fit_generator(
            train_it,
            epochs=epochs,
            steps_per_epoch=train_steps_per_epoch,
            validation_data=valid_it,
            validation_steps=validation_steps,
            callbacks=[
                EarlyStopping(patience=early_stop_patience,
                              restore_best_weights=True),
                CSVLogger(self.history_path)
            ] + tb + wcp
            # ModelCheckpoint(self.ckp_file, save_best_only=True)]
        )
        self.model.save(self.ckp_file)
        # self.model = load_model(self.ckp_file)  # not necessary, EarlyStopping is already restoring the best weights

        # log metrics from the best epoch
        try:
            dfh = pd.read_csv(self.history_path)
            m = dict(dfh.iloc[dfh.val_loss.idxmin()])
            if self.cometml_experiment is not None:
                self.cometml_experiment.log_multiple_metrics(
                    m, prefix="best-epoch/")
            if self.wandb_run is not None:
                self.wandb_run.summary.update(
                    flatten(prefix_dict(m, prefix="best-epoch/"),
                            separator='/'))
        except FileNotFoundError as e:
            logger.warning(e)
Пример #13
0
    def evaluate(self,
                 metric,
                 batch_size=256,
                 num_workers=8,
                 eval_train=False,
                 eval_skip=(),
                 save=True,
                 **kwargs):
        """Evaluate the model on the validation set
        Args:
          metrics: a list or a dictionary of metrics
          batch_size:
          num_workers:
          eval_train: if True, also compute the evaluation metrics on the training set
          save: save the json file to the output directory
        """
        if len(kwargs) > 0:
            logger.warning(
                f"Extra kwargs were provided to trainer.evaluate: {kwargs}")
        # contruct a list of dataset to evaluate
        if eval_train:
            eval_datasets = [('train', self.train_dataset)
                             ] + self.valid_dataset
        else:
            eval_datasets = self.valid_dataset

        try:
            if len(eval_skip) > 0:
                eval_datasets = [(k, v) for k, v in eval_datasets
                                 if k not in eval_skip]
        except:
            logger.warning(
                f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}"
            )

        metric_res = OrderedDict()
        for d in eval_datasets:
            if len(d) == 2:
                dataset_name, dataset = d
                eval_metric = metric  # use the default eval metric
            elif len(d) == 3:
                # specialized evaluation metric was passed
                dataset_name, dataset, eval_metric = d
            else:
                # TODO - this should be made more explicit with classes
                raise ValueError(
                    "Valid dataset needs to be a list of tuples of 2 or 3 elements"
                    "(name, dataset) or (name, dataset, metric)")
            logger.info(f"Evaluating dataset: {dataset_name}")
            lpreds = []
            llabels = []
            from copy import deepcopy
            for inputs, targets in tqdm(
                    dataset.batch_train_iter(cycle=False,
                                             num_workers=num_workers,
                                             batch_size=batch_size),
                    total=len(dataset) // batch_size):
                lpreds.append(self.model.predict_on_batch(inputs))
                llabels.append(deepcopy(targets))
                del inputs
                del targets
            preds = numpy_collate_concat(lpreds)
            labels = numpy_collate_concat(llabels)
            del lpreds
            del llabels
            metric_res[dataset_name] = eval_metric(labels, preds)

        if save:
            write_json(metric_res, self.evaluation_path, indent=2)
            logger.info("Saved metrics to {}".format(self.evaluation_path))

        if self.cometml_experiment is not None:
            self.cometml_experiment.log_multiple_metrics(flatten(
                metric_res, separator='/'),
                                                         prefix="eval/")

        if self.wandb_run is not None:
            self.wandb_run.summary.update(
                flatten(prefix_dict(metric_res, prefix="eval/"),
                        separator='/'))
        metric_res = {**self.metrics, **metric_res}
        return metric_res
Пример #14
0
def test_list():
    dic = {'a': 1, 'b': [{'c': [2, 3]}]}
    expected = {'a': 1, 'b_0_c_0': 2, 'b_0_c_1': 3}
    actual = flatten(dic)
    assert actual == expected
Пример #15
0
def test_custom_separator():
    dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}}
    expected = {'a': '1', 'b': '2', 'c*c1': '3', 'c*c2': '4'}
    actual = flatten(dic, '*')
    assert actual == expected
Пример #16
0
def test_one_flatten_utf8_dif():
    a = {u'eñe': 1}
    info = dict(info=a)
    expected = {u'info_{}'.format(u'eñe'): 1}
    actual = flatten(info)
    assert actual == expected
Пример #17
0
def test_one_flatten_utf8():
    dic = {'a': '1', u'ñ': u'áéö', 'c': {u'c1': '3', 'c2': '4'}}
    expected = {'a': '1', u'ñ': u'áéö', 'c_c1': '3', 'c_c2': '4'}
    actual = flatten(dic)
    assert actual == expected
Пример #18
0
def test_one_flatten():
    dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}}
    expected = {'a': '1', 'b': '2', 'c_c1': '3', 'c_c2': '4'}
    actual = flatten(dic)
    assert actual == expected
Пример #19
0
def test_unflatten_with_list_nested():
    dic = {"a": [[{"b": 1}], [{"d": 1}]]}
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic