Пример #1
0
def load_col_chars(
    char_enc,
    col_chars_path,
    blank_repr_div=4,
    unknown_char_extra_neg=False,
):
    with h5py.File(col_chars_path, 'r') as hf5:
        nominal_enc = NominalDataEncoder(
            [char_enc.encoder[key.rpartition('_')[-1]] for key in hf5.keys()])
        labels_repr = [torch.tensor(hf5[dat][:]) for dat in hf5.keys()]

        blank_mevm_idx = nominal_enc.encoder[char_enc.encoder['~']]

        if blank_repr_div is not None:
            labels_repr[blank_mevm_idx] = labels_repr[blank_mevm_idx][:int(
                len(labels_repr[blank_mevm_idx]) / blank_repr_div)]

        # Handle unknown character as extra_negatives
        if (char_enc.unknown_idx in nominal_enc.encoder
                and unknown_char_extra_neg):
            unknown_mevm_idx = nominal_enc.encoder[char_enc.encoder['#']]

            extra_negatives = labels_repr[unknown_mevm_idx]

            # Given unknown char is treated as rest of unknowns, remove so
            # MEVM do not treat it as a known class.
            nominal_enc.pop(unknown_mevm_idx)
        else:
            extra_negatives = None

    return nominal_enc, labels_repr, extra_negatives
Пример #2
0
def organize_data_pts_by_logits(argmax_logits, layers):
    # Organize the layers into lists per character class.
    unique_labels, label_counts = np.unique(argmax_logits, return_counts=True)

    logging.debug('Number of Unique Labels: %d', len(unique_labels))
    logging.debug('The unique labels: %s', unique_labels)
    logging.debug('Label counts: %s', label_counts)

    # Be able to obtain the label from the MEVM's indexing of classes
    nominal_encoder = NominalDataEncoder(unique_labels)

    labels_repr = []

    logging.info('Unique Labels contained within layer encoding:')
    for i, label in enumerate(unique_labels):
        logging.info('%d : %d', label, label_counts[i])

        label_indices = np.where(argmax_logits == label)[0]
        labels_repr.append(torch.tensor(layers[label_indices]))

        logging.debug('Label `%s`\'s indices = %s', label, label_indices)
        logging.debug(
            'Torch tensor shape of label %s = %s',
            label,
            labels_repr[i].shape,
        )

    return labels_repr, nominal_encoder
    def load(filepath, blank_idx, space_char, unknown_idx):
        """Loads the label set and creates char encoder"""
        nde = NominalDataEncoder.load(filepath)

        # TODO build CharEncoder s.t. it can simply copy the parts of the given
        # NDE
        return CharEncoder(blank, space_char, unknown_idx, list(nde.encoder))
    def __init__(self, labels=None, max_unknown=None, *args, **kwargs):
        super(MEVM, self).__init__(*args, **kwargs)

        # Create a NominalDataEncoder to map class inputs to the MEVM internal
        # class represntation.
        if isinstance(labels, NominalDataEncoder) or labels is None:
            self.label_enc = labels
        elif isinstance(labels, list) or isinstance(labels, np.ndarray):
            self.label_enc = NominalDataEncoder(labels)
        else:
            raise TypeError(' '.join([
                'Expected `labels` of types: None, list, np.ndarray, or',
                'NominalDataEncoder, not of type {type(labels)}'
            ]))

        self.max_unknown = max_unknown
Пример #5
0
 def __init__(self, *args, **kwargs):
     self.label_enc = NominalDataEncoder(*args, **kwargs)
    def load(h5, labels=None, labels_dtype=None, train_hyperparams=None):
        """Performs the same lod functionality as in MultipleEVM but loads the
        ordered labels from the h5 file for the label encoder.
        """
        if isinstance(h5, str):
            h5 = h5py.File(h5, 'r')

        # load evms
        _evms = []
        i = 1
        while "EVM-%d" % i in h5:
            _evms.append(EVM(h5["EVM-%d" % (i)], log_level='debug'))
            i += 1

        # Load the ordered label into the NominalDataEncoder
        if 'labels' in h5.keys():
            if labels is not None:
                logging.info(' '.join([
                    '`labels` key exists in the HDF5 MEVM state file, but',
                    'labels was given explicitly to MEVM.load(). Ignoring the',
                    'labels in the HDF5 file.',
                ]))
                label_enc = NominalDataEncoder(labels)
            else:
                if labels_dtype is None:
                    labels_dtype = np.dtype(h5.attrs['labels_dtype'])
                label_enc = NominalDataEncoder(
                    h5['labels'][:].astype(labels_dtype), )
        elif labels is not None:
            label_enc = NominalDataEncoder(labels)
        else:
            logging.warning(' '.join([
                'No `labels` dataset available in given hdf5. Relying on the',
                'evm model\'s labels if they exist. Will fail if the MEVM',
                'state does not have any labels in each of its EVM.',
            ]))

            label_enc = NominalDataEncoder([evm.label for evm in _evms], )

        # Load training vars if not given
        if train_hyperparams is None:
            # NOTE Able to specify which to load from h5 by passing a list.
            train_hyperparams = [
                'tailsize',
                'cover_threshold',
                'distance_function',
                'distance_multiplier',
                'max_unknown',
            ]

        if isinstance(train_hyperparams, list):
            train_hyperparams = {
                attr: h5.attrs[attr]
                for attr in train_hyperparams if attr in h5.attrs
            }
        elif not isinstance(train_hyperparams, dict):
            raise TypeError(' '.join([
                '`train_hyperparams` expected type: None, list, or dict, but',
                f'recieved {type(train_hyperparams)}',
            ]))

        mevm = MEVM(label_enc, **train_hyperparams)
        mevm._evms = _evms

        return mevm
    def fit(self, points, labels=None, extra_negatives=None):
        """Wraps the MultipleEVM's train() and uses the encoder to
        """
        # If points and labels are aligned sequence pair (X, y): adjust form
        if (isinstance(points, np.ndarray) and
            (isinstance(labels, list) or isinstance(labels, np.ndarray))
                and len(points) == len(labels)):
            # Adjust sequence pair into list of torch.Tensors and unique labels
            unique = np.unique(labels)
            labels = np.array(labels)
            points = [torch.Tensor(points[labels == u]) for u in unique]
            labels = unique
        elif isinstance(points, list):
            if all([isinstance(pts, np.ndarray) for pts in points]):
                # If list of np.ndarrays, turn into torch.Tensors
                points = [torch.Tensor(pts) for pts in points]
            elif not all([isinstance(pts, torch.Tensor) for pts in points]):
                raise TypeError(' '.join([
                    'expected points to be of types: list(np.ndarray),',
                    'list(torch.tensor), or np.ndarray with labels as an',
                    'aligned list or np.ndarray',
                ]))
        else:
            raise TypeError(' '.join([
                'expected points to be of types: list(np.ndarray),',
                'list(torch.tensor), or np.ndarray with labels as an',
                'aligned list or np.ndarray',
            ]))

        # Set encoder if labels is not None
        if labels is not None:
            if len(points) != len(labels):
                raise ValueError(' '.join([
                    'The given number of labels does not equal the number of',
                    'classes represented by the list of points.',
                    'If giving an aligned sequence pair of points and labels,',
                    'then ensure `points` is of type `np.ndarray`.',
                ]))

            if self.label_enc is not None:
                logging.debug(
                    '`encoder` is not None and is being overwritten!', )

            if isinstance(labels, NominalDataEncoder):
                self.label_enc = labels
            elif isinstance(labels, list) or isinstance(labels, np.ndarray):
                self.label_enc = NominalDataEncoder(labels)
            else:
                raise TypeError(' '.join([
                    'Expected `labels` of types: None, list, np.ndarray, or',
                    'NominalDataEncoder, not of type {type(labels)}'
                ]))

        # Ensure extra_negatives is of expected form (no labels for these)
        if ((isinstance(extra_negatives, np.ndarray)
             and len(extra_negatives.shape) == 2)
                or isinstance(extra_negatives, list)):
            extra_negatives = torch.Tensor(extra_negatives)
        elif not isinstance(extra_negatives, torch.Tensor):
            raise TypeError(' '.join([
                'The extra_negatives must be either None, torch.Tensor of',
                'shape 2, or an object broadcastable to such a torch.Tensor.',
            ]))

        # Points is now list(torch.Tensors) and encoder handled.

        # TODO handle adjust of extra negatives as a list of labels to be known
        # unknowns. For now, expects extra_negatives always of correct type.
        self.train(points, labels, extra_negatives)