Exemplo n.º 1
0
 def __initialize_datasets__(self, sample_data):
     self.out = {}
     sample_data = self.__parse_input_data__(sample_data)
     sample_data = self.__convert_input_data__(sample_data)
     dims = {dset: 1 if len(data.shape) == 1 else data.shape[
         1] for dset, data in iteritems(sample_data)}
     # needed for raw_datasets only
     dtypes = {
         dset: get_dtype(sample_data[dset]) for dset in self.raw_datasets}
     # init raw datasets
     for dset in self.raw_datasets:
         (group, dataset) = os.path.split(dset)
         if not(group):
             group = '/'
         self.out[dset] = self.np2h5.add_dataset(group, dataset, n_columns=dims[dset], item_type=dtypes[
                                                 dset], fixed_size=False)  # FIXME at some point should become super.add_dataset(...)
     # init not fused indexed datasets, in this implementation they are all
     # encoded in the same matrix
     if self.non_fused_datasets:
         indexed_dims = [dims[dset] for dset in self.non_fused_datasets]
         indexed_levels = [len(self.indexes[dset])
                           for dset in self.non_fused_datasets]
         dim = sum(indexed_dims)
         # smallest unsigned integer dtype compatible with all
         # indexed_datasets
         d_type = type_fitting.fit_integer_type(
             max(indexed_levels), is_signed=False)
         # FIXME at some point should become super.add_dataset(...)
         self.out['indexed'] = self.np2h5.add_dataset(
             self.group, 'indexed_data', n_columns=dim, item_type=d_type, fixed_size=False)
         with h5py.File(self.filename) as f:
             # necessary to access the part of the data corresponding to a
             # particular dataset
             f[self.group].create_dataset(
                 'indexed_cumudims', data=np.cumsum(indexed_dims), dtype=np.uint64)
     # fused datasets have a separate one dimensional dataset each
     self.key_weights = {}
     for fused_dset in self.fused_datasets:
         fused_dims = np.array(
             [dims[dset] for dset in self.fused_members[fused_dset]], dtype=np.uint64)
         max_key = np.prod(
             self.nb_levels[fused_dset] ** fused_dims) - np.uint64(1)
         if max_key >= 2 ** 64:
             raise ValueError('fused dataset %s in file %s cannot be created because 64 bits keys are not sufficient to cover all possible combinations of the fused datasets' % (
                 fused_dset, self.filename))
         # smallest unsigned integer dtype compatible
         d_type = type_fitting.fit_integer_type(max_key, is_signed=False)
         # FIXME at some point should become super.add_dataset(...)
         self.out[fused_dset] = self.np2h5.add_dataset(
             self.group, fused_dset, n_columns=1, item_type=d_type, fixed_size=False)
         nb_levels_with_multiplicity = np.concatenate([np.array(
             n, dtype=d_type) * np.ones(d, dtype=d_type) for n, d in zip(self.nb_levels[fused_dset], fused_dims)])
         self.key_weights[fused_dset] = np.concatenate(
             [np.array([1], dtype=d_type), np.cumprod(d_type(nb_levels_with_multiplicity))[:-1]])
         with h5py.File(self.filename) as f:
             f[self.group]['fused'][fused_dset].create_dataset(
                 'key_weights', data=self.key_weights[fused_dset], dtype=d_type)
Exemplo n.º 2
0
Arquivo: h5io.py Projeto: mmmaat/ABXpy
 def __initialize_datasets__(self, sample_data):
     self.out = {}
     sample_data = self.__parse_input_data__(sample_data)
     sample_data = self.__convert_input_data__(sample_data)
     dims = {dset: 1 if len(data.shape) == 1 else data.shape[
         1] for dset, data in sample_data.iteritems()}
     # needed for raw_datasets only
     dtypes = {
         dset: get_dtype(sample_data[dset]) for dset in self.raw_datasets}
     # init raw datasets
     for dset in self.raw_datasets:
         (group, dataset) = os.path.split(dset)
         if not(group):
             group = '/'
         self.out[dset] = self.np2h5.add_dataset(group, dataset, n_columns=dims[dset], item_type=dtypes[
                                                 dset], fixed_size=False)  # FIXME at some point should become super.add_dataset(...)
     # init not fused indexed datasets, in this implementation they are all
     # encoded in the same matrix
     if self.non_fused_datasets:
         indexed_dims = [dims[dset] for dset in self.non_fused_datasets]
         indexed_levels = [len(self.indexes[dset])
                           for dset in self.non_fused_datasets]
         dim = sum(indexed_dims)
         # smallest unsigned integer dtype compatible with all
         # indexed_datasets
         d_type = type_fitting.fit_integer_type(
             max(indexed_levels), is_signed=False)
         # FIXME at some point should become super.add_dataset(...)
         self.out['indexed'] = self.np2h5.add_dataset(
             self.group, 'indexed_data', n_columns=dim, item_type=d_type, fixed_size=False)
         with h5py.File(self.filename) as f:
             # necessary to access the part of the data corresponding to a
             # particular dataset
             f[self.group].create_dataset(
                 'indexed_cumudims', data=np.cumsum(indexed_dims), dtype=np.uint64)
     # fused datasets have a separate one dimensional dataset each
     self.key_weights = {}
     for fused_dset in self.fused_datasets:
         fused_dims = np.array(
             [dims[dset] for dset in self.fused_members[fused_dset]], dtype=np.uint64)
         max_key = np.prod(
             self.nb_levels[fused_dset] ** fused_dims) - np.uint64(1)
         if max_key >= 2 ** 64:
             raise ValueError('fused dataset %s in file %s cannot be created because 64 bits keys are not sufficient to cover all possible combinations of the fused datasets' % (
                 fused_dset, self.filename))
         # smallest unsigned integer dtype compatible
         d_type = type_fitting.fit_integer_type(max_key, is_signed=False)
         # FIXME at some point should become super.add_dataset(...)
         self.out[fused_dset] = self.np2h5.add_dataset(
             self.group, fused_dset, n_columns=1, item_type=d_type, fixed_size=False)
         nb_levels_with_multiplicity = np.concatenate([np.array(
             n, dtype=d_type) * np.ones(d, dtype=d_type) for n, d in zip(self.nb_levels[fused_dset], fused_dims)])
         self.key_weights[fused_dset] = np.concatenate(
             [np.array([1], dtype=d_type), np.cumprod(d_type(nb_levels_with_multiplicity))[:-1]])
         with h5py.File(self.filename) as f:
             f[self.group]['fused'][fused_dset].create_dataset(
                 'key_weights', data=self.key_weights[fused_dset], dtype=d_type)
Exemplo n.º 3
0
def collapse(scorefile, taskfile, fid):
    """Collapses the results for each triplets sharing the same on, across
    and by labels.

    """
    # We make the assumption that everything fits in memory...
    scorefid = h5py.File(scorefile)
    taskfid = h5py.File(taskfile)
    bys = taskfid['bys'][...]
    for by_idx, by in enumerate(bys):
        # print 'collapsing {0}/{1}'.format(by_idx + 1, len(bys))
        trip_attrs = taskfid['triplets']['by_index'][by_idx]

        tfrk = taskfid['regressors'][by]

        tmp = tfrk[u'indexed_data']
        indices = np.array(tmp)
        if indices.size == 0:
            continue
        tmp = scorefid['scores'][trip_attrs[0]:trip_attrs[1]]
        scores_arr = np.array(tmp)
        tmp = np.ascontiguousarray(indices).view(
            np.dtype((np.void, indices.dtype.itemsize * indices.shape[1])))
        n_indices = np.max(indices, 0) + 1
        assert np.prod(n_indices) < 18446744073709551615, "type not big enough"
        ind_type = fit_integer_type(np.prod(n_indices), is_signed=False)
        # encoding the indices of a triplet to a unique index
        new_index = indices[:, 0].astype(ind_type)
        for i in range(1, len(n_indices)):
            new_index = indices[:, i] + n_indices[i] * new_index

        permut = np.argsort(new_index)

        # collapsing the score
        sorted_scores = scores_arr[permut]
        sorted_index = new_index[permut]
        mean, unique_index, counts = unique(sorted_index, sorted_scores)

        # retrieving the triplet indices from the unique index.
        tmp = npdecode(unique_index, n_indices)

        regs = tfrk['indexed_datasets']
        indexes = []
        for reg in regs:
            indexes.append(tfrk['indexes'][reg][:])
        nregs = len(regs)

        for i, key in enumerate(tmp):
            aux = list()
            for j in range(nregs):
                aux.append(indexes[j][int(key[j])])
            score = mean[i]
            n = counts[i]
            result = aux + [by, score, int(n)]
            fid.write('\t'.join(map(str, result)) + '\n')
            # results.append(aux + [context, score, n])
            # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n')
    scorefid.close()
    taskfid.close()
    del taskfid
Exemplo n.º 4
0
 def initialize_output_dsets(self, sample_data):
     # do some automatic conversion (maybe risky?)
     out = [
         o if hasattr(o, 'shape') else numpy.array(o)
         for o in sample_data[1]
     ]
     if isinstance(out, collections.Mapping):  # dict, DataFrame ...
         dim = [
             1 if len(out[o_name].shape) == 1 else out[o_name].shape[1]
             for o_name in self.output_names
         ]
         dtypes = [get_dtype(out[o_name]) for o_name in self.output_names]
     else:  # list, tuple ...
         dim = [1 if len(o.shape) == 1 else o.shape[1] for o in out]
         dtypes = [get_dtype(o) for o in out]
     with h5py.File(self.filename) as f:
         for o, d, t in zip(self.output_names, dim, dtypes):
             if not (o in self.indexed_outputs):
                 f['data'].create_dataset(o, (0, d),
                                          dtype=t,
                                          chunks=(chunk_size(
                                              numpy.dtype(t).itemsize,
                                              d), d),
                                          maxshape=(None, d))
         indexed_o_dims = []
         indexed_o_levels = []
         # indexed outputs are stored in the order specified by
         # self.indexed_outputs
         for o in self.indexed_outputs:
             indexed_o_dims.append(dim[self.output_names.index(o)])
             indexed_o_levels.append(len(self.indexes[o]))
         d = sum(indexed_o_dims)
         # smallest unsigned integer dtype compatible with all
         # indexed_outputs
         t = type_fitting.fit_integer_type(max(indexed_o_levels),
                                           is_signed=False)
         f['data'].create_dataset('indexed_outputs', (0, d),
                                  dtype=t,
                                  chunks=(chunk_size(
                                      numpy.dtype(t).itemsize, d), d),
                                  maxshape=(None, d))
         # necessary to access the part of the dataset corresponding to a
         # particular output
         f['synopsis'].create_dataset('indexed_outputs_dims',
                                      data=numpy.cumsum(indexed_o_dims),
                                      dtype=numpy.int64)
Exemplo n.º 5
0
 def initialize_output_dsets(self, sample_data):
     # do some automatic conversion (maybe risky?)
     out = [o if hasattr(o, 'shape') else numpy.array(o)
            for o in sample_data[1]]
     if isinstance(out, collections.Mapping):  # dict, DataFrame ...
         dim = [1 if len(out[o_name].shape) == 1 else out[o_name].shape[
             1] for o_name in self.output_names]
         dtypes = [get_dtype(out[o_name]) for o_name in self.output_names]
     else:  # list, tuple ...
         dim = [1 if len(o.shape) == 1 else o.shape[1] for o in out]
         dtypes = [get_dtype(o) for o in out]
     with h5py.File(self.filename) as f:
         for o, d, t in zip(self.output_names, dim, dtypes):
             if not(o in self.indexed_outputs):
                 f['data'].create_dataset(o, (0, d), dtype=t, chunks=(
                     chunk_size(numpy.dtype(t).itemsize, d), d),
                     maxshape=(None, d))
         indexed_o_dims = []
         indexed_o_levels = []
         # indexed outputs are stored in the order specified by
         # self.indexed_outputs
         for o in self.indexed_outputs:
             indexed_o_dims.append(dim[self.output_names.index(o)])
             indexed_o_levels.append(len(self.indexes[o]))
         d = sum(indexed_o_dims)
         # smallest unsigned integer dtype compatible with all
         # indexed_outputs
         t = type_fitting.fit_integer_type(
             max(indexed_o_levels), is_signed=False)
         f['data'].create_dataset(
             'indexed_outputs', (0, d), dtype=t, chunks=(
             chunk_size(numpy.dtype(t).itemsize, d), d), maxshape=(None, d))
         # necessary to access the part of the dataset corresponding to a
         # particular output
         f['synopsis'].create_dataset(
             'indexed_outputs_dims', data=numpy.cumsum(indexed_o_dims),
             dtype=numpy.int64)
Exemplo n.º 6
0
Arquivo: task.py Projeto: mmmaat/ABXpy
    def generate_pairs(self, output=None):
        """Generate the pairs associated to the triplet list

        .. note:: This function is called by generate_triplets and should not
            be used independantly
        """

        # FIXME change this to a random file name to avoid overwriting problems
        # default name for output file
        if output is None:
            (basename, _) = os.path.splitext(self.database)
            output = basename + '.abx'
        # list all pairs
        all_empty = True
        try:
            _, output_tmp = tempfile.mkstemp()
            for by, db in self.by_dbs.iteritems():
                # FIXME maybe care about this case earlier ?
                if self.verbose > 0:
                    print("Writing AX/BX pairs to task file...")
                with h5py.File(output) as fh:
                    not_empty = fh['/triplets/' + str(by)].size
                if not_empty:
                    all_empty = False
                    max_ind = np.max(db.index.values)
                    pair_key_type = type_fitting.fit_integer_type(
                        (max_ind + 1) ** 2 - 1, is_signed=False)
                    with h52np.H52NP(output) as f_in:
                        with np2h5.NP2H5(output_tmp) as f_out:
                            inp = f_in.add_dataset('triplets', str(by))
                            out = f_out.add_dataset(
                                'pairs', str(by), n_columns=1,
                                item_type=pair_key_type, fixed_size=False)
                            # FIXME repace this by a for loop by making h52np
                            # implement the iterable pattern with next() outputing
                            # inp.read()
                            try:
                                while True:
                                    triplets = pair_key_type(inp.read())
                                    n = triplets.shape[0]
                                    ind = np.arange(n)
                                    i1 = 2 * ind
                                    i2 = 2 * ind + 1
                                    # would need to amend np2h5 and h52np to remove
                                    # the second dim...
                                    pairs = np.empty(
                                        shape=(2 * n, 1), dtype=pair_key_type)
                                    # FIXME change the encoding (and type_fitting)
                                    # so that A,B and B,A have the same code ...
                                    # (take a=min(a,b), b=max(a,b))
                                    # FIXME but allow a flag to control the
                                    # behavior to be able to enforce A,X and B,X
                                    # order when using assymetrical distance
                                    # functions
                                    pairs[i1, 0] = triplets[:, 0] + (
                                        max_ind + 1) * triplets[:, 2]  # AX
                                    pairs[i2, 0] = triplets[:, 1] + (
                                        max_ind + 1) * triplets[:, 2]  # BX
                                    # FIXME do a unique here already? Do not store
                                    # the inverse mapping ? (could sort triplets on
                                    # pair1, complete pair1, sort on pair2,
                                    # complete pair 2 and shuffle ?)
                                    out.write(pairs)
                            except StopIteration:
                                pass
                    # sort pairs
                    handler = h5_handler.H5Handler(output_tmp, '/pairs/', str(by))
                    # memory: available RAM in Mo, could be a param
                    memory = 1000
                    # estimate of the amount of data to be sorted
                    with h5py.File(output_tmp) as fh:
                        n = fh['/pairs/' + str(by)].shape[0]
                        i = fh['/pairs/' + str(by)].dtype.itemsize
                    amount = n * i  # in bytes
                    # harmonize units to Ko:
                    memory = 1000 * memory
                    amount = amount / 1000.
                    # be conservative: aim at using no more than 3/4 the available
                    # memory
                    # if enough memory take one chunk (this will do an unnecessary
                    # full write and read of the file... could be optimized easily)
                    if amount <= 0.75 * memory:
                        # would it be beneficial to have a large o_buffer_size as
                        # well ?
                        handler.sort(buffer_size=amount)
                    # else take around 30 chunks if possible (this seems efficient
                    # given the current implem, using a larger number of chunks
                    # efficiently might be possible if the reading chunks part of
                    # the sort was cythonized ?)
                    elif amount / 30. <= 0.75 * memory:
                        handler.sort(buffer_size=amount / 30.)
                    # else take minimum number of chunks possible given the
                    # available RAM
                    else:
                        handler.sort(buffer_size=0.75 * memory)

                    # FIXME should have a unique function directly instead of
                    # sorting + unique ?
                    with h52np.H52NP(output_tmp) as f_in:
                        with np2h5.NP2H5(output) as f_out:
                            inp = f_in.add_dataset('pairs', str(by))
                            out = f_out.add_dataset(
                                'unique_pairs', str(by), n_columns=1,
                                item_type=pair_key_type, fixed_size=False)
                            try:
                                last = -1
                                while True:
                                    pairs = inp.read()
                                    pairs = np.unique(pairs)
                                    # unique alters the shape
                                    pairs = np.reshape(pairs, (pairs.shape[0], 1))
                                    if pairs[0, 0] == last:
                                        pairs = pairs[1:]
                                    if pairs.size > 0:
                                        last = pairs[-1, 0]
                                        out.write(pairs)
                            except StopIteration:
                                pass
                    # store for ulterior decoding
                    with h5py.File(output) as fh:
                        fh['/unique_pairs'].attrs[str(by)] = max_ind + 1
                    store = pd.HDFStore(output)
                    # use append to make use of table format, which is better at
                    # handling strings without much space (fixed-size format)
                    store.append('/feat_dbs/' + str(by), self.feat_dbs[by],
                                 expectedrows=len(self.feat_dbs[by]))
                    store.close()
                    # FIXME generate inverse mapping to triplets (1 and 2) ?
        finally:
            os.remove(output_tmp)
        if self.verbose > 0:
            print("done.")
Exemplo n.º 7
0
Arquivo: task.py Projeto: mmmaat/ABXpy
    def on_across_triplets(self, by, on, across, on_across_block,
                           on_across_by_values, with_regressors=True):
        """Generate all possible triplets for a given by block.

        Given an on_across_block of the database and the parameters of the \
        task, this function will generate the complete set of triplets and \
        the regressors.

        Parameters
        ----------
        by : int
            The block index
        on, across : int
            The task attributes
        on_across_block : list
            the block
        on_across_by_values : dict
            the actual values
        with_regressors : bool, optional
            By default, true

        Returns
        -------
        triplets : numpy.Array
            the set of triplets generated
        regressors : numpy.Array
            the regressors generated
        """
        # find all possible A, B, X where A and X have the 'on' feature of the
        # block and A and B have the 'across' feature of the block
        A = np.array(on_across_block, dtype=self.types[by])
        on_set = set(self.on_blocks[by].groups[on])
        # FIXME quick fix to process case whith no across, but better done in a
        # separate loop ...
        if self.across == ['#across']:
            # in this case A is a singleton and B can be anything in the by
            # block that doesn't have the same 'on' as A
            B = np.array(
                list(set(self.by_dbs[by].index).difference(on_set)),
                dtype=self.types[by])
        else:
            B = self.across_blocks[by].groups[across]
            # remove B with the same 'on' than A
            B = np.array(list(set(B).difference(A)), dtype=self.types[by])
        # remove X with the same 'across' than A

        if type(across) is tuple:
            antiacross_set = set(self.antiacross_blocks[by][across])
            X = np.array(list(antiacross_set & on_set), dtype=self.types[by])
        else:
            X = np.array(list(on_set.difference(A)), dtype=self.types[by])

        # apply singleton filters
        db = self.by_dbs[by]

        if self.filters.A:
            A = self.filters.A_filter(on_across_by_values, db, A)
        if self.filters.B:
            B = self.filters.B_filter(on_across_by_values, db, B)
        if self.filters.X:
            X = self.filters.X_filter(on_across_by_values, db, X)

        # instantiate A, B, X regressors here
        if with_regressors:
            self.regressors.set_A_regressors(on_across_by_values, db, A)
            self.regressors.set_B_regressors(on_across_by_values, db, B)
            self.regressors.set_X_regressors(on_across_by_values, db, X)

        # A, B, X can then be combined efficiently in a full (or randomly
        # sampled) factorial design
        size = len(A) * len(B) * len(X)

        if size > 0:
            ind_type = type_fitting.fit_integer_type(size, is_signed=False)
            # if sampling in the absence of triplets filters, do it here
            if self.sampling and not(self.filters.ABX):
                indices = self.sampler.sample(size, dtype=ind_type)
            else:
                indices = np.arange(size, dtype=ind_type)
            # generate triplets from indices
            iX = np.mod(indices, len(X))
            iB = np.mod(np.divide(indices, len(X)), len(B))
            iA = np.divide(indices, len(B) * len(X))
            triplets = np.column_stack((A[iA], B[iB], X[iX]))

            # apply triplets filters
            if self.filters.ABX:
                triplets = self.filters.ABX_filter(
                    on_across_by_values, db, triplets)
                size = triplets.shape[0]
                # if sampling in the presence of triplets filters, do it here
                if self.sampling:
                    ind_type = type_fitting.fit_integer_type(
                        size, is_signed=False)
                    indices = self.sampler.sample(size, dtype=ind_type)
                    triplets = triplets[indices, :]
        else:
            triplets = np.empty(shape=(0, 3), dtype=self.types[by])
            indices = np.empty(shape=size, dtype=np.uint8)
            iA = indices
            iB = indices
            iX = indices

        if with_regressors:
            if self.regressors.ABX:  # instantiate ABX regressors here
                self.regressors.set_ABX_regressors(
                    on_across_by_values, db, triplets)

            # self.regressors.XXX contains either (for by and on_across_by)
            #   [[scalar_output_1_dbfun_1, scalar_output_2_dbfun_1,...],
            #    [scalar_output_1_dbfun_2, ...], ...]
            # or:
            #   [[np_array_output_1_dbfun_1, np_array_output_2_dbfun_1,...],
            #    [np_array_output_1_dbfun_2, ...], ...]
            # FIXME change manager API so that self.regressors.A contains the
            # data and not the list of dbfun_s ?
            regressors = {}
            scalar_names = self.regressors.by_names + \
                self.regressors.on_across_by_names
            scalar_regressors = self.regressors.by_regressors + \
                self.regressors.on_across_by_regressors
            for names, regs in zip(scalar_names, scalar_regressors):
                for name, reg in zip(names, regs):
                    regressors[name] = np.tile(np.array(reg),
                                               (np.size(triplets, 0), 1))
            for names, regs in zip(self.regressors.A_names,
                                   self.regressors.A_regressors):
                for name, reg in zip(names, regs):
                    regressors[name] = reg[iA]
            for names, regs in zip(self.regressors.B_names,
                                   self.regressors.B_regressors):
                for name, reg in zip(names, regs):
                    regressors[name] = reg[iB]
            for names, regs in zip(self.regressors.X_names,
                                   self.regressors.X_regressors):
                for name, reg in zip(names, regs):
                    regressors[name] = reg[iX]
            # FIXME implement this
            # for names, regs in zip(self.regressors.ABX_names,
            #                        self.regressors.ABX_regressors):
            #    for name, reg in zip(names, regs):
            #        regressors[name] = reg[indices,:]
            return triplets, regressors
        else:
            return triplets
Exemplo n.º 8
0
Arquivo: task.py Projeto: mmmaat/ABXpy
    def __init__(self, db_name, on, across=None, by=None, filters=None,
                 regressors=None, verbose=0):

        self.verbose = verbose
        assert os.path.exists(db_name), ('the item file {0} was not found:'
                                         .format(db_name))

        if across is None:
            across = []
        if by is None:
            by = []
        if filters is None:
            filters = []
        if regressors is None:
            regressors = []

        # check parameters
        # using several 'on' isn't supported by the toolbox
        assert isinstance(on, basestring), \
            'ON attribute must be specified by a string'
        on = [on]
        if isinstance(across, basestring):
            across = [across]
        if isinstance(by, basestring):
            by = [by]

        if verbose:
            print("Verifying input...")

        # open database
        db, db_hierarchy, feat_db = database.load(db_name, features_info=True)

        # check that required columns are present
        cols = set(db.columns)
        message = ' argument is invalid, check that all \
            the provided attributes are defined in the database ' + db_name
        # the argument of issuperset needs to be a list ...
        assert cols.issuperset(on), 'ON' + message
        assert cols.issuperset(across), 'ACROSS' + message
        assert cols.issuperset(by), 'BY' + message
        # FIXME add additional checks, for example that columns
        # in BY, ACROSS, ON are not the same ? (see task structure notes)
        # also that location columns are not used
        for col in cols:
            assert '_' not in col, col + ': you cannot use underscore in \
                column names'
            assert '#' not in col, col + ': you cannot use \'#\' in \
                column names'
        if verbose:
            print("Input verified")

        # if 'by' or 'across' are empty create appropriate dummy columns
        # (note that '#' is forbidden in user names for columns)
        if not by:
            db['#by'] = 0
            by = ['#by']
        if not across:
            db['#across'] = range(len(db))
            across = ['#across']
        # note that this additional columns are not in the db_hierarchy,
        # but I don't think this is problematic

        self.filters = filter_manager.FilterManager(db_hierarchy,
                                                    on, across, by,
                                                    filters)
        self.regressors = regressor_manager.RegressorManager(db,
                                                             db_hierarchy,
                                                             on, across, by,
                                                             regressors)

        self.sampling = False

        # prepare the database for generating the triplets
        self.by_dbs = {}
        self.feat_dbs = {}
        self.on_blocks = {}
        self.across_blocks = {}
        self.on_across_blocks = {}
        self.antiacross_blocks = {}
        by_groups = db.groupby(by)

        if self.verbose > 0:
            display = progress_display.ProgressDisplay()
            display.add('block', 'Preprocessing by block', len(by_groups))

        for by_key, by_frame in by_groups:
            if self.verbose > 0:
                display.update('block', 1)
                display.display()

            # allow to get by values as well as values of other variables
            # that are determined by these
            by_values = dict(by_frame.iloc[0])
            # apply 'by' filters
            if self.filters.by_filter(by_values):
                # get analogous feat_db
                by_feat_db = feat_db.iloc[by_frame.index]
                # drop indexes
                by_frame = by_frame.reset_index(drop=True)
                # reset_index to get an index relative to the 'by' db,
                # the original index could be conserved in an additional
                # 'index' column if necessary by removing the drop=True, but
                # this would add another constraint on the possible column name
                by_feat_db = by_feat_db.reset_index(drop=True)
                # apply generic filters
                by_frame = self.filters.generic_filter(by_values, by_frame)
                self.by_dbs[by_key] = by_frame
                self.feat_dbs[by_key] = by_feat_db
                self.on_blocks[by_key] = self.by_dbs[by_key].groupby(on)
                self.across_blocks[by_key] = self.by_dbs[
                    by_key].groupby(across)
                self.on_across_blocks[by_key] = self.by_dbs[
                    by_key].groupby(on + across)
                if len(across) > 1:
                    self.antiacross_blocks[by_key] = dict()
                    for across_key in (self.across_blocks[by_key]
                                       .groups.iterkeys()):
                        b = True
                        for i, col in enumerate(across):
                            b = b * (by_frame[col] != across_key[i])
                        self.antiacross_blocks[by_key][
                            across_key] = by_frame[b].index

        # store parameters
        self.database = db_name
        self.db = db
        self.db_hierarchy = db_hierarchy
        self.on = on
        self.across = across
        self.by = by

        # determining appropriate numeric type to represent index (currently
        # used only for numpy arrays and h5 storage, might also be used for
        # panda frames)
        types = {}
        for key, db in self.by_dbs.iteritems():
            # len(db)-1 wouldn't work here because there could be missing index
            # due to generic filtering
            n = np.max(db.index.values)
            types[key] = type_fitting.fit_integer_type(n, is_signed=False)

        self.types = types

        # compute some statistics about the task
        self.compute_statistics()
Exemplo n.º 9
0
def score(task_file, distance_file, score_file=None, score_group='scores'):
    """Calculate the score of a task and put the results in a hdf5 file.

    Parameters
    ----------
    task_file : string
        The hdf5 file containing the task (with the triplets and pairs
        generated)
    distance_file : string
        The hdf5 file containing the distances between the pairs
    score_file : string, optional
        The hdf5 file that will contain the results
    """
    if score_file is None:
        (basename_task, _) = os.path.splitext(task_file)
        (basename_dist, _) = os.path.splitext(distance_file)
        score_file = basename_task + '_' + basename_dist + '.score'
    # file verification:
    assert os.path.exists(task_file), 'Cannot find task file ' + task_file
    assert os.path.exists(distance_file), ('Cannot find distance file ' +
                                           distance_file)
    assert not os.path.exists(score_file), ('score file already exist ' +
                                            score_file)
    # with h5py.File(task_file) as t:
    #     bys = [by for by in t['triplets']]
    # FIXME skip empty by datasets, this should not be necessary anymore when
    # empty datasets are filtered at the task file generation level
    with h5py.File(task_file, 'r') as t:
        bys = t['bys'][...]
        # bys = t['feat_dbs'].keys()
        n_triplets = t['triplets']['data'].shape[0]
    with h5py.File(score_file, 'w') as s:
        s.create_dataset('scores', (n_triplets, 1), dtype=np.int8)
        for n_by, by in enumerate(bys):
            with h5py.File(task_file, 'r') as t, h5py.File(distance_file,
                                                           'r') as d:
                trip_attrs = t['triplets']['by_index'][n_by]
                pair_attrs = t['unique_pairs'].attrs[by]
                # FIXME here we make the assumption
                # that this fits into memory ...
                dis = d['distances']['data'][pair_attrs[1]:pair_attrs[2]][...]
                dis = np.reshape(dis, dis.shape[0])
                # FIXME idem + only unique_pairs used ?
                pairs = t['unique_pairs']['data'][pair_attrs[1]:pair_attrs[2]][
                    ...]
                pairs = np.reshape(pairs, pairs.shape[0])
                base = pair_attrs[0]
                pair_key_type = type_fitting.fit_integer_type((base)**2 - 1,
                                                              is_signed=False)
            with h52np.H52NP(task_file) as t:
                inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs)
                idx_start = trip_attrs[0]
                for triplets in inp:
                    triplets = pair_key_type(triplets)
                    idx_end = idx_start + triplets.shape[0]

                    pairs_AX = triplets[:, 0] + base * triplets[:, 2]
                    # FIXME change the encoding (and type_fitting) so that
                    # A,B and B,A have the same code ... (take a=min(a,b),
                    # b=max(a,b))
                    pairs_BX = triplets[:, 1] + base * triplets[:, 2]
                    dis_AX = dis[np.searchsorted(pairs, pairs_AX)]

                    dis_BX = dis[np.searchsorted(pairs, pairs_BX)]
                    scores = (np.int8(dis_AX < dis_BX) -
                              np.int8(dis_AX > dis_BX))
                    # 1 if X closer to A, -1 if X closer to B, 0 if equal
                    # distance (this doesn't use 0, 1/2, 1 to use the
                    # compact np.int8 data format)
                    s['scores'][idx_start:idx_end] = np.reshape(
                        scores, (-1, 1))
                    idx_start = idx_end
Exemplo n.º 10
0
def score(task_file, distance_file, score_file=None, score_group="scores"):
    """Calculate the score of a task and put the results in a hdf5 file.

    Parameters
    ----------
    task_file : string
        The hdf5 file containing the task (with the triplets and pairs
        generated)
    distance_file : string
        The hdf5 file containing the distances between the pairs
    score_file : string, optional
        The hdf5 file that will contain the results
    """
    if score_file is None:
        (basename_task, _) = os.path.splitext(task_file)
        (basename_dist, _) = os.path.splitext(distance_file)
        score_file = basename_task + "_" + basename_dist + ".score"
    # file verification:
    assert os.path.exists(task_file), "Cannot find task file " + task_file
    assert os.path.exists(distance_file), "Cannot find distance file " + distance_file
    assert not os.path.exists(score_file), "score file already exist " + score_file
    # with h5py.File(task_file) as t:
    # bys = [by for by in t['triplets']]
    # FIXME skip empty by datasets, this should not be necessary anymore when
    # empty datasets are filtered at the task file generation level
    with h5py.File(distance_file) as d:
        bys = [by for by in d["distances"]]
    for by in bys:
        with h5py.File(task_file) as t, h5py.File(distance_file) as d:
            n = t["triplets"][by].shape[0]
            # FIXME here we make the assumption
            # that this fits into memory ...
            dis = d["distances"][by][...]
            dis = np.reshape(dis, dis.shape[0])
            # FIXME idem + only unique_pairs used ?
            pairs = t["unique_pairs"][by][...]
            pairs = np.reshape(pairs, pairs.shape[0])
            base = t["unique_pairs"].attrs[by]
            pair_key_type = type_fitting.fit_integer_type((base) ** 2 - 1, is_signed=False)
        with h52np.H52NP(task_file) as t:
            with np2h5.NP2H5(score_file) as s:
                inp = t.add_dataset("triplets", by)
                out = s.add_dataset("scores", by, n_rows=n, n_columns=1, item_type=np.int8)
                try:  # FIXME replace this by a for loop by making h52np
                    # implement the iterable pattern with next() outputing
                    # inp.read()
                    while True:  # FIXME keep the pairs in the file ?
                        triplets = pair_key_type(inp.read())
                        pairs_AX = triplets[:, 0] + base * triplets[:, 2]
                        # FIXME change the encoding (and type_fitting) so that
                        # A,B and B,A have the same code ... (take a=min(a,b),
                        # b=max(a,b))
                        pairs_BX = triplets[:, 1] + base * triplets[:, 2]
                        dis_AX = dis[np.searchsorted(pairs, pairs_AX)]
                        dis_BX = dis[np.searchsorted(pairs, pairs_BX)]
                        scores = np.int8(dis_AX < dis_BX) - np.int8(dis_AX > dis_BX)
                        # 1 if X closer to A, -1 if X closer to B, 0 if equal
                        # distance (this doesn't use 0, 1/2, 1 to use the
                        # compact np.int8 data format)
                        out.write(np.reshape(scores, (scores.shape[0], 1)))
                except StopIteration:
                    pass
Exemplo n.º 11
0
def score(task_file, distance_file, score_file=None, score_group='scores'):
    """Calculate the score of a task and put the results in a hdf5 file.

    Parameters
    ----------
    task_file : string
        The hdf5 file containing the task (with the triplets and pairs
        generated)
    distance_file : string
        The hdf5 file containing the distances between the pairs
    score_file : string, optional
        The hdf5 file that will contain the results
    """
    if score_file is None:
        (basename_task, _) = os.path.splitext(task_file)
        (basename_dist, _) = os.path.splitext(distance_file)
        score_file = basename_task + '_' + basename_dist + '.score'
    # file verification:
    assert os.path.exists(task_file), 'Cannot find task file ' + task_file
    assert os.path.exists(distance_file), ('Cannot find distance file ' +
                                           distance_file)
    assert not os.path.exists(score_file), ('score file already exist ' +
                                            score_file)
    # with h5py.File(task_file) as t:
    #bys = [by for by in t['triplets']]
    # FIXME skip empty by datasets, this should not be necessary anymore when
    # empty datasets are filtered at the task file generation level
    with h5py.File(task_file) as t:
        bys = t['bys'][...]
        # bys = t['feat_dbs'].keys()
        n_triplets = t['triplets']['data'].shape[0]
    with h5py.File(score_file) as s:
        s.create_dataset('scores', (n_triplets, 1), dtype=np.int8)
        for n_by, by in enumerate(bys):
            with h5py.File(task_file) as t, h5py.File(distance_file) as d:
                trip_attrs = t['triplets']['by_index'][n_by]
                pair_attrs = t['unique_pairs'].attrs[by]
                # FIXME here we make the assumption
                # that this fits into memory ...
                dis = d['distances']['data'][pair_attrs[1]:pair_attrs[2]][...]
                dis = np.reshape(dis, dis.shape[0])
                # FIXME idem + only unique_pairs used ?
                pairs = t['unique_pairs']['data'][pair_attrs[1]:pair_attrs[2]][...]
                pairs = np.reshape(pairs, pairs.shape[0])
                base = pair_attrs[0]
                pair_key_type = type_fitting.fit_integer_type((base) ** 2 - 1,
                                                              is_signed=False)
            with h52np.H52NP(task_file) as t:
                inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs)
                idx_start = trip_attrs[0]
                for triplets in inp:
                    triplets = pair_key_type(triplets)
                    idx_end = idx_start + triplets.shape[0]

                    pairs_AX = triplets[:, 0] + base * triplets[:, 2]
                    # FIXME change the encoding (and type_fitting) so that
                    # A,B and B,A have the same code ... (take a=min(a,b),
                    # b=max(a,b))
                    pairs_BX = triplets[:, 1] + base * triplets[:, 2]
                    dis_AX = dis[np.searchsorted(pairs, pairs_AX)]

                    dis_BX = dis[np.searchsorted(pairs, pairs_BX)]
                    scores = (np.int8(dis_AX < dis_BX) -
                              np.int8(dis_AX > dis_BX))
                    # 1 if X closer to A, -1 if X closer to B, 0 if equal
                    # distance (this doesn't use 0, 1/2, 1 to use the
                    # compact np.int8 data format)
                    s['scores'][idx_start:idx_end] = np.reshape(scores, (-1, 1))
                    idx_start = idx_end
Exemplo n.º 12
0
def collapse(scorefile, taskfile, fid):
    """Collapses the results for each triplets sharing the same on, across and
    by labels.
    """
    # wf_tmp = open('tmp_pandas.txt', 'wb')
    scorefid = h5py.File(scorefile)
    taskfid = h5py.File(taskfile)
    nkeys = len(scorefid['scores'].keys())
    # results = []
    for key_idx, key in enumerate(scorefid['scores'].keys()):
        print 'collapsing {0}/{1}'.format(key_idx + 1, nkeys)
        context = key

        tfrk = taskfid['regressors'][key]

        tmp = tfrk[u'indexed_data']
        indices = np.array(tmp)
        if indices.size == 0:
            continue
        tmp = scorefid['scores'][key]
        scores_arr = np.array(tmp)
        tmp = np.ascontiguousarray(indices).view(
            np.dtype((np.void, indices.dtype.itemsize * indices.shape[1])))
        n_indices = np.max(indices, 0) + 1
        if np.prod(n_indices) > 18446744073709551615:
            print "type not big enough"
        ind_type = type_fitting.fit_integer_type(np.prod(n_indices),
                                                 is_signed=False)
        # encoding the indices of a triplet to a unique index
        new_index = indices[:, 0].astype(ind_type)
        for i in range(1, len(n_indices)):
            new_index = indices[:, i] + n_indices[i] * new_index

        permut = np.argsort(new_index)
        i_unique = 0
        # collapsing the score
        key_reg = new_index[permut[0]]
        mean = np.empty((len(permut), 3))
        mean[0] = [key_reg, scores_arr[permut[0]], 0]
        i_start = 0
        for i, p in enumerate(permut[1:]):
            i += 1
            if new_index[p] != key_reg:
                mean[i_unique, 1] = (np.mean(scores_arr[permut[i_start:i]])
                                     + 1) / 2
                mean[i_unique, 2] = i - i_start
                i_start = i
                i_unique += 1
                key_reg = new_index[p]
                mean[i_unique] = [key_reg, 0, 0]

        mean[i_unique] = [key_reg, (np.mean(scores_arr[permut[i_start:i + 1]])
                                    + 1) / 2, i - i_start + 1]
        mean = np.resize(mean, (i_unique + 1, 3))

        # retrieving the triplet indices from the unique index.
        tmp = npdecode(mean[:, 0], n_indices)

        regs = tfrk['indexed_datasets']
        indexes = []
        for reg in regs:
            indexes.append(tfrk['indexes'][reg][:])
        nregs = len(regs)

        for i, key in enumerate(tmp):
            aux = list()
            for j in range(nregs):
                aux.append(indexes[j][key[j]])
                # aux.append((indexes[regs[j]])[key[j]])
            score = mean[i, 1]
            n = mean[i, 2]
            result = aux + [context, score, int(n)]
            fid.write('\t'.join(map(str, result)) + '\n')
            # results.append(aux + [context, score, n])
            # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n')
    scorefid.close()
    taskfid.close()
    del taskfid
Exemplo n.º 13
0
def collapse(scorefile, taskfile, fid):
    """Collapses the results for each triplets sharing the same on, across
    and by labels.

    """
    # We make the assumption that everything fits in memory...
    scorefid = h5py.File(scorefile)
    taskfid = h5py.File(taskfile)
    bys = taskfid['bys'][...]
    for by_idx, by in enumerate(bys):
        # print 'collapsing {0}/{1}'.format(by_idx + 1, len(bys))
        trip_attrs = taskfid['triplets']['by_index'][by_idx]

        tfrk = taskfid['regressors'][by]

        tmp = tfrk[u'indexed_data']
        indices = np.array(tmp)
        if indices.size == 0:
            continue
        tmp = scorefid['scores'][trip_attrs[0]:trip_attrs[1]]
        scores_arr = np.array(tmp)
        tmp = np.ascontiguousarray(indices).view(
            np.dtype((np.void, indices.dtype.itemsize * indices.shape[1])))
        n_indices = np.max(indices, 0) + 1
        assert np.prod(n_indices) < 18446744073709551615, "type not big enough"
        ind_type = fit_integer_type(np.prod(n_indices),
                                    is_signed=False)
        # encoding the indices of a triplet to a unique index
        new_index = indices[:, 0].astype(ind_type)
        for i in range(1, len(n_indices)):
            new_index = indices[:, i] + n_indices[i] * new_index

        permut = np.argsort(new_index)

        # collapsing the score
        sorted_scores = scores_arr[permut]
        sorted_index = new_index[permut]
        mean, unique_index, counts = unique(sorted_index, sorted_scores)

        # retrieving the triplet indices from the unique index.
        tmp = npdecode(unique_index, n_indices)

        regs = tfrk['indexed_datasets']
        indexes = []
        for reg in regs:
            indexes.append(tfrk['indexes'][reg][:])
        nregs = len(regs)

        for i, key in enumerate(tmp):
            aux = list()
            for j in range(nregs):
                aux.append(indexes[j][int(key[j])])
            score = mean[i]
            n = counts[i]
            result = aux + [by, score, int(n)]
            fid.write('\t'.join(map(str, result)) + u'\n')
            # results.append(aux + [context, score, n])
            # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n')
    scorefid.close()
    taskfid.close()
    del taskfid