Пример #1
0
def fps_to_arena(fps_reader, metadata=None, reorder=True, alignment=None):
    if metadata is None:
        metadata = fps_reader.metadata
    num_bits = metadata.num_bits
    if not num_bits:
        if metadata.num_bytes is None:
            raise ValueError("metadata must contain at least one of num_bits or num_bytes")
        num_bits = metadata.num_bytes * 8
    #assert num_bits

    if alignment is None:
        alignment = get_optimal_alignment(num_bits)

    num_bytes = metadata.num_bytes

    storage_size = num_bytes
    if storage_size % alignment != 0:
        n = alignment - storage_size % alignment
        end_padding = "\0" * n
        storage_size += n
    else:
        end_padding = None

    ids = []
    unsorted_fps = StringIO()
    for (id, fp) in fps_reader:
        if len(fp) != num_bytes:
            raise ValueError("Fingerprint for id %r has %d bytes while the metadata says it should have %d"
                             % (id, len(fp), num_bytes))
        unsorted_fps.write(fp)
        if end_padding:
            unsorted_fps.write(end_padding)
        ids.append(id)

    unsorted_arena = unsorted_fps.getvalue()
    unsorted_fps.close()
    unsorted_fps = None


    if not reorder or not metadata.num_bits:
        start_padding, end_padding, unsorted_arena = _chemfp.make_unsorted_aligned_arena(
            unsorted_arena, alignment)
        return FingerprintArena(metadata, alignment, start_padding, end_padding, storage_size,
                                unsorted_arena, "", ids)

    # Reorder
        
    ordering = (ChemFPOrderedPopcount*len(ids))()
    popcounts = array.array("i", (0,)*(metadata.num_bits+2))

    start_padding, end_padding, unsorted_arena = _chemfp.make_sorted_aligned_arena(
        num_bits, storage_size, unsorted_arena, len(ids),
        ordering, popcounts, alignment)

    new_ids = [ids[item.index] for item in ordering]
    return FingerprintArena(metadata, alignment,
                            start_padding, end_padding, storage_size,
                            unsorted_arena, popcounts.tostring(), new_ids)
Пример #2
0
    def copy(self, indices=None, reorder=None):
        """Create a new arena using either all or some of the fingerprints in this arena

        By default this create a new arena. The fingerprint data block and ids may
        be shared with the original arena, which makes this a shallow copy. If the
        original arena is a slice, or "sub-arena" of an arena, then the copy will
        allocate new space to store just the fingerprints in the slice and use its
        own list for the ids.

        The `indices` parameter, if not None, is an iterable which contains the
        indicies of the fingerprint records to copy. Duplicates are allowed, though
        discouraged.

        If indices are specified then the default `reorder=None` or a `reorder=True`
        will reorder the fingerprints for the new arena by popcount. This improves
        overall search performance. With `reorder=False`, the fingerprints will be
        in order given by the indices.

        If indices are not given, then the default is to preserve the order type of
        the original arena. Otherwise `reorder=True` will always reorder and
        `reorder=False` will leave them in the current order.

        :param indices: indicies of the records to copy into the new arena
        :type indices: iterable containing integers, or None
        :param reorder: describes how to order the fingerprints
        :type reorder: True to reorder, False to leave in input order, None for default action
        """
        if reorder is None:
            if indices is None:
                # This is a pure copy. Reorder only if there are popcount indices.
                reorder = (self.popcount_indices != "")
            else:
                # The default is to go fast. If you want to preserve index order
                # then you'll need to set reorder=False
                reorder = True
            
        if indices is None:
            # Make a completely new arena
            # Handle the trivial case where I don't need to do anything.
            if (self.start == 0 and
                (self.end*self.storage_size + self.start_padding + self.end_padding == len(self.arena)) and
                (not reorder or self.popcount_indices)):
                return FingerprintArena(self.metadata, self.alignment,
                                        self.start_padding, self.end_padding, self.storage_size, self.arena,
                                        self.popcount_indices, self.arena_ids,
                                        start = 0, end = self.end,
                                        id_lookup = self._id_lookup)
            
            # Otherwise I need to do some work
            # Make a copy of the actual fingerprints. (Which could be a subarena.)
            start = self.start_padding + self.start*self.storage_size
            end = self.start_padding + self.end*self.storage_size
            arena = self.arena[start:end]

            # If we don't have popcount_indices and don't want them ordered
            # then just do the alignment and we're done.
            if not reorder and not self.popcount_indices:
                # Don't reorder the unordered fingerprints
                start_padding, end_padding, unsorted_arena = (
                    _chemfp.make_unsorted_aligned_arena(arena, self.alignment))
                return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding,
                                        self.storage_size, unsorted_arena, "", self.ids,
                                        id_lookup = self._id_lookup)

            # Either we're already sorted or we should become sorted.
            # If we're sorted then make_sorted_aligned_arena will detect
            # that and keep the old arena. Otherwise it sorts first and
            # makes a new arena block.
            current_ids = self.ids
            ordering = (ChemFPOrderedPopcount*len(current_ids))()
            popcounts = array.array("i", (0,)*(self.metadata.num_bits+2))
            start_padding, end_padding, arena = _chemfp.make_sorted_aligned_arena(
                self.metadata.num_bits, self.storage_size, arena, len(current_ids),
                ordering, popcounts, self.alignment)

            reordered_ids = [current_ids[item.index] for item in ordering]
            return FingerprintArena(self.metadata, self.alignment,
                                    start_padding, end_padding, self.storage_size,
                                    arena, popcounts.tostring(), reordered_ids)

        # On this pathway, we want to make a new arena which contains
        # selected fingerprints given indices into the old arena.
        
        arena = self.arena
        storage_size = self.storage_size
        start = self.start
        start_padding = self.start_padding
        arena_ids = self.arena_ids
        
        # First make sure that all of the indices are in range.
        # This will also convert negative indices into positive ones.
        new_indices = []
        range_check = self._range_check
        try:
            for i in indices:
                new_indices.append(range_check[i])
        except IndexError:
            raise IndexError("arena fingerprint index %d is out of range" % (i,))

        if reorder and self.popcount_indices:
            # There's a slight performance benefit because
            # make_sorted_aligned_arena will see that the fingerprints
            # are already in sorted order and not resort.
            # XXX Is that true? Why do a Python sort instead of a C sort?
            # Perhaps because then I don't need to copy fingerprints?
            new_indices.sort()

        # Copy the fingerprints over to a new arena block
        unsorted_fps = []
        new_ids = []
        for new_i in new_indices:
            start_offset = start_padding + new_i*storage_size
            end_offset = start_offset + storage_size
            unsorted_fps.append(arena[start_offset:end_offset])
            new_ids.append(arena_ids[new_i])
                
        unsorted_arena = "".join(unsorted_fps)
        unsorted_fps = None   # regain some memory

        # If the caller doesn't want ordered data, then leave it unsorted
        if not reorder:
            start_padding, end_padding, unsorted_arena = _chemfp.make_unsorted_aligned_arena(
                unsorted_arena, self.alignment)
            return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding, storage_size,
                                    unsorted_arena, "", new_ids)

        # Otherwise, reorder and align the area, along with popcount information
        ordering = (ChemFPOrderedPopcount*len(new_ids))()
        popcounts = array.array("i", (0,)*(self.metadata.num_bits+2))

        start_padding, end_padding, sorted_arena = _chemfp.make_sorted_aligned_arena(
            self.metadata.num_bits, storage_size, unsorted_arena, len(new_ids),
            ordering, popcounts, self.alignment)

        reordered_ids = [new_ids[item.index] for item in ordering]
        return FingerprintArena(self.metadata, self.alignment,
                                start_padding, end_padding, storage_size,
                                sorted_arena, popcounts.tostring(), reordered_ids)