예제 #1
0
def _tag_bot_endpoints(tag, idx_ptrn, search_only=False):
    """ Tag all documents whose request path contained a bot endpoint (db from Babak). """
    paths = utils.get_bot_endpoints()
    batch_size = 500
    batched_paths = utils.batch_iterable(paths, n=batch_size)
    LOGGER.info(f"_tag_bot_endpoints batch_size={batch_size}...")
    searches = []
    for idx, batch in enumerate(batched_paths):
        if idx % 10 == 0:
            LOGGER.info(f"  at batch {idx}...")
        batch_search = _tag_by_paths(tag,
                                     idx_ptrn,
                                     batch,
                                     search_only=search_only)
        searches.append(batch_search)
    return searches
예제 #2
0
    def get_activations(self, list_of_seqs, custom_tensor_to_retrieve=None):
        """Gets activations where batching may be needed to avoid OOM.

    Inputs are strings of amino acids, outputs are activations from the network.

    Args:
      list_of_seqs: list of strings as input for inference.
      custom_tensor_to_retrieve: string name for a tensor to retrieve, if unset
        uses default for signature.

    Returns:
      concatenated numpy array of activations with shape [num_of_seqs, ...]
    """
        # TODO(theosanderson): inference can be made dramatically faster by sorting
        # list of_seqs by length before inference (and presumably reversing the
        # sort process afterwards)

        if not isinstance(list_of_seqs, list):
            raise ValueError('seq_input must be a list of strings.')
        logging.info('Predicting for %d sequences', len(list_of_seqs))

        if list_of_seqs == []:  # pylint: disable=g-explicit-bool-comparison
            return np.array([], dtype=float)

        batches = list(utils.batch_iterable(list_of_seqs, self.batch_size))
        itr = tqdm.tqdm(batches, position=0) if self._use_tqdm else batches
        output_matrix = None

        for i, batch in enumerate(itr):
            batch_activations = self._get_activations_for_batch(
                batch, custom_tensor_to_retrieve=custom_tensor_to_retrieve)

            if output_matrix is None:
                # Allocate matrix to store all activations:
                output_shape = list(batch_activations.shape)
                output_shape[0] = len(list_of_seqs)
                output_matrix = np.zeros(output_shape, np.float16)
            starting_index = i * self.batch_size
            output_matrix[starting_index:starting_index +
                          batch_activations.shape[0]] = batch_activations

        return output_matrix
예제 #3
0
    def testBatchIterable(self, input_iterable, batch_size, expected):
        actual = list(utils.batch_iterable(input_iterable, batch_size))

        self.assertEqual(actual, expected)