def _tag_bot_endpoints(tag, idx_ptrn, search_only=False): """ Tag all documents whose request path contained a bot endpoint (db from Babak). """ paths = utils.get_bot_endpoints() batch_size = 500 batched_paths = utils.batch_iterable(paths, n=batch_size) LOGGER.info(f"_tag_bot_endpoints batch_size={batch_size}...") searches = [] for idx, batch in enumerate(batched_paths): if idx % 10 == 0: LOGGER.info(f" at batch {idx}...") batch_search = _tag_by_paths(tag, idx_ptrn, batch, search_only=search_only) searches.append(batch_search) return searches
def get_activations(self, list_of_seqs, custom_tensor_to_retrieve=None): """Gets activations where batching may be needed to avoid OOM. Inputs are strings of amino acids, outputs are activations from the network. Args: list_of_seqs: list of strings as input for inference. custom_tensor_to_retrieve: string name for a tensor to retrieve, if unset uses default for signature. Returns: concatenated numpy array of activations with shape [num_of_seqs, ...] """ # TODO(theosanderson): inference can be made dramatically faster by sorting # list of_seqs by length before inference (and presumably reversing the # sort process afterwards) if not isinstance(list_of_seqs, list): raise ValueError('seq_input must be a list of strings.') logging.info('Predicting for %d sequences', len(list_of_seqs)) if list_of_seqs == []: # pylint: disable=g-explicit-bool-comparison return np.array([], dtype=float) batches = list(utils.batch_iterable(list_of_seqs, self.batch_size)) itr = tqdm.tqdm(batches, position=0) if self._use_tqdm else batches output_matrix = None for i, batch in enumerate(itr): batch_activations = self._get_activations_for_batch( batch, custom_tensor_to_retrieve=custom_tensor_to_retrieve) if output_matrix is None: # Allocate matrix to store all activations: output_shape = list(batch_activations.shape) output_shape[0] = len(list_of_seqs) output_matrix = np.zeros(output_shape, np.float16) starting_index = i * self.batch_size output_matrix[starting_index:starting_index + batch_activations.shape[0]] = batch_activations return output_matrix
def testBatchIterable(self, input_iterable, batch_size, expected): actual = list(utils.batch_iterable(input_iterable, batch_size)) self.assertEqual(actual, expected)