def classify(self, test_X, keep_all_class_scores, progress=None): # TASK: There's some work to be done here to optimize the size # of this split to dial the memory usage n_rows = test_X.shape[0] if n_rows < 100: pred_y, scores, all_class_scores = _do_predict( classifier=self.classifier, X=test_X) else: n_work_orders = n_rows // 100 results = zap.work_orders( [ Munch(classifier=self.classifier, X=X, fn=_do_predict) for X in np.array_split(test_X, n_work_orders, axis=0) ], _trap_exceptions=False, _progress=progress, ) pred_y = utils.listi(results, 0) scores = utils.listi(results, 1) all_class_scores = utils.listi(results, 2) pred_y = np.concatenate(pred_y) scores = np.concatenate(scores) if keep_all_class_scores: all_class_scores = np.concatenate(all_class_scores) if not keep_all_class_scores: all_class_scores = None return pred_y, scores, all_class_scores
def classify(self, X, progress=None): check.array_t(X, ndim=2) n_rows = X.shape[0] if n_rows < 100: winner_y, winner_scores, runnerup_y, runnerup_scores = _do_predict( classifier=self.classifier, X=X) else: n_work_orders = n_rows // 100 with zap.Context(progress=progress, trap_exceptions=False): results = zap.work_orders([ Munch(classifier=self.classifier, X=X, fn=_do_predict) for X in np.array_split(X, n_work_orders, axis=0) ]) winner_y = utils.listi(results, 0) winner_scores = utils.listi(results, 1) runnerup_y = utils.listi(results, 2) runnerup_scores = utils.listi(results, 3) winner_y = np.concatenate(winner_y) winner_scores = np.concatenate(winner_scores) runnerup_y = np.concatenate(runnerup_y) runnerup_scores = np.concatenate(runnerup_scores) return winner_y, winner_scores, runnerup_y, runnerup_scores
def it_groups(): df = pd.DataFrame(dict(a=[1, 1, 2, 2, 2], b=[1, 2, 3, 4, 5])) res = zap.df_groups(test9, df.groupby("a")) a = listi(res, 0) ap1 = listi(res, 1) assert a == [1, 2] assert ap1 == [2, 3]
def _out_of_date(parents, children, ignore_fn=None): """ Check if parents are dirty compared to children args: parents: a list or singleton of paths. If the path is a dir, all of the files (recursively) in the dir will be used children: a list or singleton of paths. If the path is a dir, all of the files (recursively) in the dir will be used ignore_fn: ignore any path spec where this function return True return: A tuple: (out_of_date_boolean, reason) out_of_date_boolean: True if the youngest file in parents is younger than the oldest file in the children reason: The human readable reason why it is out of date """ parent_files_and_times = PipelineTask._parent_timestamps( parents, ignore_fn) child_files_and_times = PipelineTask._child_timestamps( children, ignore_fn) if len(parent_files_and_times) == 0: return False, "No parent files" if len(child_files_and_times) == 0: return True, "No child files" parent_times = np.array(utils.listi(parent_files_and_times, 1)) child_times = np.array(utils.listi(child_files_and_times, 1)) if np.max(parent_times) > np.max(child_times): def name_fmt(path): path = local.path(path) return ( f"{utils.safe_list_get(path.split(), -2, default='')}/{path.name}" ) parent_max_name = name_fmt( utils.listi(parent_files_and_times, 0)[np.argmax(parent_times)]) child_max_name = name_fmt( utils.listi(child_files_and_times, 0)[np.argmax(child_times)]) return ( True, (f"Parent file: '{parent_max_name}' " f"is younger than child file: " f"'{child_max_name}'"), ) return False, "Up to date"
def _run_sim(sim_params, pep_seqs_df, name, n_peps, n_samples, progress): if sim_params.get("random_seed") is not None: # Increment so that train and test will be different sim_params.random_seed += 1 np.random.seed(sim_params.random_seed) dyemat = ArrayResult( f"{name}_dyemat", shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles), dtype=DyeType, mode="w+", ) radmat = ArrayResult( f"{name}_radmat", shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles), dtype=RadType, mode="w+", ) recall = ArrayResult( f"{name}_recall", shape=(n_peps, ), dtype=RecallType, mode="w+", ) flus__remainders = zap.df_groups( _do_pep_sim, pep_seqs_df.groupby("pep_i"), sim_params=sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, _progress=progress, _trap_exceptions=False, _process_mode=True, ) flus = np.array(utils.listi(flus__remainders, 0)) flu_remainders = np.array(utils.listi(flus__remainders, 1)) return dyemat, radmat, recall, flus, flu_remainders
def region_map(im, func, n_divs=4, include_coords=False, **kwargs): """ Apply the function over window regions of im. Regions are divisions of the LAST-TWO dimensions of im. """ assert im.ndim >= 2 results = [] for win_im, _, _, coord in region_enumerate(im, n_divs): if include_coords: kwargs["coords"] = coord results += [func(win_im, **kwargs)] assert len(results) == n_divs * n_divs if isinstance(results[0], tuple): # The func returned a tuple of return values # These have to be re-assembled into arrays with a rule # that all arrays of each component of the tuple # have to return the same size. n_ret_fields = len(results[0]) result_fields = [] for ret_field_i in range(n_ret_fields): # Suppose func returns a tuple( array(11, 11), array(n, 8) ) # For the first argument you want to return a (divs, divs, 11, 11) # But for the second arguments you might want (divs, divs field = utils.listi(results, ret_field_i) # field is expected to be a list of arrays all of same shape if isinstance(field[0], np.ndarray): field_shape = field[0].shape assert all([row.shape == field_shape for row in field]) elif np.isscalar(field[0]): # Convert to an array field = np.array(field) else: raise TypeError( f"Unexpected return type from {func.__name__} in region_map" ) field_array = field.reshape((n_divs, n_divs, *field.shape[1:])) result_fields += [field_array] results = tuple(result_fields) else: results = np.array(results) results = results.reshape((n_divs, n_divs, *results.shape[1:])) return results
def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels), "There is a duplicate label", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ] # Channel mappings mentioned_channels = {dye.channel_name: False for dye in self.dyes} if "channels" in self: # Validate that channel mapping is complete for channel_name, ch_i in self.channels.items(): self._validate( channel_name in mentioned_channels, f"Channel name '{channel_name}' was not found in dyes", ) mentioned_channels[channel_name] = True self._validate( all([mentioned for _, mentioned in mentioned_channels.items()]), "Not all channels in dyes were enumerated in channels", ) else: # No channel mapping: assign them self["channels"] = { ch_name: i for i, ch_name in enumerate(sorted(mentioned_channels.keys())) }
def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "amino_acid")))) == len(self.labels), "There is a duplicate label", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ]
def arrays( fn, arrays_dict, _batch_size=None, _stack=False, _limit_slice=None, **kwargs, ): """ Split an array by its first dimension and send each row to fn. The array_dict is one or more parallel arrays that will be passed to fn(). **kwargs will end up as (constant) kwargs to fn(). Example: def myfn(a, b, c): return a + b + c a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) res = zap.arrays( myfn, dict(a=a, b=b), c=1 ) # This will call: # myfn(1, 4, 1) # myfn(2, 5, 1) # myfn(3, 6, 1) # and res == [1+4+1, 2+5+1, 3+6+1] These calls are batched into parallel processes (or _process_mode is False) where the _batch_size is set or if None it will be chosen to use all cpus. When fn returns a tuple of fields, these return fields will be maintained. Example: def myfn(a, b, c): return a, b+c a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) res = zap.arrays( myfn, dict(a=a, b=b), c=1 ) # This will call as before but now: # res == ([1, 2, 3], [4+1, 5+1, 6+1]) If _stack is True then _each return field_ will be wrapped with a np.array() before it is returned. If _stack is a list then you can selective wrap the np.array only to the return fields of your choice. Example: def myfn(a, b, c): return a, b+c a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) res = zap.arrays( myfn, dict(a=a, b=b), c=1, _stack=True ) # This will call as before but now: # res == (np.array([1, 2, 3]), np.array([4+1, 5+1, 6+1])) # Of called with _stack=[True, False] # res == (np.array([1, 2, 3]), [4+1, 5+1, 6+1]) """ n_rows = len(list(arrays_dict.values())[0]) assert all([len(a) == n_rows for a in arrays_dict.values()]) batch_slices = make_batch_slices(n_rows, _batch_size, _limit_slice) result_batches = work_orders( _work_orders=[ Munch( fn=_run_arrays, inner_fn=fn, slice=batch_slice, arrays_dict=arrays_dict, **kwargs, ) for batch_slice in batch_slices ], _fn_name=fn.__name__, ) if len(result_batches) == 0: raise ValueError("No batches were returned") first_batch = result_batches[0] if isinstance(first_batch, Exception): raise first_batch if len(first_batch) == 0: raise ValueError("First batch had no elements") first_return = first_batch[0] if isinstance(first_return, Exception): raise first_return assert isinstance(first_return, tuple) n_fields = len(first_return) unbatched = [] for field_i in range(n_fields): field_rows = [] for batch in result_batches: field_rows += utils.listi(batch, field_i) unbatched += [field_rows] if _stack is not None: if isinstance(_stack, bool): _stack = [_stack] * n_fields if isinstance(_stack, (list, tuple)): assert all([isinstance(s, bool) for s in _stack]) assert len(_stack) == n_fields # If requested, wrap the return field in np.array() for field_i in range(n_fields): if _stack[field_i]: unbatched[field_i] = np.array(unbatched[field_i]) if n_fields == 1: return unbatched[0] else: return tuple(unbatched)
def channels(self): return sorted(list(set(utils.listi(self.dyes, "channel_name"))))