def determine_ref_occupancy(self, test_set: pd.Index, block_size=7500, n_jobs=1): """ Function to determine the number of test cells occupying each reference sphere :param pd.Index test_set: Pandas index of test set observation names """ if any(~test_set.isin(self.adata.obs_names)): raise ValueError( 'Some of the cells in the test set are not in the AnnData object. ' 'Ensure that all the test cells are in the AnnData object') # Test data test_data = self.adata[_find_cell_indices(self.adata, test_set), ].X # Compute counts in blocks counts = np.zeros(len(self.ref_set)) blocks = np.linspace(0, len(self.ref_set), int(len(self.ref_set) / block_size) + 1).astype( np.int) for b in range(1, len(blocks)): test_range = range(blocks[b - 1], blocks[b]) ref_data = self.adata[self.ref_set[test_range], ].X dists = pairwise_distances(ref_data, test_data, n_jobs=n_jobs) counts[test_range] = (dists < self.sigmas[test_range, np.newaxis]).sum(axis=1) return counts
def find_overlap_in_dataset(data_in: np.ndarray, list_1: pd.Index, list_2: list): if not isinstance(list_1, pd.Index): raise TypeError("{} is supposed to be of type [pd.Index]. " "Wrong format instead: {}".format(list_1, type(list_1))) else: overlap = list(set(list_1.tolist()).intersection(list_2)) found_bool = list_1.isin(overlap) return found_bool, data_in.X[:, found_bool]
def filter_genes(genes: pd.Index, species: str = 'human') -> pd.Index: """Remove ribosomal and mitochondrial genes.""" ribo_genes = get_ribosomal_genes(species) mito_genes = get_mitochondrial_genes(species) excluded_genes = ribo_genes + mito_genes sel_genes = genes[~genes.isin(excluded_genes)] return sel_genes
def determine_distance(self, test_set: pd.Index): """ Function to determine the fraction of unoccoupied reference spheres in the test set :param pd.Index test_set: Pandas index of test set observation names """ if any(~test_set.isin(self.adata.obs_names)): raise ValueError( 'Some of the cells in the test set are not in the AnnData object. ' 'Ensure that all the test cells are in the AnnData object') counts = self.determine_ref_occupancy(test_set) return np.sum(counts == 0) / len(counts), counts
def replace_multi_index_level( df: "classes.BeliefsDataFrame", level: str, index: pd.Index, intersection: bool = False, ) -> "classes.BeliefsDataFrame": """Replace one of the index levels of the multi-indexed DataFrame. Returns a new DataFrame object. :param df: a BeliefsDataFrame (or just a multi-indexed DataFrame). :param level: the name of the index level to replace. :param index: the new index. :param intersection: policy for replacing the index level. If intersection is False then simply replace (note that the new index should have the same length as the old index). If intersection is True then add indices not contained in the old index and delete indices not contained in the new index. New rows have nan columns values and copies of the first row for other index levels (note that the resulting index is usually longer and contains values that were both in the old and new index, i.e. the intersection). """ # Todo: check whether timezone information is copied over correctly # Check input if intersection is False and len(index) != len(df.index): raise ValueError( "Cannot simply replace multi-index level with an index of different length than the original. " "Use intersection instead?") if index.name is None: index.name = level new_index_values = [] new_index_names = [] if intersection is True: contained_in_old = index.isin(df.index.get_level_values(level)) new_index_not_in_old = index[~contained_in_old] contained_in_new = df.index.get_level_values(level).isin(index) for i in df.index.names: if i == level: # For the index level that should be replaced # Copy old values that the new index contains, and add new values that the old index does not contain new_index_values.append( df.index.get_level_values(i)[contained_in_new].append( new_index_not_in_old)) new_index_names.append(index.name) else: # For the other index levels # Copy old values that the new index contains, and add the first value to the new rows new_row_values = pd.Index([df.index.get_level_values(i)[0]] * len(new_index_not_in_old)) new_index_values.append( df.index.get_level_values(i)[contained_in_new].append( new_row_values)) new_index_names.append(i) else: for i in df.index.names: if i == level: # For the index level that should be replaced # Replace with new index new_index_values.append(index) new_index_names.append(index.name) else: # For the other index levels # Copy all old values new_index_values.append(df.index.get_level_values(i)) new_index_names.append(i) # Construct new MultiIndex mux = pd.MultiIndex.from_arrays(new_index_values, names=new_index_names) df = df.copy(deep=True) # Apply new MultiIndex if intersection is True: # Reindex such that new rows get nan column values df = df.reindex(mux) else: # Replace the index df.index = mux return df.sort_index()
def _get_indexer(self, index: pd.Index) -> np.ndarray: return index.isin(self.values)