def _nearest(scdf, ocdf, suffix="_b", how=None, overlap=True, **kwargs): if overlap: nearest_df, df_to_find_nearest_in = _overlapping_for_nearest( scdf, ocdf, suffix, **kwargs) else: df_to_find_nearest_in = scdf df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, "Start", "End") ocdf = sort_one_by_one(ocdf, "Start", "End") df_to_find_nearest_in.index = pd.Index(range(len(df_to_find_nearest_in))) if how == "next": r_idx, dist = _next_nonoverlapping(df_to_find_nearest_in.End, ocdf.Start, ocdf.index.values) elif how == "previous": r_idx, dist = _previous_nonoverlapping(df_to_find_nearest_in.Start, ocdf.End) else: previous_r_idx, previous_dist = _previous_nonoverlapping( df_to_find_nearest_in.Start, ocdf.End) next_r_idx, next_dist = _next_nonoverlapping(df_to_find_nearest_in.End, ocdf.Start, ocdf.index.values) r_idx, dist = nearest_nonoverlapping(previous_r_idx, previous_dist, next_r_idx, next_dist) ocdf = ocdf.reindex( r_idx, fill_value=-1) # instead of np.nan, so ints are not promoted to float ocdf.index = df_to_find_nearest_in.index ocdf.insert(ocdf.shape[1], "Distance", pd.Series(dist, index=ocdf.index).fillna(-1).astype(int)) r_idx = pd.Series(r_idx, index=ocdf.index) df_to_find_nearest_in = df_to_find_nearest_in.drop( r_idx.loc[r_idx == -1].index) df = df_to_find_nearest_in.join(ocdf, rsuffix=suffix) if overlap and not df.empty and not nearest_df.empty: df = pd.concat([nearest_df, df]) elif overlap and not nearest_df.empty: df = nearest_df df = df.drop("Chromosome" + suffix, axis=1) return df
def _nearest(self, other, strandedness, suffix="_b", how=None, overlap=True): if overlap: nearest_df, df_to_find_nearest_in = _overlapping_for_nearest(self, other, strandedness, suffix) else: df_to_find_nearest_in = self.df other_strand = {"+": "-", "-": "+"} if self.stranded and strandedness: # chromosome and strand grpby_key = "Chromosome Strand".split() else: grpby_key = "Chromosome" other_dfs = {k: d for k, d in other.df.groupby(grpby_key)} dfs = [] for key, scdf in df_to_find_nearest_in.groupby(grpby_key): if len(key) == 2 and strandedness == "opposite": other_key = key[0], other_strand[key[1]] else: other_key = key if not other_key in other_dfs: continue ocdf = other_dfs[other_key] scdf.index = pd.Index(range(len(scdf))) if how == "next": r_idx, dist = _next_nonoverlapping(scdf.End, ocdf.Start, ocdf.index.values) elif how == "previous": r_idx, dist = _previous_nonoverlapping(scdf.Start, ocdf.End, ocdf.index.values) else: previous_r_idx, previous_dist = _previous_nonoverlapping(scdf.Start, ocdf.End, ocdf.index.values) next_r_idx, next_dist = _next_nonoverlapping(scdf.End, ocdf.Start, ocdf.index.values) r_idx, dist = nearest_nonoverlapping(previous_r_idx, previous_dist, next_r_idx, next_dist) ocdf = ocdf.reindex(r_idx, fill_value=-1) # instead of np.nan, so ints are not promoted to float ocdf.index = scdf.index ocdf.insert(ocdf.shape[1], "Distance", pd.Series(dist, index=ocdf.index).fillna(-1).astype(int)) ocdf.drop("Chromosome", axis=1, inplace=True) r_idx = pd.Series(r_idx, index=ocdf.index) scdf = scdf.drop(r_idx.loc[r_idx == -1].index) result = scdf.join(ocdf, rsuffix=suffix) dfs.append(result) if dfs: df = pd.concat(dfs) else: df = pd.DataFrame(columns="Chromosome Start End Strand".split()) if overlap and not df.empty and not nearest_df.empty: df = pd.concat([nearest_df, df]) elif overlap and not nearest_df.empty: df = nearest_df return df
def _nearest(scdf, ocdf, kwargs): if scdf.empty or ocdf.empty: return None overlap = kwargs["overlap"] how = kwargs["how"] suffix = kwargs["suffix"] if how == "upstream": strand = scdf.Strand.iloc[0] how = {"+": "previous", "-": "next"}[strand] elif how == "downstream": strand = scdf.Strand.iloc[0] how = {"+": "next", "-": "previous"}[strand] ocdf = ocdf.reset_index(drop=True) if overlap: nearest_df, df_to_find_nearest_in = _overlapping_for_nearest( scdf, ocdf, suffix) else: df_to_find_nearest_in = scdf if not df_to_find_nearest_in.empty: df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, "Start", "End") ocdf = sort_one_by_one(ocdf, "Start", "End") df_to_find_nearest_in.index = pd.Index( range(len(df_to_find_nearest_in))) if how == "next": r_idx, dist = _next_nonoverlapping(df_to_find_nearest_in.End, ocdf.Start, ocdf.index.values) elif how == "previous": r_idx, dist = _previous_nonoverlapping(df_to_find_nearest_in.Start, ocdf.End) else: previous_r_idx, previous_dist = _previous_nonoverlapping( df_to_find_nearest_in.Start, ocdf.End) next_r_idx, next_dist = _next_nonoverlapping( df_to_find_nearest_in.End, ocdf.Start, ocdf.index.values) r_idx, dist = nearest_nonoverlapping(previous_r_idx, previous_dist, next_r_idx, next_dist) ocdf = ocdf.reindex(r_idx) ocdf.index = df_to_find_nearest_in.index ocdf = _insert_distance(ocdf, dist, suffix) r_idx = pd.Series(r_idx, index=ocdf.index) df_to_find_nearest_in = df_to_find_nearest_in.drop( r_idx.loc[r_idx == -1].index) df = df_to_find_nearest_in.join(ocdf, rsuffix=suffix) if overlap and "df" in locals() and not df.empty and not nearest_df.empty: df = pd.concat([nearest_df, df], sort=False) elif overlap and not nearest_df.empty: df = nearest_df df = df.drop("Chromosome" + suffix, axis=1) return df