def combine_group_rows_on_char(df, group_on, combine_cols=None, char='|'): """ Performs a Groupby on a dataframe and then converts each group into a single row, joinned by a character `char` Primarly suppports grouping on columns, other methods have not been tested. :param df: The dataframe to group :param group_on: the column name or list of column names to group by :param combine_cols: a list of column names to combine with a character, if None, will combine all columns. can save computation time to provide only the columns of interest for combination :param char: the character to combine the columns with. Defaults to a `|` character. :return: Dataframe with 1 row per group, and information of different rows joined by given character. """ col_order = df.columns if type(group_on) in (str, int, float): group_on = [group_on] grouped = df.groupby(group_on) if combine_cols is None: combine_cols = find_cols_with_multi_values(grouped) out_df = grouped.first() for col in tqdm(combine_cols, desc='total_progress'): tqdm.pandas(desc=col) out_df[col] = grouped[col].progress_apply(char_combine_col, char=char) return out_df.reset_index()[col_order]
def create_farm_id_translate_table( df: pd.DataFrame, columns=["kommunenr", "gaardsnummer", "bruksnummer", "festenummer"]): """ For a dataframe with farmers, each identified by kommunenr, gårdsnummer, bruksnummer, and festenummer, returns a new dataframe with new and updated ids for every farmer, queried against the geonorge commune reform api. Settle in, this could take a while... """ from tqdm.autonotebook import tqdm old_farms = df[columns] tqdm.pandas(desc="Creating translate table...", ncols=100) def apply_func(farm): return get_updated_commune_and_farm_id(*farm) new_farms = old_farms.progress_apply(apply_func, axis=1, result_type="broadcast") new_farms.columns = list(map(lambda c: c + "_new", columns)) old_farms.columns = list(map(lambda c: c + "_old", columns)) return old_farms.merge(new_farms, left_index=True, right_index=True)
def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds): samp = self._obj.iloc[: self._npartitions * 2, :] meta = samp.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) try: if broadcast: result_type = "broadcast" elif reduce: result_type = "reduce" tmp_df = ( dd.from_pandas(samp, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) assert tmp_df.equals(meta) if self._progress_bar: with TQDMDaskProgressBar(desc="Dask Apply"): return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) else: return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) except (AssertionError, AttributeError, ValueError, TypeError) as e: if self._progress_bar: tqdm.pandas(desc="Pandas Apply") return self._obj.progress_apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) else: return self._obj.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds )
def apply(self, func, convert_dtype=True, args=(), **kwds): """ Apply the function to the Series using swifter """ samp = self._obj.iloc[:self._npartitions * 2] # check if input is string or if the user is overriding the string processing default allow_dask_processing = True if self._allow_dask_on_strings else ( samp.dtype != "object") if "axis" in kwds.keys(): kwds.pop("axis") warnings.warn( "Axis keyword not necessary because applying on a Series.") try: # try to vectorize tmp_df = func(samp, *args, **kwds) assert samp.apply(func, convert_dtype=convert_dtype, args=args, **kwds).equals(tmp_df) return func(self._obj, *args, **kwds) except ( AssertionError, AttributeError, ValueError, TypeError, TypingError, ): # if can't vectorize, estimate time to pandas apply wrapped = self._wrapped_apply(func, convert_dtype=convert_dtype, args=args, **kwds) n_repeats = 3 timed = timeit.timeit(wrapped, number=n_repeats) samp_proc_est = timed / n_repeats est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._obj.shape[ 0] # if pandas apply takes too long and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing: return self._dask_apply(func, convert_dtype, *args, **kwds) else: # use pandas if self._progress_bar: tqdm.pandas(desc="Pandas Apply") return self._obj.progress_apply( func, convert_dtype=convert_dtype, args=args, **kwds) else: return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds)
def _apply_func(iterable, func, tqdm_obj=None): """ Applies a function to an iterable immutably. """ if isinstance(iterable, (pd.DataFrame, pd.Series)): tqdm.pandas() return iterable.progress_apply(func) else: def update(*args): tqdm_obj.update() return func(*args) return map(update, iterable)
def apply(self, func, *args, **kwds): """ Apply the function to the transformed swifter object """ # estimate time to pandas apply wrapped = self._wrapped_apply(func, *args, **kwds) n_repeats = 3 timed = timeit.timeit(wrapped, number=n_repeats) samp_proc_est = timed / n_repeats est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._nrows # if pandas apply takes too long, use dask if est_apply_duration > self._dask_threshold: return self._dask_apply(func, *args, **kwds) else: # use pandas if self._progress_bar: tqdm.pandas(desc="Pandas Apply") return self._obj_pd.apply(func, *args, **kwds) else: return self._obj_pd.apply(func, *args, **kwds)
def assemble_library( spacers: pd.DataFrame, on_target_score_threshold: int = 0, off_target_score_threshold: int = 0, spacers_per_feature: int = 6, ) -> pd.DataFrame: """Creates a final list of protospacers for synthesis Parameters __________ spacers : :class:`~pd.DataFrame` Dataframe with all spacers found by :module:`~find_spacers.find_spacers`, scores added by :module:`~on_target_scoring.on_target_scoring` and :module:`~off_target_scoring.off_target_scoring` on_target_score_threshold : int, optional (default: 100) Spacers with an on-target score below this threshold will be removed off_target_score_threshold : int, optional (default: 0) Spacers with an off-target score below this threshold will be removed spacers_per_feature : int, optional (default: 6) The number of spacers to return for each gene Return ______ :class:`~pd.DataFrame` with the final spacer sequences for synthesis """ spacers = spacers[spacers["on_target_score"] > on_target_score_threshold] spacers = spacers[spacers["off_target_score"] > off_target_score_threshold] spacers = spacers.drop(labels=["seq_hash", "hash"], axis="columns").drop_duplicates() if spacers_per_feature == 0: return spacers.drop(labels=["seq_hash", "hash"], axis="columns") else: tqdm.pandas(desc="Assembling library", unit="spacers") grouped = ( spacers.groupby("gene_name").progress_apply(lambda x: x.nlargest( spacers_per_feature, "on_target_score")).reset_index(drop=True) # .drop(labels=["seq_hash", "hash"], axis="columns") ) return grouped
def generate_unique_filepaths(outfile=None, nrows=None): ''' Create a list of unique filepaths for all case json in the PACER folder and export to .csv Inputs: - outfile (str or Path) - the output file name (.csv) relative to the project root if none doesn't output - nrows (int) - no. of cases to use (for testing) Outputs: DataFrame of file metadata (also output to outfile if output=True) ''' import pandas as pd tqdm.pandas() case_jsons = [court_dir.glob('json/*.json') for court_dir in settings.PACER_PATH.glob('*') if court_dir.is_dir()] file_iter = chain(*case_jsons) df = convert_filepaths_list(file_iter=file_iter, nrows=nrows) #Write the file if outfile: df.to_csv(std_path(outfile)) return df
def filtrations(df, with_dots=False): stopWords = set(stopwords.words('english')) if with_dots: tqdm.pandas(desc="WITH DOTS: ") df = df[df.lemma.progress_apply(lambda lemma: str(lemma) not in string.punctuation.replace('.', ''))] else: tqdm.pandas(desc="WITHOUT DOTS: ") df = df[df.lemma.progress_apply(lambda lemma: str(lemma) not in string.punctuation)] mask = (~df.lemma.isin(stopWords)) & (df.ner_tag != '[]') & (df.ner_tag != '') & (df.lemma != '') & (df.token != '') df = df[mask] tqdm.pandas(desc="") return df
def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds): """ Apply the function to the DataFrame using swifter """ samp = self._obj.iloc[: self._npartitions * 2, :] # check if input is string or if the user is overriding the string processing default str_processing = ("object" in samp.dtypes.values) if not self._allow_dask_on_strings else False try: # try to vectorize tmp_df = func(samp, *args, **kwds) assert samp.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ).equals(tmp_df) return func(self._obj, *args, **kwds) except ( AssertionError, AttributeError, ValueError, TypeError, TypingError, ) as e: # if can't vectorize, estimate time to pandas apply wrapped = self._wrapped_apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) n_repeats = 3 timed = timeit.timeit(wrapped, number=n_repeats) samp_proc_est = timed / n_repeats est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._obj.shape[0] # if pandas apply takes too long and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and (not str_processing): if axis == 0: raise NotImplementedError( "Swifter cannot perform axis=0 applies on large datasets.\n" "Dask currently does not have an axis=0 apply implemented.\n" "More details at https://github.com/jmcarpenter2/swifter/issues/10" ) return self._dask_apply(func, axis, broadcast, raw, reduce, result_type, *args, **kwds) else: # use pandas if self._progress_bar: tqdm.pandas(desc="Pandas Apply") return self._obj.progress_apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) else: return self._obj.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds )
# In[5]: import pandas as pd import urllib import numpy as np import json from tqdm.autonotebook import tqdm #%matplotlib inline tqdm.pandas(tqdm) import dask.dataframe as dd from dask.multiprocessing import get from dask.diagnostics import ProgressBar from datetime import datetime import matplotlib.pyplot as plt from IPython.display import display # In[10]:
# %% import glob import os from functools import partial import pandas as pd from pymatgen.core import Composition, Structure from tqdm.autonotebook import tqdm from aviary.cgcnn.utils import get_cgcnn_input from aviary.wren.utils import count_wyks, get_aflow_label_spglib tqdm.pandas() # prime progress_apply functionality final_dir = os.path.dirname(os.path.abspath(__file__)) idx_list = [] structs = [] E_vasp_list = [] meta_list = [] ht_paths = [] for f in glob.glob(final_dir + "/raw/*.poscar", recursive=True): task_id = f.split("/")[-1].split(".")[0] with open(f) as s: s = s.read() struct = Structure.from_str(s, fmt="poscar") lines = s.split("\n")
def additional_features(df): tqdm.pandas(desc="IS TITLE: ") df['is_title'] = df.token.progress_apply(lambda x: int(str(x).istitle())) tqdm.pandas(desc="CONTAINS DIGITS: ") df['contains_digits'] = df.token.progress_apply(lambda x: int(not str(x).isalpha())) tqdm.pandas(desc="WORD LENGTH: ") df['word_len'] = df.token.progress_apply(lambda x: len(str(x))) tqdm.pandas(desc="SUFFIX: ") df['suffix'] = df.lemma.progress_apply(lambda x: str(x)[-3:]) tqdm.pandas(desc="PREFIX: ") df['prefix'] = df.lemma.progress_apply(lambda x: str(x)[0:3]) tqdm.pandas(desc="") df['prev_pos_tag'] = np.roll(df.pos_tag.values, 1) df['prev_is_title'] = np.roll(df.is_title.values, 1) df['prev_contains_digits'] = np.roll(df.contains_digits.values, 1) df['prev_word_len'] = np.roll(df.word_len.values, 1) df['prev_suffix'] = np.roll(df.suffix.values, 1) df['prev_prefix'] = np.roll(df.prefix.values, 1) df['next_pos_tag'] = np.roll(df.pos_tag.values, -1) df['next_is_title'] = np.roll(df.is_title.values, -1) df['next_contains_digits'] = np.roll(df.contains_digits.values, -1) df['next_word_len'] = np.roll(df.word_len.values, -1) df['next_suffix'] = np.roll(df.suffix.values, -1) df['next_prefix'] = np.roll(df.prefix.values, -1) return df
def create_library( input_sequences: str = None, output_library: str = None, reference: str = None, restriction_sites: str = None, largeindex: bool = False, on_target_rule_set: Optional[str] = None, on_target_score_threshold: int = 0, off_target_rule_set: Optional[str] = None, off_target_score_threshold: int = 0, off_target_count_threshold: int = 100, number_mismatches_to_consider: int = 3, nuclease: str = "SpCas9", spacers_per_feature: int = 9, reject: bool = False, paired: bool = False, number_upstream_spacers: int = 0, number_downstream_spacers: int = 0, cores: int = 0, chunks: int = 8, verbose: bool = False, write_early_exit: bool = False, ) -> None: """Build a CRISPR library \f Parameters ---------- :param input_sequences : :param output_library : :param reference : :param restriction_sites : :param largeindex : :param on_target_score_threshold : :param off_target_score_threshold : :param off_target_count_threshold : number_mismatches_to_consider :param nuclease : :param spacers_per_feature : :param reject : :param paired : :param rule_set : :param number_upstream_spacers : :param number_downstream_spacers : :param cores : :param chunks: verbose : bool Return ------ :type reference: object """ targets = pyfaidx.Fasta(input_sequences) global NUCLEASES nuc = NUCLEASES[NUCLEASES["nuclease"] == nuclease].to_dict( orient="records")[0] spacers_df = find_spacers( itemlist=targets, nuclease_info=nuc, restriction_sites=restriction_sites, chunks=chunks, ) if write_early_exit: spacers_df.to_csv( "/Users/milessmith/workspace/mc_human_files/early_exit.csv") sys.exit(0) initialnumber = spacers_df.shape[0] # thank the gods for the tutorial at # https://www.machinelearningplus.com/python/parallel-processing-python/ # scoring_pool = Pool(cores) chunked_spacer_dfs = np.array_split(spacers_df, chunks * 10) scoring_partial = partial( on_target_scoring, rule_set=on_target_rule_set, on_target_score_threshold=on_target_score_threshold, ) spacers_df = pd.concat(p_umap(scoring_partial, chunked_spacer_dfs)) # scoring_pool.close() # scoring_pool.join() # scoring_pool.clear() if spacers_df.shape[0] == 0: print("Sorry, no spacers matching that criteria were found") exit() else: if verbose: print( f"Finished scoring spacers. {spacers_df.shape[0]} of {initialnumber} " f"spacers have an on-target score above the cutoff threshold of " f"{on_target_score_threshold}.") tqdm.pandas(desc="Adding tracking hashes", unit="spacers") spacers_df["hash"] = spacers_df.progress_apply(lambda x: hash(tuple(x)), axis=1) if verbose: print("\nBeginning Bowtie alignment...") off_target_results_file = off_target_discovery( spacers_df=spacers_df, nuclease_info=nuc, cpus=cores, refgenome=reference, large_index_size=largeindex, reject=reject, number_mismatches_to_consider=number_mismatches_to_consider, verbose=verbose, ) spacers_df = off_target_scoring( otrf=off_target_results_file, spacers_df=spacers_df, nuclease_info=nuc, rule_set=off_target_rule_set, off_target_score_threshold=off_target_score_threshold, off_target_count_threshold=off_target_count_threshold, verbose=verbose, ) if paired: guide_library = assemble_paired_library( spacers=spacers_df, on_target_score_threshold=on_target_score_threshold, off_target_score_threshold=off_target_score_threshold, number_upstream_spacers=number_upstream_spacers, number_downstream_spacers=number_downstream_spacers, ) else: guide_library = assemble_library( spacers=spacers_df, on_target_score_threshold=on_target_score_threshold, off_target_score_threshold=off_target_score_threshold, spacers_per_feature=spacers_per_feature, ) guide_library.to_csv(output_library) print("Finished.")
def assemble_paired_library( spacers: pd.DataFrame, on_target_score_threshold: int = 100, off_target_score_threshold: int = 100, number_upstream_spacers: int = 3, number_downstream_spacers: int = 3, # min_paired_distance: int = 30, #reenable once I figure it out mix_and_match: bool = True, ) -> pd.DataFrame: """Creates a final list of protospacers for synthesis. Used to create excision libraries, where two spacers are necessary to cause cuts at either side of a feature. `assemble_paired_library()` will take a set of upstream and set of downstream spacers, generate all permutations for those originating for the same feature, and assemble them in a synthetic SpCas9 spacer array Parameters __________ spacers : :class:`~pd.DataFrame` Dataframe with all spacers found by :module:`~find_spacers.find_spacers`, scores added by :module:`~on_target_scoring.on_target_scoring` and :module:`~off_target_scoring.off_target_scoring` on_target_score_threshold : int, optional (default: 100) Spacers with an on-target score below this threshold will be removed off_target_score_threshold : int, optional (default: 100) Spacers with an off-target score below this threshold will be removed number_upstream_spacers : int, optional (default: 3) Number of spacers upstream of a gene to use number_downstream_spacers : int, optional (default: 3) Number of spacers upstream of a gene to use mix_and_match : bool, optional (default: True) If `True`, permutations of the final upstream and downstream spacers will be assembled into a larger synthetic spacer array construct. Return ______ :class:`~pd.DataFrame` with the final spacer sequences for synthesis. If `mix_and_match` is `True`, then this will correspond to the spacer arrays; if `False`, then this will be a listing of the final upstream and downstream spacers. """ spacers = spacers[spacers["on_target_score"] > on_target_score_threshold] spacers = spacers[spacers["off_target_score"] > off_target_score_threshold] upstream_spacers = spacers[spacers["gene_name"].str.contains("upstream")] downstream_spacers = spacers[spacers["gene_name"].str.contains( "downstream")] tqdm.pandas(desc="finding upstream spacers with highest on-target scores") grouped_upstream = (upstream_spacers.groupby("seq_hash").progress_apply( lambda x: x.nlargest(number_upstream_spacers, "on_target_score")). reset_index(drop=True)) tqdm.pandas( desc="finding downstream spacers with highest on-target scores") grouped_downstream = (downstream_spacers.groupby( "seq_hash").progress_apply(lambda x: x.nlargest( number_downstream_spacers, "on_target_score")).reset_index( drop=True)) if mix_and_match: original_targets = spacers["seq_hash"].drop_duplicates().values combo_df = pd.DataFrame(columns=[ "gene_name", "feature_id", "strand", "spacer", "upstream_on_target_score", "downstream_on_target_score", "upstream_off_target_score", "downstream_off_target_score", "seq_hash", "upstream_hash", "downstream_hash", ]) for _ in original_targets: tmp_upstream_spacers = grouped_upstream[ grouped_upstream["seq_hash"] == _] tmp_downstream_spacers = grouped_downstream[ grouped_downstream["seq_hash"] == _] for permuted_indices in product(tmp_upstream_spacers.index, tmp_downstream_spacers.index): upstream_index, downstream_index = permuted_indices instance_df = pd.DataFrame({ "gene_name": tmp_upstream_spacers["gene_name"].drop_duplicates().item(). strip("-upstream"), "feature_id": tmp_upstream_spacers["feature_id"], "strand": tmp_upstream_spacers["strand"], "spacer": "".join([ BSMBI_ARM_5, RIGHT_EXTRA_SPACER, tmp_upstream_spacers.loc[upstream_index, "spacer"], DIRECT_REPEAT, LEFT_EXTRA_SPACER, tmp_downstream_spacers.loc[downstream_index, "spacer"], BSMBI_ARM_3, ]), "upstream_on_target_score": tmp_upstream_spacers.loc[upstream_index, "on_target_score"], "downstream_on_target_score": tmp_downstream_spacers.loc[downstream_index, "on_target_score"], "upstream_off_target_score": tmp_upstream_spacers.loc[upstream_index, "off_target_score"], "downstream_off_target_score": tmp_downstream_spacers.loc[downstream_index, "off_target_score"], "seq_hash": tmp_upstream_spacers.loc[upstream_index, "seq_hash"], "upstream_hash": tmp_upstream_spacers.loc[upstream_index, "hash"], "downstream_hash": tmp_downstream_spacers.loc[downstream_index, "hash"], }) combo_df = pd.concat([combo_df, instance_df]).drop_duplicates() return combo_df else: return pd.concat([grouped_upstream, grouped_downstream])
import pandas as pd import numpy as np from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.model_selection import GridSearchCV from gensim.models.ldamulticore import LdaMulticore from gensim.models import Phrases from gensim.models import CoherenceModel # from gensim.models import ldaseqmodel from gensim.corpora import Dictionary from datetime import datetime from .ploty_template import plot_title from .eda import Documents from . import models from tqdm.autonotebook import tqdm tqdm.pandas() import warnings warnings.filterwarnings("ignore") global MODEL_PATH MODEL_PATH = os.path.abspath(os.path.dirname(models.__file__)) ################## TODO: Temporal Topic Modelling ################## class DynTM: def __init__(self, documents_object, num_topics=None, algo=None): if isinstance(documents_object, Documents): self.doc_obj = documents_object self.raw_df = documents_object.raw_df self.stop_words = documents_object.stop_words
def off_target_scoring( otrf: str, spacers_df: pd.DataFrame, nuclease_info: Dict[str, Any], rule_set: Optional[str] = None, off_target_score_threshold: int = 0, off_target_count_threshold: Optional[int] = 100, verbose: bool = False, ) -> object: """Calculate a cumulative off-target score for a protospacer \f Parameters ----------- otrf : `str` Path to the results from Bowtie spacers_df : :class:`~pandas.DataFrame` Dataframe containing spacers. Format should be `{'gene_name', 'feature_id', 'start','stop','strand','spacer'}` nuclease_info : `str` dictionary series with nuclease characteristics from nuclease_list.csv off_target_score_threshold : `int` Total off-target score threshold beyond which a spacer is rejected. Ranges from 0 to 100. off_target_count_threshold : `int`, default: 100 Number of potential mismatches that should be tolerated. Spacers exceeding the threshold will be discarded verbose : `bool` Return ------- :class:`~pandas.DataFrame` matching the one passed to spacers_df containing off-target scores """ bowtie_results = pd.read_csv( otrf, header=None, names=[ "hash", "strand", "refseq", "position", "seq", "readquality", "aligncount", "mismatches", ], usecols=["hash", "mismatches"], dtype={"hash": "int64", "mismatches": "str"}, na_filter=False, skip_blank_lines=True, sep="\t", memory_map=True, ) if verbose: print(f"Total alignments from Bowtie: {bowtie_results.shape[0]}") # We need to reduce the number of spacers we examine. For the most part, # those with a lot of potential off-targets (>1000?) have really low # scores and are worthless. Some have >10,000 (!) potential off-targets # and should just be thrown out. results_count = bowtie_results.groupby("hash").agg("count").reset_index() filtered_results = bowtie_results[ bowtie_results["hash"].isin( results_count[results_count["mismatches"] < off_target_count_threshold][ "hash" ] ) ] # Keep only those spacers that have fewer than our cutoff spacers_df = spacers_df[spacers_df["hash"].isin(filtered_results["hash"])] mmpos = regex.compile("[0-9]{1,}") tqdm.pandas(desc="converting mismatches", unit="spacers") filtered_results["locations"] = filtered_results["mismatches"].progress_apply( mmpos.findall ) tqdm.pandas(desc="collapsing mismatches", unit="spacers") collapsed_results = ( filtered_results.groupby("hash") .progress_apply(lambda x: x["locations"].values) .reset_index() .rename(index=str, columns={0: "locations"}) ) tqdm.pandas(desc="scoring mismatches", unit="spacers") collapsed_results["off_target_score"] = collapsed_results.apply( lambda x: sumofftargets(x["locations"], rule_set=rule_set), axis=1 ) spacers_df = spacers_df.merge(collapsed_results, on="hash") tqdm.pandas("counting off-targets", unit="spacers") spacers_df["off_targets"] = spacers_df.progress_apply( lambda x: len(x["locations"]) - 1, axis=1 ) spacers_df = spacers_df.drop(columns=["locations"]) spacers_df = spacers_df[spacers_df["off_target_score"] > off_target_score_threshold] return spacers_df