def main(args): # keep track of chromosome since this will be run with bash script chrom = args['<chrom>'] # define output prefix outprefix = args['<out>'] # make Fasta object for genome of choice, e.g. hg19 genome = Fasta(args['<fasta>'], as_raw=True) cas_list = args['<cas_list>'].split(',') # get set of positions for each type of cas for cas in cas_list: current_cas = cas_obj.get_cas_enzyme( cas, os.path.join(cas_obj_path, 'CAS_LIST.txt')) for_starts, rev_starts = find_spec_pams(current_cas, str(genome[str(chrom)]), orient=current_cas.primeness) savestr_for = f'{outprefix}' + str(chrom) + '_' + str( cas) + '_pam_sites_for.npy' savestr_rev = f'{outprefix}' + str(chrom) + '_' + str( cas) + '_pam_sites_rev.npy' np.save(savestr_for, list(for_starts)) np.save(savestr_rev, list(rev_starts))
def get_made_broke_pams(df, chrom, ref_genome): """ Apply makes_breaks_pams to a df. :param df: gens df generated by get_chr_tables.sh, available on EF github. :param chrom: chromosome currently being analyzed. :param ref_genome: ref_genome fasta, pyfaidx format. :return: dataframe with indicators for whether each variant makes/breaks PAMs, pd df. """ FULL_CAS_LIST = cas_obj.get_cas_list( os.path.join(cas_obj_path, "CAS_LIST.txt")) for cas in cas_list: if cas not in FULL_CAS_LIST: logging.info(f"Skipping {cas}, not in CAS_LIST.txt") continue current_cas = cas_obj.get_cas_enzyme( cas, os.path.join(cas_obj_path, "CAS_LIST.txt")) makes, breaks = zip(*df.apply( lambda row: makes_breaks_pam(current_cas, chrom, row["pos"], row[ "ref"], row["alt"], ref_genome), axis=1, )) df[f"makes_{cas}"] = makes df[f"breaks_{cas}"] = breaks return df
def adjusted_length(row): """ Adds on the length of the PAM to the sequnce length. """ cas = cas_object.get_cas_enzyme(row["cas_type"]) if row["strand"] == "positive": return (row["start"], row["stop"] + len(cas.forwardPam)) else: return (row["start"] - len(cas.forwardPam), row["stop"])
colors_formatted = [] for color in colors: colors_formatted.append( str(color[:3]).replace(' ', '').replace('[', '').replace(']', '')) # map Cas proteins to colors cas_to_colors = dict(zip(cas_list, colors_formatted)) bed_dfs = [] for chrom in chroms: for cas in cas_list: print(cas) cas_info = cas_obj.get_cas_enzyme( cas, os.path.join(cas_obj_path, 'CAS_LIST.txt')) pam_size = len(cas_info.forwardPam) for_pams = [] rev_pams = [] chroms_for = [] chroms_rev = [] with gzip.open( f'/pollard/data/projects/AlleleAnalyzer_data/pam_sites_hg38/pam_sites_hg38_txt/chr{chrom}_{cas}_pam_sites_for.txt.gz', 'rb') as f: for line in f.readlines(): for_pams.append(int(float(line.strip())) - 1) chroms_for.append('chr' + str(chrom)) with gzip.open( f'/pollard/data/projects/AlleleAnalyzer_data/pam_sites_hg38/pam_sites_hg38_txt/chr{chrom}_{cas}_pam_sites_rev.txt.gz', 'rb') as f: for line in f.readlines():
def main(args): logging.info(args) out = args["<out>"] pams_dir = args["<pams_dir>"] gens = args["<gens_file>"] guide_len = int(args["--guide_len"]) ref_genome = Fasta(args["<ref_genome_fasta>"], as_raw=True) global cas_list cas_list = list(args["<cas>"].split(",")) # Read in gens and chroms file, and see if gens file needs to be split. gens = pd.read_hdf(gens, "all") if gens.empty: print('No variants in this region.') exit() chroms = dict(Counter(gens.chrom)).keys() if len(chroms) > 1: gens = split_gens(gens, list(chroms)) else: gens = [gens] fasta_chrom = list(ref_genome.keys())[0].startswith("chr") chroms = [norm_chr(ch, fasta_chrom) for ch in list(chroms)] # # Add check to make sure the correct FASTA file was loaded. - this is too glitchy # if set(chroms) != set(list(ref_genome.keys())): # logging.error(f"{args['<gens_file>']} chromosomes/notations differ from {args['<ref_genome_fasta>']}: {chroms} and {list(ref_genome.keys())}.") # exit(1) # save locations of PAM proximal variants to dictionary pam_prox_vars = {} # get variants within sgRNA region for 3 prime PAMs (20 bp upstream of for pos and vice versa) FULL_CAS_LIST = cas_obj.get_cas_list( os.path.join(cas_obj_path, "CAS_LIST.txt")) for cas in cas_list: if cas not in FULL_CAS_LIST: logging.info(f"Skipping {cas}, not in CAS_LIST.txt") cas_list.remove(cas) combined_df = [] for i, chrom in enumerate(chroms): chr_variants = set(gens[i]["pos"].tolist()) for cas in cas_list: current_cas = cas_obj.get_cas_enzyme( cas, os.path.join(cas_obj_path, "CAS_LIST.txt")) logging.info(f"Evaluating {current_cas.name} at {chrom}.") cas_prox_vars = [] pam_dict = {} pam_for_pos = np.load( os.path.join(pams_dir, f"{chrom}_{cas}_pam_sites_for.npy")).tolist() pam_rev_pos = np.load( os.path.join(pams_dir, f"{chrom}_{cas}_pam_sites_rev.npy")).tolist() if current_cas.primeness == "3'": for pos in pam_for_pos: prox_vars = set(get_range_upstream( pos, guide_len)) & chr_variants cas_prox_vars.extend(prox_vars) pam_dict[pos] = prox_vars for pos in pam_rev_pos: prox_vars = set(get_range_downstream( pos, guide_len)) & chr_variants cas_prox_vars.extend(prox_vars) pam_dict[pos] = prox_vars elif current_cas.primeness == "5'": for pos in pam_for_pos: prox_vars = set(get_range_downstream( pos, guide_len)) & chr_variants cas_prox_vars.extend(prox_vars) pam_dict[pos] = prox_vars for pos in pam_rev_pos: prox_vars = set(get_range_upstream( pos, guide_len)) & chr_variants cas_prox_vars.extend(prox_vars) pam_dict[pos] = prox_vars pam_prox_vars[cas] = cas_prox_vars chrdf = get_made_broke_pams(gens[i], chrom, ref_genome) for cas in cas_list: # print(cas) spec_pam_prox_vars = pam_prox_vars[cas] chrdf[f"var_near_{cas}"] = chrdf["pos"].isin(spec_pam_prox_vars) cas_cols = [] for cas in cas_list: prelim_cols = [ w.replace("cas", cas) for w in ["makes_cas", "breaks_cas", "var_near_cas"] ] cas_cols.extend(prelim_cols) keepcols = ["chrom", "pos", "ref", "alt"] + cas_cols chrdf = chrdf[keepcols] combined_df.append(chrdf) combined_df = pd.concat(combined_df) combined_df.to_hdf( f"{out}.h5", "all", mode="w", format="table", data_columns=True, complib="blosc", ) add_metadata(f"{out}.h5", args, os.path.basename(__file__), __version__, "Annotation") logging.info("Done.")