def main(args):

    # keep track of chromosome since this will be run with bash script

    chrom = args['<chrom>']

    # define output prefix

    outprefix = args['<out>']

    # make Fasta object for genome of choice, e.g. hg19

    genome = Fasta(args['<fasta>'], as_raw=True)

    cas_list = args['<cas_list>'].split(',')

    # get set of positions for each type of cas

    for cas in cas_list:
        current_cas = cas_obj.get_cas_enzyme(
            cas, os.path.join(cas_obj_path, 'CAS_LIST.txt'))
        for_starts, rev_starts = find_spec_pams(current_cas,
                                                str(genome[str(chrom)]),
                                                orient=current_cas.primeness)
        savestr_for = f'{outprefix}' + str(chrom) + '_' + str(
            cas) + '_pam_sites_for.npy'
        savestr_rev = f'{outprefix}' + str(chrom) + '_' + str(
            cas) + '_pam_sites_rev.npy'
        np.save(savestr_for, list(for_starts))
        np.save(savestr_rev, list(rev_starts))
示例#2
0
def get_made_broke_pams(df, chrom, ref_genome):
    """
    Apply makes_breaks_pams to a df.
    :param df: gens df generated by get_chr_tables.sh, available on EF github.
    :param chrom: chromosome currently being analyzed.
    :param ref_genome: ref_genome fasta, pyfaidx format.
    :return: dataframe with indicators for whether each variant makes/breaks PAMs, pd df.
    """
    FULL_CAS_LIST = cas_obj.get_cas_list(
        os.path.join(cas_obj_path, "CAS_LIST.txt"))
    for cas in cas_list:
        if cas not in FULL_CAS_LIST:
            logging.info(f"Skipping {cas}, not in CAS_LIST.txt")
            continue
        current_cas = cas_obj.get_cas_enzyme(
            cas, os.path.join(cas_obj_path, "CAS_LIST.txt"))

        makes, breaks = zip(*df.apply(
            lambda row: makes_breaks_pam(current_cas, chrom, row["pos"], row[
                "ref"], row["alt"], ref_genome),
            axis=1,
        ))
        df[f"makes_{cas}"] = makes
        df[f"breaks_{cas}"] = breaks
    return df
def adjusted_length(row):
    """
	Adds on the length of the PAM to the sequnce length.
	"""
    cas = cas_object.get_cas_enzyme(row["cas_type"])
    if row["strand"] == "positive":
        return (row["start"], row["stop"] + len(cas.forwardPam))
    else:
        return (row["start"] - len(cas.forwardPam), row["stop"])
示例#4
0
colors_formatted = []

for color in colors:
    colors_formatted.append(
        str(color[:3]).replace(' ', '').replace('[', '').replace(']', ''))

# map Cas proteins to colors

cas_to_colors = dict(zip(cas_list, colors_formatted))

bed_dfs = []
for chrom in chroms:
    for cas in cas_list:
        print(cas)
        cas_info = cas_obj.get_cas_enzyme(
            cas, os.path.join(cas_obj_path, 'CAS_LIST.txt'))
        pam_size = len(cas_info.forwardPam)
        for_pams = []
        rev_pams = []
        chroms_for = []
        chroms_rev = []
        with gzip.open(
                f'/pollard/data/projects/AlleleAnalyzer_data/pam_sites_hg38/pam_sites_hg38_txt/chr{chrom}_{cas}_pam_sites_for.txt.gz',
                'rb') as f:
            for line in f.readlines():
                for_pams.append(int(float(line.strip())) - 1)
                chroms_for.append('chr' + str(chrom))
        with gzip.open(
                f'/pollard/data/projects/AlleleAnalyzer_data/pam_sites_hg38/pam_sites_hg38_txt/chr{chrom}_{cas}_pam_sites_rev.txt.gz',
                'rb') as f:
            for line in f.readlines():
示例#5
0
def main(args):
    logging.info(args)
    out = args["<out>"]
    pams_dir = args["<pams_dir>"]
    gens = args["<gens_file>"]
    guide_len = int(args["--guide_len"])
    ref_genome = Fasta(args["<ref_genome_fasta>"], as_raw=True)

    global cas_list
    cas_list = list(args["<cas>"].split(","))

    # Read in gens and chroms file, and see if gens file needs to be split.
    gens = pd.read_hdf(gens, "all")
    if gens.empty:
        print('No variants in this region.')
        exit()
    chroms = dict(Counter(gens.chrom)).keys()

    if len(chroms) > 1:
        gens = split_gens(gens, list(chroms))
    else:
        gens = [gens]

    fasta_chrom = list(ref_genome.keys())[0].startswith("chr")
    chroms = [norm_chr(ch, fasta_chrom) for ch in list(chroms)]

    # # Add check to make sure the correct FASTA file was loaded. - this is too glitchy
    # if set(chroms) != set(list(ref_genome.keys())):
    #     logging.error(f"{args['<gens_file>']} chromosomes/notations differ from {args['<ref_genome_fasta>']}: {chroms} and {list(ref_genome.keys())}.")
    #     exit(1)

    # save locations of PAM proximal variants to dictionary
    pam_prox_vars = {}
    # get variants within sgRNA region for 3 prime PAMs (20 bp upstream of for pos and vice versa)
    FULL_CAS_LIST = cas_obj.get_cas_list(
        os.path.join(cas_obj_path, "CAS_LIST.txt"))
    for cas in cas_list:
        if cas not in FULL_CAS_LIST:
            logging.info(f"Skipping {cas}, not in CAS_LIST.txt")
            cas_list.remove(cas)

    combined_df = []
    for i, chrom in enumerate(chroms):
        chr_variants = set(gens[i]["pos"].tolist())
        for cas in cas_list:
            current_cas = cas_obj.get_cas_enzyme(
                cas, os.path.join(cas_obj_path, "CAS_LIST.txt"))

            logging.info(f"Evaluating {current_cas.name} at {chrom}.")
            cas_prox_vars = []
            pam_dict = {}
            pam_for_pos = np.load(
                os.path.join(pams_dir,
                             f"{chrom}_{cas}_pam_sites_for.npy")).tolist()
            pam_rev_pos = np.load(
                os.path.join(pams_dir,
                             f"{chrom}_{cas}_pam_sites_rev.npy")).tolist()

            if current_cas.primeness == "3'":
                for pos in pam_for_pos:
                    prox_vars = set(get_range_upstream(
                        pos, guide_len)) & chr_variants
                    cas_prox_vars.extend(prox_vars)
                    pam_dict[pos] = prox_vars
                for pos in pam_rev_pos:
                    prox_vars = set(get_range_downstream(
                        pos, guide_len)) & chr_variants
                    cas_prox_vars.extend(prox_vars)
                    pam_dict[pos] = prox_vars

            elif current_cas.primeness == "5'":
                for pos in pam_for_pos:
                    prox_vars = set(get_range_downstream(
                        pos, guide_len)) & chr_variants
                    cas_prox_vars.extend(prox_vars)
                    pam_dict[pos] = prox_vars
                for pos in pam_rev_pos:
                    prox_vars = set(get_range_upstream(
                        pos, guide_len)) & chr_variants
                    cas_prox_vars.extend(prox_vars)
                    pam_dict[pos] = prox_vars

            pam_prox_vars[cas] = cas_prox_vars

        chrdf = get_made_broke_pams(gens[i], chrom, ref_genome)

        for cas in cas_list:
            # print(cas)
            spec_pam_prox_vars = pam_prox_vars[cas]
            chrdf[f"var_near_{cas}"] = chrdf["pos"].isin(spec_pam_prox_vars)

        cas_cols = []
        for cas in cas_list:
            prelim_cols = [
                w.replace("cas", cas)
                for w in ["makes_cas", "breaks_cas", "var_near_cas"]
            ]
            cas_cols.extend(prelim_cols)
        keepcols = ["chrom", "pos", "ref", "alt"] + cas_cols
        chrdf = chrdf[keepcols]
        combined_df.append(chrdf)

    combined_df = pd.concat(combined_df)
    combined_df.to_hdf(
        f"{out}.h5",
        "all",
        mode="w",
        format="table",
        data_columns=True,
        complib="blosc",
    )

    add_metadata(f"{out}.h5", args, os.path.basename(__file__), __version__,
                 "Annotation")
    logging.info("Done.")