def dalignbed2dalignbedguides(cfg): """ Get guide seqeunces from the BED file step#4 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbed = del_Unnamed( pd.read_csv(cfg['dalignbedp'], sep='\t', keep_default_na=False)) dguides = set_index( del_Unnamed( pd.read_csv(cfg['dguidesp'], sep='\t', keep_default_na=False)), 'guide: id') # if the error in human, use: `cut -f 1 data/alignment.bed.sorted.bed | sort| uniq -c | grep -v CHR | grep -v GL | grep -v KI` dalignbedguidesp = cfg['dalignbedguidesp'] logging.info(basename(dalignbedguidesp)) if not exists(dalignbedguidesp) or cfg['force']: dalignbed = pd.merge(dalignbed, dguides, on='guide: id', suffixes=('', '.1')) dalignbed.to_csv(dalignbedguidesp, '\t') return cfg
def dalignbedannot2daggbyguide(cfg): """ Aggregate annotations per alignment to annotations per guide. step#10 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbedannot = del_Unnamed( pd.read_csv(cfg['dalignbedannotp'], sep='\t', low_memory=False)) daggbyguidep = '{}/10_daggbyguide.tsv'.format(datatmpd) logging.info(basename(daggbyguidep)) if not exists(daggbyguidep) or cfg['force']: daggbyguide = dalignbedannot.loc[(dalignbedannot['NM'] == 0), [ 'guide: id', 'guide+PAM sequence', 'gene names', 'gene ids', 'transcript ids' ]].drop_duplicates(subset=['guide: id']) if len(daggbyguide) != 0: daggbyguide = set_index(daggbyguide, 'guide: id') guideids = daggbyguide.index.tolist() for gi in range(len(guideids)): gid = guideids[gi] dalignbedannoti = dalignbedannot.loc[ dalignbedannot['guide: id'] == gid, :] if len(dalignbedannoti.shape) == 1: dalignbedannoti = pd.DataFrame(dalignbedannoti).T for col in [ 'types', 'gene names', 'gene ids', 'transcript ids', 'protein ids', 'exon ids' ]: daggbyguide.loc[gid, col] = ";".join( np.unique(dalignbedannoti[col].fillna('nan').tolist())) from beditor.lib.get_scores import get_beditorscore_per_guide for guideid in daggbyguide.index: dalignbedannotguide = dalignbedannot.loc[( dalignbedannot['guide: id'] == guideid), :] daggbyguide.loc[ guideid, 'beditor score'] = get_beditorscore_per_guide( guide_seq=dalignbedannotguide['guide+PAM sequence']. unique()[0], strategy=dalignbedannotguide['strategy'].unique()[0], align_seqs_scores=dalignbedannotguide['beditor score'], BEs=cfg['BEs'] # test=cfg['test'] ) daggbyguide.loc[guideid, 'CFD score'] = dalignbedannotguide[ 'CFD score'].mean() #FIXME if mean is not appropriate daggbyguide['beditor score (log10)'] = daggbyguide[ 'beditor score'].apply(np.log10) dalignbedannot['alternate alignments count'] = 1 daggbyguide = daggbyguide.join( pd.DataFrame( dalignbedannot.groupby('guide: id') ['alternate alignments count'].agg('sum'))) daggbyguide.to_csv(daggbyguidep, sep='\t') daggbyguide.to_csv(cfg['dofftargetsp'], sep='\t') return cfg
def dguides2guidessam(cfg, dguides): """ Aligns guides to genome and gets SAM file step#1 :param cfg: configuration dict :param dguides: dataframe of guides """ datatmpd = cfg['datatmpd'] dguides = set_index(dguides, 'guide: id') guidels = dguides.loc[:, 'guide+PAM length'].unique() for guidel in guidels: logging.debug(f"now aligning guides of length {guidel}") guidesfap = f'{datatmpd}/01_guides_guidel{guidel:02}.fa' logging.info(basename(guidesfap)) if not exists(guidesfap) or cfg['force']: with open(guidesfap, 'w') as f: for gi in dguides.index: f.write('>{}\n{}\n'.format( gi.replace(' ', '_'), dguides.loc[gi, 'guide+PAM sequence'])) ## BWA alignment command is adapted from cripror ## https://github.com/rraadd88/crisporWebsite/blob/master/crispor.py # BWA: allow up to X mismatches # maximum number of occurences in the genome to get flagged as repeats. # This is used in bwa samse, when converting the sam file # and for warnings in the table output. MAXOCC = 60000 # the BWA queue size is 2M by default. We derive the queue size from MAXOCC MFAC = 2000000 / MAXOCC genomep = cfg['genomep'] genomed = dirname(genomep) # make var local, see below genomegffp = cfg['genomegffp'] # increase MAXOCC if there is only a single query, but only in CGI mode bwaM = MFAC * MAXOCC # -m is queue size in bwa guidessap = f'{datatmpd}/01_guides_guidel{guidel:02}.sa' logging.info(basename(guidessap)) if not exists(guidessap) or cfg['force']: cmd = f"{cfg['bwa']} aln -t 1 -o 0 -m {bwaM} -n {cfg['mismatches_max']} -k {cfg['mismatches_max']} -N -l {guidel} {genomep} {guidesfap} > {guidessap} 2> {guidessap}.log" runbashcmd(cmd) guidessamp = f'{datatmpd}/01_guides_guidel{guidel:02}.sam' logging.info(basename(guidessamp)) if not exists(guidessamp) or cfg['force']: cmd = f"{cfg['bwa']} samse -n {MAXOCC} {genomep} {guidessap} {guidesfap} > {guidessamp} 2> {guidessamp}.log" runbashcmd(cmd) return cfg
def df2features(df): """ cols= ini, end, name,sense """ from Bio.SeqFeature import SeqFeature, FeatureLocation from beditor.lib.io_dfs import set_index colini,colend,colname,colsense=df.columns df=set_index(df,colname) features=[] df=df.reset_index() for name in df.index: features.append(SeqFeature(FeatureLocation(start=int(df.loc[name,colini]), end=int(df.loc[name,colend])+1, strand=int(df.loc[name,colsense]),), type=df.loc[name,colname], )) return features
def dannotsagg2dannots2dalignbedannot(cfg): """ Map aggregated annotations to guides step#9 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dannotsagg = del_Unnamed( pd.read_csv(cfg['dannotsaggp'], sep='\t', keep_default_na=False)) dalignbedstats = del_Unnamed( pd.read_csv(cfg['dalignbedstatsp'], sep='\t', keep_default_na=False)) dalignbedannotp = cfg['dalignbedannotp'] logging.info(basename(dalignbedannotp)) if not exists(dalignbedannotp) or cfg['force']: # df2info(dalignbed) # df2info(dannotsagg) dalignbedannot = dalignbedstats.set_index('id').join( set_index(dannotsagg, 'id'), rsuffix=' annotation') dalignbedannot['NM'] = dalignbedannot['NM'].apply(int) from beditor.lib.get_scores import get_beditorscore_per_alignment, get_cfdscore dalignbedannot['beditor score'] = dalignbedannot.apply( lambda x: get_beditorscore_per_alignment( NM=x['NM'], genic=True if x['region'] == 'genic' else False, alignment=x['alignment'], pam_length=len(x['PAM']), pam_position=x['original position'], # test=cfg['test'], ), axis=1) dalignbedannot['CFD score'] = dalignbedannot.apply( lambda x: get_cfdscore(x['guide+PAM sequence'].upper(), x[ 'aligned sequence'].upper()), axis=1) dalignbedannot['CFD score'] = dalignbedannot['CFD score'].fillna(0) dalignbedannot.to_csv(dalignbedannotp, sep='\t') return cfg
def get_seq_aminoacid(cfg, din): """ Fetches sequences if mutation format is amino acid :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ import pyensembl #import ensembl object that would fetch genes # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease']) ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) din.index = range(len(din)) dbedp = '{}/dbedflank.bed'.format(cfg['datad']) dbed = pd.DataFrame(columns=bed_colns) terrpositions = [] terrnotfound = [] terrnoncoding = [] bedrowi = 0 # for i in trange(len(din)-1,desc='get positions for bedtools'): for i in din.index: if din.loc[i, 'transcript: id'] in ensembl.transcript_ids(): t = ensembl.transcript_by_id(din.loc[i, 'transcript: id']) if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon: coding_sequence_positions = tboundaries2positions(t) if len(coding_sequence_positions) == len(t.coding_sequence): #TODO need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce dcoding = t2pmapper(t, coding_sequence_positions) dcodingmutpos = dcoding.loc[( dcoding['protein index'] == din.loc[ i, 'aminoacid: position']), :] codon_positions = dcodingmutpos[ 'coding sequence positions'].tolist() if len(codon_positions) != 0: dbed.loc[bedrowi, 'chromosome'] = t.contig if cfg['test']: print(din.loc[i, 'transcript: id'], codon_positions) if t.strand == '+': dbed.loc[bedrowi, 'codon start'] = codon_positions[0] dbed.loc[bedrowi, 'codon end'] = codon_positions[2] elif t.strand == '-': dbed.loc[bedrowi, 'codon start'] = codon_positions[2] dbed.loc[bedrowi, 'codon end'] = codon_positions[0] dbed.loc[bedrowi, 'start'] = dbed.loc[ bedrowi, 'codon start'] - 22 #FIXME put flank in the yml dbed.loc[bedrowi, 'end'] = dbed.loc[ bedrowi, 'codon end'] + 21 #FIXME put flank in the yml dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[ 'protein sequence'].tolist()[0] dbed.loc[bedrowi, 'reference codon'] = ''.join( dcodingmutpos['coding sequence'].tolist()) dbed.loc[bedrowi, 'strand'] = t.strand dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format( din.loc[i, 'transcript: id'], dbed.loc[bedrowi, 'chromosome'], dbed.loc[bedrowi, 'strand'], int(dbed.loc[bedrowi, 'start']), int(dbed.loc[bedrowi, 'end'])) dbed.loc[bedrowi, 'gene: id'] = t.gene_id dbed.loc[bedrowi, 'gene: name'] = t.gene.name dbed.loc[bedrowi, 'protein: id'] = t.protein_id dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[ i, 'aminoacid: position'] # break bedrowi += 1 else: terrpositions.append(t.id) else: terrpositions.append(t.id) else: terrnoncoding.append(t.id) else: terrnotfound.append(din.loc[i, 'transcript: id']) if cfg['test']: logging.error('not found: {}'.format( din.loc[i, 'transcript: id'])) if len(dbed) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no valid seqeunces found; saving an empty table.') saveemptytable(cfg, f"{cfg['dsequencesp']}") return None dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45, axis=1)), :] #FIXME put flank in the yml dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int) dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int) dbed = dbed.drop_duplicates(subset=bed_colns) dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False) err2tids = { 'terrpositions': terrpositions, 'terrnotfound': terrnotfound, 'terrnoncoding': terrnoncoding, } if cfg['test']: print(err2tids) with open(dbedp + '.err.json', 'w') as outfile: json.dump(err2tids, outfile) bedp = f"{cfg['datad']}/dbedflank.bed" fastap = f"{cfg['datad']}/dbedflank.fa" cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) dflankfa = fa2df(fastap, ids2cols=True) dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply( lambda x: x.upper()) dflankfa.loc[:, 'sequence: length'] = [len(s) for s in dflankfa['sequence']] dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index] dflankfa.index.name = 'id' dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1') dseq2compatible = { 'aminoacid: position': 'aminoacid: position', 'gene: id': 'gene: id', 'gene: name': 'gene: name', 'protein: id': 'protein: id', 'transcript: id': 'seqid', 'transcript: sequence': 'sequence', 'aminoacid: wild-type': 'reference residue', 'codon: wild-type': 'reference codon', 'contig': 'contig', 'strand': 'strand', 'start': 'start', 'end': 'end', 'codon start': 'codon start', 'codon end': 'codon end', } if 'amino acid mutation' in dseq: dseq2compatible['amino acid mutation'] = 'amino acid mutation' dseq.to_csv(cfg['dseqtmpp'], sep='\t') dseq = dseq[list(dseq2compatible.values())] dseq.columns = list(dseq2compatible.keys()) # dseq.to_csv('data/dseq.csv') logging.info(dseq.columns.tolist()) logging.info(din.columns.tolist()) dseq = pd.merge(dseq.reset_index(), din, on=['transcript: id', 'aminoacid: position']) logging.info(dseq.columns.tolist()) set_index(dseq, 'id') if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq = dfswapcols(dseq, ['aminoacid: wild-type', 'amino acid mutation']) dseq['codon: mutation'] = dseq['codon: wild-type'].copy() dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t') del ensembl
def plot_vizbysteps(cfg): prjd = cfg['prjd'] #make one output table and stepwise plots datad = f"{prjd}/05_output" # step2 # make submap stepi = 2 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_substitution_map" plotps = glob(plotp + '*') if len(plotps) == 0 or cfg['force']: plotpf = plotp + "_{mutation_type}.png" dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() if len(dstep) < 1000: logging.info('plot_submap_possibilities') plot_submap_possibilities(dmutagenesis=dstep, plotpf=plotpf, test=False) else: logging.warning(f'skipped: plot_submap_possibilities') else: logging.warning(f'not found: {dstepp}') # step3 # stats by strategies stepi = 3 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_stats_by_strategies.png" if not exists(plotp) or cfg['force']: dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() logging.info('plot_bar_dguides') plot_bar_dguides(dstep, plotp) else: logging.warning(f'not found: {dstepp}') # make nt_composition plot stepi = 3 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_nt_compositions" plotps = glob(plotp + '*') if len(plotps) == 0 or cfg['force']: plotpf = plotp + "_{method}.png" makedirs(dirname(plotp), exist_ok=True) dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() # dbepams=pd.read_table(f'{dirname(realpath(__file__))}/../data/dbepams.tsv') dbepams = pd.read_table(cfg['dbepamsp'], keep_default_na=False) dpam = dbepams.loc[:, cols_dpam].drop_duplicates() dpam = set_index(dpam, 'PAM') logging.info('plot_dist_dguides') plot_dist_dguides(dstep, dpam, plotpf) else: logging.warning(f'not found: {dstepp}') # make plot_dna_features_view stepi = 3 plotd = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dna_features_view" plotps = glob(plotd + '/*') if len(plotps) == 0 or cfg['force']: dguidesp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" dsequencesp = f"{cfg[stepi-2]}/d{cfg[stepi-2].replace('/','').split('_')[-1]}.tsv" if exists(dguidesp): logging.info('plot_dna_features_view') plot_dna_features_view( cfg, dsequences=del_Unnamed( pd.read_table(dsequencesp, keep_default_na=False)).drop_duplicates(), dguides=del_Unnamed( pd.read_table(dguidesp, keep_default_na=False)).drop_duplicates(), plotd=plotd, more=False) else: logging.warning(f'not found: {dstepp}') # # step2 # make submap #FIXME get all the columns used for plotting in the dguides. # stepi=3 # plotp=f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_submap_used_for_mutagenesis" # plotps=glob(plotp+'*') # if len(plotps)==0 or cfg['force']: # plotpf=plotp+"_{mutation_type}.png" # dstepp=f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" # dstep=del_Unnamed(pd.read_table(dstepp)).drop_duplicates() # logging.info('plot_submap_possibilities') # plot_submap_possibilities(dmutagenesis=dstep, # plotpf=plotpf,test=False) # step4 offtargets correlations stepi = 4 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dist_beditor_score.png" if not exists(plotp) or cfg['force']: dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() logging.info('plot_dist_dofftargets') plot_dist_dofftargets(dstep, plotp) else: logging.warning(f'not found: {dstepp}')