def dalignbed2dalignbedguidesseq(cfg): """ Get sequences from BED file step#6 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbedguides = del_Unnamed( pd.read_csv(cfg['dalignbedguidesp'], sep='\t')) dalignedfasta = del_Unnamed(pd.read_csv(cfg['dalignedfastap'], sep='\t')) dalignbedguidesseqp = cfg['dalignbedguidesseqp'] logging.info(basename(dalignbedguidesseqp)) if not exists(dalignbedguidesseqp) or cfg['force']: dalignbedguidesseq = pd.merge(dalignbedguides, dalignedfasta, on='id', suffixes=('', '.2')) dalignbedguidesseq = dalignbedguidesseq.dropna( subset=['aligned sequence'], axis=0) # dalignbed.index.name='id' dalignbedguidesseq = dalignbedguidesseq.drop_duplicates() dalignbedguidesseq.to_csv(dalignbedguidesseqp, sep='\t') return cfg
def dalignbed2dalignbedguides(cfg): """ Get guide seqeunces from the BED file step#4 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbed = del_Unnamed( pd.read_csv(cfg['dalignbedp'], sep='\t', keep_default_na=False)) dguides = set_index( del_Unnamed( pd.read_csv(cfg['dguidesp'], sep='\t', keep_default_na=False)), 'guide: id') # if the error in human, use: `cut -f 1 data/alignment.bed.sorted.bed | sort| uniq -c | grep -v CHR | grep -v GL | grep -v KI` dalignbedguidesp = cfg['dalignbedguidesp'] logging.info(basename(dalignbedguidesp)) if not exists(dalignbedguidesp) or cfg['force']: dalignbed = pd.merge(dalignbed, dguides, on='guide: id', suffixes=('', '.1')) dalignbed.to_csv(dalignbedguidesp, '\t') return cfg
def dalignbedannot2daggbyguide(cfg): """ Aggregate annotations per alignment to annotations per guide. step#10 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbedannot = del_Unnamed( pd.read_csv(cfg['dalignbedannotp'], sep='\t', low_memory=False)) daggbyguidep = '{}/10_daggbyguide.tsv'.format(datatmpd) logging.info(basename(daggbyguidep)) if not exists(daggbyguidep) or cfg['force']: daggbyguide = dalignbedannot.loc[(dalignbedannot['NM'] == 0), [ 'guide: id', 'guide+PAM sequence', 'gene names', 'gene ids', 'transcript ids' ]].drop_duplicates(subset=['guide: id']) if len(daggbyguide) != 0: daggbyguide = set_index(daggbyguide, 'guide: id') guideids = daggbyguide.index.tolist() for gi in range(len(guideids)): gid = guideids[gi] dalignbedannoti = dalignbedannot.loc[ dalignbedannot['guide: id'] == gid, :] if len(dalignbedannoti.shape) == 1: dalignbedannoti = pd.DataFrame(dalignbedannoti).T for col in [ 'types', 'gene names', 'gene ids', 'transcript ids', 'protein ids', 'exon ids' ]: daggbyguide.loc[gid, col] = ";".join( np.unique(dalignbedannoti[col].fillna('nan').tolist())) from beditor.lib.get_scores import get_beditorscore_per_guide for guideid in daggbyguide.index: dalignbedannotguide = dalignbedannot.loc[( dalignbedannot['guide: id'] == guideid), :] daggbyguide.loc[ guideid, 'beditor score'] = get_beditorscore_per_guide( guide_seq=dalignbedannotguide['guide+PAM sequence']. unique()[0], strategy=dalignbedannotguide['strategy'].unique()[0], align_seqs_scores=dalignbedannotguide['beditor score'], BEs=cfg['BEs'] # test=cfg['test'] ) daggbyguide.loc[guideid, 'CFD score'] = dalignbedannotguide[ 'CFD score'].mean() #FIXME if mean is not appropriate daggbyguide['beditor score (log10)'] = daggbyguide[ 'beditor score'].apply(np.log10) dalignbedannot['alternate alignments count'] = 1 daggbyguide = daggbyguide.join( pd.DataFrame( dalignbedannot.groupby('guide: id') ['alternate alignments count'].agg('sum'))) daggbyguide.to_csv(daggbyguidep, sep='\t') daggbyguide.to_csv(cfg['dofftargetsp'], sep='\t') return cfg
def dannotsagg2dannots2dalignbedannot(cfg): """ Map aggregated annotations to guides step#9 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dannotsagg = del_Unnamed( pd.read_csv(cfg['dannotsaggp'], sep='\t', keep_default_na=False)) dalignbedstats = del_Unnamed( pd.read_csv(cfg['dalignbedstatsp'], sep='\t', keep_default_na=False)) dalignbedannotp = cfg['dalignbedannotp'] logging.info(basename(dalignbedannotp)) if not exists(dalignbedannotp) or cfg['force']: # df2info(dalignbed) # df2info(dannotsagg) dalignbedannot = dalignbedstats.set_index('id').join( set_index(dannotsagg, 'id'), rsuffix=' annotation') dalignbedannot['NM'] = dalignbedannot['NM'].apply(int) from beditor.lib.get_scores import get_beditorscore_per_alignment, get_cfdscore dalignbedannot['beditor score'] = dalignbedannot.apply( lambda x: get_beditorscore_per_alignment( NM=x['NM'], genic=True if x['region'] == 'genic' else False, alignment=x['alignment'], pam_length=len(x['PAM']), pam_position=x['original position'], # test=cfg['test'], ), axis=1) dalignbedannot['CFD score'] = dalignbedannot.apply( lambda x: get_cfdscore(x['guide+PAM sequence'].upper(), x[ 'aligned sequence'].upper()), axis=1) dalignbedannot['CFD score'] = dalignbedannot['CFD score'].fillna(0) dalignbedannot.to_csv(dalignbedannotp, sep='\t') return cfg
def dalignbedguidesseq2dalignbedstats(cfg): """ Gets scores for guides step#7 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] dalignbedguidesseq = del_Unnamed( pd.read_csv(cfg['dalignbedguidesseqp'], sep='\t')) dalignbedstatsp = cfg['dalignbedstatsp'] logging.info(basename(dalignbedstatsp)) if not exists(dalignbedstatsp) or cfg['force']: df = dalignbedguidesseq.apply( lambda x: align(x['guide+PAM sequence'], x['aligned sequence']), axis=1).apply(pd.Series) df.columns = ['alignment', 'alignment: score'] dalignbedstats = dalignbedguidesseq.join(df) del df dalignbedstats.to_csv(dalignbedstatsp, sep='\t') return cfg
def dannots2dalignbed2dannotsagg(cfg): """ Aggregate annotations per guide step#8 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] daannotp = f'{datatmpd}/08_dannot.tsv' cfg['daannotp'] = daannotp dannotsaggp = cfg['dannotsaggp'] logging.info(basename(daannotp)) if ((not exists(daannotp)) and (not exists(dannotsaggp))) or cfg['force']: dannots = pd.read_csv( cfg['annotationsbedp'], sep='\t', names=bed_colns + [ c + ' annotation' if c in set(bed_colns).intersection(gff_colns) else c for c in gff_colns ], low_memory=False) dannots = del_Unnamed(dannots) dannots = dannots.set_index('id') dannots['annotations count'] = 1 # separate ids from attribute columns dannots = lambda2cols(dannots, lambdaf=gffatributes2ids, in_coln='attributes', to_colns=[ 'gene name', 'gene id', 'transcript id', 'protein id', 'exon id' ]) dannots['annotation coordinate'] = dannots.apply( lambda x: '{}:{}-{}({})'.format(x['chromosome annotation'], x[ 'start annotation'], x['end annotation'], x['strand annotation' ]), axis=1) logging.debug('or this step takes more time?') dannots.to_csv(daannotp, sep='\t') else: dannots = pd.read_csv(daannotp, sep='\t', low_memory=False) dannots = del_Unnamed(dannots) logging.info(basename(dannotsaggp)) if not exists(dannotsaggp) or cfg['force']: if not 'dannots' in locals(): dannots = pd.read_table(daannotp, low_memory=False) dannots = del_Unnamed(dannots) dannots = dannots.reset_index() dannotsagg = pd.DataFrame( dannots.groupby('id')['annotations count'].agg('sum')) - 1 dannotsagg.loc[dannotsagg['annotations count'] == 0, 'region'] = 'intergenic' dannotsagg.loc[dannotsagg['annotations count'] != 0, 'region'] = 'genic' alignids = dannots['id'].unique() #[:15] logging.debug('start of the slowest step') for alignidi in range(len(alignids)): alignid = alignids[alignidi] dannoti = dannots.loc[dannots['id'] == alignid, :] if len(dannoti.shape) == 1: dannoti = pd.DataFrame(dannoti).T dannoti = dannoti.loc[ dannoti['type'] != 'chromosome', :].drop_duplicates( subset=['start annotation', 'end annotation']) for col in [ 'type', 'gene name', 'gene id', 'transcript id', 'protein id', 'exon id' ]: dannotsagg.loc[alignid, col + 's'] = ";".join( np.unique(dannoti[col].fillna('nan').tolist())) logging.debug('end of the slowest step') del dannots dannotsagg = dannotsagg.reset_index() dannotsagg.to_csv(dannotsaggp, sep='\t') return cfg
def plot_vizbysteps(cfg): prjd = cfg['prjd'] #make one output table and stepwise plots datad = f"{prjd}/05_output" # step2 # make submap stepi = 2 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_substitution_map" plotps = glob(plotp + '*') if len(plotps) == 0 or cfg['force']: plotpf = plotp + "_{mutation_type}.png" dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() if len(dstep) < 1000: logging.info('plot_submap_possibilities') plot_submap_possibilities(dmutagenesis=dstep, plotpf=plotpf, test=False) else: logging.warning(f'skipped: plot_submap_possibilities') else: logging.warning(f'not found: {dstepp}') # step3 # stats by strategies stepi = 3 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_stats_by_strategies.png" if not exists(plotp) or cfg['force']: dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() logging.info('plot_bar_dguides') plot_bar_dguides(dstep, plotp) else: logging.warning(f'not found: {dstepp}') # make nt_composition plot stepi = 3 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_nt_compositions" plotps = glob(plotp + '*') if len(plotps) == 0 or cfg['force']: plotpf = plotp + "_{method}.png" makedirs(dirname(plotp), exist_ok=True) dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() # dbepams=pd.read_table(f'{dirname(realpath(__file__))}/../data/dbepams.tsv') dbepams = pd.read_table(cfg['dbepamsp'], keep_default_na=False) dpam = dbepams.loc[:, cols_dpam].drop_duplicates() dpam = set_index(dpam, 'PAM') logging.info('plot_dist_dguides') plot_dist_dguides(dstep, dpam, plotpf) else: logging.warning(f'not found: {dstepp}') # make plot_dna_features_view stepi = 3 plotd = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dna_features_view" plotps = glob(plotd + '/*') if len(plotps) == 0 or cfg['force']: dguidesp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" dsequencesp = f"{cfg[stepi-2]}/d{cfg[stepi-2].replace('/','').split('_')[-1]}.tsv" if exists(dguidesp): logging.info('plot_dna_features_view') plot_dna_features_view( cfg, dsequences=del_Unnamed( pd.read_table(dsequencesp, keep_default_na=False)).drop_duplicates(), dguides=del_Unnamed( pd.read_table(dguidesp, keep_default_na=False)).drop_duplicates(), plotd=plotd, more=False) else: logging.warning(f'not found: {dstepp}') # # step2 # make submap #FIXME get all the columns used for plotting in the dguides. # stepi=3 # plotp=f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_submap_used_for_mutagenesis" # plotps=glob(plotp+'*') # if len(plotps)==0 or cfg['force']: # plotpf=plotp+"_{mutation_type}.png" # dstepp=f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" # dstep=del_Unnamed(pd.read_table(dstepp)).drop_duplicates() # logging.info('plot_submap_possibilities') # plot_submap_possibilities(dmutagenesis=dstep, # plotpf=plotpf,test=False) # step4 offtargets correlations stepi = 4 plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dist_beditor_score.png" if not exists(plotp) or cfg['force']: dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): dstep = del_Unnamed(pd.read_table( dstepp, keep_default_na=False)).drop_duplicates() logging.info('plot_dist_dofftargets') plot_dist_dofftargets(dstep, plotp) else: logging.warning(f'not found: {dstepp}')
def make_outputs(cfg, plot=True): """ Cobines stepwise analysis files into a pretty table. :param cfg: main configuration dict :param plot: if True creates visualizations """ print(f"{get_datetime()}: generating outputs") from beditor.lib.global_vars import stepi2colsoutput prjd = cfg['prjd'] #make one output table and stepwise plots datad = f"{prjd}/05_output" makedirs(datad, exist_ok=True) #table doutputp = f"{datad}/doutput.tsv" #FIXME if steps are added if not exists(doutputp) or cfg['force']: from beditor.lib.io_dfs import del_Unnamed if 'doutput' in locals(): del doutput for stepi in range(5): if stepi != 2 and cfg['step2ignore'] != stepi: dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv" if exists(dstepp): logging.info(f'combining {stepi}') colsoutput = stepi2colsoutput[stepi] dstep = del_Unnamed( pd.read_table(dstepp, keep_default_na=False)) if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: if stepi == 0: continue colsoutput = [col for col in colsoutput if col in dstep] dstep = dstep.loc[:, colsoutput] if len(dstep) != 0: dstep = dstep.drop_duplicates() if not 'doutput' in locals(): doutput = dstep.copy() del dstep else: cols_on = list( set(doutput.columns.tolist()).intersection( dstep.columns.tolist())) if len(cols_on) != 0: doutput = pd.merge(doutput, dstep, on=cols_on, how='left') else: logging.error( f'output of step {stepi-1} or {stepi} are missing.' ) return None del dstep if cfg['mutation_format'] == 'nucleotide': doutput = doutput.drop([ c for c in doutput if (('codon' in c) or ('amino' in c) or ('transcript' in c)) ], axis=1) if len(doutput) != 0 and 'guide+PAM sequence' in doutput: from beditor.lib.io_seqs import get_polyt_length doutput['length of polyT stretch'] = doutput[ 'guide+PAM sequence'].apply(lambda x: get_polyt_length(x)) makedirs(dirname(doutputp), exist_ok=True) doutput.to_csv(doutputp, sep='\t') else: doutput = pd.read_table(doutputp, keep_default_na=False) # plot if plot: plot_vizbysteps(cfg) logging.info(f"Outputs are located at {datad}") return doutput