def enrichr(gene_list, description, out_dir, scan=None, max_terms=10, figsize=(12, 6), run_main=False): ''' Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list. Uses enrichr. Inputs ------ gene_list: list of genes to perform enrichment on description: string description for title out_dir: output director scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats) max_terms: limit return plot to this max load: load results figsize: change fig size Returns ------- None ''' out_dir = make_folder(out_dir) testscan = { 'KEGG': 'KEGG_2016', 'GO_biological_process': 'GO_Biological_Process_2017b', 'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ChEA': 'ChEA_2016', 'OMIM_Disease': 'OMIM_Disease' } if isinstance(scan, dict): testscan = {**testscan, **scan} for nick, name in testscan.items(): gseapy.enrichr(gene_list=gene_list, figsize=figsize, top_term=max_terms, description=f'{description}_{nick}', gene_sets=name, outdir=out_dir, format='png') out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png', f'Enrichr: {nick} for {description}', run_main=run_main) out_list = pd.DataFrame({'Gene Name': gene_list}, index=range(len(gene_list))) out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
def enrichr(gene_list, description, out_dir, log_file, scan=None, max_terms=20, figsize=(12, 6), run_main=False): ''' Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list. Uses enrichr. Inputs ------ gene_list: list of genes to perform enrichment on description: string description for title out_dir: output director scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats) max_terms: limit return plot to this max load: load results figsize: change fig size Returns ------- None ''' gene_sets = [ 'KEGG_2016', 'GO_Biological_Process_2018', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ChEA_2016', 'OMIM_Disease' ] gene_sests = scan if scan is not None else gene_sets for gene_set in gene_sets: try: filename = f'{out_dir}{description}_{gene_set}.enrichr.txt' post = post_genes(genes, description) get = enrich(post['userListId'], filename, gene_library) png = enrichr_barplot(filename=filename, gene_library=gene_sets, out_dir=out_dir, description=description, figsize=figsize, max_n=max_terms) out_result(png, f'Enrichr: {gene_set} for {description}', run_main=run_main) except: output( f'Error in enrichr for {description} with {gene_set}. Skipping... \n', log_file=log_file, run_main=run_main)
def plot_col(df, title, ylabel, out='', xy=(None, None), xticks=[''], plot_type=['violin', 'swarm'], pvalue=False, compare_tags=None, log_file=None, run_main=False): ''' One or two column boxplot from dataframe. Titles x axis based on column names. Inputs ------ df: dataframe (uses first two columns) title: string of title ylabel: string of y label xy: If specified, will x is the label column and y is the data column. (default: (None,None): Data separated into two columns). xticks: list of xtick names (default is none) pvalue: bool to perform ttest (default is False). Will only work if xy=(None,None) or ther are only two labels in x. plot_type: list of one or more: violin, box, swarm (default=violin) compare_tags: if xy and pvalue is specified and there are more than two tags in x, specify the tags to compare. eg. ['a','b'] out: out parent directory. if none returns into colplot/ log_file: log_file Returns ------ None ''' out = make_folder(f'{val_folder(out)}plots/') plt.clf() sns.set(context='paper', font='Arial', font_scale=2, style='white', rc={ 'figure.dpi': 300, 'figure.figsize': (5, 6) }) if type(plot_type) != list: plot_type = plot_type.split() lower_plot_type = [x.lower() for x in plot_type] if len(lower_plot_type) == 0: raise IOError('Input a plot type.') elif True not in { x in lower_plot_type for x in ['violin', 'box', 'swarm'] }: raise IOError('Did not recognize plot type.') if 'swarm' in lower_plot_type: if xy == (None, None): fig = sns.swarmplot(data=df, color='black', s=4) else: fig = sns.swarmplot(data=df, x=xy[0], y=xy[1], color='black', s=4) if 'violin' in lower_plot_type: if xy == (None, None): fig = sns.violinplot(data=df) else: fig = sns.violinplot(data=df, x=xy[0], y=xy[1]) if 'box' in lower_plot_type: if xy == (None, None): fig = sns.boxplot(data=df) else: fig = sns.boxplot(data=df, x=xy[0], y=xy[1]) fig.yaxis.set_label_text(ylabel) fig.set_title(title) if xticks: fig.xaxis.set_ticklabels(xticks) fig.xaxis.set_label_text('') for tick in fig.xaxis.get_ticklabels(): tick.set_fontsize(12) if pvalue: if xy == (None, None): _, pvalue = stats.ttest_ind(a=df.iloc[:, 0], b=df.iloc[:, 1]) compare_tags = df.columns else: _, pvalue = stats.ttest_ind( a=df[df[xy[0]] == compare_tags[0]][xy[1]], b=df[df[xy[0]] == compare_tags[1]][xy[1]]) fig.text( s=f'p-value = {pvalue:.03g}, {compare_tags[0]} v {compare_tags[1]}', x=0, y=-.12, transform=fig.axes.transAxes, fontsize=12) sns.despine() plt.tight_layout() plt.subplots_adjust(bottom=0.17, top=0.9) plt.savefig(f"{out}{title.replace(' ', '_')}.png", dpi=300) if run_main: plt.close() out_result(f"{out}{title.replace(' ', '_')}.png", f'{title} Plot', run_main=run_main) output(f"{title.replace(' ', '_')}.png found in {out}", log_file=log_file, run_main=run_main)
def spike(exp): ''' If calling from jupyter. Change backend as needed. Align sequencing files to drosophila. ''' import pandas as pd if len(exp.spike_samples) == 0: output('Not processing Spike-ins', log_file=exp.log_file, run_main=exp.run_main) exp.tasks_complete.append('Spike') return exp # Make QC folder spike_folder = make_folder(f'{exp.scratch}spike/') output('Processing samples with drosophila-spike in chromatin.', log_file=exp.log_file, run_main=exp.run_main) for sample in exp.spike_samples: bam = exp.sample_files[sample]['bam'] spike_command = [ submission_prepend(), f'samtools view -b -f 4 {bam} | samtools sort -n - | samtools fastq - > {spike_folder}{sample}.bwa_unaligned.fastq', f'bowtie2 -p 8 -x {exp.genome_indicies["spike_index"]} -U {spike_folder}{sample}.bwa_unaligned.fastq -S {spike_folder}{sample}.BDGP6.sam --very-sensitive-local -k 1 --no-unal', f'samtools view -b -F 4 {spike_folder}{sample}.BDGP6.sam | samtools sort - > {spike_folder}{sample}.BDGP6.bam', f'picard MarkDuplicates I={spike_folder}{sample}.BDGP6.bam O={spike_folder}{sample}.BDGP6.nodup.bam M={spike_folder}{sample}.BDGP6.nodups.markdups.qc ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true', f'samtools flagstat {spike_folder}{sample}.BDGP6.nodup.bam > {spike_folder}{sample}.unique_drosophila.flagstat.qc', f'rm {spike_folder}{sample}.BDGP6.sam {spike_folder}{sample}.BDGP6.nodup.bam {spike_folder}{sample}*.fastq' ] exp.job_id.append( send_job(command_list=spike_command, job_name=f"{sample}_spike", job_log_folder=exp.job_folder, q='general', mem=10000, log_file=exp.log_file, project=exp.project, cores=2, run_main=exp.run_main)) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file, exp.run_main) spike_reads = pd.DataFrame(index=['spike_reads', 'genome_reads']) for sample in exp.spike_samples: qc_file = f'{spike_folder}{sample}.unique_drosophila.flagstat.qc' exp.sample_files[sample]['drosophila'] = qc_file with open(qc_file, 'r') as fp: spike_number = fp.read().split(' ')[0] with open(exp.sample_files[sample]['nodup_flagstat']) as fp: target_number = fp.read().split(' ')[0] spike_reads[sample] = [spike_number, target_number] exp.spike_reads = spike_reads.T condition_dict = pd.Series(exp.sample_df.Condition.values, index=exp.sample_df.Sample_Name).to_dict() exp.spike_reads['Replicate'] = [ x.split('_')[-1] for x in exp.spike_reads.index.tolist() ] exp.spike_reads['Condition'] = [ condition_dict[x] for x in exp.splike_reads.index.tolist() ] for name, spike_conditions in exp.spike_comparisons.items(): out_dir = make_folder(f'{exp.scratch}spike/{name}') plot = spike_in_plot(exp.spike_reads, spike_conditions, name, out_dir) out_result(plot, f'{name.replace("_", " ")} Spike-In Comparison', run_main=exp.run_main) output( f'Spike-in comparison {name.replace("_", " ")} can be found here: {plot.replace(os.scratch, "")}' ) output(f'Spike-in counts:\n {spike_reads.T}', log_file=exp.log_file, run_main=exp.run_main) output('Spike-in alignment jobs finished.', log_file=exp.log_file, run_main=exp.run_main) # Generate one dataframe for all spike_counts output( f"Spike-in processing complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n", log_file=exp.log_file, run_main=exp.run_main) exp.tasks_complete.append('Spike') return exp
def overlap_three(bed_dict, overlap_name, out_folder, log_file, genome=None, run_main=False): ''' Takes a dictionary of three bed-like format files. Merges all overlapping peaks for each bed into a master file. Intersects beds to merged master file. Performs annotations with ChIPseeker if genome is specified. Plots venn diagrams of peak overlaps If genome is specified, also plots venn diagrams of annotated gene sets. Inputs ------ bed_dict: dictionary of BedTool files genome: 'hg38','hg19','mm10' Returns ------- Returns a dictionary of dataframes from unique and overlap peaks. If genome is specified, includes a dictionary of annotated peaks. ''' from collections import OrderedDict names = list(bed_dict.keys()) out = make_folder(out_folder) output(f'Output files are found in {out}', log_file=log_file, run_main=run_main) output(f'A: {names[0]}, B: {names[1]}, C: {names[2]}', log_file=log_file, run_main=run_main) with open(f'{out}README.txt', 'w') as file: file.write( 'All peaks are unique, meaning that each peak is in only one group.\n' ) file.write( 'Capital letter means this sample peak is included in the overlap.\n' ) file.write( 'Lowercase letter means the sample is excluded in the overlap.\n\n' ) file.write(f'A: {names[0]}\nB: {names[1]}\nC: {names[2]}') master = bed_dict[names[0]].cat(bed_dict[names[1]]).cat( bed_dict[names[2]]).sort().merge() A = bed_dict[names[0]].sort().merge() B = bed_dict[names[1]].sort().merge() C = bed_dict[names[2]].sort().merge() sorted_dict = OrderedDict({'master': master, 'A': A, 'B': B, 'C': C}) sorted_dict['Abc'] = master.intersect(A).intersect(B, v=True).intersect( C, v=True) sorted_dict['aBc'] = master.intersect(B).intersect(A, v=True).intersect( C, v=True) sorted_dict['ABc'] = master.intersect(A).intersect(B).intersect(C, v=True) sorted_dict['abC'] = master.intersect(C).intersect(A, v=True).intersect( B, v=True) sorted_dict['AbC'] = master.intersect(A).intersect(C).intersect(B, v=True) sorted_dict['aBC'] = master.intersect(B).intersect(C).intersect(A, v=True) sorted_dict['ABC'] = master.intersect(A).intersect(B).intersect(C) labTup = tuple(key for key in sorted_dict.keys()) lenTup = tuple(len(bed) for bed in sorted_dict.values()) output(f'{labTup}\n{lenTup}', log_file=log_file, run_main=run_main) plot_venn3_counts(lenTup[4:], names, f'{overlap_name} Peak', out) out_result(f'{out}venn_plot/{overlap_name}_Peak-overlap.png', f"{overlap_name} Peak Venn Overlap", run_main=run_main) for key, bed in sorted_dict.items(): if len(bed) == 0: open(f'{out}{key.replace(" ", "_")}-peaks-from-mergedPeaks.bed', 'w').close() # Can't convert empty bed file to dataframe else: bed2df(bed).to_csv( f"{out}{key.replace(' ', '_')}-peaks-from-mergedPeaks.bed", header=None, index=None, sep="\t") if bool(genome): output('Annotating ovelapped peaks...', log_file=log_file) unikey = '{}_unique' unianno = '{}_unique_annotated' return_dict = annotate_peaks( { unikey.format(key): bed2df(bed) for key, bed in sorted_dict.items() if len(bed) > 0 }, out, genome=genome, log_file=log_file, run_main=run_main) for key, bed in sorted_dict.items(): if len(bed) == 0: return_dict[unianno.format(key)] = None Set1 = set() if return_dict[unianno.format('A')] is None else set( return_dict[unianno.format('A')].SYMBOL.unique().tolist()) Set2 = set() if return_dict[unianno.format('B')] is None else set( return_dict[unianno.format('B')].SYMBOL.unique().tolist()) Set3 = set() if return_dict[unianno.format('C')] is None else set( return_dict[unianno.format('C')].SYMBOL.unique().tolist()) plot_venn3_set({ names[0]: Set1, names[1]: Set2, names[2]: Set3 }, f'{overlap_name}_annotated_genes', out) out_result( f'{out}venn_plot/{overlap_name}_annotated_genes-overlap.png', f"{overlap_name.replace('_',' ')} Gene Venn Overlap", run_main=run_main) return_sorted_dict = {key: bed2df(bed) for key, bed in sorted_dict.items()} return return_sorted_dict if genome is None else { **return_sorted_dict, **return_dict }
def overlap_two(bed_dict, overlap_name, out_folder, log_file, genome=None, run_main=False): ''' Takes a dictionary of two bed-like format files. Merges all overlapping peaks for each bed into a master file. Intersects beds to merged master file. Performs annotations with ChIPseeker if genome is specified. Plots venn diagrams of peak overlaps If genome is specified, also plots venn diagrams of annotated gene sets. Inputs ------ bed_dict: dictionary of BedTool files genome: 'hg38','hg19','mm10' Returns ------- Returns a dictionary of dataframes from unique and overlap peaks. If genome is specified, includes a dictionary of annotated peaks. ''' names = list(bed_dict.keys()) out_folder = make_folder(out_folder) output(f'Output files for {overlap_name} are found in {out_folder}', log_file=log_file, run_main=run_main) masterfile = bed_dict[names[0]].cat(bed_dict[names[1]]).sort().merge() sorted_dict = {key: bed.sort().merge() for key, bed in bed_dict.items()} overlap_dict = { 'overlap': masterfile.intersect(sorted_dict[names[0]]).intersect( sorted_dict[names[1]]) } for key, bed in sorted_dict.items(): other = { other_key: other_bed for other_key, other_bed in sorted_dict.items() if other_key != key } overlap_dict[f'{key}_unique_peak'] = masterfile.intersect( sorted_dict[key]).intersect(list(other.values())[0], v=True) for key, bed in overlap_dict.items(): if len(bed) == 0: open( f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed', 'w').close() # Can't convert empty bed file to dataframe else: bed2df(bed).to_csv( f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed', header=None, index=None, sep="\t") overlap_numbers = pd.Series( { names[0]: len(overlap_dict[f'{names[0]}_unique_peak']), names[1]: len(overlap_dict[f'{names[1]}_unique_peak']), 'overlap': len(overlap_dict['overlap']) }, index=[names[0], names[1], 'overlap']) # Venn plot_venn2(overlap_numbers, overlap_name.replace('_', ' '), out_folder) out_result( f'{out_folder}venn_plot/{overlap_name.replace(" ","_")}-overlap.png', f"{overlap_name.replace('_',' ')} Peak Venn Overlap", run_main=run_main) if bool(genome): # output(f'Annotating overlaping peaks for {overlap_name.replace("_"," ")}...', log_file) # Annotate with ChIPseeker unikey = '{}_unique' unianno = '{}_unique_annotated' return_dict = annotate_peaks( { unikey.format(key): bed2df(bed) for key, bed in overlap_dict.items() if len(bed) > 0 }, out_folder, genome=genome, log_file=log_file, run_main=run_main) for key, bed in overlap_dict.items(): if len(bed) == 0: return_dict[unianno.format(key)] = None Set1_unique = set() if return_dict[unianno.format( f'{names[0]}_unique_peak')] is None else set( return_dict[unianno.format( f'{names[0]}_unique_peak')].SYMBOL.unique().tolist()) Set2_unique = set() if return_dict[unianno.format( f'{names[1]}_unique_peak')] is None else set( return_dict[unianno.format( f'{names[1]}_unique_peak')].SYMBOL.unique().tolist()) Overlap_Set = set( ) if return_dict[unianno.format('overlap')] is None else set( return_dict[unianno.format('overlap')].SYMBOL.unique().tolist()) venn2_dict = { names[0]: (Set1_unique | Overlap_Set), names[1]: (Set2_unique | Overlap_Set) } plot_name = f'{overlap_name.replace("_"," ")} Annotated Gene' plot_venn2_set(venn2_dict, plot_name, out_folder) out_result( f'{out_folder}venn_plot/{plot_name.replace(" ","_")}-overlap.png', f"{overlap_name.replace('_',' ')} Venn Annotated Gene Overlap", run_main=run_main) gene_overlaps = {} gene_overlaps[f'{names[0]}_unique_genes'] = Set1_unique - ( Set2_unique | Overlap_Set) gene_overlaps[f'{names[1]}_unique_genes'] = Set2_unique - ( Set1_unique | Overlap_Set) gene_overlaps['Overlap_Gene_Set'] = (Set1_unique & Set2_unique) | Overlap_Set return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()} for key, item in gene_overlaps.items(): return_dict[key] = item else: return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()} return return_dict