Пример #1
0
def enrichr(gene_list,
            description,
            out_dir,
            scan=None,
            max_terms=10,
            figsize=(12, 6),
            run_main=False):
    '''
    Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list.
    Uses enrichr.

    Inputs
    ------
    gene_list: list of genes to perform enrichment on
    description: string description for title
    out_dir: output director
    scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats)
    max_terms: limit return plot to this max
    load: load results
    figsize: change fig size

    Returns
    -------

    None

    '''

    out_dir = make_folder(out_dir)

    testscan = {
        'KEGG': 'KEGG_2016',
        'GO_biological_process': 'GO_Biological_Process_2017b',
        'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ChEA': 'ChEA_2016',
        'OMIM_Disease': 'OMIM_Disease'
    }

    if isinstance(scan, dict):
        testscan = {**testscan, **scan}

    for nick, name in testscan.items():
        gseapy.enrichr(gene_list=gene_list,
                       figsize=figsize,
                       top_term=max_terms,
                       description=f'{description}_{nick}',
                       gene_sets=name,
                       outdir=out_dir,
                       format='png')

        out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png',
                   f'Enrichr: {nick} for {description}',
                   run_main=run_main)

    out_list = pd.DataFrame({'Gene Name': gene_list},
                            index=range(len(gene_list)))
    out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
Пример #2
0
def enrichr(gene_list,
            description,
            out_dir,
            log_file,
            scan=None,
            max_terms=20,
            figsize=(12, 6),
            run_main=False):
    '''
    Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list.
    Uses enrichr.

    Inputs
    ------
    gene_list: list of genes to perform enrichment on
    description: string description for title
    out_dir: output director
    scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats)
    max_terms: limit return plot to this max
    load: load results
    figsize: change fig size

    Returns
    -------

    None

    '''

    gene_sets = [
        'KEGG_2016', 'GO_Biological_Process_2018',
        'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ChEA_2016',
        'OMIM_Disease'
    ]

    gene_sests = scan if scan is not None else gene_sets

    for gene_set in gene_sets:
        try:
            filename = f'{out_dir}{description}_{gene_set}.enrichr.txt'

            post = post_genes(genes, description)
            get = enrich(post['userListId'], filename, gene_library)
            png = enrichr_barplot(filename=filename,
                                  gene_library=gene_sets,
                                  out_dir=out_dir,
                                  description=description,
                                  figsize=figsize,
                                  max_n=max_terms)
            out_result(png,
                       f'Enrichr: {gene_set} for {description}',
                       run_main=run_main)
        except:
            output(
                f'Error in enrichr for {description} with {gene_set}. Skipping... \n',
                log_file=log_file,
                run_main=run_main)
Пример #3
0
def plot_col(df,
             title,
             ylabel,
             out='',
             xy=(None, None),
             xticks=[''],
             plot_type=['violin', 'swarm'],
             pvalue=False,
             compare_tags=None,
             log_file=None,
             run_main=False):
    '''
    One or two column boxplot from dataframe.  Titles x axis based on column names.

    Inputs
    ------
    df: dataframe (uses first two columns)
    title: string of title
    ylabel: string of y label
    xy: If specified, will x is the label column and y is the data column. (default: (None,None): Data separated into two columns).
    xticks: list of xtick names (default is none)
    pvalue: bool to perform ttest (default is False).  Will only work if xy=(None,None) or ther are only two labels in x.
    plot_type: list of one or more: violin, box, swarm (default=violin)
    compare_tags:  if xy and pvalue is specified and there are more than two tags in x, specify the tags to compare. eg. ['a','b']
    out: out parent directory.  if none returns into colplot/
    log_file: log_file

    Returns
    ------
    None
    '''

    out = make_folder(f'{val_folder(out)}plots/')

    plt.clf()
    sns.set(context='paper',
            font='Arial',
            font_scale=2,
            style='white',
            rc={
                'figure.dpi': 300,
                'figure.figsize': (5, 6)
            })

    if type(plot_type) != list:
        plot_type = plot_type.split()
    lower_plot_type = [x.lower() for x in plot_type]

    if len(lower_plot_type) == 0:
        raise IOError('Input a plot type.')
    elif True not in {
            x in lower_plot_type
            for x in ['violin', 'box', 'swarm']
    }:
        raise IOError('Did not recognize plot type.')

    if 'swarm' in lower_plot_type:
        if xy == (None, None):
            fig = sns.swarmplot(data=df, color='black', s=4)
        else:
            fig = sns.swarmplot(data=df, x=xy[0], y=xy[1], color='black', s=4)
    if 'violin' in lower_plot_type:
        if xy == (None, None):
            fig = sns.violinplot(data=df)
        else:
            fig = sns.violinplot(data=df, x=xy[0], y=xy[1])
    if 'box' in lower_plot_type:
        if xy == (None, None):
            fig = sns.boxplot(data=df)
        else:
            fig = sns.boxplot(data=df, x=xy[0], y=xy[1])

    fig.yaxis.set_label_text(ylabel)
    fig.set_title(title)
    if xticks:
        fig.xaxis.set_ticklabels(xticks)
        fig.xaxis.set_label_text('')
        for tick in fig.xaxis.get_ticklabels():
            tick.set_fontsize(12)

    if pvalue:
        if xy == (None, None):
            _, pvalue = stats.ttest_ind(a=df.iloc[:, 0], b=df.iloc[:, 1])
            compare_tags = df.columns
        else:
            _, pvalue = stats.ttest_ind(
                a=df[df[xy[0]] == compare_tags[0]][xy[1]],
                b=df[df[xy[0]] == compare_tags[1]][xy[1]])
        fig.text(
            s=f'p-value = {pvalue:.03g}, {compare_tags[0]} v {compare_tags[1]}',
            x=0,
            y=-.12,
            transform=fig.axes.transAxes,
            fontsize=12)

    sns.despine()
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.17, top=0.9)
    plt.savefig(f"{out}{title.replace(' ', '_')}.png", dpi=300)
    if run_main:
        plt.close()

    out_result(f"{out}{title.replace(' ', '_')}.png",
               f'{title} Plot',
               run_main=run_main)
    output(f"{title.replace(' ', '_')}.png found in {out}",
           log_file=log_file,
           run_main=run_main)
Пример #4
0
def spike(exp):
    '''
    If calling from jupyter.  Change backend as needed.

    Align sequencing files to drosophila.
    '''
    import pandas as pd

    if len(exp.spike_samples) == 0:
        output('Not processing Spike-ins',
               log_file=exp.log_file,
               run_main=exp.run_main)
        exp.tasks_complete.append('Spike')
        return exp

    # Make QC folder
    spike_folder = make_folder(f'{exp.scratch}spike/')
    output('Processing samples with drosophila-spike in chromatin.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    for sample in exp.spike_samples:
        bam = exp.sample_files[sample]['bam']

        spike_command = [
            submission_prepend(),
            f'samtools view -b -f 4 {bam} | samtools sort -n - | samtools fastq - > {spike_folder}{sample}.bwa_unaligned.fastq',
            f'bowtie2 -p 8 -x {exp.genome_indicies["spike_index"]} -U {spike_folder}{sample}.bwa_unaligned.fastq -S {spike_folder}{sample}.BDGP6.sam --very-sensitive-local -k 1 --no-unal',
            f'samtools view -b -F 4 {spike_folder}{sample}.BDGP6.sam | samtools sort - > {spike_folder}{sample}.BDGP6.bam',
            f'picard MarkDuplicates I={spike_folder}{sample}.BDGP6.bam O={spike_folder}{sample}.BDGP6.nodup.bam M={spike_folder}{sample}.BDGP6.nodups.markdups.qc ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true',
            f'samtools flagstat {spike_folder}{sample}.BDGP6.nodup.bam > {spike_folder}{sample}.unique_drosophila.flagstat.qc',
            f'rm {spike_folder}{sample}.BDGP6.sam {spike_folder}{sample}.BDGP6.nodup.bam {spike_folder}{sample}*.fastq'
        ]

        exp.job_id.append(
            send_job(command_list=spike_command,
                     job_name=f"{sample}_spike",
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=10000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file, exp.run_main)

    spike_reads = pd.DataFrame(index=['spike_reads', 'genome_reads'])

    for sample in exp.spike_samples:
        qc_file = f'{spike_folder}{sample}.unique_drosophila.flagstat.qc'
        exp.sample_files[sample]['drosophila'] = qc_file

        with open(qc_file, 'r') as fp:
            spike_number = fp.read().split(' ')[0]

        with open(exp.sample_files[sample]['nodup_flagstat']) as fp:
            target_number = fp.read().split(' ')[0]

        spike_reads[sample] = [spike_number, target_number]

    exp.spike_reads = spike_reads.T
    condition_dict = pd.Series(exp.sample_df.Condition.values,
                               index=exp.sample_df.Sample_Name).to_dict()

    exp.spike_reads['Replicate'] = [
        x.split('_')[-1] for x in exp.spike_reads.index.tolist()
    ]
    exp.spike_reads['Condition'] = [
        condition_dict[x] for x in exp.splike_reads.index.tolist()
    ]

    for name, spike_conditions in exp.spike_comparisons.items():
        out_dir = make_folder(f'{exp.scratch}spike/{name}')
        plot = spike_in_plot(exp.spike_reads, spike_conditions, name, out_dir)
        out_result(plot,
                   f'{name.replace("_", " ")} Spike-In Comparison',
                   run_main=exp.run_main)
        output(
            f'Spike-in comparison {name.replace("_", " ")} can be found here: {plot.replace(os.scratch, "")}'
        )

    output(f'Spike-in counts:\n {spike_reads.T}',
           log_file=exp.log_file,
           run_main=exp.run_main)

    output('Spike-in alignment jobs finished.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    # Generate one dataframe for all spike_counts

    output(
        f"Spike-in processing complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n",
        log_file=exp.log_file,
        run_main=exp.run_main)

    exp.tasks_complete.append('Spike')
    return exp
Пример #5
0
def overlap_three(bed_dict,
                  overlap_name,
                  out_folder,
                  log_file,
                  genome=None,
                  run_main=False):
    '''
    Takes a dictionary of three bed-like format files.
    Merges all overlapping peaks for each bed into a master file.
    Intersects beds to merged master file.
    Performs annotations with ChIPseeker if genome is specified.
    Plots venn diagrams of peak overlaps
    If genome is specified, also plots venn diagrams of annotated gene sets.

    Inputs
    ------
    bed_dict:  dictionary of BedTool files
    genome: 'hg38','hg19','mm10'

    Returns
    -------
    Returns a dictionary of dataframes from unique and overlap peaks.
    If genome is specified, includes a dictionary of annotated peaks.
    '''
    from collections import OrderedDict

    names = list(bed_dict.keys())

    out = make_folder(out_folder)

    output(f'Output files are found in {out}',
           log_file=log_file,
           run_main=run_main)
    output(f'A: {names[0]}, B: {names[1]}, C: {names[2]}',
           log_file=log_file,
           run_main=run_main)
    with open(f'{out}README.txt', 'w') as file:
        file.write(
            'All peaks are unique, meaning that each peak is in only one group.\n'
        )
        file.write(
            'Capital letter means this sample peak is included in the overlap.\n'
        )
        file.write(
            'Lowercase letter means the sample is excluded in the overlap.\n\n'
        )
        file.write(f'A: {names[0]}\nB: {names[1]}\nC: {names[2]}')

    master = bed_dict[names[0]].cat(bed_dict[names[1]]).cat(
        bed_dict[names[2]]).sort().merge()

    A = bed_dict[names[0]].sort().merge()
    B = bed_dict[names[1]].sort().merge()
    C = bed_dict[names[2]].sort().merge()

    sorted_dict = OrderedDict({'master': master, 'A': A, 'B': B, 'C': C})
    sorted_dict['Abc'] = master.intersect(A).intersect(B, v=True).intersect(
        C, v=True)
    sorted_dict['aBc'] = master.intersect(B).intersect(A, v=True).intersect(
        C, v=True)
    sorted_dict['ABc'] = master.intersect(A).intersect(B).intersect(C, v=True)
    sorted_dict['abC'] = master.intersect(C).intersect(A, v=True).intersect(
        B, v=True)
    sorted_dict['AbC'] = master.intersect(A).intersect(C).intersect(B, v=True)
    sorted_dict['aBC'] = master.intersect(B).intersect(C).intersect(A, v=True)
    sorted_dict['ABC'] = master.intersect(A).intersect(B).intersect(C)

    labTup = tuple(key for key in sorted_dict.keys())
    lenTup = tuple(len(bed) for bed in sorted_dict.values())

    output(f'{labTup}\n{lenTup}', log_file=log_file, run_main=run_main)

    plot_venn3_counts(lenTup[4:], names, f'{overlap_name} Peak', out)
    out_result(f'{out}venn_plot/{overlap_name}_Peak-overlap.png',
               f"{overlap_name} Peak Venn Overlap",
               run_main=run_main)

    for key, bed in sorted_dict.items():
        if len(bed) == 0:
            open(f'{out}{key.replace(" ", "_")}-peaks-from-mergedPeaks.bed',
                 'w').close()  # Can't convert empty bed file to dataframe
        else:
            bed2df(bed).to_csv(
                f"{out}{key.replace(' ', '_')}-peaks-from-mergedPeaks.bed",
                header=None,
                index=None,
                sep="\t")

    if bool(genome):
        output('Annotating ovelapped peaks...', log_file=log_file)
        unikey = '{}_unique'
        unianno = '{}_unique_annotated'
        return_dict = annotate_peaks(
            {
                unikey.format(key): bed2df(bed)
                for key, bed in sorted_dict.items() if len(bed) > 0
            },
            out,
            genome=genome,
            log_file=log_file,
            run_main=run_main)
        for key, bed in sorted_dict.items():
            if len(bed) == 0:
                return_dict[unianno.format(key)] = None

        Set1 = set() if return_dict[unianno.format('A')] is None else set(
            return_dict[unianno.format('A')].SYMBOL.unique().tolist())
        Set2 = set() if return_dict[unianno.format('B')] is None else set(
            return_dict[unianno.format('B')].SYMBOL.unique().tolist())
        Set3 = set() if return_dict[unianno.format('C')] is None else set(
            return_dict[unianno.format('C')].SYMBOL.unique().tolist())

        plot_venn3_set({
            names[0]: Set1,
            names[1]: Set2,
            names[2]: Set3
        }, f'{overlap_name}_annotated_genes', out)
        out_result(
            f'{out}venn_plot/{overlap_name}_annotated_genes-overlap.png',
            f"{overlap_name.replace('_',' ')} Gene Venn Overlap",
            run_main=run_main)

    return_sorted_dict = {key: bed2df(bed) for key, bed in sorted_dict.items()}

    return return_sorted_dict if genome is None else {
        **return_sorted_dict,
        **return_dict
    }
Пример #6
0
def overlap_two(bed_dict,
                overlap_name,
                out_folder,
                log_file,
                genome=None,
                run_main=False):
    '''
    Takes a dictionary of two bed-like format files.
    Merges all overlapping peaks for each bed into a master file.
    Intersects beds to merged master file.
    Performs annotations with ChIPseeker if genome is specified.
    Plots venn diagrams of peak overlaps
    If genome is specified, also plots venn diagrams of annotated gene sets.

    Inputs
    ------
    bed_dict:  dictionary of BedTool files
    genome: 'hg38','hg19','mm10'

    Returns
    -------
    Returns a dictionary of dataframes from unique and overlap peaks.
    If genome is specified, includes a dictionary of annotated peaks.
    '''

    names = list(bed_dict.keys())

    out_folder = make_folder(out_folder)

    output(f'Output files for {overlap_name} are found in {out_folder}',
           log_file=log_file,
           run_main=run_main)

    masterfile = bed_dict[names[0]].cat(bed_dict[names[1]]).sort().merge()
    sorted_dict = {key: bed.sort().merge() for key, bed in bed_dict.items()}
    overlap_dict = {
        'overlap':
        masterfile.intersect(sorted_dict[names[0]]).intersect(
            sorted_dict[names[1]])
    }
    for key, bed in sorted_dict.items():
        other = {
            other_key: other_bed
            for other_key, other_bed in sorted_dict.items() if other_key != key
        }
        overlap_dict[f'{key}_unique_peak'] = masterfile.intersect(
            sorted_dict[key]).intersect(list(other.values())[0], v=True)

    for key, bed in overlap_dict.items():
        if len(bed) == 0:
            open(
                f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed',
                'w').close()  # Can't convert empty bed file to dataframe
        else:
            bed2df(bed).to_csv(
                f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed',
                header=None,
                index=None,
                sep="\t")

    overlap_numbers = pd.Series(
        {
            names[0]: len(overlap_dict[f'{names[0]}_unique_peak']),
            names[1]: len(overlap_dict[f'{names[1]}_unique_peak']),
            'overlap': len(overlap_dict['overlap'])
        },
        index=[names[0], names[1], 'overlap'])

    # Venn
    plot_venn2(overlap_numbers, overlap_name.replace('_', ' '), out_folder)
    out_result(
        f'{out_folder}venn_plot/{overlap_name.replace(" ","_")}-overlap.png',
        f"{overlap_name.replace('_',' ')} Peak Venn Overlap",
        run_main=run_main)

    if bool(genome):
        # output(f'Annotating overlaping peaks for {overlap_name.replace("_"," ")}...', log_file)
        # Annotate with ChIPseeker
        unikey = '{}_unique'
        unianno = '{}_unique_annotated'
        return_dict = annotate_peaks(
            {
                unikey.format(key): bed2df(bed)
                for key, bed in overlap_dict.items() if len(bed) > 0
            },
            out_folder,
            genome=genome,
            log_file=log_file,
            run_main=run_main)
        for key, bed in overlap_dict.items():
            if len(bed) == 0:
                return_dict[unianno.format(key)] = None

        Set1_unique = set() if return_dict[unianno.format(
            f'{names[0]}_unique_peak')] is None else set(
                return_dict[unianno.format(
                    f'{names[0]}_unique_peak')].SYMBOL.unique().tolist())
        Set2_unique = set() if return_dict[unianno.format(
            f'{names[1]}_unique_peak')] is None else set(
                return_dict[unianno.format(
                    f'{names[1]}_unique_peak')].SYMBOL.unique().tolist())
        Overlap_Set = set(
        ) if return_dict[unianno.format('overlap')] is None else set(
            return_dict[unianno.format('overlap')].SYMBOL.unique().tolist())

        venn2_dict = {
            names[0]: (Set1_unique | Overlap_Set),
            names[1]: (Set2_unique | Overlap_Set)
        }

        plot_name = f'{overlap_name.replace("_"," ")} Annotated Gene'
        plot_venn2_set(venn2_dict, plot_name, out_folder)
        out_result(
            f'{out_folder}venn_plot/{plot_name.replace(" ","_")}-overlap.png',
            f"{overlap_name.replace('_',' ')} Venn Annotated Gene Overlap",
            run_main=run_main)

        gene_overlaps = {}
        gene_overlaps[f'{names[0]}_unique_genes'] = Set1_unique - (
            Set2_unique | Overlap_Set)
        gene_overlaps[f'{names[1]}_unique_genes'] = Set2_unique - (
            Set1_unique | Overlap_Set)
        gene_overlaps['Overlap_Gene_Set'] = (Set1_unique
                                             & Set2_unique) | Overlap_Set

        return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()}

        for key, item in gene_overlaps.items():
            return_dict[key] = item

    else:
        return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()}

    return return_dict