Пример #1
0
def main(use_config=True,
         outputdir=None,
         results=None,
         md_results=None,
         mdd_results=None,
         motif_distances=None,
         md=False,
         mdd=False,
         debug=False,
         jobid=None,
         output_type=False,
         p_cutoff=None,
         plot_format=None):
    '''This script creates output files associated with TFEA
    '''
    start_time = time.time()
    if use_config:
        from TFEA import config
        outputdir = config.vars['OUTPUT']
        figuredir = config.vars['FIGUREDIR']
        results = config.vars['RESULTS']
        md_results = config.vars['MD_RESULTS']
        mdd_results = config.vars['MDD_RESULTS']
        motif_distances = config.vars['MOTIF_DISTANCES']
        md = config.vars['MD']
        mdd = config.vars['MDD']
        debug = config.vars['DEBUG']
        jobid = config.vars['JOBID']
        output_type = config.vars['OUTPUT_TYPE']
        p_cutoff = np.log(config.vars['PADJCUTOFF'])
        padj_cutoff = np.log(config.vars['PADJCUTOFF'])
        label1 = config.vars['LABEL1']
        label2 = config.vars['LABEL2']
        plotall = config.vars['PLOTALL']
        singlemotif = config.vars['SINGLEMOTIF']
        plot_format = config.vars['PLOT_FORMAT']

    print("Creating output...", end=' ', flush=True, file=sys.stderr)
    TFEA_header = [
        '#TF', 'E-Score', 'Corrected E-Score', 'Events', 'GC', 'FPKM', 'P-adj',
        'Corrected P-adj'
    ]
    description = [
        'Motif Name', 'Enrichment Score',
        'Enrichment Score following GC correction',
        'Number of motif instances within analyzed regions',
        'GC-content of motif',
        'FPKM of the gene associated with the motif if an annotation is provided',
        'Adjusted P-value (Bonferroni)',
        'Adjusted P-value (Bonferroni) after GC correction'
    ]
    sort_index = [5, 3, 2, -1]
    txt_output(outputdir=outputdir,
               results=results,
               outname='results.txt',
               sortindex=sort_index,
               header=TFEA_header)
    plot.plot_global_MA(results,
                        p_cutoff=p_cutoff,
                        title='TFEA MA-Plot',
                        xlabel='$Log_{10}$(Events)',
                        ylabel='E-Score',
                        savepath=figuredir / (f'TFEA_MA.{plot_format}'),
                        plot_format=plot_format,
                        c_index=1,
                        x_index=3,
                        y_index=2,
                        p_index=-1,
                        ylimits=[-1, 1])
    # plot.plot_global_volcano(results, p_cutoff=p_cutoff, title='TFEA Volcano Plot',
    #                 xlabel='Area Under the Curve (AUC)',
    #                 ylabel='-log10(P-adj)',
    #                 savepath=figuredir / 'TFEA_volcano.png',
    #                 dpi=dpi)
    # plot.plot_global_gc(results, p_cutoff=p_cutoff, title='TFEA GC-Plot',
    #                     xlabel='Motif GC-content',
    #                     ylabel='Area Under the Curve (AUC)',
    #                     savepath=figuredir / 'TFEA_GC.png', dpi=dpi,
    #                     x_index=-3,
    #                     y_index=1,
    #                     p_index=-1)
    if md:
        header = ['#TF', 'MD-Score', 'Events', 'p-val']
        txt_output(outputdir=outputdir,
                   results=md_results,
                   outname='md_results.txt',
                   header=header,
                   sortindex=[-1],
                   log=False)
        plot.plot_global_MA(md_results,
                            p_cutoff=p_cutoff,
                            title='MD MA-Plot',
                            xlabel='Log10(Motif Hits)',
                            ylabel='MD-Score Difference',
                            savepath=figuredir / (f'MD_MA.{plot_format}'),
                            plot_format=plot_format,
                            x_index=2,
                            y_index=1,
                            p_index=-1,
                            ylimits=[-1, 1])
        plot.plot_global_volcano(md_results,
                                 p_cutoff=p_cutoff,
                                 title='MD Volcano Plot',
                                 xlabel='MD-Score Difference',
                                 ylabel='-log10(P-val)',
                                 savepath=figuredir /
                                 (f'MD_volcano.{plot_format}'),
                                 plot_format=plot_format)
    if mdd:
        header = ['#TF', 'MDD-Score', 'Events', 'p-val']
        txt_output(outputdir=outputdir,
                   results=mdd_results,
                   outname='mdd_results.txt',
                   header=header,
                   sortindex=[-1],
                   log=False)
        plot.plot_global_MA(mdd_results,
                            p_cutoff=p_cutoff,
                            title='MDD MA-Plot',
                            xlabel='Log10(Motif Hits)',
                            ylabel='Differential MD-Score Difference',
                            savepath=figuredir / (f'MDD_MA.{plot_format}'),
                            plot_format=plot_format,
                            x_index=2,
                            y_index=1,
                            p_index=-1,
                            ylimits=[-1, 1])
        plot.plot_global_volcano(mdd_results,
                                 p_cutoff=p_cutoff,
                                 title='MDD Volcano Plot',
                                 xlabel='Differential MD-Score Difference',
                                 ylabel='-log10(P-val)',
                                 savepath=figuredir /
                                 (f'MDD_volcano.{plot_format}'),
                                 plot_format=plot_format)

    total_time = time.time() - start_time
    if use_config:
        config.vars['OUTPUTtime'] = total_time
    if output_type == 'html':
        if use_config:
            # summary_html_output(config_object=config.vars, outputdir=outputdir)
            module_list = [('COMBINE', config.vars['COMBINE'],
                            config.vars['COMBINEtime']),
                           ('RANK', config.vars['RANK'],
                            config.vars['RANKtime']),
                           ('SCANNER', config.vars['SCANNER'],
                            config.vars['SCANNERtime']),
                           ('ENRICHMENT', config.vars['ENRICHMENT'],
                            config.vars['ENRICHMENTtime']),
                           ('OUTPUT', config.vars['OUTPUT_TYPE'],
                            config.vars['OUTPUTtime'])]
        else:
            module_list = []
        create_motif_result_htmls(results=results,
                                  results_header=TFEA_header,
                                  outputdir=outputdir,
                                  padj_cutoff=padj_cutoff,
                                  singlemotif=singlemotif,
                                  plotall=plotall,
                                  auc_index=2,
                                  padj_index=-1,
                                  plot_format=plot_format)
        html_output(results=results,
                    results_header=TFEA_header,
                    description=description,
                    module_list=module_list,
                    outputdir=outputdir,
                    label1=label1,
                    label2=label2,
                    padj_cutoff=padj_cutoff,
                    plotall=plotall,
                    auc_index=2,
                    padj_index=-1,
                    sortindex=sort_index,
                    plot_format=plot_format)

    print("done in: " + str(datetime.timedelta(seconds=int(total_time))),
          file=sys.stderr)

    if debug:
        multiprocess.current_mem_usage(jobid)
Пример #2
0
def main(use_config=True,
         fasta_file=False,
         md_fasta1=False,
         md_fasta2=False,
         ranked_file=None,
         md_bedfile1=None,
         md_bedfile2=None,
         scanner=None,
         md=None,
         largewindow=None,
         smallwindow=None,
         genomehits=None,
         fimo_background=None,
         genomefasta=None,
         tempdir=None,
         fimo_motifs=None,
         singlemotif=None,
         fimo_thresh=None,
         debug=None,
         mdd=None,
         jobid=None,
         cpus=None):
    '''This is the main script of the SCANNER module. It returns motif distances
        to regions of interest by either scanning fasta files on the fly using
        fimo or homer or by using bedtools closest on a center bed file and 
        a database of bed files corresponding to motif hits across the genome

    Parameters
    ----------
    use_config : boolean
        Whether to use a config module to assign variables.
    fasta_file : str
        Full path to a fasta file
    md_fasta1 : str
        Full path to a fasta file corresponding to a single condition. Only 
        required if md score analysis desired
    md_fasta2 : str
        Full path to a fasta file corresponding to a single condition. Only 
        required if md score analysis desired
    ranked_file : str
        Full path to a ranked bed file used in calculating background for 
        fimo scanning. Only necessary if fimo scanning desired
    scanner : str
        Scanning method desired
    md : boolean
        Whether md score analysis is desired. If True, requires bed files for
        each condition. These can be generated in the COMBINE module.
    largewindow : int
        Half-length of total window size to use when defining cutoffs for 
        how far out to measure motif distances
    smallwindow : int
        Half-length of window size to use when defining cutoffs for significant
        motif hits
    genomehits : str
        Full path to a folder containing bed files of motif hits across the 
        genome
    fimo_background : int, str, or boolean
        Defines whether to use a background file when performing fimo motif
        scanning. A user can specify any int for window size, smallwindow, 
        largewindow, or False if not desired.
    genomefasta : str
        Full path to a fasta file for desired genome
    tempdir : str
        Full path to a directory where files will be saved
    fimo_motifs : str
        Full path to a .meme formatted motif database
    singlemotif : str or boolean
        Whether to perform scanning on only a subset of motifs. A user can
        specify a single motif or a ',' separated list of motifs.
    fimo_thresh : str
        A float formatted as a string to be used when calling fimo to specify
        the p-value cutoff threshold
    debug : boolean
        Whether to print debug statements specifically within the multiprocess
        module

    Returns
    -------
    motif_distances : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region
    md_distances1 : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region
    md_distances2 : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region

    Raises
    ------
    InputError
        If an unknown scanner option is specified
    '''
    start_time = time.time()
    if use_config:
        from TFEA import config
        fasta_file = config.vars['FASTA_FILE']
        md_fasta1 = config.vars['MD_FASTA1']
        md_fasta2 = config.vars['MD_FASTA2']
        mdd_fasta1 = config.vars['MDD_FASTA1']
        mdd_fasta2 = config.vars['MDD_FASTA2']
        ranked_file = config.vars['RANKED_FILE']
        md_bedfile1 = config.vars['MD_BEDFILE1']
        md_bedfile2 = config.vars['MD_BEDFILE2']
        mdd_bedfile1 = config.vars['MDD_BEDFILE1']
        mdd_bedfile2 = config.vars['MDD_BEDFILE2']
        scanner = config.vars['SCANNER']
        md = config.vars['MD']
        largewindow = config.vars['LARGEWINDOW']
        smallwindow = config.vars['SMALLWINDOW']
        genomehits = config.vars['GENOMEHITS']
        fimo_background = config.vars['FIMO_BACKGROUND']
        genomefasta = config.vars['GENOMEFASTA']
        tempdir = config.vars['TEMPDIR']
        fimo_motifs = config.vars['FIMO_MOTIFS']
        singlemotif = config.vars['SINGLEMOTIF']
        fimo_thresh = config.vars['FIMO_THRESH']
        debug = config.vars['DEBUG']
        mdd = config.vars['MDD']
        mdd_pval = config.vars['MDD_PVAL']
        mdd_percent = config.vars['MDD_PERCENT']
        pvals = config.vars['PVALS']
        cpus = config.vars['CPUS']
        jobid = config.vars['JOBID']

    print("Scanning regions using " + scanner + "...",
          flush=True,
          file=sys.stderr)

    motif_distances = None
    md_distances1 = None
    md_distances2 = None
    mdd_distances1 = None
    mdd_distances2 = None

    if not fasta_file and scanner != 'genome hits':
        fasta_file = getfasta(bedfile=ranked_file,
                              genomefasta=genomefasta,
                              tempdir=tempdir,
                              outname='ranked_file.fa')
        if os.stat(fasta_file).st_size == 0:
            raise exceptions.FileEmptyError(
                "Error in SCANNER module. Converting RANKED_FILE to fasta failed."
            )
    if md:
        if not md_fasta1:
            md_fasta1 = getfasta(bedfile=md_bedfile1,
                                 genomefasta=genomefasta,
                                 tempdir=tempdir,
                                 outname='md1_fasta.fa')
        if not md_fasta2:
            md_fasta2 = getfasta(bedfile=md_bedfile2,
                                 genomefasta=genomefasta,
                                 tempdir=tempdir,
                                 outname='md2_fasta.fa')
        if os.stat(md_fasta1).st_size == 0 or os.stat(md_fasta2).st_size == 0:
            raise exceptions.FileEmptyError(
                "Error in SCANNER module. Converting MD bedfiles to fasta failed."
            )
    if mdd:
        if not mdd_fasta1:
            mdd_fasta1 = getfasta(bedfile=mdd_bedfile1,
                                  genomefasta=genomefasta,
                                  tempdir=tempdir,
                                  outname='mdd1_fasta.fa')
        if not mdd_fasta2:
            mdd_fasta2 = getfasta(bedfile=mdd_bedfile2,
                                  genomefasta=genomefasta,
                                  tempdir=tempdir,
                                  outname='mdd2_fasta.fa')
        if os.stat(mdd_fasta1).st_size == 0 or os.stat(
                mdd_fasta2).st_size == 0:
            raise exceptions.FileEmptyError(
                "Error in SCANNER module. Converting MDD bedfiles to fasta failed."
            )

    #FIMO
    if scanner == 'fimo':
        #Get background file, if none desired set to 'None'
        if fasta_file and fimo_background:
            background_file = fasta_markov(tempdir=tempdir,
                                           fastafile=fasta_file,
                                           order='1')
        elif fimo_background == 'largewindow':
            background_file = fimo_background_file(window=int(largewindow),
                                                   tempdir=tempdir,
                                                   bedfile=ranked_file,
                                                   genomefasta=genomefasta,
                                                   order='1')
        elif fimo_background == 'smallwindow':
            background_file = fimo_background_file(window=int(smallwindow),
                                                   tempdir=tempdir,
                                                   bedfile=ranked_file,
                                                   genomefasta=genomefasta,
                                                   order='1')
        elif type(fimo_background) == int:
            background_file = fimo_background_file(window=fimo_background,
                                                   tempdir=tempdir,
                                                   bedfile=ranked_file,
                                                   genomefasta=genomefasta,
                                                   order='1')
        elif type(fimo_background) == str:
            background_file = fimo_background
        else:
            background_file = None

        #Get motifs to scan through
        if singlemotif != False:
            motif_list = singlemotif.split(',')
        else:
            motif_list = fimo_motif_names(motifdatabase=fimo_motifs)

        #Perform fimo on desired motifs
        print("\tTFEA:", file=sys.stderr)
        fimo_keywords = dict(bg_file=background_file,
                             fasta_file=fasta_file,
                             tempdir=tempdir,
                             motifdatabase=fimo_motifs,
                             thresh=fimo_thresh,
                             largewindow=largewindow)

        motif_distances = multiprocess.main(function=fimo,
                                            args=motif_list,
                                            kwargs=fimo_keywords,
                                            debug=debug,
                                            jobid=jobid,
                                            cpus=cpus)

        #FIMO for md score fasta files
        if md:
            print("\tMD:", file=sys.stderr)
            fimo_keywords = dict(bg_file=background_file,
                                 fasta_file=md_fasta1,
                                 tempdir=tempdir,
                                 motifdatabase=fimo_motifs,
                                 thresh=fimo_thresh,
                                 largewindow=largewindow)
            md_distances1 = multiprocess.main(function=fimo,
                                              args=motif_list,
                                              kwargs=fimo_keywords,
                                              debug=debug,
                                              jobid=jobid,
                                              cpus=cpus)

            fimo_keywords = dict(bg_file=background_file,
                                 fasta_file=md_fasta2,
                                 tempdir=tempdir,
                                 motifdatabase=fimo_motifs,
                                 thresh=fimo_thresh,
                                 largewindow=largewindow)
            md_distances2 = multiprocess.main(function=fimo,
                                              args=motif_list,
                                              kwargs=fimo_keywords,
                                              debug=debug,
                                              jobid=jobid,
                                              cpus=cpus)

            if use_config:
                config.vars['MD_DISTANCES1'] = md_distances1
                config.vars['MD_DISTANCES2'] = md_distances2

        if mdd:
            print("\tMDD:", file=sys.stderr)
            print(f'\t Completed: 0/{len(motif_distances)} ',
                  end=' ',
                  file=sys.stderr)
            fimo_keywords = dict(bg_file=background_file,
                                 fasta_file=mdd_fasta1,
                                 tempdir=tempdir,
                                 motifdatabase=fimo_motifs,
                                 thresh=fimo_thresh,
                                 largewindow=largewindow)
            mdd_distances1 = multiprocess.main(function=fimo,
                                               args=motif_list,
                                               kwargs=fimo_keywords,
                                               debug=debug,
                                               jobid=jobid,
                                               cpus=cpus)

            fimo_keywords = dict(bg_file=background_file,
                                 fasta_file=mdd_fasta2,
                                 tempdir=tempdir,
                                 motifdatabase=fimo_motifs,
                                 thresh=fimo_thresh,
                                 largewindow=largewindow)
            mdd_distances2 = multiprocess.main(function=fimo,
                                               args=motif_list,
                                               kwargs=fimo_keywords,
                                               debug=debug,
                                               jobid=jobid,
                                               cpus=cpus)
            # mdd_distances1 = []
            # mdd_distances2 = []
            # mdd_sorted_indices = np.argsort(pvals)
            # for i, single_motif_distances in enumerate(motif_distances, 1):
            #     motif = single_motif_distances[0]
            #     mdd_distances = single_motif_distances[1:]
            #     print("pval len:", len(pvals), file=sys.stderr)
            #     print("mdd_indices len:", len(mdd_sorted_indices), file=sys.stderr)
            #     print("mdd_dist len:", len(mdd_distances), file=sys.stderr)
            #     mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices]
            #     if mdd_percent != False:
            #         cutoff = int(len(mdd_sorted_distances)*mdd_percent)
            #         mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff])
            #         mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:])
            #     else:
            #         sorted_pvals = [pvals[i] for i in mdd_sorted_indices]
            #         cutoff = int(len([p for p in sorted_pvals if p < mdd_pval]))
            #         mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff])
            #         mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:])
            #     # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr)
            if use_config:
                config.vars['MDD_DISTANCES1'] = mdd_distances1
                config.vars['MDD_DISTANCES2'] = mdd_distances2

    #HOMER
    elif scanner == 'homer':
        raise exceptions.InputError(
            "Homer scanning is not supported at this time.")

    #GENOME HITS
    elif scanner == 'genome hits':
        #Get motifs to analyze
        if singlemotif == False:
            motif_list = os.listdir(genomehits)
        else:
            motif_list = [
                os.path.join(genomehits, motif)
                for motif in singlemotif.split(',')
            ]

        #Perform bedtools closest to get distances
        ranked_file = get_center(bedfile=ranked_file, outname=ranked_file)
        print("\tTFEA:", file=sys.stderr)
        bedtools_distance_keywords = dict(genomehits=genomehits,
                                          ranked_center_file=ranked_file,
                                          tempdir=tempdir,
                                          distance_cutoff=largewindow,
                                          rank_index=3)

        motif_distances = multiprocess.main(function=bedtools_closest,
                                            args=motif_list,
                                            kwargs=bedtools_distance_keywords,
                                            debug=debug,
                                            jobid=jobid,
                                            cpus=cpus)

        #GENOME HITS for md score bed files
        if md:
            print("\tMD:", file=sys.stderr)
            md_bedfile1 = get_center(bedfile=md_bedfile1, outname=md_bedfile1)
            bedtools_distance_keywords = dict(genomehits=genomehits,
                                              ranked_center_file=md_bedfile1,
                                              tempdir=tempdir,
                                              distance_cutoff=largewindow)

            md_distances1 = multiprocess.main(
                function=bedtools_closest,
                args=motif_list,
                kwargs=bedtools_distance_keywords,
                debug=debug,
                jobid=jobid,
                cpus=cpus)

            md_bedfile2 = get_center(bedfile=md_bedfile2, outname=md_bedfile2)
            bedtools_distance_keywords = dict(genomehits=genomehits,
                                              ranked_center_file=md_bedfile2,
                                              tempdir=tempdir,
                                              distance_cutoff=largewindow)

            md_distances2 = multiprocess.main(
                function=bedtools_closest,
                args=motif_list,
                kwargs=bedtools_distance_keywords,
                debug=debug,
                jobid=jobid,
                cpus=cpus)
            if use_config:
                config.vars['MD_DISTANCES1'] = md_distances1
                config.vars['MD_DISTANCES2'] = md_distances2
        if mdd:
            print("\tMDD:", file=sys.stderr)
            print(f'\t Completed: 0/{len(motif_distances)} ',
                  end=' ',
                  file=sys.stderr)
            mdd_bedfile1 = get_center(bedfile=mdd_bedfile1,
                                      outname=mdd_bedfile1)
            bedtools_distance_keywords = dict(genomehits=genomehits,
                                              ranked_center_file=mdd_bedfile1,
                                              tempdir=tempdir,
                                              distance_cutoff=largewindow)

            mdd_distances1 = multiprocess.main(
                function=bedtools_closest,
                args=motif_list,
                kwargs=bedtools_distance_keywords,
                debug=debug,
                jobid=jobid,
                cpus=cpus)

            mdd_bedfile2 = get_center(bedfile=mdd_bedfile2,
                                      outname=mdd_bedfile2)
            bedtools_distance_keywords = dict(genomehits=genomehits,
                                              ranked_center_file=mdd_bedfile2,
                                              tempdir=tempdir,
                                              distance_cutoff=largewindow)

            mdd_distances2 = multiprocess.main(
                function=bedtools_closest,
                args=motif_list,
                kwargs=bedtools_distance_keywords,
                debug=debug,
                jobid=jobid,
                cpus=cpus)
            # mdd_distances1 = []
            # mdd_distances2 = []
            # mdd_sorted_indices = np.argsort(pvals)
            # for i, single_motif_distances in enumerate(motif_distances, 1):
            #     motif = single_motif_distances[0]
            #     mdd_distances = single_motif_distances[1:]
            #     mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices]
            #     if mdd_percent != False:
            #         cutoff = int(len(mdd_sorted_distances)*mdd_percent)
            #         mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff])
            #         mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:])
            #     else:
            #         sorted_pvals = [pvals[i] for i in mdd_sorted_indices]
            #         cutoff = int(len([p for p in sorted_pvals if p < mdd_pval]))
            #         mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff])
            #         mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:])
            #    # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr)
            if use_config:
                config.vars['MDD_DISTANCES1'] = mdd_distances1
                config.vars['MDD_DISTANCES2'] = mdd_distances2
    else:
        raise exceptions.InputError("SCANNER option not recognized.")

    if use_config:
        config.vars['MOTIF_DISTANCES'] = motif_distances

    total_time = time.time() - start_time
    if use_config:
        config.vars['SCANNERtime'] = total_time

    #Remove large fasta files from output folder
    # if fasta_file:
    #     fasta_file.unlink()
    # if md_fasta1:
    #     md_fasta1.unlink()
    # if md_fasta2:
    #     md_fasta2.unlink()
    # if mdd_fasta1:
    #     mdd_fasta1.unlink()
    # if mdd_fasta2:
    #     mdd_fasta2.unlink()

    print("done in: " + str(datetime.timedelta(seconds=int(total_time))),
          file=sys.stderr)

    if debug:
        multiprocess.current_mem_usage(jobid)

    return motif_distances, md_distances1, md_distances2, mdd_distances1, mdd_distances2
Пример #3
0
def run():
    #Imports
    #==============================================================================
    import sys
    import subprocess
    import shutil
    from pathlib import Path
    #Add TFEA srcdirectory into path
    srcdirectory = Path(__file__).absolute().parent
    sys.path.insert(0, srcdirectory)

    from TFEA import process_inputs

    #ARGUMENT PARSING
    #==============================================================================
    '''We begin by parsing user arguments. TFEA can be run in two ways and these
        are not mutually exclusive. TFEA has traditional command line flags that
        a user may specify. Additionally, a user may provide a configuration file
        (.ini) with all necessary inputs. Finally, a user may provide both a 
        configuration file and command line flags. In this case, the command line
        flags will overwrite any redundant options in the configuration file.
    '''
    #Process user inputs in a separate module
    parser = process_inputs.read_arguments()

    #Display help message when no args are passed.
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    #TEST module
    #==============================================================================
    '''If test flag specified, run unittests and exit.
    '''
    test_install = parser.parse_args().TEST_INSTALL
    if test_install:
        subprocess.call(["python3", srcdirectory / 'test' / 'test_install.py'])
        sys.exit()

    test_full = parser.parse_args().TEST_FULL
    if test_full:
        sbatch = parser.parse_args().SBATCH
        if not sbatch:
            subprocess.call(
                ["python3", srcdirectory / 'test' / 'test_full.py'])
            sys.exit()
        else:
            error_file = str(srcdirectory / 'test' / 'test_files' /
                             'TFEA_test.err')
            output_file = str(srcdirectory / 'test' / 'test_files' /
                              'TFEA_test.out')
            subprocess.call([
                "sbatch", "--error=" + error_file, "--output=" + output_file,
                "--mail-user="******"TFEA tests submitted as sbatch job. It can be "
                   "monitored using:\ntail -f " + error_file))
            sys.exit()

    #Rerun module
    #==============================================================================
    '''If rerun flag specified, rerun all rerun.sh files in specified directory
    '''
    rerun = parser.parse_args().RERUN
    if rerun:
        for path in rerun:
            for rerun_script in Path(path).glob('**/rerun.sh'):
                subprocess.call(["sh", rerun_script])
        sys.exit()

    #VERIFICATION OF USER INPUTS
    #==============================================================================
    '''This section of the code reads config file and user specified flags, makes
        sure these are complete and not conflicting and writes them to config.py
        within TFEA for global use across modules
    '''
    process_inputs.verify_arguments(parser=parser)

    #CREATING DIRECTORIES
    #==============================================================================
    '''TFEA creates the specified output directory if it doesn't exist. Within the
        output directory, 3 directories are created: 'temp_files', 'e_and_o', and
        'plots'. These contain temporary files, stderr and stdout files, and
        figures generated by TFEA. This is also the module where the special 
        --sbatch flag is handled
    '''
    process_inputs.create_directories(srcdirectory=srcdirectory)

    #==============================================================================
    #MAIN SCRIPT
    #==============================================================================

    #SECONDARY IMPORTS
    #==============================================================================
    import multiprocessing as mp

    from TFEA import config
    from TFEA import multiprocess

    #Print starting statements
    #==============================================================================
    print("TFEA start: ", file=sys.stderr)
    #Print multiprocessing information to stderr
    if config.vars['DEBUG']:
        mp.log_to_stderr()
        multiprocess.current_mem_usage(config.vars['JOBID'])

    #COMBINE module
    #==============================================================================
    '''This module is a pre-processing step where a user may specify how to handle
        multiple bed file inputs. The goal is to arrive at a single bed file to
        input into subsequent modules.
    '''
    if config.vars['COMBINE'] != False:
        from TFEA import combine
        combine.main()

    #RANK module
    #==============================================================================
    '''This module decides how to rank regions within the bed files. If genome
        hits specified then the ranked output will only contain the center of each
        region (since we will perform bedtools closest later)
    '''
    if config.vars['RANK'] != False:
        from TFEA import rank
        rank.main()

    #SCANNER module
    #==============================================================================
    '''This module returns motif distances to regions of interest. This is
        accomplished either by scanning regions on the fly using fimo or homer, or 
        by running bedtools closest on region centers compared to a database of
        motif hits across the genome.
    '''
    from TFEA import scanner
    scanner.main()

    #ENRICHMENT module
    #==============================================================================
    '''Where the bulk of TFEA analysis occurs. Some components of plotting module 
        are contained within this enrichment module
    '''
    from TFEA import enrichment
    enrichment.main()

    #OUTPUT module
    #==============================================================================
    '''A module to write output to either a txt or html file
    '''
    from TFEA import output
    output.main()

    print("TFEA done. Output in:", config.vars['OUTPUT'], file=sys.stderr)

    #Delete temp_files directory
    #==============================================================================
    if not config.vars['DEBUG']:
        shutil.rmtree(config.vars['TEMPDIR'])
Пример #4
0
def main(use_config=True,
         motif_distances=None,
         md_distances1=None,
         md_distances2=None,
         mdd_distances1=None,
         mdd_distances2=None,
         enrichment=None,
         output_type=None,
         permutations=None,
         debug=None,
         largewindow=None,
         smallwindow=None,
         md=None,
         mdd=None,
         cpus=None,
         jobid=None,
         pvals=None,
         fcs=None,
         p_cutoff=None,
         figuredir=None,
         plotall=False,
         fimo_motifs=None,
         meta_profile_dict=None,
         label1=None,
         label2=None,
         dpi=None,
         motif_fpkm={},
         bootstrap=False,
         gc=None,
         plot_format=None):
    '''This is the main script of the ENRICHMENT module. It takes as input
        a list of distances outputted from the SCANNER module and calculates
        an enrichment score, a p-value, and in some instances an adjusted 
        p-value for each motif.
    
    Parameters
    ----------
    use_config : boolean
        Whether to use a config module to assign variables.
    motif_distances : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region
    md_distances1 : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region
    md_distances2 : list of lists
        A list containing a list for each motif scanned. For each motif, the 
        list begins with the motif name as a string and is followed by int
        values corresponding to the motif distance for each region (ranked). A
        '.' value means the motif was not within the given region
    enrichment : str
        The type of enrichment analysis to perform
    output_type : str
        Determines what some functions will output. At this point, this is mostly
        intended for debug purposes.
    permutations : int
        Number of random shuffling permutations to perform to calculate a 
        p-value
    debug : boolean
        Whether to print debug statements specifically within the multiprocess
        module
    largewindow : int
        A distance cutoff value used within auc_bgcorrect
    smallwindow : int
        A distance cutoff value used within the md score analysis
    
    Returns
    -------
    results : list of lists
        A list of lists corresponding to enrichment statistics for each motif
    md_results : list of lists
        A list of lists corresponding to md-score statistics for each motif
    '''
    start_time = time.time()
    if use_config:
        motif_distances = config.vars['MOTIF_DISTANCES']
        md_distances1 = config.vars['MD_DISTANCES1']
        md_distances2 = config.vars['MD_DISTANCES2']
        mdd_distances1 = config.vars['MDD_DISTANCES1']
        mdd_distances2 = config.vars['MDD_DISTANCES2']
        enrichment = config.vars['ENRICHMENT']
        permutations = config.vars['PERMUTATIONS']
        debug = config.vars['DEBUG']
        largewindow = config.vars['LARGEWINDOW']
        smallwindow = config.vars['SMALLWINDOW']
        pvals = config.vars['PVALS']
        fcs = config.vars['FCS']
        md = config.vars['MD']
        mdd = config.vars['MDD']
        cpus = config.vars['CPUS']
        jobid = config.vars['JOBID']
        p_cutoff = np.log(config.vars['PADJCUTOFF'])
        figuredir = config.vars['FIGUREDIR']
        plotall = config.vars['PLOTALL']
        fimo_motifs = config.vars['FIMO_MOTIFS']
        meta_profile_dict = config.vars['META_PROFILE']
        label1 = config.vars['LABEL1']
        label2 = config.vars['LABEL2']
        output_type = config.vars['OUTPUT_TYPE']
        bootstrap = config.vars['BOOTSTRAP']
        gc = config.vars['GC']
        plot_format = config.vars['PLOT_FORMAT']
        try:
            motif_fpkm = config.vars['MOTIF_FPKM']
        except:
            motif_fpkm = {}

    print("Calculating enrichment...", flush=True, file=sys.stderr)

    results = None
    md_results = None
    mdd_results = None

    if enrichment == 'auc':
        gc_correct = {}
        linear_regression = None
        if gc:
            print('\tCorrecting GC:', file=sys.stderr)
            auc_keywords = dict(fimo_motifs=fimo_motifs)
            motif_gc_auc = multiprocess.main(function=get_auc_gc,
                                             args=motif_distances,
                                             kwargs=auc_keywords,
                                             debug=debug,
                                             jobid=jobid,
                                             cpus=cpus)

            #Calculate linear regression based on AUC and GC content of motifs
            varx = np.array([i[2] for i in motif_gc_auc])
            vary = np.array([i[1] for i in motif_gc_auc])
            mask = ~np.isnan(varx) & ~np.isnan(vary)
            linear_regression = [
                x for x in stats.linregress(varx[mask], vary[mask])
            ]
            slope, intercept, _, _, _ = linear_regression
            for key, _, gc in motif_gc_auc:
                offset = slope * gc + intercept
                gc_correct[key] = offset

        print('\tCalculating E-Score:', file=sys.stderr)
        # manager = Manager()
        # meta_profile_dict = manager.dict(meta_profile_dict)
        auc_keywords = dict(permutations=permutations,
                            use_config=use_config,
                            output_type=output_type,
                            pvals=pvals,
                            plotall=plotall,
                            p_cutoff=p_cutoff,
                            figuredir=figuredir,
                            largewindow=largewindow,
                            fimo_motifs=fimo_motifs,
                            meta_profile_dict=meta_profile_dict,
                            label1=label1,
                            label2=label2,
                            fcs=fcs,
                            motif_fpkm=motif_fpkm,
                            tests=len(motif_distances),
                            bootstrap=bootstrap,
                            gc_correct=gc_correct,
                            plot_format=plot_format)
        results = multiprocess.main(function=auc_simulate_and_plot,
                                    args=motif_distances,
                                    kwargs=auc_keywords,
                                    debug=debug,
                                    jobid=jobid,
                                    cpus=cpus)

        plot.plot_global_gc(results,
                            p_cutoff=p_cutoff,
                            title='TFEA GC-Plot',
                            xlabel='Motif GC-content',
                            ylabel='Non-corrected E-Score',
                            savepath=figuredir / ('TFEA_GC.' + plot_format),
                            linear_regression=linear_regression,
                            plot_format=plot_format,
                            x_index=4,
                            y_index=1,
                            c_index=2,
                            p_index=-1,
                            ylimits=[-1, 1])

        # results = list()
        # for motif_distance in motif_distances:
        #     results.append(area_under_curve(motif_distance, **auc_keywords))
        # padj_bonferroni(results)
    # elif enrichment == 'anderson-darling':
    #     results = multiprocess.main(function=anderson_darling,
    #                                     args=motif_distances, debug=debug,
    #                                     jobid=jobid, cpus=cpus)

    # elif enrichment == 'auc_bgcorrect':
    #     print('\tTFEA:', file=sys.stderr)
    #     auc_bgcorrect_keywords = dict(permutations=permutations)
    #     results = multiprocess.main(function=area_under_curve_bgcorrect,
    #                                 args=motif_distances,
    #                                 kwargs=auc_bgcorrect_keywords,
    #                                 debug=debug, jobid=jobid, cpus=cpus)

    #     padj_bonferroni(results)
    else:
        raise exceptions.InputError(
            "Enrichment option not recognized or supported.")

    if md:
        print('\tMD:', file=sys.stderr)
        md_results = calculate_md(md_distances1=md_distances1,
                                  md_distances2=md_distances2,
                                  smallwindow=smallwindow,
                                  jobid=jobid,
                                  cpus=cpus,
                                  debug=debug)
        if use_config:
            config.vars['MD_RESULTS'] = md_results
    if mdd:
        print('\tMDD:', file=sys.stderr)
        mdd_results = calculate_md(md_distances1=mdd_distances1,
                                   md_distances2=mdd_distances2,
                                   smallwindow=smallwindow,
                                   jobid=jobid,
                                   cpus=cpus,
                                   debug=debug)
        if use_config:
            config.vars['MDD_RESULTS'] = mdd_results

    if use_config:
        config.vars['RESULTS'] = results

    total_time = time.time() - start_time
    if use_config:
        config.vars['ENRICHMENTtime'] = total_time

    #Remove large meta profile file
    # meta_profile_file.unlink()
    if type(meta_profile_dict) == pathlib.PosixPath:
        shutil.rmtree(meta_profile_dict, ignore_errors=True)

    print("done in: " + str(datetime.timedelta(seconds=int(total_time))),
          file=sys.stderr)

    if debug:
        multiprocess.current_mem_usage(jobid)

    return results, md_results, mdd_results
Пример #5
0
def main(use_config=True,
         bed1=None,
         bed2=None,
         method=None,
         tempdir=None,
         md=None,
         largewindow=None,
         scanner=None,
         debug=False,
         label1=None,
         label2=None,
         jobid=None):
    '''This is the main script of the combine function that is called within
        TFEA. Default arguments are assigned to variables within config.vars.

    Parameters
    ----------
    use_config : boolean
        Whether to use a config module to assign variables.
    bed1 : list
        A list of strings specifying full paths to bed files corresponding to
        a single condition (replicates)
    bed2 : list
        A list of strings specifying full paths to bed files corresponding to
        a single condition (replicates)
    method : str
        Method for combining input bed files into a single bed file
    tempdir : str
        Full path to a directory where files will be saved
    md : boolean
        Whether md-score bed files are generated
    largewindow : int
        Half-length of window size to use when generating md-score related
        bed files
    scanner : str
        Scanner method to use in SCANNER module. Only needed if md also
        specified. If equal to 'genome hits', md bed files generated will be 
        only contain one base and be centered at the middle of the region

    Returns
    -------
    None - Assigns varaibles within config if use_config set to True

    Raises
    ------
    FileEmptyError
        If any resulting file is empty
    '''
    start_time = time.time()
    if use_config:
        bed1 = config.vars['BED1']
        bed2 = config.vars['BED2']
        method = config.vars['COMBINE']
        tempdir = config.vars['TEMPDIR']
        md = config.vars['MD']
        md_bedfile1 = config.vars['MD_BEDFILE1']
        md_bedfile2 = config.vars['MD_BEDFILE2']
        largewindow = config.vars['LARGEWINDOW']
        scanner = config.vars['SCANNER']
        label1 = config.vars['LABEL1']
        label2 = config.vars['LABEL2']
        debug = config.vars['DEBUG']
        jobid = config.vars['JOBID']

    print("Combining Regions...", end=' ', flush=True, file=sys.stderr)

    if md_bedfile1 and md_bedfile2:
        centered_md_bedfile1 = tempdir / 'md_bedfile1.centered.bed'
        centered_md_bedfile2 = tempdir / 'md_bedfile2.centered.bed'
        md = md and (not md_bedfile1 or not md_bedfile2
                     )  #Boolean to determine whether to generate MD bed files
        md_pybedtool1 = BedTool(str(md_bedfile1))
        md_pybedtool1.each(center_feature).each(
            extend_feature,
            size=largewindow).remove_invalid().saveas(centered_md_bedfile1)
        md_pybedtool2 = BedTool(str(md_bedfile2))
        md_pybedtool2.each(center_feature).each(
            extend_feature,
            size=largewindow).remove_invalid().saveas(centered_md_bedfile2)
        if use_config:
            config.vars['MD_BEDFILE1'] = centered_md_bedfile1
            config.vars['MD_BEDFILE2'] = centered_md_bedfile2

    #Use MuMerge to merge bed files
    if method == 'mumerge':
        mumerge_input = tempdir / 'mumerge_input.txt'
        combined_file = tempdir / 'combined_file.mumerge'
        #Write MuMerge input file
        # with open(mumerge_input, 'w') as F:
        #     F.write("#file\tsampid\tgroup\n")
        #     for i,bedpath in enumerate(bed1, 1):
        #         F.write(f'{bedpath}\t{label1}{i}\t{label1}\n')
        #     for i,bedpath in enumerate(bed2, 1):
        #         F.write(f'{bedpath}\t{label2}{i}\t{label2}\n')

        #MuMerge Command - output to combined_file.mumerge.bed
        combined_file = mumerge(mumerge_input,
                                combined_file,
                                bed1=bed1,
                                bed2=bed2,
                                label1=label1,
                                label2=label2)
        clean_combined_file = tempdir / 'combined_file.mumerge.clean.bed'
        combined_pybedtool = BedTool(str(combined_file))
        combined_pybedtool.remove_invalid().saveas(clean_combined_file)
        combined_file = clean_combined_file
        # combined_file = Path(str(combined_file) + '_MUMERGE.bed')

        #Perform simple merge same as merge all for md bed files
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.mumerge"
            md_mumerge_input1 = tempdir / "md_mumerge_input1.txt"
            md_bedfile1 = mumerge(md_mumerge_input1,
                                  md_bedfile1,
                                  bed1=bed1,
                                  label1=label1,
                                  label2=label2)
            md_pybedtool1 = BedTool(str(md_bedfile1))
            md_bedfile1 = tempdir / "md_bedfile1.mumerge.final.bed"
            md_pybedtool1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            md_bedfile2 = tempdir / "md_bedfile2.mumerge"
            md_mumerge_input2 = tempdir / "md_mumerge_input2.txt"
            md_bedfile2 = mumerge(md_mumerge_input2,
                                  md_bedfile2,
                                  bed2=bed2,
                                  label1=label1,
                                  label2=label2)
            md_pybedtool2 = BedTool(str(md_bedfile2))
            md_bedfile2 = tempdir / "md_bedfile2.mumerge.final.bed"
            md_pybedtool2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)

            # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4)
            # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4)
            # md_merged_bed1.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_merged_bed2.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2)

    #Merge all bed regions, for MD merge condition replicates
    elif method == 'mergeall':
        combined_file = tempdir / "combined_file.mergeall.bed"
        merged_bed = merge_bed(beds=bed1 + bed2)
        # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.merge.bed"
            md_bedfile2 = tempdir / "md_bedfile2.merge.bed"
            # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_merged_bed1 = merge_bed(beds=bed1).each(
                featurefuncs.extend_fields, 4)
            md_merged_bed2 = merge_bed(beds=bed2).each(
                featurefuncs.extend_fields, 4)
            md_merged_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_merged_bed1.saveas(md_bedfile1)
            md_merged_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_merged_bed2.saveas(md_bedfile2)

    elif method == 'tfitclean':
        # combined_file = tfit_clean(beds=bed1+bed2, tempdir=tempdir)
        combined_file = tempdir / "combined_file.tfitclean.bed"
        size_cut = 200
        cleaned_bed = clean_bed(beds=bed1 + bed2, size_cut=size_cut)
        # cleaned_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        cleaned_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.clean.bed"
            md_bedfile2 = tempdir / "md_bedfile2.clean.bed"
            md_cleaned_bed1 = clean_bed(beds=bed1)
            md_cleaned_bed2 = clean_bed(beds=bed2)
            # md_cleaned_bed1.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1)
            md_cleaned_bed1.saveas(combined_file)
            # md_cleaned_bed2.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2)
            md_cleaned_bed2.saveas(combined_file)

    #Intersect all bed regions, for MD intersect condition replicates
    elif method == 'intersectall':
        combined_file = tempdir / 'combined_file.intersectall.bed'
        intersected_bed = intersect_bed(beds=bed1 + bed2)
        # intersected_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        intersected_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.intersect.bed"
            md_bedfile2 = tempdir / "md_bedfile2.intersect.bed"
            md_intersected_bed1 = intersect_bed(beds=bed1)
            md_intersected_bed2 = intersect_bed(beds=bed2)
            md_intersected_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_intersected_bed1.saveas(combined_file)
            md_intersected_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_intersected_bed2.saveas(combined_file)

    #Merge all regions, filter small regions. For MD perform this for each condition
    elif method == 'tfitremovesmall':
        # combined_file = tfit_remove_small(beds=bed1+bed2, tempdir=tempdir)
        size_cut = 200
        combined_file = tempdir / "combined_file.mergeallnosmall.bed"
        merged_bed = merge_bed(beds=bed1 + bed2)
        # merged_bed.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.filter(lambda b: b.stop - b.start > size_cut).saveas(
            combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.merge.bed"
            md_bedfile2 = tempdir / "md_bedfile2.merge.bed"
            md_merged_bed1 = merge_bed(beds=bed1)
            md_merged_bed2 = merge_bed(beds=bed2)
            # md_merged_bed1.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1)
            md_merged_bed1.filter(
                lambda b: b.stop - b.start > size_cut).saveas(combined_file)
            # md_merged_bed2.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2)
            md_merged_bed2.filter(
                lambda b: b.stop - b.start > size_cut).saveas(combined_file)

    #Intersect replicates, merge conditions. For MD intersect condition replicates
    elif method == 'intersect/merge':
        # combined_file = intersect_merge_bed(bed1=bed1, bed2=bed2, tempdir=tempdir)
        combined_file = tempdir / 'combined_file.intermerge.bed'
        intersected_bed1 = intersect_bed(beds=bed1)
        intersected_bed2 = intersect_bed(beds=bed2)
        merged_bed = intersected_bed1.cat(intersected_bed2).merge().sort()
        # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.intersect.bed"
            md_bedfile2 = tempdir / "md_bedfile2.intersect.bed"
            md_intersected_bed1 = intersect_bed(beds=bed1).each(
                featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_intersected_bed2 = intersect_bed(beds=bed2).each(
                featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_intersected_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_intersected_bed1.saveas(md_bedfile1)
            md_intersected_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_intersected_bed2.saveas(md_bedfile2)

    else:
        raise exceptions.InputError("Error: COMBINE option not recognized.")

    #Check to make sure no files are empty
    if os.stat(combined_file).st_size == 0:
        raise exceptions.FileEmptyError(
            "Error in COMBINE module. Resulting bed file is empty.")

    if md:
        if os.stat(md_bedfile1).st_size == 0 or os.stat(
                md_bedfile2).st_size == 0:
            raise exceptions.FileEmptyError(
                "Error in COMBINE module. Resulting md bed file is empty.")
        if use_config:
            #Assign MD_BEDFILE variables in config
            config.vars['MD_BEDFILE1'] = md_bedfile1
            config.vars['MD_BEDFILE2'] = md_bedfile2

    #Assign COMBINED_FILE variable in config
    if use_config:
        config.vars['COMBINED_FILE'] = combined_file

    #Record time, print
    total_time = time.time() - start_time
    if use_config:
        config.vars['COMBINEtime'] = total_time
    print("done in: " + str(datetime.timedelta(seconds=int(total_time))),
          ". Processing",
          len(combined_file.read_text().split('\n')),
          "regions",
          file=sys.stderr)

    if debug:
        multiprocess.current_mem_usage(jobid)