def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
Exemplo n.º 2
0
def permuteMATS(infile, outfiles, outdir):
    '''creates directories for permutation testing

    Creates directories for permutation testing and leaves dummy
    init file in directory (for timestamping)
    Only becomes active if :term:`PARAMS` permute is set to 1

    Parameters
    ----------
    infile: string
        name and path to design

    outfile: list
        list of unknown length, capturing all permutations
        retrospectively

    outdir: string
        directory to generate permutations in

    permutations : string
       :term:`PARAMS`. number of directories to be generated
    '''

    if not os.path.exists(outdir):
        os.makedirs(outdir)
    for i in range(0, PARAMS["permutations"]):
        if not os.path.exists("%s/run%i.dir" % (outdir, i)):
            os.makedirs("%s/run%i.dir" % (outdir, i))
        P.touch("%s/run%i.dir/init" % (outdir, i))
Exemplo n.º 3
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    load_statement = P.build_load_statement(
        tablename=tablename,
        options="--allow-empty-file "
        "--add-index=category "
        "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 4
0
def buildPicardRnaSeqMetrics(infiles, strand, outfile):
    '''run picard:RNASeqMetrics



    Arguments
    ---------
    infiles : string
        Input filename in :term:`BAM` format.
        Genome file in refflat format
            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
    outfile : string
        Output filename with picard output.

    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3
    infile, genome = infiles

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
    REF_FLAT=%(genome)s
    INPUT=%(infile)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    STRAND=%(strand)s
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()
Exemplo n.º 5
0
def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)
Exemplo n.º 6
0
def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)
def generatePSP(positives, negatives, outfile):
    ''' generate a discrimitative PSP file from
    the positives and negatives that can be used
    to do descriminative MEME '''

    psp_options = PARAMS["psp_options"]
    
    nseqs_pos = int(FastaIterator.count(positives))
    nseqs_neg = int(FastaIterator.count(negatives))

    if nseqs_pos < 2 or nseqs_neg < 2:
        E.warn("%s: input files do not have sufficent sequences"
               "to run psp-gen, skipping" % outfile)
        P.touch(outfile)
        return

    # get appropriate options from meme options
    if PARAMS.get("meme_revcomp", True):
        psp_options += " -revcomp"

    statement = '''psp-gen -pos %(positives)s
                           -neg %(negatives)s
                           %(psp_options)s
                   > %(outfile)s '''

    P.run()
Exemplo n.º 8
0
def buildPicardInsertSizeStats(infile, outfile, genome_file):
    '''run Picard:CollectInsertSizeMetrics

    Collect insert size statistics.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectInsertSizeMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
Exemplo n.º 9
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''gather BAM file alignment statistics using Picard '''

    job_options = getPicardOptions()
    job_threads = 3

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s
    | python %(scriptsdir)s/bam2bam.py -v 0
    --method=set-sequence --output-sam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
Exemplo n.º 10
0
def buildPicardInsertSizeStats(infile, outfile, genome_file):
    '''run Picard:CollectInsertSizeMetrics

    Collect insert size statistics.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''CollectInsertSizeMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
Exemplo n.º 11
0
def permuteMATS(infile, outfiles, outdir):
    '''creates directories for permutation testing

    Creates directories for permutation testing and leaves dummy
    init file in directory (for timestamping)
    Only becomes active if :term:`PARAMS` permute is set to 1

    Parameters
    ----------
    infile: string
        name and path to design

    outfile: list
        list of unknown length, capturing all permutations
        retrospectively

    outdir: string
        directory to generate permutations in

    permutations : string
       :term:`PARAMS`. number of directories to be generated
    '''

    if not os.path.exists(outdir):
        os.makedirs(outdir)
    for i in range(0, PARAMS["permutations"]):
        if not os.path.exists("%s/run%i.dir" % (outdir, i)):
            os.makedirs("%s/run%i.dir" % (outdir, i))
        P.touch("%s/run%i.dir/init" % (outdir, i))
Exemplo n.º 12
0
def buildPicardDuplicateStats(infile, outfile):
    '''run picard:MarkDuplicates
    Record duplicate metrics using Picard and keep the dedupped .bam
    file.
    Pair duplication is properly handled, including inter-chromosomal
    cases. SE data is also handled.  These stats also contain a
    histogram that estimates the return from additional sequecing.  No
    marked bam files are retained (/dev/null...)  Note that picards
    counts reads but they are in fact alignments.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s MarkDuplicates
    INPUT=%(infile)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s.duplicate_metrics
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT;
    '''
    statement += '''samtools index %(outfile)s ;'''
    P.run()
Exemplo n.º 13
0
def buildPicardGCStats(infile, outfile, genome_file):
    """picard:CollectGCBiasMetrics

    Collect GC bias metrics.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.

    """

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''CollectGcBiasMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    CHART_OUTPUT=%(outfile)s.pdf
    SUMMARY_OUTPUT=%(outfile)s.summary
    >& %(outfile)s'''

    P.run()
Exemplo n.º 14
0
def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    cc = dbhandle.cursor()
    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc.execute(sql)
        outfile = IOTools.openFile(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        P.touch(outfile)
Exemplo n.º 15
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
Exemplo n.º 16
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Exemplo n.º 17
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))
    job_memory = "8G"

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles,), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)
Exemplo n.º 18
0
def buildPicardDuplicateStats(infile, outfile):
    '''run picard:MarkDuplicates
    Record duplicate metrics using Picard and keep the dedupped .bam
    file.
    Pair duplication is properly handled, including inter-chromosomal
    cases. SE data is also handled.  These stats also contain a
    histogram that estimates the return from additional sequecing.  No
    marked bam files are retained (/dev/null...)  Note that picards
    counts reads but they are in fact alignments.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s MarkDuplicates
    INPUT=%(infile)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s.duplicate_metrics
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT;
    '''
    statement += '''samtools index %(outfile)s ;'''
    P.run()
Exemplo n.º 19
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    load_statement = P.build_load_statement(tablename=tablename,
                                            options="--allow-empty-file "
                                            "--add-index=category "
                                            "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 20
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''run picard:CollectMultipleMetrics
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectMultipleMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
Exemplo n.º 21
0
def buildPicardCoverageStats(infile, outfile, baits, regions):
    '''run picard:CollectHsMetrics
    Generate coverage statistics for regions of interest from a bed
    file using Picard.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    baits : :term:`bed` formatted file of bait regions
    regions : :term:`bed` formatted file of target regions
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectHsMetrics
    BAIT_INTERVALS=%(baits)s
    TARGET_INTERVALS=%(regions)s
    INPUT=%(infile)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=LENIENT''' % locals()
    P.run()
Exemplo n.º 22
0
def buildPicardCoverageStats(infile, outfile, baits, regions):
    '''run picard:CalculateHSMetrics

    Generate coverage statistics for regions of interest from a bed
    file using Picard.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    baits : :term:`bed` formatted file of bait regions
    regions : :term:`bed` formatted file of target regions

    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CalculateHsMetrics
    BAIT_INTERVALS=%(baits)s
    TARGET_INTERVALS=%(regions)s
    INPUT=%(infile)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=LENIENT''' % locals()
    P.run()
Exemplo n.º 23
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))
    job_memory = "8G"

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles,), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)
Exemplo n.º 24
0
def calculateM3DSpikeClustersPvalue(infiles, outfile):
    job_options = "-l mem_free=4G -pe dedicated 1"
    design = infiles[-1]
    infiles = infiles[:-1]
    RRBS.calculateM3DSpikepvalue(infiles, outfile, design,
                                 submit=True, job_options=job_options)
    P.touch(outfile)
Exemplo n.º 25
0
def makeSummaryPlots(infile, outfile):

    job_options = "-l mem_free=48G"

    RRBS.summaryPlots(infile, outfile,
                      submit=True, job_options=job_options)
    P.touch(outfile)
Exemplo n.º 26
0
def buildPicardGCStats(infile, outfile, genome_file):
    """picard:CollectGCBiasMetrics
    Collect GC bias metrics.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    """

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectGcBiasMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    CHART_OUTPUT=%(outfile)s.pdf
    SUMMARY_OUTPUT=%(outfile)s.summary
    >& %(outfile)s'''

    P.run()
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Exemplo n.º 28
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
Exemplo n.º 29
0
def buildPicardRnaSeqMetrics(infiles, strand, outfile):
    '''run picard:RNASeqMetrics



    Arguments
    ---------
    infiles : string
        Input filename in :term:`BAM` format.
        Genome file in refflat format
            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
    outfile : string
        Output filename with picard output.

    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3
    infile, genome = infiles

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
    REF_FLAT=%(genome)s
    INPUT=%(infile)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    STRAND=%(strand)s
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()
Exemplo n.º 30
0
def mergeMeanTables(infiles, outfile):
    '''
    Collate and merge all separate tables into a single
    large table for all MZ and DZ twins
    '''

    job_memory = "300G"
    panel = outfile.split("/")[-1].split("-")[1]
    cell_type = outfile.split("/")[-1].split("mean_")[-1]
    cell_type = P.snip(cell_type, ".tsv")
    table_name = "_".join([cell_type, "mean"])
    out_dir = "/".join(outfile.split("/")[:-1])
    
    twin_id = "twin.id"

    statement = '''
    python /ifs/devel/projects/proj052/flow_pipeline/scripts/flow2twins.py
    --task=merge_flow
    --twin-id-column=%(twin_id)s
    --demographics-file=%(twins_demographics)s
    --demo-id-column=%(twins_demo_header)s
    --database=%(database)s
    --tablename=%(table_name)s
    --filter-gates="(F|S)SC-(A|H)"
    --filter-zero-arrays    
    --log=%(outfile)s.log
    --output-directory=%(out_dir)s
    --output-file-pattern=%(table_name)s
    '''

    P.run()
    P.touch(outfile)
Exemplo n.º 31
0
        def splitFiles(infile, outfile):
            """
            Arbitrarily split files into chunks for parallelisation
            """

            Timeseries.splitFiles(infile=infile, nchunks=PARAMS["resampling_chunks"], out_dir="parallel_files.dir")
            P.touch(outfile)
Exemplo n.º 32
0
def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc = dbhandle.execute(sql)
        outfile = IOTools.openFile(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        P.touch(outfile)
def runFIMO(motifs, database, outfile, exportdir, options={}):
    '''run fimo to look for occurances of motifs supplied in sequence database.
    :param:`motifs` is the path to a MEME formated motif file.
    :param:`database` is a fasta file.
    :param:`outfile` is the text output from fimo
    :param:`exportdir` specifies the directory to put exported files (html,gff)
    :param:options is a dictionary: {'option':'value'} will be passed as
                    --option=value and will overwrite options specified in the
                     PARAMs'''


    # if the motifs file is empty, then fimo will return an error
    # this isn't very useful behavoir.

    inlines = IOTools.openFile(motifs).read()
    #print inlines
    if not re.search("MOTIF", inlines):
        E.warning("No motifs found in %s" % motifs)
        P.touch(outfile)
        return
    else:
        E.debug("%s: %i motifs found" % 
                (motifs, len(re.findall("MOTIF", inlines))))


    fimo_options = PARAMS.get("fimo_options", "")
    for option, value in options.iteritems():
        fimo_options = re.sub("%s=\S+" % option, "", fimo_options)
        if value is None:
            fimo_options += " --%s" % option
        else:
            fimo_options += " --%s=%s" % (option, value)

    tmpout = P.getTempFilename()
    
    track = os.path.basename(outfile)
    exportdir = os.path.abspath(exportdir)

    xmlout = P.snip(outfile,".txt") + ".xml"
    logfile = P.snip(outfile,".txt") + ".log"
    gffout = os.path.join(exportdir, track + ".gff")
    htmlout = os.path.join(exportdir, track + ".html")
    
    statement = ''' fimo --oc %(tmpout)s
                         %(fimo_options)s
                         %(motifs)s
                         %(database)s &> %(logfile)s;
                     checkpoint;
                     mv %(tmpout)s/fimo.txt %(outfile)s;
                     checkpoint;
                     mv %(tmpout)s/fimo.xml %(xmlout)s;
                     checkpoint;
                     mv %(tmpout)s/fimo.gff %(gffout)s
                     checkpoint;
                     mv %(tmpout)s/fimo.html %(htmlout)s;
                     checkpoint;
                     rm -r %(tmpout)s '''

    P.run()
def joint_index_dexseq(infile, outfile):

    db = connect()
    db.executescript('''
             DROP INDEX IF EXISTS dexseq_results_joint;
             CREATE INDEX dexseq_results_joint
                    ON dexseq_results(groupID,featureID);''')
    P.touch(outfile)
Exemplo n.º 35
0
def subsetSequenceData(infile, outfile):
    """subset fastq files"""
    ignore_pipe_erors = True
    ignore_errors = True
    m = PipelineMapping.SubsetHead(limit=PARAMS["sample_size"])
    statement = m.build((infile,), outfile)
    P.run()
    P.touch(outfile)
Exemplo n.º 36
0
def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)
Exemplo n.º 37
0
def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)
Exemplo n.º 38
0
def runGOFromDatabase(outfile,
                      outdir,
                      statement_fg,
                      statement_bg,
                      go_file,
                      ontology_file=None,
                      samples=1000):
    """check for GO enrichment.

    Gene lists are extracted from a database.
    This method is a wrapper for `runGO.py`.

    Arguments
    ---------
    outfile : string
        Output filename
    outdir : string
        Output directory for auxiliary files
    statement_fg : string
        SQL statement to select genes of foreground set.
    statement_bg : string
        SQL statement to select genes in background set.
    go_file : string
        Filename with Gene-to-GO assignments
    ontology_file : string
        Filename with ontology information.
    samples : int
        Number of samples for empirical FDR. If not given, use
        BH FDR.
    """

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    cc = dbhandle.cursor()
    fg = set([x[0] for x in cc.execute(statement_fg).fetchall()])
    bg = set([x[0] for x in cc.execute(statement_bg).fetchall()])

    if len(fg) == 0:
        P.touch(outfile)
        return

    fg_file = os.path.join(outdir, "foreground")
    bg_file = os.path.join(outdir, "background")
    outf = open(fg_file, "w")
    outf.write("\n".join(map(str, fg)) + "\n")
    outf.close()
    outf = open(bg_file, "w")
    outf.write("\n".join(map(str, bg)) + "\n")
    outf.close()

    runGOFromFiles(outfile,
                   outdir,
                   fg_file,
                   bg_file,
                   go_file,
                   ontology_file=ontology_file,
                   samples=samples)
Exemplo n.º 39
0
        def splitFiles(infile, outfile):
            '''
            Arbitrarily split files into chunks for parallelisation
            '''

            Timeseries.splitFiles(infile=infile,
                                  nchunks=PARAMS['resampling_chunks'],
                                  out_dir="parallel_files.dir")
            P.touch(outfile)
Exemplo n.º 40
0
def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)
Exemplo n.º 41
0
def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)
Exemplo n.º 42
0
def runGOFromDatabase(outfile,
                      outdir,
                      statement_fg,
                      statement_bg,
                      go_file,
                      ontology_file=None,
                      samples=1000):
    """check for GO enrichment.

    Gene lists are extracted from a database.
    This method is a wrapper for `runGO.py`.

    Arguments
    ---------
    outfile : string
        Output filename
    outdir : string
        Output directory for auxiliary files
    statement_fg : string
        SQL statement to select genes of foreground set.
    statement_bg : string
        SQL statement to select genes in background set.
    go_file : string
        Filename with Gene-to-GO assignments
    ontology_file : string
        Filename with ontology information.
    samples : int
        Number of samples for empirical FDR. If not given, use
        BH FDR.
    """

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    cc = dbhandle.cursor()
    fg = set([x[0] for x in cc.execute(statement_fg).fetchall()])
    bg = set([x[0] for x in cc.execute(statement_bg).fetchall()])

    if len(fg) == 0:
        P.touch(outfile)
        return

    fg_file = os.path.join(outdir, "foreground")
    bg_file = os.path.join(outdir, "background")
    outf = open(fg_file, "w")
    outf.write("\n".join(map(str, fg)) + "\n")
    outf.close()
    outf = open(bg_file, "w")
    outf.write("\n".join(map(str, bg)) + "\n")
    outf.close()

    runGOFromFiles(outfile, outdir,
                   fg_file, bg_file,
                   go_file,
                   ontology_file=ontology_file,
                   samples=samples)
Exemplo n.º 43
0
def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinel") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"], PARAMS_PEAKCALLER)

    P.touch(outfile)
Exemplo n.º 44
0
    def genReplicateData(infile, outfile):
        """
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        """

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir)

        P.touch(outfile)
Exemplo n.º 45
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
Exemplo n.º 46
0
def plotDETagStats(infiles, outfile):
    '''plot differential expression stats'''

    infile, composition_file = infiles
    Expression.plotDETagStats(
        infile, outfile,
        additional_file=composition_file,
        join_columns=("contig", "start", "end"),
        additional_columns=("CpG_density",
                            "length"))

    P.touch(outfile)
Exemplo n.º 47
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=["compensation", "parse_gating"],
                      help="select method to perform on workspace "
                      "file.")

    parser.add_option("--gating-directory",
                      dest="gate_dir",
                      type="string",
                      help="directory to store gating dummy files")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # write footer and output benchmark information.
    E.Stop()

    infile = argv[-1]

    if options.method == "compensation":
        split_file = infile.split("/")
        infile = split_file[-1]
        split_file.remove(infile)
        path = "/".join(split_file)
        out_df = P52.get_compensation_matrix(path=path, infile=infile)
        out_df.to_csv(options.stdout, sep="\t")

    elif options.method == "parse_gating":
        for dfile in P52.parse_gating_file(infile):
            outfile = options.gate_dir + "/" + dfile
            P.touch(outfile)
    else:
        pass
Exemplo n.º 48
0
def buildPicardDuplicationStats(infile, outfile):
    '''run picard:MarkDuplicates

    Record duplicate metrics using Picard, the marked records
    are discarded.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.getTempFile(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\
                                    -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY)

    statement += '''MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()

    os.unsetenv("CGAT_JAVA_OPTS")

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
Exemplo n.º 49
0
def buildPicardDuplicationStats(infile, outfile):
    '''run picard:MarkDuplicates

    Record duplicate metrics using Picard, the marked records
    are discarded.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.getTempFile(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\
                                    -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY)

    statement += '''MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()

    os.unsetenv("CGAT_JAVA_OPTS")

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
Exemplo n.º 50
0
def plotPathwayGenes(infile, outfile):
    '''
    plot the genes that are differentially expressed
    and fall into pathways
    '''
    # R will not be able to plot anything if none of the 
    # differentially expressed genes are associated
    # with a pathway. plot nothing if this is the case

    # colour of the pathways should associate with the 
    # track that they come from 

    # because the plots can get unwieldy with large 
    # gene sets, if there are more than 20 genes
    # associated with a pathway then take the top 20
    # This should be explained in the documentation

    col = random.sample(range(1,600,1), 1)[0]
    track = os.path.basename(infile).replace(".genes", "")

    if len(open(infile).readlines()) == 1:
        R('''pdf("%s")
             plot(c(0,1,2,3,4), c(0,1,2,3,4), cex = 0)
             text(2, y = 2, labels = "No genes were associated with pathways", cex = 1)
          ''' % outfile.replace(".plots", ".pdf"))
        P.touch(outfile)
    else:
        # NB. size of plot should be proportional to the
        # number of genes in the pathways
        R('''
          library("ggplot2")
          dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")
          pathways <- unique(dat$pathway)
          for (p in pathways){
              toPlot <- aggregate(l2fold~gene, dat[dat$pathway == p,], mean)
              if (regexpr("/", p)[1] != -1){
                  # "/" in name not compatible with outfile names
                  p <- sub("/", "|", p)}
              outf <- paste(paste("pathways.dir/", paste("%s", p, sep = "."), sep = ""), "genes.pdf", sep = ".")
              cols <- col2rgb(%i)
              col <- rgb(cols[1], cols[2], cols[3], maxColorValue = 255)
              toPlot$col <- col
              if (nrow(toPlot) > 10){
                  toPlot <- toPlot[order(abs(toPlot$l2fold), decreasing = T),][1:10,]}
              plot1 <- ggplot(toPlot, aes(x = gene, y = l2fold, fill = col, stat = "identity")) + geom_bar(stat = "identity") + coord_flip() + scale_fill_manual(values = toPlot$col)
              plot1 + ggtitle(p) + theme(text = element_text(size = 40, color = "black"), axis.text = element_text(colour = "Black"))
              ggsave(file = outf, width = 11, height = nrow(toPlot), limitsize = F)
          }
        ''' % (infile, track, col))
        P.touch(outfile)
Exemplo n.º 51
0
def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)
Exemplo n.º 52
0
def loadFastqc(infile, outfile):
    '''load FASTQC stats into database.'''
    track = P.snip(infile, ".fastqc")
    filename = os.path.join(
        PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt")

    PipelineReadqc.loadFastqc(filename,
                              backend=PARAMS["database_backend"],
                              database=PARAMS["database_name"],
                              host=PARAMS["database_host"],
                              username=PARAMS["database_username"],
                              password=PARAMS["database_password"],
                              port=PARAMS["database_port"])
    P.touch(outfile)
Exemplo n.º 53
0
def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)
Exemplo n.º 54
0
def loadFastqc(infile, outfile):
    '''load FASTQC stats into database.'''
    track = P.snip(infile.replace("processed.dir/", ""), ".fastqc")
    filename = os.path.join(
        PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt")

    PipelineReadqc.loadFastqc(filename,
                              backend=PARAMS["database_backend"],
                              database=PARAMS["database_name"],
                              host=PARAMS["database_host"],
                              username=PARAMS["database_username"],
                              password=PARAMS["database_password"],
                              port=PARAMS["database_port"])
    P.touch(outfile)
Exemplo n.º 55
0
    def genReplicateData(infile, outfile):
        '''
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        '''

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile,
                                   axis="column",
                                   group_var="replicates",
                                   outdir=outdir)

        P.touch(outfile)
Exemplo n.º 56
0
def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinel") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER)

    P.touch(outfile)
Exemplo n.º 57
0
def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)