def assembleWithStringTie(infiles, outfile):

    infile, reference = infiles
    basefile = os.path.basename(infile)
    job_threads = PARAMS["stringtie_threads"]
    job_memory = PARAMS["stringtie_memory"]
    tmpfile = P.get_temp_filename()
    if os.path.exists(tmpfile):
        os.unlink(tmpfile)

    statement = '''
                    portcullis full 
                            -t 1
                            -o portcullis/%(basefile)s/
                            -r %(portcullis_bedref)s
                            -b 
                            %(portcullis_fastaref)s
                            %(infile)s &&
                    mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s &&
                    rm -r portcullis/%(basefile)s/ &&
                    stringtie %(tmpfile)s
                           -p %(stringtie_threads)s
                           -G <(zcat %(reference)s)
                           %(stringtie_options)s
                           2> %(outfile)s.log
                   | gzip > %(outfile)s &&
                   rm %(tmpfile)s'''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            tmpfilename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join([
            "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"
        ])

    P.run(statement)
Пример #2
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.get_temp_filename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=P.get_params()["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run(statement)

    os.unlink(tmpfasta)
Пример #3
0
def quantifyWithSalmon(infiles, outfile):
    '''Quantify existing samples against genesets'''
    job_threads = 2
    job_memory = "16G"

    infile, gtffile = infiles
    basefile = os.path.basename(infile)
    sample_name = basefile.split(os.extsep, 1)
    sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam"
    gtfbase = P.snip(os.path.basename(gtffile), ".gz")
    salmonIndex = "salmon_index/" + gtfbase + ".salmon.index"
    fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq"
    fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq"
    salmon_options = PARAMS["salmon_quantoptions"]

    statement = '''
    samtools sort -n %(infile)s -o %(sorted_bam)s;
    samtools fastq
         -1 %(fastq1)s
         -2 %(fastq2)s
         -0 /dev/null -s /dev/null -n -F 0x900
         %(sorted_bam)s; 
    salmon quant -i %(salmonIndex)s
        --libType IU
        -1 %(fastq1)s
        -2 %(fastq2)s
        -o %(outfile)s
        %(salmon_options)s; 
    mv %(outfile)s/quant.sf %(outfile)s.sf; 
    rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s 
    '''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        filename = "temp_bams/%s" % basefile
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)

        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            filename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join(
            ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"])

    P.run(statement)
Пример #4
0
 def download(self, genes=None, fields=None, scope=None, species=None):
     '''
     download an up to date ontology file, parse the xml data into a
     Python "ElementTree" and delete the ontology file.
     '''
     ontologyfile = P.get_temp_filename(".")
     os.system("wget -O %s %s" % (ontologyfile, self.datasource))
     tree = ET.parse(ontologyfile)
     os.remove(ontologyfile)
     self.dataset = tree
def runFIMO(motifs, database, outfile, exportdir, options={}):
    '''run fimo to look for occurances of motifs supplied in sequence database.
    :param:`motifs` is the path to a MEME formated motif file.
    :param:`database` is a fasta file.
    :param:`outfile` is the text output from fimo
    :param:`exportdir` specifies the directory to put exported files (html,gff)
    :param:options is a dictionary: {'option':'value'} will be passed as
                    --option=value and will overwrite options specified in the
                     PARAMs'''

    # if the motifs file is empty, then fimo will return an error
    # this isn't very useful behavoir.

    inlines = IOTools.open_file(motifs).read()
    #print inlines
    if not re.search("MOTIF", inlines):
        E.warning("No motifs found in %s" % motifs)
        P.touch(outfile)
        return
    else:
        E.debug("%s: %i motifs found" %
                (motifs, len(re.findall("MOTIF", inlines))))

    fimo_options = PARAMS.get("fimo_options", "")
    for option, value in options.iteritems():
        fimo_options = re.sub("%s=\S+" % option, "", fimo_options)
        if value is None:
            fimo_options += " --%s" % option
        else:
            fimo_options += " --%s=%s" % (option, value)

    tmpout = P.get_temp_filename()

    track = os.path.basename(outfile)
    exportdir = os.path.abspath(exportdir)

    xmlout = P.snip(outfile, ".txt") + ".xml"
    logfile = P.snip(outfile, ".txt") + ".log"
    gffout = os.path.join(exportdir, track + ".gff")
    htmlout = os.path.join(exportdir, track + ".html")

    statement = ''' fimo --oc %(tmpout)s
                         %(fimo_options)s
                         %(motifs)s
                         %(database)s &> %(logfile)s;
                     mv %(tmpout)s/fimo.txt %(outfile)s;
                     mv %(tmpout)s/fimo.xml %(xmlout)s;
                     mv %(tmpout)s/fimo.gff %(gffout)s;
                     mv %(tmpout)s/fimo.html %(htmlout)s;
                     rm -r %(tmpout)s '''

    P.run(statement)
Пример #6
0
def buildRefFlat(infile, outfile):
    '''build flat geneset for Picard RnaSeqMetrics.
    '''

    tmpflat = P.get_temp_filename(".")

    statement = '''
    gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s;
    paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s)
    > %(outfile)s
    '''
    P.run(statement, job_memory=PARAMS["job_memory"])
    os.unlink(tmpflat)
Пример #7
0
def loadManualAnnotations(infile, outfile):

    tmp = P.get_temp_filename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.open_file(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.open_file(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
Пример #8
0
        def aggregateAdaptors(infiles, outfile):
            '''
            Collate fasta files into a single contaminants file for
            adapter removal.
            '''
            tempfile = P.get_temp_filename()
            infiles = " ".join(infiles)

            statement = """
            cat %(infiles)s | fastx_reverse_complement > %(tempfile)s;
            cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s;
            rm -f %(tempfile)s
            """
            P.run(statement)
Пример #9
0
    def test_job_should_write_to_explicit_temp_and_not_clean_up(self):

        outfile = os.path.join(self.work_dir, "out")
        tmpfile = P.get_temp_filename(clear=True)
        P.run("hostname > {outfile}; "
              "echo {tmpfile} > {tmpfile}; "
              "cat {tmpfile} >> {outfile}".format(outfile=outfile,
                                                  tmpfile=tmpfile),
              to_cluster=False)

        with IOTools.open_file(outfile) as outf:
            hostname = outf.readline().strip()
            tmpfile_read = outf.readline().strip()

        self.assertEqual(tmpfile, tmpfile_read)
        self.assertTrue(self.file_exists(tmpfile, hostname, expect=True))
        os.unlink(tmpfile)
Пример #10
0
def makeCytoscapeInputs(infiles, outfile):
    infile = infiles[1]
    T = P.get_temp_filename(".")
    statement = """
    awk -F "\\t" '{printf("%%%%s\\t%%%%s\\t%%%%s\\t%%%%s\\t+1\\n",\
    $1, $12, $8, $9)}' %(infile)s > %(T)s""" % locals()
    P.run(statement)
    typ = infile.split("_")[-3]
    keep = [
        line.strip() for line in IOTools.open_file(PARAMS['cytoscape_%s' %
                                                          typ]).readlines()
    ]
    tab = pd.read_csv(T, sep="\t")
    tab = tab[tab['term_id'].isin(keep)]
    tab.columns = ['ID', 'Description', 'pvalue', 'padj', 'Phenotype']
    tab.to_csv(outfile, sep="\t", index=None)
    os.remove(T)
Пример #11
0
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset,
                           fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.get_temp_filename()

    statement = '''
    cgat annotator2tsv \
    --method=fdr-table \
    --fdr-method=%(fdr_method)s \
    --log=%(outfile)s.log \
    --regex-identifier="(.*)%(suffix)s" \
    %(infile)s > %(tmpfilename)s
    '''
    P.run(statement)

    tmpfile = P.get_temp_file()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   cgat csv2db %(csv2db_options)s \
    --table=%(table)s
    < %(tmpfilename2)s > %(outfile)s'''

    P.run(**dict(list(locals().items()) + list(P.get_params().items())))
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)
def makeSalmonIndex(infile, outfile):
    # Long transcripts cause indexing to use lots of memory?
    job_memory = "64G"
    job_threads = 1

    gtf_basename = P.snip(os.path.basename(infile), ".gtf.gz")
    transcript_fasta = "salmon_index/" + gtf_basename + "transcripts.fa"
    fastaref = PARAMS["portcullis_fastaref"]
    index_options = PARAMS["salmon_indexoptions"]
    tmpfile = P.get_temp_filename()

    statement = '''
    gunzip -c %(infile)s > %(tmpfile)s;
    gffread %(tmpfile)s -g %(fastaref)s -w %(transcript_fasta)s;
    salmon index
      -p %(job_threads)s
      %(index_options)s
      -t %(transcript_fasta)s
      -i %(outfile)s
      --perfectHash;
    rm %(tmpfile)s
    '''
    P.run(statement)
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.is_empty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if IOTools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run(statement)

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            IOTools.touch_file(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run(statement)

    else:

        tmpfile = P.get_temp_filename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.is_empty(infiles[0]):
            IOTools.touch_file(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run(statement)

        for fn in infiles[1:]:
            if IOTools.is_empty(infiles[0]):
                IOTools.touch_file(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run(statement)

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run(statement)

        os.unlink(tmpfile)
Пример #15
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        IOTools.touch_file(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.get_temp_filename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run(statement)

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run(statement)
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = IOTools.open_file(tmpfile_name, "w")
    tmpfile.write("\t".join(("expression", "fold", "fdr", "counts",
                             "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = IOTools.open_file(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(
                map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y],
                          100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(
                map(str, (fdr, power, power_counts.sum().sum(), 100.0 *
                          power_counts.sum().sum() / unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.to_table(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
Пример #16
0
def convertReadsToIntervals(bamfile,
                            bedfile,
                            filtering_quality=None,
                            filtering_dedup=None,
                            filtering_dedup_method='picard',
                            filtering_nonunique=False):
    '''convert reads in *bamfile* to *intervals*.

    This method converts read data into intervals for
    counting based methods.

    This method is not appropriate for RNA-Seq.

    Optional steps include:

    For paired end data, pairs are merged and optionally
    filtered by insert size.

    Arguments
    ---------
    bamfile : string
        Filename of input file in :term:`bam` format.
    bedfile : string
        Filename of output file in :term:`bed` format.
    filtering_quality : int
        If set, remove reads with a quality score below given threshold.
    filtering_dedup : bool
        If True, deduplicate data.
    filtering_dedup_method : string
        Deduplication method. Possible options are ``picard`` and
        ``samtools``.
    filtering_nonunique : bool
        If True, remove non-uniquely matching reads.

    '''
    track = P.snip(bedfile, ".bed.gz")

    is_paired = BamTools.is_paired(bamfile)
    current_file = bamfile
    tmpdir = P.get_temp_filename()
    statement = ["mkdir %(tmpdir)s"]
    nfiles = 0

    if filtering_quality > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''samtools view
        -q %(filtering_quality)i -b
        %(current_file)s
        2>> %%(bedfile)s.quality.log
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_nonunique:

        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''cat %(current_file)s
        | cgat bam2bam
        --method=filter
        --filter-method=unique,mapped
        --log=%%(bedfile)s.nonunique.log
        2> %%(bedfile)s.nonunique.err
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_dedup is not None:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        if filtering_dedup_method == 'samtools':
            statement.append('''samtools rmdup - - ''')

        elif filtering_dedup_method == 'picard':
            statement.append('''picard MarkDuplicates
            INPUT=%(current_file)s
            OUTPUT=%(next_file)s
            ASSUME_SORTED=TRUE
            METRICS_FILE=%(bedfile)s.duplicate_metrics
            REMOVE_DUPLICATES=TRUE
            VALIDATION_STRINGENCY=SILENT
            >& %%(bedfile)s.markdup.log ''' % locals())

        nfiles += 1
        current_file = next_file

    if is_paired:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --merge-pairs
              --min-insert-size=%(filtering_min_insert_size)i
              --max-insert-size=%(filtering_max_insert_size)i
              --log=%(bedfile)s.bam2bed.log
              -
            2> %(bedfile)s.bam2bed.err
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            2> %(bedfile)s.sanitize.err
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')
    else:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --log=%(bedfile)s.bam2bed.log
              -
            2> %(bedfile)s.bam2bed.err
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            2> %(bedfile)s.sanitize.err
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')

    statement.append("tabix -p bed %(bedfile)s >& %(bedfile)s.tabix.log")
    statement.append("rm -rf %(tmpdir)s")
    statement = " ; ".join(statement)
    P.run(statement, job_memory="8G")
Пример #17
0
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"):
    '''aggregate output from several ``bedtools coverage`` results.

    ``bedtools coverage`` outputs the following columns for a bed4
    file::

    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    This method autodetects the number of columns in the :term:`infiles`
    and selects:

    * bed4: use column 5
    * bed6: use column 7
    * bed12: use column 13

    Arguments
    ---------
    infiles : list
        Input filenames with the output from ``bedtools coverage``
    outfile : string
        Output filename in :term:`tsv` format.
    regex : string
        Regular expression used to extract the track name from the
        filename.  The default removes any suffix.

    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        """<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" %
        (x, column) for x in infiles
    ])
    tmpfile = P.get_temp_filename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run(statement)

    # build track names
    tracks = [
        re.search(regex, os.path.basename(x)).groups()[0] for x in infiles
    ]

    outf = IOTools.open_file(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    # filter for uniqueness - keys with the same value as the
    # previous line will be ignored.
    last_gene = None
    c = E.Counter()
    for line in open(tmpfile, "r"):
        c.input += 1
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]

        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        if genes[0] == last_gene:
            c.duplicates += 1
            continue
        c.output += 1
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
        last_gene = genes[0]

    outf.close()

    os.unlink(tmpfile)

    E.info("aggregateWindowsTagCounts: %s" % c)
Пример #18
0
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run(statement)
        return

    tmpfile1 = P.get_temp_filename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | cgat gff2fasta
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        IOTools.touch_file(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    os.unlink(tmpfile1)

    inf = IOTools.open_file("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([
        x[0] for x in cc.execute("""SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)
    ])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" %
           ((len(new_pseudos), len(known_pseudos),
             len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.open_file(outfile, "w")
    inf = GTF.iterator(IOTools.open_file(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
Пример #19
0
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        IOTools.touch_file(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        IOTools.touch_file(outfile)
        return

    tmpfile_mito = P.get_temp_filename(".")

    statement = '''
    cgat index_fasta
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        IOTools.touch_file(outfile)
        return

    format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    # convert to gtf
    inf = IOTools.open_file("%s.links.gz" % outfile)
    outf = IOTools.open_file(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end, target_contig,
         target_strand, target_start, target_end, score, pid,
         alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
Пример #20
0
def summarizeTagsWithinContext(tagfile,
                               contextfile,
                               outfile,
                               min_overlap=0.5,
                               job_memory="15G"):
    '''count occurances of tags in genomic context.

    Examines the genomic context to where tags align.

    A tag is assigned to the genomic context that it
    overlaps by at least 50%. Thus some reads mapping
    several contexts might be dropped.

    Arguments
    ---------
    tagfile : string
        Filename with tags. The file can be :term:`bam` or :term:`bed` format.
    contextfile : string
        Filename of :term:`bed` formatted files with named intervals (BED4).
    outfile : string
        Output in :term:`tsv` format.
    min_overlap : float
        Minimum overlap (fraction) to count features as overlapping.
    job_memory : string
        Memory to reserve.
    '''

    tmpfile = P.get_temp_filename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(2)]
    statement = '''
    cgat bam_vs_bed
    --min-overlap=%(min_overlap)f
    --log=%(outfile)s.log
    %(tagfile)s %(contextfile)s
    > %(tmpfile)s_0
    '''

    P.run(statement)

    statement = '''
    printf "intergenic\\t" >> %(tmpfile)s_1'''

    P.run(statement)

    statement = '''
    bedtools intersect -a %(tagfile)s
    -b %(contextfile)s
    -bed -v | wc -l
    | xargs printf
    >> %(tmpfile)s_1
    '''
    P.run(statement)

    files = " ".join(tmpfiles)
    statement = '''
    sort --merge  %(files)s
    | gzip > %(outfile)s
    '''
    P.run(statement)

    for x in tmpfiles:
        os.unlink(x)
Пример #21
0
def resetGTFAttributes(infile, genome, gene_ids, outfile):
    """set GTF attributes in :term:`gtf` formatted file so that they are
    compatible with cufflinks.
    This method runs cuffcompare with `infile` against itself to add
    attributes such as p_id and tss_id.
    Arguments
    ---------
    infile : string
        Filename of :term:`gtf`-formatted input file
    genome : string
       Filename (without extension) of indexed genome file
       in :term:`fasta` format.
    gene_ids : dict
       Dictionary mapping transcript ids to gene ids.
    outfile : string
       Output filename in :term:`gtf` format
    """
    tmpfile1 = P.get_temp_filename(".")
    tmpfile2 = P.get_temp_filename(".")

    #################################################
    E.info("adding tss_id and p_id")

    # The p_id attribute is set if the fasta sequence is given.
    # However, there might be some errors in cuffdiff downstream:
    #
    # cuffdiff: bundles.cpp:479: static void HitBundle::combine(const std::
    # vector<HitBundle*, std::allocator<HitBundle*> >&, HitBundle&): Assertion
    # `in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id()' failed.
    #
    # I was not able to resolve this, it was a complex
    # bug dependent on both the read libraries and the input reference gtf
    # files
    job_memory = "5G"

    statement = '''
    cuffcompare -r <( gunzip < %(infile)s )
         -T
         -s %(genome)s.fa
         -o %(tmpfile1)s
         <( gunzip < %(infile)s )
         <( gunzip < %(infile)s )
    > %(outfile)s.log
    '''
    P.run(statement)

    #################################################
    E.info("resetting gene_id and transcript_id")

    # reset gene_id and transcript_id to ENSEMBL ids
    # cufflinks patch:
    # make tss_id and p_id unique for each gene id
    outf = IOTools.open_file(tmpfile2, "w")
    map_tss2gene, map_pid2gene = {}, {}
    inf = IOTools.open_file(tmpfile1 + ".combined.gtf")

    def _map(gtf, key, val, m):
        if val in m:
            while gene_id != m[val]:
                val += "a"
                if val not in m:
                    break
        m[val] = gene_id

        gtf.setAttribute(key, val)

    for gtf in GTF.iterator(inf):
        transcript_id = gtf.oId
        gene_id = gene_ids[transcript_id]
        gtf.setAttribute("transcript_id", transcript_id)
        gtf.setAttribute("gene_id", gene_id)

        # set tss_id
        try:
            tss_id = gtf.tss_id
        except AttributeError:
            tss_id = None
        try:
            p_id = gtf.p_id
        except AttributeError:
            p_id = None

        if tss_id:
            _map(gtf, "tss_id", tss_id, map_tss2gene)
        if p_id:
            _map(gtf, "p_id", p_id, map_pid2gene)

        outf.write(str(gtf) + "\n")

    outf.close()

    # sort gtf file
    PipelineGeneset.sortGTF(tmpfile2, outfile)

    # make sure tmpfile1 is NEVER empty
    assert tmpfile1
    for x in glob.glob(tmpfile1 + "*"):
        os.unlink(x)
    os.unlink(tmpfile2)
Пример #22
0
def buildGenomicContext(infiles, outfile, distance=10):
    '''build a :term:`bed` formatted file with genomic context.

    The output is a bed formatted file, annotating genomic segments
    according to whether they are any of the ENSEMBL annotations.

    The function also adds the RNA and repeats annotations from the UCSC.
    The annotations can be partially or fully overlapping.

    The annotations can be partially or fully overlapping. Adjacent
    features (less than 10 bp apart) of the same type are merged.

    Arguments
    ---------
    infiles : list
       A list of input files to generate annotations from. The contents are
       1. ``repeats``, a :term:`gff` formatted file with repeat annotations

       2. ``rna``, a :term:`gff` formatted file with small, repetetive
          RNA annotations

       3. ``annotations``, a :term:`gtf` formatted file with genomic
            annotations, see :func:`annotateGenome`.

       4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see
            :func:`buildFlatGeneSet`.

    outfile : string
       Output filename in :term:`bed` format.
    distance : int
       Merge adajcent features of the same type within this distance.

    '''

    repeats_gff, rna_gff, annotations_gtf, geneset_flat_gff, \
        cpgisland_bed, go_tsv = infiles

    tmpfile = P.get_temp_filename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(6)]

    # add ENSEMBL annotations
    statement = """
    zcat %(annotations_gtf)s
    | cgat gtf2gtf
    --method=sort --sort-order=gene
    | cgat gtf2gtf
    --method=merge-exons --log=%(outfile)s.log
    | cgat gff2bed
    --set-name=gene_biotype --is-gtf
    --log=%(outfile)s.log
    | sort -k 1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_0
    """
    P.run(statement)

    # rna
    statement = '''
    zcat %(repeats_gff)s %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_1'''
    P.run(statement)

    # add aggregate intervals for repeats
    statement = '''
    zcat %(repeats_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | awk -v OFS="\\t" '{$4 = "repeats"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_2'''
    P.run(statement)

    # add aggregate intervals for rna
    statement = '''
    zcat %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | awk -v OFS="\\t" '{$4 = "repetetive_rna"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_3 '''
    P.run(statement)

    # add ribosomal protein coding genes
    goids = ("GO:0003735", )

    patterns = "-e %s" % ("-e ".join(goids))

    statement = '''
    zcat %(geneset_flat_gff)s
    | cgat gtf2gtf
    --map-tsv-file=<(zcat %(go_tsv)s | grep %(patterns)s | cut -f 2 | sort | uniq)
    --method=filter --filter-method=gene
    --log=%(outfile)s.log
    | cgat gff2bed
    --log=%(outfile)s.log
    | awk -v OFS="\\t" '{$4 = "ribosomal_coding"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_4
    '''
    P.run(statement)

    # CpG islands
    statement = '''
    zcat %(cpgisland_bed)s
    | awk '{printf("%%s\\t%%i\\t%%i\\tcpgisland\\n", $1,$2,$3 )}'
    > %(tmpfile)s_5
    '''
    P.run(statement)

    # sort and merge
    # remove strand information as bedtools
    # complains if there are annotations with
    # different number of field
    files = " ".join(tmpfiles)
    statement = '''
    sort --merge -k1,1 -k2,2n %(files)s
    | cut -f 1-4
    | gzip
    > %(outfile)s
    '''
    P.run(statement)

    for x in tmpfiles:
        os.unlink(x)
Пример #23
0
def buildGenomicContext(infiles, outfile, distance=10, job_memory="4G"):

    '''build a :term:`bed` formatted file with genomic context.
    The output is a bed formatted file, annotating genomic segments
    according to whether they are any of the ENSEMBL annotations.
    The function also adds the RNA and repeats annotations from the UCSC.
    The annotations can be partially or fully overlapping.
    The annotations can be partially or fully overlapping. Adjacent
    features (less than 10 bp apart) of the same type are merged.
    Arguments
    ---------
    infiles : list
       A list of input files to generate annotations from. The contents are
       1. ``repeats``, a :term:`gff` formatted file with repeat annotations
       2. ``rna``, a :term:`gff` formatted file with small, repetetive
          RNA annotations
       3. ``annotations``, a :term:`gtf` formatted file with genomic
            annotations, see :func:`annotateGenome`.
       4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see
            :func:`buildFlatGeneSet`.
    outfile : string
       Output filename in :term:`bed` format.
    distance : int
       Merge adajcent features of the same type within this distance.
    '''

    repeats_gff, rna_gff, annotations_gtf, utr_gtf, intron_gtf = infiles

    tmpfile = P.get_temp_filename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(4)]

    # add ENSEMBL annotations
    statement = """
    zcat %(annotations_gtf)s
    | cgat gtf2gtf
    --method=sort --sort-order=gene
    | cgat gtf2gtf
    --method=merge-exons --log=%(outfile)s.log
    | cgat gff2bed
    --set-name=gene_biotype --is-gtf
    --log=%(outfile)s.log
    | sort -k 1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_0
    """
    P.run(statement, job_memory=job_memory)

    # rna
    statement = '''
    zcat %(repeats_gff)s %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_1'''
    P.run(statement, job_memory=job_memory)

    # utr
    statement = '''zcat %(utr_gtf)s
    | cgat gff2bed --is-gtf --set-name=feature
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_2'''
    P.run(statement, job_memory=job_memory)

    # intron
    statement = '''zcat %(intron_gtf)s
    | cgat gff2bed --is-gtf --set-name=feature
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_3'''
    P.run(statement, job_memory=job_memory)

    # sort and merge
    # remove strand information as bedtools
    # complains if there are annotations with
    # different number of field
    files = " ".join(tmpfiles)
    statement = '''
    sort --merge -k1,1 -k2,2n %(files)s
    | cut -f 1-4
    | gzip
    > %(outfile)s
    '''
    P.run(statement, job_memory=job_memory)

    for x in tmpfiles:
        os.unlink(x)