Пример #1
0
def computeOverlapCoding( infile, outfile ):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''
    
    to_cluster = True
    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
Пример #2
0
def buildDownstreamFlankBed(infile, outfile):
    """ build interval downstream of gene start for each entry in bed file"""
    window = PARAMS["geneset_flank"]
    faidx = PARAMS["faidx"]
    statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s 
                   | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s"""
    P.run()
Пример #3
0
def countReadsWithinWindows(bedfile,
                            windowfile,
                            outfile,
                            counting_method="midpoint"):
    '''count reads given in *tagfile* within intervals in 
    *windowfile*.

    Both files need to be :term:`bed` formatted.

    Counting is done using bedtools. The counting method
    can be 'midpoint' or 'nucleotide'.
    '''
    job_options = "-l mem_free=4G"

    if counting_method == "midpoint":
        f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' '''
    elif counting_method == "nucleotide":
        f = ""
    else:
        raise ValueError("unknown counting method: %s" % counting_method)

    statement = '''
    zcat %(bedfile)s
    %(f)s
    | coverageBed -a stdin -b %(windowfile)s -split
    | sort -k1,1 -k2,2n
    | gzip
    > %(outfile)s
    '''

    P.run()
Пример #4
0
def ExtendRegion(infile, outfile):
    """convert bed to gtf"""
    statement = """gunzip < %(infile)s 
                   | slopBed -i stdin -g %(faidx)s -b 1000  
                   | gzip
                   > %(outfile)s """
    P.run()
Пример #5
0
def getNoncodingGeneset(infile, outfile):
    """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """
    ensembl_transcripts = PARAMS["ensembl_transcripts"]
    statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s;
                   echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; 
                   cat %(outfile)s | wc -l >> %(outfile)s.count;"""
    P.run()
Пример #6
0
def addMissingNoncodingTranscripts(infile, outfile):
    """ Add ensembl gene id to GTF file"""
    ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"]
    statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s  -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf;
                   cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n
                   > %(outfile)s;"""
    P.run()
def buildBAMStats( infile, outfile ):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''python %(scriptsdir)s/bam2stats.py --force 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()
Пример #8
0
def loadEffects(infile, outfile):
    '''load transcript effects into tables.'''

    root = infile[:-len(".effects.gz")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --from-zipped \
              --index=transcript_id \
              --table=%(root)s_effects \
    < %(infile)s > %(outfile)s
    '''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation"):

        statement = '''
        gunzip < %(infile)s.%(suffix)s.gz
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --allow-empty
        --index=transcript_id 
        --table=%(root)s_effects_%(suffix)s 
        --ignore-column=seq_na
        --ignore-column=seq_aa
        >> %(outfile)s
        '''
        P.run()
Пример #9
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()
Пример #10
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)
Пример #11
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
Пример #12
0
def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    to_cluster = True
    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    if os.path.exists(outfile):
        os.remove(outfile)

    for motiffile in motiffiles:
        of = IOTools.openFile(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run()
Пример #13
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Пример #14
0
    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models 
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts 
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile 
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()
Пример #15
0
def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )
Пример #16
0
def collectMEMEResults(tmpdir, target_path, outfile):
    '''collect output from a MEME run in tmpdir
    and copy all over to target_path

    convert images output by MEME (.eps files) to 
    .png files.'''

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile)

    # convert images to png
    epsfiles = glob.glob(os.path.join(target_path, "*.eps"))

    for epsfile in epsfiles:
        b, ext = os.path.splitext(epsfile)
        pngfile = b + ".png"
        statement = '''convert %(epsfile)s %(pngfile)s '''
        P.run()
Пример #17
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def makeSegments( infile, outfile ):
    '''compute intron overrun.'''

    to_cluster = True

    statement = '''gunzip < %(infile)s 
    | %(scriptsdir)s/gff_sort pos 
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--output-filename-pattern="%(outfile)s.%%s"
		--force 
		--log=%(outfile)s.log 
    > %(outfile)s 
    '''
    P.run()

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene
    | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--force 
		--output-filename-pattern="%(outfile)s_genes.%%s" 
		--log=%(outfile)s.log
    >> %(outfile)s'''
    P.run()
def loadRepeatInformation( infiles, outfile ):
    '''load genome information.'''
    
    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename( "." )

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )
Пример #20
0
def buildTranscriptLevelReadCounts(infiles, outfile):
    '''count reads falling into transcripts of protein coding gene models.

    .. note::
       In paired-end data sets each mate will be counted. Thus
       the actual read counts are approximately twice the fragment
       counts.

    '''
    bamfile, geneset = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    statement = '''
    zcat %(geneset)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=transcripts
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()
Пример #21
0
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
Пример #22
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
Пример #23
0
def buildGeneLevelReadCounts(infiles, outfile):
    '''compute read counts and coverage of exons with reads.
    '''

    bamfile, exons = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    # ignore multi-mapping reads
    statement = '''
    zcat %(exons)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=genes
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()
Пример #24
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py 
                        --sort=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py 
                           --renumber-genes=%(gene_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --renumber-transcripts=%(transcript_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --sort=gene 
                           --log=%(outfile)s.log
                          | gzip > %(outfile)s'''

    P.run()
def runSpades(infile, outfile):
    '''
    run spades on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Spades().build(infile)
    P.run()
Пример #26
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()
def runSoapdenovo(infile, outfile):
    '''
    run soapdenovo
    '''
    job_options = "-l mem_free=30G"
    statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile)
    P.run()
def runIdba(infile, outfile):
    '''
    run idba on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Idba().build(infile)
    P.run()
def buildAnnotations( infiles, outfile ):
    '''annotate transcripts by location (intergenic, intronic, ...)'''
    
    infile, annotation = infiles

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=position 
		--counter=classifier 
		--section=exons 
		--section=introns 
		--counter=length 
		--counter=splice 
		--counter=composition-na 
		--counter=splice-comparison 
		--log=%(outfile)s.log 
                --filename-format=gff
		--filename-gff=%(annotation)s 
		--genome-file=%(genome_dir)s/%(genome)s"
    | gzip
    > %(outfile)s
    '''
    P.run()
Пример #30
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Пример #31
0
def buildFullGeneSet(infiles, outfile):
    '''
    produces a final gene set that can be used for 
    differential expression analysis and comparisons
    between protein coding and lncRNA transcripts
    '''
    # change the source to be in keeping with classification
    # of transcripts - f coming from cufflinks assembly
    infs = " ".join(infiles)
    statement = ("zcat %(infs)s |"
                 " sed 's/Cufflinks/protein_coding/g' |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --sort=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip  > %(outfile)s")
    P.run()
Пример #32
0
def lowerStringencyDeNovos(infiles, outfile):
    '''Filter lower stringency de novo variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open(pedfile),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")' > %(outfile)s''' % locals(
    )
    P.run()
Пример #33
0
def alignContigsToReference(infile, outfile, param):
    '''
    align the contigs to the reference genomes
    using nucmer
    '''
    print infile, param

    to_cluster = True

    reffile, contigfile = infile, param
    pattern = P.snip(os.path.basename(outfile), ".delta")
    statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s'''
    P.run()
    outf = os.path.basename(outfile)
    statement = '''mv %(outf)s alignment.dir'''
    P.run()
Пример #34
0
def filterVariants(infiles, outfile):
    '''Filter variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open("%(pedfile)s"),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(father)s").getAB()<0.05&&vc.getGenotype("%(mother)s").getAB()<0.05&&vc.getGenotype("%(child)s").getAB()>=0.25&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3' > %(outfile)s''' % locals(
    )
    P.run()
Пример #35
0
def loadOverlap(infile, outfile):
    '''load results of overlap computation.'''

    tablename = outfile[:-len("_table.load")]
    statement = '''
	grep -v "\\bna\\b" 
        < %(infile)s 
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
             --map set1:str 
             --map set2:str 
             --index=set1 
             --index=set2 
             --table=%(tablename)s
        > %(outfile)s
    '''
    P.run()
Пример #36
0
def runFrameFinder(infile, outfile):
    '''run FrameFinder

    search on both strands (-r TRUE). Note that CPC default is: only forward strand.

    '''
    cpc_dir = "/ifs/apps/bio/cpc-0.9-r2"
    statement = '''
    cat %(infile)s |
    %(cpc_dir)s/libs/estate/bin/framefinder
    -r TRUE -w %(cpc_dir)s/data/framefinder.model /dev/stdin
    | gzip
     > %(outfile)s
    '''

    P.run()
Пример #37
0
def buildCodingExons( infile, outfile ):
    '''build a collection of transcripts from the protein-coding portion of the ENSEMBL gene set.

    All exons are kept
    '''

    to_cluster = True

    statement = '''
    gunzip < %(infile)s 
    | awk '$2 == "protein_coding"' 
    | awk '$3 == "exon"' 
    | python %(scriptsdir)s/gtf2gtf.py --remove-duplicates=gene --log=%(outfile)s.log 
    | gzip > %(outfile)s
    '''
    P.run()
Пример #38
0
def makeDistances(infiles, outfile):
    '''compute intron overrun.'''

    infile, annotation = infiles

    statement = '''gunzip
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=distance-genes 
		--log=%(outfile)s.log 
		--filename-gff=<( gunzip < %(annotation)s ) " 
    > %(outfile)s 
    '''
    P.run()
def buildCodingGeneSet(infile, outfile):
    '''build a gene set with only protein coding 
    transcripts.

    Genes are selected via their gene biotype in the GTF file.
    Note that this set will contain all transcripts of protein
    coding genes, including processed transcripts.

    This set includes UTR and CDS.
    '''

    to_cluster = True
    statement = '''
    zcat %(infile)s | awk '$2 == "protein_coding"' | gzip > %(outfile)s
    '''
    P.run()
Пример #40
0
def convertBed2Psl(infile, outfile):
    """convert a bed to a psl file."""

    track = outfile[:-len(".bed.gz")]
    genomefile = os.path.join(PARAMS["genome_dir"],
                              PARAMS["%s_genome" % track])
    if not os.path.exists(genomefile + ".fasta"):
        raise IOError("genome %s does not exist" % genomefile)

    statement = """gunzip < %(infile)s 
    | python %(scriptsdir)s/bed2psl.py 
         --genome=%(genomefile)s
         --log=%(outfile)s.log 
    | gzip > %(outfile)s
    """
    P.run()
def buildAnnotatorSegments(tmpdir, infile, outfile):
    '''convert segments in bed format to annotator format
    from infile to outfile.
    '''

    tmpsegments = os.path.join(tmpdir, "segments")
    to_cluster = True

    statement = '''
        python %(scriptsdir)s/bed2gff.py < %(infile)s |\
	python %(scriptsdir)s/gff2annotator.py --log=%(outfile)s.log --section=segments > %(tmpsegments)s \
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    return tmpsegments
Пример #42
0
def mergeDMRWindows(infile, outfile):
    '''merge overlapping windows.'''

    to_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/medip_merge_intervals.py
          --log=%(outfile)s.log
          --invert
          --output-filename-pattern=%(outfile)s.%%s.bed.gz
    | gzip
    > %(outfile)s
    '''

    P.run()
Пример #43
0
def reportTotalRNAFunctions(infiles, outfiles):
    '''report total RNA functions.'''

    to_cluster = USECLUSTER

    rpkm_filename, annotations_filename = infiles
    expression_filename, diff_filename = outfiles
    statement = '''
    python %(rmaadir)s/report_totalRNA_annotations.py 
           %(rpkm_filename)s 
           %(annotations_filename)s 
           %(expression_filename)s 
           %(diff_filename)s
    '''

    P.run()
Пример #44
0
def copyEnsemblDb(infile, outfile):
    '''copy tables from ensembl database to rnaseq database'''
    table_list = P.asList(PARAMS["ensembl_tables"])
    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"]
    cc.execute(query)
    for table in table_list:
        cc = dbhandle.cursor()
        query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table,
                                                                      table)
        print query
        cc.execute(query)
    cc.close()
    statement = """touch %(outfile)s;"""
    P.run()
Пример #45
0
def buildGeneTables(infile, outfile):
    '''
    build gene tables
    '''
    if infile.endswith(".gff.gz"):
        outf = gzip.open(outfile, "w")
        outf.write(
            "chr\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\tattributes\n")
        for line in gzip.open(infile).readlines():
            outf.write(line)
        outf.close()
    else:
        statement = '''zcat %(infile)s | python %(scriptsdir)s/fasta2table.py
        -s sequence
        --log=%(outfile)s.log | gzip > %(outfile)s'''
        P.run()
Пример #46
0
def runMACS( infile, outfile ):

    to_cluster = False

    track = infile[:-len("normbam")]
    try:
        control = pipeline_vitaminD.getControl( track ) + ".bam"
    except AssertionError:
        return

    statement = '''
    macs -t %(infile)s -c %(control)s \
          --name=%(outfile)s \
          --format=bam --tsize=35 --bw=110 --mfold=8 --gsize=6000000 >& %(outfile)s''' 

    P.run( **dict( locals().items() + PARAMS.items() ) )
Пример #47
0
def assignEssentialGenesToContigs(infile, outfile):
    '''
    assign essential genes to contigs
    '''
    dirname = os.path.dirname(infile)
    essential = PARAMS["hmmer_hmm"]
    tempdir = P.getTempDir(".")

    statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa;
    hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc
    --notextw  %(essential)s %(tempdir)s/orfs.fa;
    tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " "
    | gzip > %(outfile)s'''
    P.run()
    statement = '''rm -rf %(tempdir)s'''
    P.run()
Пример #48
0
    def buildGenomeAlignment(infile, outfile):
        '''remove non-unique alignments in genomic infile.'''

        statement = '''gunzip < %(infile)s 
        | sort -k10,10 -k12,12n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-query
        --log=%(outfile)s.log
        | sort -k14,14 -k16,16n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-target
        --log=%(outfile)s.log
        | gzip
        >> %(outfile)s
        '''
        P.run()
Пример #49
0
def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    table = P.toTable(outfile)
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=track,transcript_id
              --index=contig,pos
              --index=protein_id
              --index=transcript_id
              --table=%(table)s 
    < %(infile)s.map
    > %(outfile)s
    '''
    P.run()
def createRealignIntervals(infiles, outfile):

    infile, reference = infiles

# need to unload java before runnning GATK as it now runs on java version 7

    statement = '''module unload apps/java/jre1.6.0_26;
    java -Xmx4g -jar
    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
    -T RealignerTargetCreator
    -R %(reference)s
    -I %(infile)s
    -o %(outfile)s
    ''' % locals()

    P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    transcriptome data.
    '''

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"  
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
Пример #52
0
def buildCDNAFasta( infile, outfile ):
    '''load ENSEMBL cdna FASTA file
    
    *infile* is an ENSEMBL cdna file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()
Пример #53
0
def buildTileStats(infile, outfile):
    '''compute tiling window size statistics from bed file.'''

    use_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/gff2histogram.py 
                   --force
                   --format=bed 
                   --data=size
                   --method=hist
                   --method=stats
                   --output-filename-pattern=%(outfile)s.%%s.tsv
    > %(outfile)s
    '''
    P.run()
Пример #54
0
def exportSequences(infile, outfile):
    '''collect sequences from a gtf file.'''

    prefix = outfile[:-len(".fasta")]

    to_cluster = True
    statement = '''gunzip 
        < %(infile)s
        | python %(scriptsdir)s/gtf2gtf.py --sort=gene
	| python %(scriptsdir)s/gff2fasta.py 
		--is-gtf 
		--genome-file=%(genome_dir)s/%(genome)s
		--log=%(outfile)s.log 
	| python %(toolsdir)s/index_fasta.py --force %(prefix)s - 
        > %(outfile)s.log'''

    P.run()
Пример #55
0
def buildPeptideFasta( infile, outfile ):
    '''create ENSEMBL peptide file

    *infile* is an ENSEMBL .pep.all.fa.gz file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()
Пример #56
0
def annotateVariantsSNPsift(infile, outfile):
    '''Add annotations using SNPsift'''
    to_cluster = USECLUSTER
    job_options = "-pe dedicated 4 -R y -l mem_free=6G"
    track = P.snip(os.path.basename(infile), ".vqsr.vcf")
    dbNSFP = PARAMS["annotation_snpsift_dbnsfp"]
    # The following statement is not fully implemented yet
    #    statement = '''SnpSift.sh geneSets -v /ifs/projects/proj016/data/1000Genomes/msigdb.v4.0.symbols.gmt %(infile)s > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement = '''SnpSift.sh dbnsfp -v %(dbNSFP)s %(infile)s
    > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement += '''SnpSift.sh annotate /ifs/projects/proj016/data/1000Genomes/00-All.vcf
    variants/%(track)s_temp1.vcf > %(outfile)s ;''' % locals()
    #    statement += '''rm -f variants/*temp*vcf;'''

    P.run()
Пример #57
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    table = P.toTable(outfile)

    statement = '''
    gunzip 
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=protein_id
              --table=%(table)s 
              --map=effect:str
    > %(outfile)s
    '''
    P.run()
Пример #58
0
def mergeGeneLists(infiles, outfile):
    '''Merge gene lists into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        print statement
        cc.execute(statement)
        cc.close()

    # Build union statement
    pre = "CREATE TABLE %s AS " % tablename
    statement = ""
    for f in infiles:
        track = P.snip(os.path.basename(f),
                       ".genelist.load").replace("-", "_").replace(".", "_")
        species = track[:2]
        genelist_id = PARAMS["genelist_id"]
        statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species
                       FROM %(track)s_genelist g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals(
        )
        pre = " UNION "

    print statement
    cc = dbhandle.cursor()
    cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals())
    cc.execute(statement)
    cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' %
               tablename)
    cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' %
               tablename)
    cc.close()

    statement = "touch %s" % outfile
    P.run()
Пример #59
0
def loadRepeatsRates(infile, outfile):
    '''load repeat overlap'''

    table = outfile[:-len(".load")]

    statement = '''gunzip 
    < %(infile)s 
    | awk '$4 > 0'
    | python %(toolsdir)s/csv_cut.py --remove exons_lengths exons_values
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
              --allow-empty
    > %(outfile)s'''

    P.run()
Пример #60
0
def loadSegments(infile, outfile):
    '''load segments'''

    table = outfile[:-len(".load")]

    for x in (".distances", ".sizes", ".overlaps", "_genes.distances",
              "_genes.sizes", "_genes.overlaps"):
        y = re.sub("\.", "_", x)
        statement = '''
        python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --index=gene_id 
        --map=gene_id:str 
        --table=%(table)s%(y)s 
        < %(infile)s%(x)s
        >> %(outfile)s'''

        P.run()