Exemplo n.º 1
0
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)

            statement = '''gunzip < %(infile)s
                 | cgat maf2psl
                      --query=%(track)s
                      --target=%(maf_master)s
                      --log=%(outfile)s.log
                 | cgat psl2psl
                      --method=filter-fasta
                      --method=sanitize
                      --queries-tsv-file=%(genomefile)s
                      --target-psl-file=%(genome)s
                      --log=%(outfile)s.log
                 | gzip
                 >> %(outfile)s
                 '''
            P.run()
Exemplo n.º 2
0
def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | cgat psl2chain --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint;
    gunzip < %(outfile)s.sorted.chain.gz
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)
Exemplo n.º 3
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError(
                "required file %s for %s (stage %i) not exist." % (filename, outfile, stage))

        if stage == 0:
            statement.append( '''gunzip < %(filename)s''' % locals() )
        else:
            statement.append( '''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals() )

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
Exemplo n.º 5
0
def spikeInCounts(infiles, outfile):
    '''
    Perform spike-in across a specific range of fold changes or absolute
    count differences.  Counts table generated from original input counts
    data.
    '''

    counts_file = infiles[0]
    design_file = infiles[1]

    statement = '''
    zcat %(counts_file)s |
    python %(scriptsdir)s/counts2counts.py --design-tsv-file=%(design_file)s
            --method="spike" --spike-type="row" --spike-change-bin-max=3.0
            --spike-change-bin-width=0.1  --spike-change-bin-min=0.1
            --spike-initial-bin-width=1
            --spike-initial-bin-min=1 --spike-initial-bin-max=200000
            --spike-minimum=1 --spike-maximum=1000000
            --random-seed=%(random_seed)i
            --spike-iterations=%(spike_iterations)i  -v 5
            --log=%(outfile)s.log
            | gzip > %(outfile)s
    '''

    P.run()
def buildBAMStats(infile, outfile):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''cgat bam2stats --force-output 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
Exemplo n.º 8
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''gather BAM file alignment statistics using Picard '''

    job_options = getPicardOptions()
    job_threads = 3

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s
    | python %(scriptsdir)s/bam2bam.py -v 0
    --method=set-sequence --output-sam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
Exemplo n.º 10
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
    def buildRawGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from maf files.
        '''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            # skip maf files without Hsap on top.
            if "other" in infile or "supercontig" in infile:
                continue

            E.info("adding %s" % infile)

            genome_query, genome_target = getGenomes()

            statement = '''gunzip < %(infile)s 
             | python %(scriptsdir)s/maf2psl.py
                  --query=%(maf_name_query)s
                  --target=%(maf_name_target)s
                  --log=%(outfile)s.log
             | python %(scriptsdir)s/psl2psl.py
                  --method=filter-fasta
                  --method=sanitize
                  --queries-tsv-file=%(genome_query)s
                  --target-psl-file=%(genome_target)s
                  --log=%(outfile)s.log
             | gzip
             >> %(outfile)s
             '''
            P.run()
Exemplo n.º 12
0
    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()
Exemplo n.º 13
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
Exemplo n.º 14
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    load_statement = P.build_load_statement(
        tablename=tablename,
        options="--allow-empty-file "
        "--add-index=category "
        "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 15
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN",
                                                      "TRAN")
            statement = '''
            zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()
def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()
def buildAllStats(infiles, outfile):
    '''
    paste stats together
    '''
    statement = '''paste %s > %s''' % (
        " ".join([infile for infile in infiles]), outfile)
    P.run()
def mapReadsWithBowtie(infiles, outfile):
    """map reads with bowtie"""

    inifile, infile = infiles

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = """
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    """

    P.run()
Exemplo n.º 19
0
def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""):
    '''Recalibrates base quality scores using GATK'''

    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    checkpoint ;''' % locals()

    statement += '''GenomeAnalysisTK
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s ;
                    checkpoint ;''' % locals()

    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()
    P.run()
Exemplo n.º 20
0
def buildOverlapWithEnsembl(infile, outfile, filename_bed):
    '''compute overlap of genes with intervals.

    If `filename_bed` has multiple tracks the overlap will
    be computed for each track separately.

    The output is a tab-separated table with pairs of
    overlapping features between `infile` and `filename_bed`.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output file in :term:`tsv` format.
    filename_bed : string
       Filename in :term:`bed` format.
    '''

    statement = '''gunzip
        < %(infile)s
        | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts
        | python %(scriptsdir)s/gff2bed.py --is-gtf
        | python %(scriptsdir)s/bed2graph.py
            --output-section=name
            --log=%(outfile)s.log
            - %(filename_bed)s
        > %(outfile)s
    '''
    P.run()
Exemplo n.º 21
0
def GATKReadGroups(infile, outfile, genome,
                   library="unknown", platform="Illumina",
                   platform_unit="1", track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                    checkpoint ;''' % locals()

    statement += '''AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                    checkpoint ;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run()
Exemplo n.º 22
0
def buildPromotorRegions(infile, outfile, promotor_size=1000):
    '''annotate promotor regions from reference gene set.

    This method builds promotor regions for transcripts
    in an ENSEMBL gene set.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Filename in :term:`gff` format.
    promotor_size : int
       Size of the promotor region (nucleotides upstream
       of TSS).
    '''

    statement = """
    gunzip < %(infile)s
    | python %(scriptsdir)s/gff2gff.py --method=sanitize
    --sanitize-method=genome
    --skip-missing --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log
    | python %(scriptsdir)s/gtf2gff.py --method=promotors
    --promotor-size=%(promotor_size)s \
    --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    """
    P.run()
Exemplo n.º 23
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database_name"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --map-tsv-file=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".", exon.strand,
                                        "."])) +
                    "\t" + exon.attributes + "\n")

        elif len(entry) == 1:
            for exon in entry:
                single.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".",
                                        exon.strand, "."])) +
                    "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
Exemplo n.º 26
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--add-index=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    dbhandle = PipelineUCSC.connectToUCSC(
        host=PARAMS["ucsc_host"],
        user=PARAMS["ucsc_user"],
        database=ucsc_database)

    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(shared=True)

    total_repeats = 0
    for table in tables:
        E.info("%s: loading repeats from %s" % (ucsc_database, table))
        cc = dbhandle.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.',
            strand, '.',
            CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
            FROM %(table)s
            WHERE repClass in ('%(repclasses)s') """ % locals())
        n = 0
        for data in cc.fetchall():
            n += 1
            tmpfile.write("\t".join(map(str, data)) + "\n")
        E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n))
        total_repeats += n

    if total_repeats == 0:
        raise ValueErrror("did not find any repeats for %s" % ucsc_database)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''cat %(tmpfilename)s
    | %(pipeline_scriptsdir)s/gff_sort pos
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Exemplo n.º 31
0
def runControlLncRNAPhyloCSF(infile, outfile):
    phylogeny = PARAMS["phyloCSF_phylogeny"]
    n_frames = int(PARAMS["phyloCSF_n_frames"])
    if PARAMS["phyloCSF_options"]:
        options = PARAMS["phyloCSF_options"]
    else:
        options = ""

    species = []
    for mapping in PARAMS["phyloCSF_map_species_names"].split(","):
        species.append(mapping.split(":")[1])
    species = ",".join(species)
    to_cluster = True
    statement = ("PhyloCSF %(phylogeny)s"
                 "  %(infile)s"
                 "  --frames=%(n_frames)s"
                 "  --species=%(species)s"
                 " %(options)s"
                 " > %(outfile)s")
    P.run()
Exemplo n.º 32
0
def mergeSummaries(infiles,summaryfile):
    #file to store all the stats combined
    print(mergeSummaries)
    combstats = os.getcwd()+"/"+summaryfile
    statementlist = []
    in0 = os.getcwd()+"/"+infiles[0] 
    statementlist.append("touch {}".format(combstats))
    statementlist.append("head -1 {} >>{}".format(in0,combstats))
    statementlist.append("sed  -i '1s/^/{}\\t{}\\t /' {}".format("file","assembler",combstats))
    #extract filenames and assembler names to add to summary text file
    for infile in infiles:
        indir=os.getcwd()+"/"+infile
        insplit=infile.split("/")
        filen=insplit[1]
        assem=insplit[0].split("_")[0]
        #just append the last line and add filename and assembler name
        statementlist.append("tail -1 {} >> {}".format(indir,combstats))
        statementlist.append("sed -i '$s/^/{}\\t{}\\t /' {}".format(filen,assem,combstats))
    statement = " && ".join(statementlist)
    P.run()
Exemplo n.º 33
0
def getCoverageStats(outfile):
    '''
    Grab the gene model coverage stats table
    from the mapping pipeline database

    This is a table in the report generated from a tracker,
    need to actually make this table ourselves to get
    5'/3' coverages
    '''

    statement = '''
    python %(cgat_scripts)s/extract_stats.py
    --task=extract_table
    --log=%(outfile)s.log
    --database=%(mapping_db)s
    --table-name=%(mapping_picard_dups)s
    > %(outfile)s
    '''

    P.run()
def mergeAllAssemblies(infiles, outfile):

    infiles = ["<(zcat %s)" % infile for infile in infiles]
    infiles, reference = infiles[:-1], infiles[-1]

    job_threads = PARAMS["stringtie_merge_threads"]

    infiles = " ".join(infiles)

    statement = '''stringtie --merge
                             -G %(reference)s
                             -p %(stringtie_merge_threads)s
                             %(stringtie_merge_options)s
                             %(infiles)s
                            2> %(outfile)s.log
                   | python %(scriptsdir)s/gtf2gtf.py --method=sort
                           --sort-order=gene+transcript
                            -S %(outfile)s -L %(outfile)s.log'''

    P.run()
Exemplo n.º 35
0
def buildRepeatsRates(infile, outfile):
    '''compute rates for individual aligned repeats.'''

    genome_query, genome_target = getGenomes()

    statement = '''gunzip < %(infile)s
    | sort -k10,10 -k14,14 -k9,9 -k12,12n
    | %(cmd-farm)s --split-at-lines=10000 --output-header --log=%(outfile)s.log
    "cgat psl2psl
    --log=%(outfile)s.log
    --method=add-sequence
    --queries-tsv-file=%(genome_query)s
    --target-psl-file=%(genome_target)s
    | cgat psl2table
    --method=query-counts
    --method=baseml
    --baseml-model=REV"
    | gzip > %(outfile)s
    '''
    P.run()
Exemplo n.º 36
0
    def buildGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from axt files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            E.info("adding %s" % infile)
            statement = '''gunzip < %(infile)s
            | axtToPsl
            /dev/stdin
            %(query)s.sizes
            %(target)s.sizes
            /dev/stdout
            | pslSwap /dev/stdin /dev/stdout
            | gzip >> %(outfile)s
            '''
            P.run()
Exemplo n.º 37
0
def convertChainToPsl(infile, outfile):
    '''convert a chain file to a psl file.
    '''

    to_cluster = False

    target, query = extractGenomes(infile)

    E.debug("query=%s, target=%s" % (query, target))

    statement = '''gunzip
    < %(infile)s
    | %(cmd-farm)s --split-at-regex="^chain" --chunk-size=1000 --max-lines=1000000 --log=%(outfile)s.log
    " cgat chain2psl --log=%(outfile)s.log
      | pslSwap stdin stdout "
    | gzip
    >  %(outfile)s
    '''

    P.run()
Exemplo n.º 38
0
def variantAnnotatorIndels(infiles, outfile):
    '''Annotate variant file using GATK VariantAnnotator'''
    to_cluster = USECLUSTER
    infile, bamlist, effFile = infiles
    statement = '''GenomeAnalysisTK
                   -T VariantAnnotator
                   -R %(bwa_index_dir)s/%(genome)s.fa
                   -I %(bamlist)s
                   -A SnpEff --snpEffFile %(effFile)s
                   -o %(outfile)s
                   --variant %(infile)s
                   -L %(infile)s
                   -A Coverage
                   -A FisherStrand
                   -A HaplotypeScore
                   -A MappingQualityRankSumTest
                   -A ReadPosRankSumTest
                   -A AlleleBalanceBySample
                   -A RMSMappingQuality'''
    P.run()
Exemplo n.º 39
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.

    The comment column is ignored.
    '''

    table = P.toTable(outfile)

    statement = '''gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g"
    | cut -f 1-55
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --add-index=snp_id 
              --add-index=protein_id
              --table=%(table)s 
              --map=effect:str
    > %(outfile)s
    '''
    P.run()
def merge_by_tissue(infiles, outfile):

    reference = "<(zcat %s)" % infiles[0][0]
    infiles = ["<(zcat %s)" % infile[0] for infile in infiles]

    job_threads = PARAMS["stringtie_merge_threads"]

    infiles = " ".join(infiles)

    statement = '''stringtie --merge
                             -G %(reference)s
                             -p %(stringtie_merge_threads)s
                             %(stringtie_merge_options)s
                             %(infiles)s
                            2> %(outfile)s.log
                   | python %(scriptsdir)s/gtf2gtf.py --method=sort
                           --sort-order=gene+transcript
                            -S %(outfile)s -L %(outfile)s.log'''

    P.run()
Exemplo n.º 41
0
    def buildGenomeAlignment(infiles, outfile):
        '''remove non-unique alignments in genomic infile.'''

        to_cluster = True

        infiles = " ".join(infiles)

        statement = '''zcat %(infiles)s
             | sort -k10,10 -k12,12n
             | cgat psl2psl
                  --method=remove-overlapping-query
                  --log=%(outfile)s.log
             | sort -k14,14 -k16,16n
             | cgat psl2psl
                  --method=remove-overlapping-target
                  --log=%(outfile)s.log
             | gzip
             >> %(outfile)s
             '''
        P.run()
Exemplo n.º 42
0
def haplotypeCaller(infile, outfile, genome, dbsnp, intervals, padding,
                    options):
    '''Call SNVs and indels using GATK HaplotypeCaller in all members of a
    family together'''
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T HaplotypeCaller
                    -ERC GVCF
                    -variant_index_type LINEAR
                    -variant_index_parameter 128000
                    -o %(outfile)s
                    -R %(genome)s
                    -I %(infile)s
                    --dbsnp %(dbsnp)s
                    -L %(intervals)s
                    -ip %(padding)s
                    %(options)s''' % locals()
    P.run()
Exemplo n.º 43
0
def copyBamFile(infile, outfile):
    '''Make softlinks of the bam files

    Arguments
    ---------
    infile : string
        Input file in :term:`BAM` format.
    outfile : string
        Output file in :term: `BAM` format.
    '''

    statement = '''ln -s ../%(infile)s
    %(outfile)s'''

    P.run()

    statement = '''samtools index %(outfile)s
    '''

    P.run()
Exemplo n.º 44
0
def loadRates(infile, outfile):
    '''load rates.

    Select the longest stretch for each transcript.
    '''

    track = outfile[:-len(".load")]
    statement = '''
    gunzip
    < %(infile)s
    | python %(toolsdir)s/csv_cut.py
          --large --remove qStarts tStarts blockSizes qSequence tSequence --log=%(outfile)s
    | csort -k:qName: -k:aligned:rn
    | perl -p -e "s/qName/gene_id/"
    | awk '{if (l==$10) {next;} l = $10; print; }'
    |cgat csv2db %(csv2db_options)s --map gene_id:str --table=%(track)s --add-index=gene_id --allow-empty-file
    > %(outfile)s
    '''

    P.run()
Exemplo n.º 45
0
def generateClusterSpikeIns(infile, outfile):
    # parametrise binning in pipeline.ini
    job_options = "-l mem_free=4G"

    statement = '''cat %(infile)s |
    cgat data2spike --method=spike
    --design-tsv-file=design.tsv --difference-method=relative
    --spike-shuffle-column-suffix=-perc
    --spike-keep-column-suffix=-meth,-unmeth
    --spike-minimum=100 --spike-maximum=100
    --spike-output-method=seperate
    --spike-cluster-maximum-distance=150
    --spike-cluster-minimum-size=10 --spike-iterations=50
    --spike-type=cluster --spike-change-bin-min=-100
    --spike-change-bin-max=100 --spike-change-bin-width=10
    --spike-initial-bin-min=0 --spike-initial-bin-max=100
    --spike-initial-bin-width=100 --spike-subcluster-min-size=1
    --spike-subcluster-max-size=9 --spike-subcluster-bin-width=1
    > %(outfile)s_tmp; mv %(outfile)s_tmp %(outfile)s''' % locals()
    P.run()
Exemplo n.º 46
0
def runGsea(infile, outfile):
    '''
    Perform the enrichment analysis, by using gene set enrichment analysis
    (GSEA) and leading edge analysis.
    '''
    geneset = PARAMS['geneset_name']
    idtype = PARAMS['id_gsea_type']
    id_conversion = PARAMS['id_gsea_to_convert']
    min_size = PARAMS['stats_gsea_min_size']
    max_size = PARAMS['stats_gsea_max_size']
    seed = PARAMS['stats_gsea_seed']
    no = PARAMS['stats_gsea_permut']
    p_no = PARAMS['stats_gsea_display_num']
    l_no = PARAMS['stats_gsea_ngeneset']

    statement = '''dir=$(basename %(infile)s .processed | awk '{split($0,a,"/"); print a[1]}') &&
                   mkdir $dir && cd $dir &&
                   xvfb-run cgat runGSEA -f ../%(infile)s -g %(geneset)s -m %(min_size)s -x %(max_size)s
                   -s %(seed)s -n %(no)s -d %(p_no)s -l %(l_no)s'''
    P.run()
Exemplo n.º 47
0
def buildIntergenicRegions(infiles, outfile):
    """build a :term:`bed` file with regions not overlapping any genes.

    Arguments
    ---------
    infiles : list
       - Input filename with geneset in :term:`gtf` format.
       - Input filename with chromosome sizes in :term:`tsv` format.
    outfile : string
       Output filename with genomic regions in :term:`bed` format.
    """

    infile, contigs = infiles

    statement = '''zcat %(infile)s
    | sort -k1,1 -k2,2n
    | complementBed -i stdin -g %(contigs)s
    | gzip
    > %(outfile)s'''
    P.run()
Exemplo n.º 48
0
def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if IOTools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif IOTools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()
Exemplo n.º 49
0
def makeTagDirectoryChips(infile, outfile):
    '''
    This will create a tag file for each bam file
    for a CHIP-seq experiment
    '''

    bamstrip = infile.strip(".bam")
    samfile = bamstrip + ".sam"

    statement = '''
                   samtools view %(infile)s > homer/Tag.dir/%(samfile)s &&
                   cd homer/Tag.dir/ &&
                   makeTagDirectory %(bamstrip)s
                   %(samfile)s
                   -genome %(homer_maketagdir_genome)s -checkGC
                   &> %(bamstrip)s.makeTagChip.log &&
                   touch %(bamstrip)s/%(bamstrip)s.txt &&
                   sleep 60'''

    P.run()
Exemplo n.º 50
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles, ), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)
Exemplo n.º 51
0
def loadGeneCoordinates(infile, outfile):
    '''merge transcripts to generate the genomic coordinates per gene
    and load '''

    # TS. remove transcript_id column as this is now meaningless
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--ignore-column=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=merge-transcripts
    | cgat gtf2tsv
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 52
0
def plotHeatmap(infile, outfile):

    '''
    This tool creates a heatmap for scores associated with genomic regions.
    The program requires a matrix file generated by the tool computeMatrix.
    '''

    infile = "".join(infile)

    statement = '''plotHeatmap -m %(infile)s
                   -o %(outfile)s
                   --outFileNameMatrix %(deep_out_namematrix)s
                   --outFileSortedRegions %(deep_out_sorted)s
                   --dpi %(deep_dpi)s
                   --colorMap %(deep_colormap)s
                   --kmeans %(deep_kmeans)s
                   --legendLocation %(deep_legendlocation)s
                   --refPointLabel %(deep_refpointlabel)s'''

    P.run()
def buildSequinsReferenceTranscriptome(infiles, outfile):
    '''
    Builds a reference transcriptome from the provided GTF geneset - generates
    a fasta file containing the sequence of each feature labelled as
    "exon" in the GTF.
    --fold-at specifies the line length in the output fasta file'''

    infile, genome_file = infiles

    statement = '''
    zcat %(infile)s |
    awk '$3=="exon"'|
    cgat gff2fasta
    --is-gtf --genome-file=%(genome_file)s --fold-at=60 -v 0
    --log=%(outfile)s.log > %(outfile)s;
    checkpoint;
    samtools faidx %(outfile)s
    '''

    P.run()
Exemplo n.º 54
0
def mapReadsWithShrimp(infiles, outfile):
    '''map reads with shrimp'''

    inifile, infile = infiles

    job_options = "-l mem_free=64G"
    job_threads = PARAMS["shrimp_threads"]

    statement = '''
    gmapper-cs --full-threshold 80%% --threads %(shrimp_threads)i --fastq --output-report
              --sam-unaligned
              %(shrimp_options)s
              %(infile)s 
              %(genome_dir)s/%(genome)s.fa 
    2> %(outfile)s.log
    | gzip 
    > %(outfile)s 
    '''

    P.run()
Exemplo n.º 55
0
def getDuplicationStats(outfile):
    '''
    Grab the picard duplication stats table
    from the mapping pipeline database
    '''

    statement = '''
    cgat extract_stats
    --log=%(outfile)s.log
    --task=extract_table
    --database-port=3306
    --database-backend=%(database_backend)s
    --database-hostname=%(database_host)s
    --database-username=%(database_username)s
    --database=%(mapping_db)s
    --table-name=%(mapping_picard_dups)s
    > %(outfile)s
    '''

    P.run()
Exemplo n.º 56
0
def buildReadCorrespondence(infiles, outfile):
    '''count number of reads mapped, duplicates, etc.
    '''

    to_cluster = USECLUSTER

    headers = ",".join([P.snip(x, ".bam") for x in infiles])
    sorters = " ".join(["<( samtools view -h %s | %s/hsort 0 )" %
                        (x, PARAMS["scriptsdir"]) for x in infiles])

    statement = '''
    cgat diff_bam
         --header-names=%(headers)s
         --log=%(outfile)s.log
       %(sorters)s
    | gzip
    > %(outfile)s
    '''

    P.run()
Exemplo n.º 57
0
def makeSailfishIndex(infile, outfile):
    '''
    Make a sailfish index file from a multi-fasta of
    spliced transcript sequences
    '''

    outdir = "/".join(outfile.split("/")[:-1])
    job_threads = 8
    statement = '''
    python %(cgat_scripts)s/fastq2tpm.py
    --method=make_index
    --program=sailfish
    --index-fasta=%(infile)s
    --kmer-size=%(sailfish_kmer)s
    --threads=%(job_threads)s
    --output-directory=%(outdir)s
    --log=%(outfile)s.log
    '''

    P.run()
Exemplo n.º 58
0
def spikeVsGenome(infile, outfile):
    '''Summarise the number of reads mapping uniquely to spike-ins and genome.
       Compute the ratio of reads mapping to spike-ins vs genome.
       Only uniquely mapping reads are considered'''

    header = "\\t".join(
        ["nreads_uniq_map_genome", "nreads_uniq_map_spike", "fraction_spike"])

    statement = ''' echo -e "%(header)s" > %(outfile)s;
                    checkpoint;
                    samtools view %(infile)s
                    | grep NH:i:1
                    | awk 'BEGIN{OFS="\\t";ercc=0;genome=0};
                           $3~/chr*/{genome+=1};
                           $3~/ERCC*/{ercc+=1};
                           END{frac=ercc/(ercc+genome);
                               print genome,ercc,frac};'
                    >> %(outfile)s
                '''
    P.run()
Exemplo n.º 59
0
def mergeSailfishCounts(infiles, outfile):
    '''
    Merge all raw counts from sailfish across each
    condition
    '''

    infiles = " ".join(infiles)
    job_memory = "4G"

    statement = '''
    cgat combine_tables
    --columns=1
    --take=5
    --use-file-prefix
    --regex-filename='(.+).quant'
    --log=%(outfile)s.log
    %(infiles)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 60
0
def grepPrimers(infile, outfile):
    '''count occurences of decreasing primer substrings at start of reads '''
    to_cluster = False
    primer = "a"
    if infile.find("_b.") > 0:
        primer = "b"
    if primer == "a":
        primer_seq = PARAMS["grep_primer_a"]
    else:
        primer_seq = PARAMS["grep_primer_b"]

    for i in range(len(primer_seq), 5, -1):
        primer_subseq = primer_seq[:i]
        statement = '''echo "%(primer_subseq)s" >> %(outfile)s; zcat %(infile)s | grep ^%(primer_subseq)s | wc -l >> %(outfile)s;'''
        P.run()

    # reformat out file
    statement = '''echo "Total reads" >> %(outfile)s; echo `zcat %(infile)s |  wc -l` / 4 | bc >> %(outfile)s;
                   sed -i '{N;s/\\n/\\t/}' %(outfile)s; '''
    P.run()