def orthologTripleWithFeature( infile, outfile):
    '''Generate list of conserved genes associated with feature in all species '''
    tablename = "ortholog_triple_with_feature"
    #anno_base = PARAMS["annotations_dir"]
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    #db_name = PARAMS["database"]
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect( PARAMS["database"] )
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals()
        cc.execute( statement )
        cc.close()

    # Extract data from db
    cc = dbhandle.cursor()
    cc.execute( "DROP TABLE IF EXISTS %(tablename)s" % locals() )
    statement = '''CREATE TABLE %(tablename)s AS 
                   SELECT count(distinct o.schema) as species_count, 
                   group_concat(o.gene_id,",") as gene_ids,
                   group_concat(g.gene_name,",") as gene_names,
                   group_concat(o.schema,",") as species_list, set_id
                   FROM genelists_merged g, triple_ortholog_groups o
                   WHERE g.gene_id=o.gene_id
                   GROUP BY set_id ''' % locals()
    cc.execute( statement )
    cc.close()
    statement = "touch %s" % outfile
    P.run()
示例#2
0
def orthologTripleWithFeature(infile, outfile):
    '''Generate list of conserved genes associated with feature in all species '''
    tablename = "ortholog_triple_with_feature"
    #anno_base = PARAMS["annotations_dir"]
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    #db_name = PARAMS["database"]
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        cc.execute(statement)
        cc.close()

    # Extract data from db
    cc = dbhandle.cursor()
    cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals())
    statement = '''CREATE TABLE %(tablename)s AS 
                   SELECT count(distinct o.schema) as species_count, 
                   group_concat(o.gene_id,",") as gene_ids,
                   group_concat(g.gene_name,",") as gene_names,
                   group_concat(o.schema,",") as species_list, set_id
                   FROM genelists_merged g, triple_ortholog_groups o
                   WHERE g.gene_id=o.gene_id
                   GROUP BY set_id ''' % locals()
    cc.execute(statement)
    cc.close()
    statement = "touch %s" % outfile
    P.run()
示例#3
0
def exportConservedGeneBed(infile, outfile):
    '''export bed file for each list of conserved CAPseq genes'''
    species_list = P.asList(PARAMS["species"])
    gtf_list = P.asList(PARAMS["annotations_gtf"])
    species_lookup = dict(zip(species_list, gtf_list))
    species = infile[0:2]
    species_gtf = species_lookup[species]
    track = P.snip(os.path.basename(infile), ".export")

    #gtffile = os.path.join( PARAMS["annotations_dir"], species_genome, PARAMS["annotations_gtf"] )
    statement = '''zcat %(species_gtf)s | python %(scriptsdir)s/gtf2gtf.py --filter=gene --apply=%(infile)s --log=%(outfile)s.log
                   | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --with-utr --log=%(outfile)s.log
                   | python %(scriptsdir)s/gff2bed.py --is-gtf --name=gene_id --track=feature --log=%(outfile)s.log
                   | grep -v track > %(outfile)s;'''
    P.run()
def exportConservedGeneBed( infile, outfile ):
    '''export bed file for each list of conserved CAPseq genes'''
    species_list = P.asList(PARAMS["species"])
    gtf_list = P.asList(PARAMS["annotations_gtf"])
    species_lookup = dict(zip(species_list, gtf_list))
    species = infile[0:2]
    species_gtf = species_lookup[species]
    track = P.snip( os.path.basename(infile),".export")
    
    #gtffile = os.path.join( PARAMS["annotations_dir"], species_genome, PARAMS["annotations_gtf"] )
    statement = '''zcat %(species_gtf)s | python %(scriptsdir)s/gtf2gtf.py --filter=gene --apply=%(infile)s --log=%(outfile)s.log
                   | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --with-utr --log=%(outfile)s.log
                   | python %(scriptsdir)s/gff2bed.py --is-gtf --name=gene_id --track=feature --log=%(outfile)s.log
                   | grep -v track > %(outfile)s;''' 
    P.run()
示例#5
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)
    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(scriptsdir)s/cgat_file_apply.sh {} md5sum \;
    | perl -p -e "s/ +/\\t/g"
    | sort -k1,1
    > %(outfile)s'''
    P.run()
def exportPairsScoreMatrix2( infile, outfile ):
    species_list = P.asList(PARAMS["species"])
    outs = open( outfile, "w")
    first=True
    for species in species_list:
        dbhandle = sqlite3.connect( PARAMS["database"] )
        cc = dbhandle.cursor()
        statement = ''' SELECT species, score from (
                        SELECT species2 as species, score2 as score from pairwise_ortholog_stats where species1="%(species)s"
                        UNION SELECT species1 as species, score2 as score from pairwise_ortholog_stats where species2="%(species)s"
                        UNION SELECT "%(species)s" as species,  1.0 as score)
                        ORDER BY species desc''' % locals()
        # If first write headers
        if first: 
            cc.execute( statement )
            outs.write("species")
            for result in cc:
                outs.write("\t%s" % result[0] )
            outs.write("\n")
            first = False
        cc.execute( statement )
        outs.write(species)
        for result in cc:
            outs.write("\t%s" % result[1] )
        outs.write("\n")
        cc.close()
    outs.close()
示例#7
0
def exportConservedGeneListPerSpecies(infile, outfile):
    '''Export list of conserved genes associated with feature for each species '''

    species_list = P.asList(PARAMS["species"])
    ensembl_version = PARAMS["orthology_ensembl_version"]

    # Get gene list from database
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_list:
        cc = dbhandle.cursor()
        statement = '''SELECT distinct g.gene_id
                       FROM ortholog_groups g, ortholog_groups_with_feature f
                       WHERE f.set_id=g.set_id
                       AND f.species_count=6
                       AND g.schema LIKE "cgat_%(species)s%%"''' % locals()
        cc.execute(statement)

        # Write to file
        outfilename = species + ".conserved.export"
        outs = open(outfilename, "w")
        for result in cc:
            pre = ""
            for r in result:
                outs.write("%s%s" % (pre, str(r)))
                pre = "\t"
            outs.write("\n")
        cc.close()
        outs.close()

    statement = "touch %s" % outfile
    P.run()
示例#8
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError(
                "required file %s for %s (stage %i) not exist." % (filename, outfile, stage))

        if stage == 0:
            statement.append( '''gunzip < %(filename)s''' % locals() )
        else:
            statement.append( '''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals() )

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
示例#9
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
示例#10
0
def annotateGenesetOverlap(infile, outfile):
    '''classify intervals according to their base pair overlap with respect to different genomic features (genes, TSS, upstream/downstream flanks) '''
    to_cluster = True
    feature_list = P.asList(PARAMS["geneset_feature_list"])
    outfiles = ""
    first = True
    for feature in feature_list:
        feature_name = P.snip(os.path.basename(feature),
                              ".gtf").replace(".", "_")
        outfiles += " %(outfile)s.%(feature_name)s " % locals()
        if first:
            cut_command = "cut -f1,4,5,6,8 "
            first = False
        else:
            cut_command = "cut -f4,5,6 "
        statement = """
                cat %(infile)s
                | python %(scriptsdir)s/bed2gff.py --as-gtf
                | python %(scriptsdir)s/gtf2table.py
		                --counter=overlap
		                --counter=length
		                --log=%(outfile)s.log
		                --filename-gff=%(geneset_dir)s/%(feature)s
		                --genome-file=%(genome_dir)s/%(genome)s
                | %(cut_command)s
                | sed s/nover/%(feature_name)s_nover/g
                | sed s/pover/%(feature_name)s_pover/g
                | sed s/min/length/
                > %(outfile)s.%(feature_name)s"""
        P.run()
    # Paste output together
    statement = '''paste  %(outfiles)s > %(outfile)s'''
    P.run()
def annotateGenesetOverlap( infile, outfile ):
    '''classify intervals according to their base pair overlap with respect to different genomic features (genes, TSS, upstream/downstream flanks) '''
    to_cluster = True
    feature_list = P.asList( PARAMS["geneset_feature_list"] )
    outfiles = ""
    first = True
    for feature in feature_list:
        feature_name = P.snip( os.path.basename( feature ), ".gtf" ).replace(".","_")
        outfiles += " %(outfile)s.%(feature_name)s " % locals()
        if first:
            cut_command = "cut -f1,4,5,6,8 "
            first = False
        else:
            cut_command = "cut -f4,5,6 "
        statement = """
                cat %(infile)s
                | python %(scriptsdir)s/bed2gff.py --as-gtf
                | python %(scriptsdir)s/gtf2table.py
		                --counter=overlap
		                --counter=length
		                --log=%(outfile)s.log
		                --filename-gff=%(geneset_dir)s/%(feature)s
		                --genome-file=%(genome_dir)s/%(genome)s
                | %(cut_command)s
                | sed s/nover/%(feature_name)s_nover/g
                | sed s/pover/%(feature_name)s_pover/g
                | sed s/min/length/
                > %(outfile)s.%(feature_name)s"""
        P.run()
    # Paste output together
    statement = '''paste  %(outfiles)s > %(outfile)s'''
    P.run()
def findGenes(infile, outfile):
    '''Adds expression "GENE_OF_INTEREST" to the FILTER column of the vcf if variant is within a gene of interest as defined in the ini file'''
    to_cluster = USECLUSTER
    geneList = P.asList( PARAMS["annotation_genes_of_interest"] )
    expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList)
    statement = '''GenomeAnalysisTK -T VariantFiltration -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'" --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals()
    P.run()
def exportConservedGeneListPerSpecies( infile, outfile):
    '''Export list of conserved genes associated with feature for each species '''
    
    species_list = P.asList(PARAMS["species"])
    ensembl_version = PARAMS["orthology_ensembl_version"]
    
    # Get gene list from database
    dbhandle = sqlite3.connect( PARAMS["database"] )
    for species in species_list:
        cc = dbhandle.cursor()
        statement = '''SELECT distinct g.gene_id
                       FROM ortholog_groups g, ortholog_groups_with_feature f
                       WHERE f.set_id=g.set_id
                       AND f.species_count=6
                       AND g.schema LIKE "cgat_%(species)s%%"''' % locals()
        cc.execute( statement )
        
        # Write to file
        outfilename = species + ".conserved.export"
        outs = open( outfilename, "w")
        for result in cc:
            pre = ""
            for r in result:
              outs.write("%s%s" % (pre, str(r)) )
              pre = "\t"
            outs.write("\n")
        cc.close()
        outs.close()
        
    statement = "touch %s" % outfile
    P.run()
示例#14
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)

    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(scriptsdir)s/cgat_file_apply.sh {} wc -l \;
    | sort -k1,1
    > %(outfile)s'''
    P.run()
示例#15
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path): shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
示例#16
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError("required file %s for %s (stage %i) not exist." %
                             (filename, outfile, stage))

        if stage == 0:
            statement.append('''gunzip < %(filename)s''' % locals())
        else:
            statement.append('''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals())

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
def exportMotifDiscoverySequences( infile, outfile ):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.
    
    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip( infile, "_intervals.load" )
    dbhandle = connect()
        
    p = P.substituteParameters( **locals() )
    nseq = PipelineMotifs.writeSequencesForIntervals( track, 
                                                      outfile,
                                                      dbhandle,
                                                      full = False,
                                                      masker = P.asList(p['motifs_masker']),
                                                      halfwidth = int(p["motifs_halfwidth"]),
                                                      maxsize = int(p["motifs_max_size"]),
                                                      proportion = p["motifs_proportion"],
                                                      min_sequences = p["motifs_min_sequences"],
                                                      num_sequences = p["motifs_num_sequences"],
                                                      order = p['motifs_score'])

    if nseq == 0:
        E.warn( "%s: no sequences - meme skipped" % outfile)
        P.touch( outfile )
示例#18
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
示例#19
0
def exportPairsScoreMatrix2(infile, outfile):
    species_list = P.asList(PARAMS["species"])
    outs = open(outfile, "w")
    first = True
    for species in species_list:
        dbhandle = sqlite3.connect(PARAMS["database"])
        cc = dbhandle.cursor()
        statement = ''' SELECT species, score from (
                        SELECT species2 as species, score2 as score from pairwise_ortholog_stats where species1="%(species)s"
                        UNION SELECT species1 as species, score2 as score from pairwise_ortholog_stats where species2="%(species)s"
                        UNION SELECT "%(species)s" as species,  1.0 as score)
                        ORDER BY species desc''' % locals()
        # If first write headers
        if first:
            cc.execute(statement)
            outs.write("species")
            for result in cc:
                outs.write("\t%s" % result[0])
            outs.write("\n")
            first = False
        cc.execute(statement)
        outs.write(species)
        for result in cc:
            outs.write("\t%s" % result[1])
        outs.write("\n")
        cc.close()
    outs.close()
示例#20
0
def findGenes(infile, outfile):
    '''Adds expression "GENE_OF_INTEREST" to the FILTER column of the vcf if variant is within a gene of interest as defined in the ini file'''
    to_cluster = USECLUSTER
    geneList = P.asList(PARAMS["annotation_genes_of_interest"])
    expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList)
    statement = '''GenomeAnalysisTK -T VariantFiltration -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'" --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals(
    )
    P.run()
示例#21
0
def mergeGeneLists(infiles, outfile):
    '''Merge gene lists into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        print statement
        cc.execute(statement)
        cc.close()

    # Build union statement
    pre = "CREATE TABLE %s AS " % tablename
    statement = ""
    for f in infiles:
        track = P.snip(os.path.basename(f),
                       ".genelist.load").replace("-", "_").replace(".", "_")
        species = track[:2]
        genelist_id = PARAMS["genelist_id"]
        statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species
                       FROM %(track)s_genelist g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals(
        )
        pre = " UNION "

    print statement
    cc = dbhandle.cursor()
    cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals())
    cc.execute(statement)
    cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' %
               tablename)
    cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' %
               tablename)
    cc.close()

    statement = "touch %s" % outfile
    P.run()
def mergeGeneLists( infiles, outfile ):
    '''Merge gene lists into single table and load into SQLite.'''

    tablename = P.toTable( outfile )
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect( PARAMS["database"] )
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals()
        print statement
        cc.execute( statement )
        cc.close()

    # Build union statement
    pre = "CREATE TABLE %s AS " % tablename
    statement = ""
    for f in infiles:
        track = P.snip( os.path.basename( f), ".genelist.load" ).replace("-","_").replace(".","_")
        species = track[:2]
        genelist_id=PARAMS["genelist_id"]
        statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species
                       FROM %(track)s_genelist g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals()
        pre = " UNION "

    print statement
    cc = dbhandle.cursor()
    cc.execute( "DROP TABLE IF EXISTS %(tablename)s" % locals() )
    cc.execute( statement )
    cc.execute( '''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' % tablename )
    cc.execute( '''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' % tablename )
    cc.close()

    statement = "touch %s" % outfile
    P.run()
def buildAssemblyStats(infile, outfile):
    '''
    return assembly stats from all of the assemblers that
    were used in the running of the contig assembly
    '''
    assemblers = P.asList(PARAMS.get("assemblers"))

    # connect
    dbh = sqlite3.connect(infile)
    cc = dbh.cursor()

    result = {}
    alignment_stats_names = []
    for assembler in assemblers:
        tablename = "%s_contig_summary_tsv" % assembler
        # get the contig summaries
        for data in cc.execute("""SELECT track,
                                  nscaffolds,
                                  median_length,
                                  mean_length,
                                  max_length,
                                  scaffold_length,
                                  N50
                                  FROM %s""" % tablename).fetchall():
            track = "%s_" % assembler + data[0]
            result[track] = list(data[1:])
            alignment_stats_names.append(track + "_alignment_stats")

    # get the alignment statistics - % of reads
    # mapping to contigs
    for a in alignment_stats_names:
        a = P.toTable(a + ".load")
        for data in cc.execute(
                """SELECT percent FROM %s WHERE category == 'reads_mapped'""" %
                a).fetchall():
            track = a[:-len("_alignment_stats")].replace(
                "_filtered_contigs",
                ".filtered.contigs").replace("sim_",
                                             "sim-").replace("BP_", "BP-")
            result[track].append(data[0])

    outf = open(outfile, "w")
    outf.write(
        "assembler\ttrack\tncontigs\tmedian_length\tmean_length\tmax_length\ttotal_length\tN50\tpercent_mapped\n"
    )
    for track, results in result.iteritems():
        assembler = track.split("_")[0]
        track = track.split("_")[1].replace("-R1.filtered.contigs", "")
        outf.write("\t".join([assembler, track] + map(str, results)) + "\n")
    outf.close()
def getInput( track ):
    '''return a list of input tracks associated with track.

    Associations can be defined in the .ini file in the section
    [input]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::
    
       [input]
       track1.bam=input1.bam,input2.bam

    Glob expressions are permitted.

    Default tracks can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam


    '''

    input_files = []
 
    # configparser by default converts option names to lower case
    fn = track.asFile()
    fn = fn.lower()

    if "input_%s" % fn in PARAMS:
        input_files.extend( P.asList( PARAMS["input_%s" % fn ] ) )
    else:
        for pattern, value in P.CONFIG.items( "input" ):
            if "%" in pattern:
                p = re.sub( "%", "\S+", pattern )
                if re.search( p, fn ):
                    input_files.extend( P.asList( value ) )

    return input_files
示例#25
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
示例#26
0
def copyEnsemblDb( infile, outfile ):
    '''copy tables from ensembl database to rnaseq database'''
    table_list = P.asList(PARAMS["ensembl_tables"])
    dbhandle = sqlite3.connect( PARAMS["database"] )
    cc = dbhandle.cursor()
    query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"]
    cc.execute( query )
    for table in table_list:
        cc = dbhandle.cursor()
        query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table, table)
        print query
        cc.execute( query )
    cc.close()
    statement = """touch %(outfile)s;"""
    P.run()
示例#27
0
def copyEnsemblDb(infile, outfile):
    """copy tables from ensembl database to rnaseq database"""
    table_list = P.asList(PARAMS["ensembl_tables"])
    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"]
    cc.execute(query)
    for table in table_list:
        cc = dbhandle.cursor()
        query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table, table)
        print query
        cc.execute(query)
    cc.close()
    statement = """touch %(outfile)s;"""
    P.run()
示例#28
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(track, tmpfasta,
                                      dbhandle,
                                      full=False,
                                      masker=P.asList(PARAMS['motifs_masker']),
                                      halfwidth=int(PARAMS["meme_halfwidth"]),
                                      maxsize=int(PARAMS["meme_max_size"]),
                                      proportion=PARAMS["meme_proportion"],
                                      min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
def buildAssemblyStats(infile, outfile):
    '''
    return assembly stats from all of the assemblers that
    were used in the running of the contig assembly
    '''
    assemblers = P.asList(PARAMS.get("assemblers"))
    
    # connect
    dbh = sqlite3.connect(infile)
    cc = dbh.cursor()
    
    result = {}
    alignment_stats_names = []
    for assembler in assemblers:
        tablename = "%s_contig_summary_tsv" % assembler
        # get the contig summaries
        for data in cc.execute("""SELECT track,
                                  nscaffolds,
                                  median_length,
                                  mean_length,
                                  max_length,
                                  scaffold_length,
                                  N50
                                  FROM %s""" % tablename).fetchall():
            track = "%s_" % assembler+data[0]
            result[track] = list(data[1:])
            alignment_stats_names.append(track+"_alignment_stats")

    # get the alignment statistics - % of reads
    # mapping to contigs
    for a in alignment_stats_names:
        a = P.toTable(a+".load")
        for data in cc.execute("""SELECT percent FROM %s WHERE category == 'reads_mapped'""" % a).fetchall():
            track = a[:-len("_alignment_stats")].replace("_filtered_contigs", ".filtered.contigs").replace("sim_", "sim-").replace("BP_", "BP-")
            result[track].append(data[0])

    outf = open(outfile, "w")
    outf.write("assembler\ttrack\tncontigs\tmedian_length\tmean_length\tmax_length\ttotal_length\tN50\tpercent_mapped\n")
    for track, results in result.iteritems():
        assembler = track.split("_")[0]
        track = track.split("_")[1].replace("-R1.filtered.contigs", "")
        outf.write("\t".join([assembler, track] + map(str,results)) + "\n")
    outf.close()
示例#30
0
def runTest(infile, outfile):
    '''run a test.'''

    track = P.snip(outfile, ".log")

    pipeline_name = PARAMS.get("%s_pipeline" % track,
                               "pipeline_" + track[len("test_"):])
    pipeline_targets = ' '.join(
        P.asList(PARAMS.get("%s_target" % track, "full")))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    to_cluster = False

    statement = '''
    (cd %(track)s.dir;
    python %(pipelinedir)s/%(pipeline_name)s.py
    %(pipeline_options)s make %(pipeline_targets)s) >& %(outfile)s
    '''
    P.run()
@transform(buildEdgeRStats, suffix(".tsv"), ".load")
def loadEdgeRStats(infile, outfile):
    P.load(infile, outfile)


@follows(loadCufflinks, loadCufflinksFPKM, loadGeneLevelReadCounts)
def expression():
    pass


mapToTargets = {
    'cuffdiff': loadCuffdiffStats,
    'deseq': loadDESeqStats,
    'edger': loadEdgeRStats,
}
TARGETS_DIFFEXPRESSION = [mapToTargets[x] for x in P.asList(PARAMS["methods"])]


@follows(*TARGETS_DIFFEXPRESSION)
def diff_expression():
    pass


@follows(diff_expression)
@merge("*_stats.tsv", "de_stats.load")
def loadDEStats(infiles, outfile):
    '''load DE stats into table.'''
    P.concatenateAndLoad(infiles,
                         outfile,
                         missing_value=0,
                         regex_filename="(.*)_stats.tsv")
示例#32
0
def MergedGeneListStats(infile, outfile):

    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Write to file
    header = "species\tgenes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature"
    outs = open(outfile, "w")
    outs.write("%s\n" % (header))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        print statement
        cc.execute(statement)
        cc.close()

        # Extract data from db
        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct t.gene_id) as genes
                       FROM genelists_merged g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.gene_id 
                       AND t.gene_biotype='protein_coding' ''' % locals()
        cc.execute(statement)
        result = cc.fetchall()
        genes_with_feature = str(result[0][0])
        cc.close()
        #print track + " genes_with_feature=" + genes_with_feature + "\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct gene_id) as genes
                       FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals(
        )
        cc.execute(statement)
        result = cc.fetchall()
        total_genes = str(result[0][0])
        cc.close()
        #print track + " total_protein_coding_genes =" + total_genes + "\n"

        proportion_with_feature = (float(genes_with_feature) /
                                   float(total_genes)) * 100
        #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct set_id) as genes
                       FROM ortholog_groups''' % locals()
        cc.execute(statement)
        result = cc.fetchall()
        total_conserved_genes = str(result[0][0])
        cc.close()
        #print "total_conserved_genes =" + total_conserved_genes + "\n"

        proportion_conserved = (float(total_conserved_genes) /
                                float(total_genes)) * 100
        #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct t.gene_id) as genes
                       FROM genelists_merged g, %(species)s.transcript_info t, ortholog_groups o
                       WHERE g.gene_id=t.gene_id and t.gene_biotype='protein_coding' 
                       AND o.gene_id=t.gene_id''' % locals()
        cc.execute(statement)
        result = cc.fetchall()
        conserved_genes_with_feature = str(result[0][0])
        cc.close()
        #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n"

        proportion_conserved_with_feature = (
            float(conserved_genes_with_feature) /
            float(total_conserved_genes)) * 100
        #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n"

        outs.write(
            "%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" %
            (species, genes_with_feature, total_genes, total_conserved_genes,
             conserved_genes_with_feature, proportion_with_feature,
             proportion_conserved, proportion_conserved_with_feature))

    outs.close()
示例#33
0
    | %(cmd-farm)s --split-at-regex="^chain" --chunksize=1000 --max-lines=1000000 --log=%(outfile)s.log
    " python %(scriptsdir)s/chain2psl.py --log=%(outfile)s.log
      | pslSwap stdin stdout "
    | gzip
    >  %(outfile)s
    '''

    P.run()

##########################################################################
##########################################################################
##########################################################################
# extracting alignments from maf files
##########################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:
    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
示例#34
0
    statement = '''
    (cd %(test_name)s.dir;
    python %(pipelines_dir)s/%(pipeline_name)s.py
    %(pipeline_options)s make full) >& %(outfile)s
    '''
    P.run()


###################################################################
###################################################################
###################################################################
# general tests
###################################################################
@files([(os.path.join(PARAMS["data_dir"], x + ".dir"),
         x + ".log")
        for x in P.asList(PARAMS["prerequisites"])])
def runPreparationTests(infile, outfile):
    '''run pre-requisite pipelines.'''
    runTest(infile, outfile)


###################################################################
###################################################################
###################################################################
# run a test
###################################################################
@follows(runPreparationTests)
@files([(x,
         os.path.basename(P.snip(x, '.dir')) + ".log")
        for x in glob.glob(
            os.path.join(PARAMS["data_dir"], "pipeline_*.dir"))
示例#35
0
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1)
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = True

    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s 
示例#36
0
                  "pipeline.ini" ] )

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"],
                                       "pipeline_annotations.py" )

###################################################################
###################################################################
###################################################################
## Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

METHODS = P.asList( PARAMS["methods" ] )

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info( "reading additional configuration from pipeline_conf.py" )
    execfile("pipeline_conf.py")

###################################################################
###################################################################
###################################################################
def connect():
    '''connect to database.
示例#37
0
                   | gzip 
                   > %(outfile)s.log.gz'''
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; 
                | gzip 
                > %(outfile)s'''
    P.run()


###################################################################
###################################################################
###################################################################
@files( [ (x, "%s_%s.output.gz" % (x[:-len(".features.gz")],y), y ) \
              for x,y in itertools.product(
            glob.glob( "*.features.gz"), P.asList( PARAMS["polyphen_models"] ) ) ] )
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = False

    # need to run in chunks for large feature files
    statement = """gunzip 
        < %(infile)s
        | %(cmd-farm)s
            --split-at-lines=10000
            --output-header
        "perl %(polyphen_home)s/bin/run_weka_cpp.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           -p 
def MergedGeneListStats( infile, outfile ):

    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))
    
    # Write to file
    header = "species\tgenes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature"
    outs = open( outfile, "w")
    outs.write( "%s\n" % (header) )

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect( PARAMS["database"] )
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals()
        print statement
        cc.execute( statement )
        cc.close()

        # Extract data from db
        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct t.gene_id) as genes
                       FROM genelists_merged g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.gene_id 
                       AND t.gene_biotype='protein_coding' ''' % locals()
        cc.execute( statement )
        result = cc.fetchall()
        genes_with_feature = str(result[0][0])
        cc.close()
        #print track + " genes_with_feature=" + genes_with_feature + "\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct gene_id) as genes
                       FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals()
        cc.execute( statement )
        result = cc.fetchall()
        total_genes = str(result[0][0])
        cc.close()
        #print track + " total_protein_coding_genes =" + total_genes + "\n"

        proportion_with_feature = (float(genes_with_feature)/float(total_genes))*100
        #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct set_id) as genes
                       FROM ortholog_groups''' % locals()
        cc.execute( statement )
        result = cc.fetchall()
        total_conserved_genes = str(result[0][0])
        cc.close()
        #print "total_conserved_genes =" + total_conserved_genes + "\n"

        proportion_conserved = (float(total_conserved_genes)/float(total_genes))*100
        #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n"

        cc = dbhandle.cursor()
        statement = '''SELECT count(distinct t.gene_id) as genes
                       FROM genelists_merged g, %(species)s.transcript_info t, ortholog_groups o
                       WHERE g.gene_id=t.gene_id and t.gene_biotype='protein_coding' 
                       AND o.gene_id=t.gene_id''' % locals()
        cc.execute( statement )
        result = cc.fetchall()
        conserved_genes_with_feature = str(result[0][0])
        cc.close()
        #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n"

        proportion_conserved_with_feature = (float(conserved_genes_with_feature)/float(total_conserved_genes))*100
        #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n"
        
        outs.write( "%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (species, genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature) )
        
    outs.close()
示例#39
0
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design),
         "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"),
                                              P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")

#########################################################################
#########################################################################
#########################################################################


@follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
示例#40
0
                   > %(outfile)s.log.gz'''
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; 
                | gzip 
                > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################


@files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y)
        for x, y in itertools.product(
            glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = False

    # need to run in chunks for large feature files
    statement = """gunzip 
        < %(infile)s
        | %(cmd-farm)s
            --split-at-lines=10000
            --output-header
        "perl %(polyphen_home)s/bin/run_weka_cpp.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           -p 
def getAssociatedBAMFiles( track ):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` 
    file in the current directory and returns an offset of 0.

    Associations can be defined in the .ini file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::
    
       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::
    
       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam


    '''
    fn = track.asFile()
    bamfiles = glob.glob( "%s.bam" % fn )

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.asList( PARAMS["bams_%s" % fn.lower() ] ):
                bamfiles.extend( glob.glob( ff ) )
        else:
            for pattern, value in P.CONFIG.items( "bams" ):
                if "%" in pattern:
                    p = re.sub( "%", "\S+", pattern )
                    if re.search( p, fn, re.IGNORECASE ):
                        bamfiles.extend( glob.glob( value ) )

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = map(int, P.asList( PARAMS["offsets_%s" % fn.lower() ] ))
    else:
        for pattern, value in P.CONFIG.items( "offsets" ):
            if "%" in pattern:
                p = re.sub( "%", "\S+", pattern )
                if re.search( p, fn, re.IGNORECASE ):
                    offsets.extend( map( int, value.split(",") ) )

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the same as number of offsets: %s" % (str(bamfiles), str(offsets)))


    return bamfiles, offsets
示例#42
0
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
    glob.glob("*.fastq.gz"), "(\S+).fastq.gz") +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        glob.glob("*.fastq.1.gz"), "(\S+).fastq.1.gz")

ALL = PipelineTracks.Sample3()
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

###################################################################
# Global flags
###################################################################
# AH: added default values for assemblers and coverage_mapper
# to allow import of pipeline script
ASSEMBLERS = P.asList(PARAMS.get("assemblers", ""))
MAPPER = PARAMS.get("coverage_mapper", 'bwa')
BOWTIE = MAPPER == "bowtie"
BOWTIE2 = MAPPER == "bowtie2"
BWA = MAPPER == "bwa"


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''
    dbh = sqlite3.connect(PARAMS["database"])
    return dbh

###################################################################
示例#43
0
    >  %(outfile)s
    '''

    P.run()


##################################################################################
##################################################################################
##################################################################################
## extracting alignments from maf files
##################################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:

    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" %
             (PARAMS["%s_label" % track], PARAMS["maf_master"]), track)
            for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
    defaults={
        'annotations_dir': "",
        'paired_end': False})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py", on_error_raise=__name__ == "__main__")

###################################################################
###################################################################
###################################################################
# get options that are to be tested
cufflinks_options = {}
if "cufflinks_test_options" in PARAMS:
    options = P.asList(PARAMS["cufflinks_test_options"])
    for option in options:
        if option == "--pre-mrna-fraction" \
                or option == "--small-anchor-fraction" \
                or option == "--max-multiread-fraction":
            cufflinks_options[option] = [0, 0.5, 0.75, 1]
        elif option == "--min-isoform-fraction":
            cufflinks_options[option] = [0.05, 0.1, 0.5, 1]
        elif option == "--junc-alpha":
            cufflinks_options[option] = [0.001, 0.01, 0.1]
        elif option == "--min-frags-per-transfrag":
            cufflinks_options[option] = [1, 5, 10]
        elif option == "--overhang-tolerance":
            cufflinks_options[option] = [0, 2, 5, 8]
        elif option == "--overlap-radius":
            cufflinks_options[option] = [50, 100, 200]
示例#45
0
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1)
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = True

    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s 
示例#46
0
    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )

###################################################################
###################################################################
@files( [ (PARAMS["%s_merge" % x], "%s.gtf.gz" % x) for x in P.asList(PARAMS["merge"])] +\
            [ (EXPERIMENTAL_TRACKS, PARAMS["merged"] ) ] )
def buildMergedTracks( infiles, outfile ):
    '''merge tracks.'''

    infiles = " ".join(infiles)
    statement = '''
	zcat %(infiles)s 
        | python %(scriptsdir)s/gff2psl.py 
                 --log=%(outfile)s.log 
                 --is-gtf 
                 --allow-duplicates 
	| python %(scriptsdir)s/psl2psl.py 
                 --log=%(outfile)s.log 
                 --method=rename-query 
                 --unique 
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "pipeline.ini"],
    defaults={"annotations_annotations_dir": "",
              "genesets_abinitio_coding": "pruned.gtf.gz",
              "genesets_abinitio_lncrna": "pruned.gtf.gz",
              "genesets_reference": "reference.gtf.gz",
              "genesets_refcoding": "refcoding.gtf.gz",
              "genesets_previous": ""})

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__")
PREVIOUS = P.asList(PARAMS["genesets_previous"])

#########################################################################
#########################################################################
#########################################################################


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_annotations_database"])
def GeneListStats( infile, outfile ):

    track = P.snip( os.path.basename( infile), ".genelist.load" ).replace("-","_").replace(".","_")
    species = track[:2]
    #anno_base = PARAMS["annotations_dir"]
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    #ensembl_version = PARAMS["orthology_ensembl_version"]
    species_lookup = dict(zip(species_list, anno_list))
    species_db = species_lookup[species]
    #species_db = anno_base + species_genome + "/" + PARAMS["database"]

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect( PARAMS["database"] )
    cc = dbhandle.cursor()
    statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals()
    cc.execute( statement )
    cc.close()

    # Extract data from db
    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct t.gene_id) as genes
                   FROM %(track)s_genelist g, %(species)s.transcript_info t
                   WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' ''' % locals()
    cc.execute( statement )
    result = cc.fetchall()
    genes_with_feature = str(result[0][0])
    cc.close()
    #print track + " genes_with_feature=" + genes_with_feature + "\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct gene_id) as genes
                   FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals()
    cc.execute( statement )
    result = cc.fetchall()
    total_genes = str(result[0][0])
    cc.close()
    #print track + " total_protein_coding_genes =" + total_genes + "\n"

    proportion_with_feature = (float(genes_with_feature)/float(total_genes))*100
    #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct set_id) as genes
                   FROM ortholog_groups''' % locals()
    cc.execute( statement )
    result = cc.fetchall()
    total_conserved_genes = str(result[0][0])
    cc.close()
    #print "total_conserved_genes =" + total_conserved_genes + "\n"

    proportion_conserved = (float(total_conserved_genes)/float(total_genes))*100
    #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct t.gene_id) as genes
                   FROM %(track)s_genelist g, %(species)s.transcript_info t, ortholog_groups o
                   WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' 
                   AND o.gene_id=t.gene_id''' % locals()
    cc.execute( statement )
    result = cc.fetchall()
    conserved_genes_with_feature = str(result[0][0])
    cc.close()
    #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n"

    proportion_conserved_with_feature = (float(conserved_genes_with_feature)/float(total_conserved_genes))*100
    #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n"

    # Write to file
    header = "genes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature"
    outs = open( outfile, "w")
    outs.write( "%s\n" % (header) )
    outs.write( "%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature) )
    outs.close()
            suffix(".tsv"), 
            ".load" )
def loadEdgeRStats( infile, outfile ):
    P.load( infile, outfile )

###################################################################
###################################################################
###################################################################
@follows( loadCufflinks, loadGeneLevelReadCounts )
def expression(): pass

mapToTargets = { 'cuffdiff': loadCuffdiffStats,
                 'deseq': loadDESeqStats,
                 'edger': loadEdgeRStats,
                 }
TARGETS_DIFFEXPRESSION = [ mapToTargets[x] for x in P.asList( PARAMS["methods"] ) ]

@follows( *TARGETS_DIFFEXPRESSION )
def diff_expression(): pass

###################################################################
###################################################################
###################################################################
@jobs_limit(1,"R")
@follows( mkdir("tagplots.dir"), aggregateFeatureCounts )
@files( [ (x, os.path.join( "tagplots.dir", y)) for x, y in TARGETS_DE ] )
def plotRNASEQTagData( infiles, outfile ):
    '''perform differential expression analysis using deseq.'''

    design_file = infiles[0]
    geneset_file = infiles[1]
示例#50
0
# collect fastq.gz tracks
TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
        glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\
        PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
            glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" )

ALL = PipelineTracks.Sample3()
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

###################################################################
## Global flags
###################################################################
ASSEMBLERS = P.asList(PARAMS["general_assemblers"])
METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS

ASSEMBLERS = P.asList(PARAMS["assemblers"])
MAPPER = PARAMS["coverage_mapper"]
BOWTIE = MAPPER == "bowtie"
BWA = MAPPER == "bwa"


###################################################################
###################################################################
###################################################################
def connect():
    '''connect to database.

    This method also attaches to helper databases.
示例#51
0
        P.run()
    
    statement = '''
	    (cd %(test_name)s.dir; python %(scriptsdir)s/%(pipeline_name)s.py 
                      %(pipeline_options)s make full) >& %(outfile)s
        ''' 
    
    P.run()

###################################################################
###################################################################
###################################################################
## general tests
###################################################################
@files( [ (os.path.join( PARAMS["data_dir"], x + ".dir"), x + ".log" ) for x in
             P.asList(PARAMS["prerequisites"]) ] )
def prepareTests( infile, outfile ):
    '''run pre-requisite pipelines.'''
    runTest( infile, outfile )

###################################################################
###################################################################
###################################################################
## run a test
###################################################################
@follows( prepareTests )
@files( [ (x, os.path.basename(x) + ".log" ) for x in \
              glob.glob( os.path.join( PARAMS["data_dir"], "pipeline_*")) ] )
def runTests( infile, outfile ):
    '''run a pipeline with test data.'''
    runTest( infile, outfile )
def loadEdgeRStats(infile, outfile):
    P.load(infile, outfile)


@follows(loadCufflinks,
         loadCufflinksFPKM,
         loadGeneLevelReadCounts)
def expression():
    pass

mapToTargets = {'cuffdiff': loadCuffdiffStats,
                'deseq': loadDESeqStats,
                'edger': loadEdgeRStats,
                }
TARGETS_DIFFEXPRESSION = [mapToTargets[x] for x in
                          P.asList(PARAMS["methods"])]


@follows(*TARGETS_DIFFEXPRESSION)
def diff_expression():
    pass


@follows(diff_expression)
@merge("*_stats.tsv", "de_stats.load")
def loadDEStats(infiles, outfile):
    '''load DE stats into table.'''
    P.concatenateAndLoad(infiles, outfile,
                         missing_value=0,
                         regex_filename="(.*)_stats.tsv")
示例#53
0
def GeneListStats(infile, outfile):

    track = P.snip(os.path.basename(infile),
                   ".genelist.load").replace("-", "_").replace(".", "_")
    species = track[:2]
    #anno_base = PARAMS["annotations_dir"]
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    #ensembl_version = PARAMS["orthology_ensembl_version"]
    species_lookup = dict(zip(species_list, anno_list))
    species_db = species_lookup[species]
    #species_db = anno_base + species_genome + "/" + PARAMS["database"]

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
    )
    cc.execute(statement)
    cc.close()

    # Extract data from db
    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct t.gene_id) as genes
                   FROM %(track)s_genelist g, %(species)s.transcript_info t
                   WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' ''' % locals(
    )
    cc.execute(statement)
    result = cc.fetchall()
    genes_with_feature = str(result[0][0])
    cc.close()
    #print track + " genes_with_feature=" + genes_with_feature + "\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct gene_id) as genes
                   FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals(
    )
    cc.execute(statement)
    result = cc.fetchall()
    total_genes = str(result[0][0])
    cc.close()
    #print track + " total_protein_coding_genes =" + total_genes + "\n"

    proportion_with_feature = (float(genes_with_feature) /
                               float(total_genes)) * 100
    #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct set_id) as genes
                   FROM ortholog_groups''' % locals()
    cc.execute(statement)
    result = cc.fetchall()
    total_conserved_genes = str(result[0][0])
    cc.close()
    #print "total_conserved_genes =" + total_conserved_genes + "\n"

    proportion_conserved = (float(total_conserved_genes) /
                            float(total_genes)) * 100
    #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n"

    cc = dbhandle.cursor()
    statement = '''SELECT count(distinct t.gene_id) as genes
                   FROM %(track)s_genelist g, %(species)s.transcript_info t, ortholog_groups o
                   WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' 
                   AND o.gene_id=t.gene_id''' % locals()
    cc.execute(statement)
    result = cc.fetchall()
    conserved_genes_with_feature = str(result[0][0])
    cc.close()
    #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n"

    proportion_conserved_with_feature = (float(conserved_genes_with_feature) /
                                         float(total_conserved_genes)) * 100
    #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n"

    # Write to file
    header = "genes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature"
    outs = open(outfile, "w")
    outs.write("%s\n" % (header))
    outs.write("%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" %
               (genes_with_feature, total_genes, total_conserved_genes,
                conserved_genes_with_feature, proportion_with_feature,
                proportion_conserved, proportion_conserved_with_feature))
    outs.close()
示例#54
0
def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` 
    file in the current directory and returns an offset of 0.

    Associations can be defined in the .ini file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam


    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.asList(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()]))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(map(int, value.split(",")))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError(
            "number of BAM files %s is not the same as number of offsets: %s" %
            (str(bamfiles), str(offsets)))

    return bamfiles, offsets
示例#55
0
    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    to_cluster = False

    statement = '''
    (cd %(track)s.dir;
    python %(pipelinedir)s/%(pipeline_name)s.py
    %(pipeline_options)s make %(pipeline_targets)s) >& %(outfile)s
    '''
    P.run()


@follows(setupTests)
@files([("%s.tgz" % x, "%s.log" % x)
        for x in P.asList(PARAMS.get("prerequisites", ""))])
def runPreparationTests(infile, outfile):
    '''run pre-requisite pipelines.'''
    runTest(infile, outfile)


@follows(runPreparationTests)
@files([("%s.tgz" % x, "%s.log" % x) for x in P.CONFIG.sections()
        if x.startswith("test")
        and x not in P.asList(PARAMS.get("prerequisites", ""))])
def runTests(infile, outfile):
    '''run a pipeline with test data.'''
    runTest(infile, outfile)


@transform((runPreparationTests, runTests), suffix(".log"), ".md5")