Exemplo n.º 1
0
def applyThreshold(infile, fasta, threshold, max_distance=0):
    '''apply threshold to a wig file writing a
    bed-formatted file as output.'''

    c = E.Counter()

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        c.contigs += 1

        E.debug("processing %s" % contig)

        last_start, last_end = -1, 0

        for start, end, value in block_iterator(infile, contig, size):
            d = start - last_end
            if (d > 0 or value < threshold):
                if last_start >= 0:
                    yield contig, last_start, last_end
                    c.intervals += 1
                last_start = -1
            elif last_start < 0 and value >= threshold:
                last_start = start

            last_end = end

        if last_start >= 0:
            yield contig, last_start, end
            c.intervals += 1

        c.output += 1

    E.info(str(c))
Exemplo n.º 2
0
def annotateCpGIslands(infiles, outfile):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex(IOTools.openFile(cpgfile))

    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n")

    for tss in Bed.iterator(IOTools.openFile(tssfile)):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find(start, end))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start
            else:
                relative_start = tss.end - genome_end

            relative_end = relative_start + l

            outf.write("\t".join(
                map(str, (tss.name, tss.strand, genome_start, genome_end,
                          relative_start, relative_end))) + "\n")
            c.matches_output += 1

    outf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Exemplo n.º 3
0
def runAnalysis( sequences,
                 arrangements,
                 matrix = 'nr',
                 qvalue_threshold = 0.05 ):

    if matrix == 'nr':
        sense_matrix = NR
    elif matrix == "rxrvdr":
        sense_matrix = RXRVDR
    else:
        raise ValueError("unknown matrix")

    matcher = MatcherRandomisationSequence( sense_matrix )
    
    # find motifs in both foreground and control together
    results = []
    for x, sequence in enumerate(sequences):
        result = matcher.run( sequence,
                              arrangements,
                              qvalue_threshold = qvalue_threshold )
        
        for r in result:
            results.append( r._replace( sequence = x ) )
            
    nsequences = len(sequences)

    fg_filtered = combineMotifs( results )
    fg_counter = E.Counter()
    fg_seqs = set()
    co_counter = E.Counter()
    co_seqs = set()
    
    for x in results:
        fg_counter[x.arrangement] += 1
        fg_seqs.add( x.sequence )
        
    for x in fg_filtered:
        co_counter[x.arrangement] += 1
        co_seqs.add( x.sequence )

    for x in arrangements:
        print x, fg_counter[x], co_counter[x]

    print len(fg_seqs), len(co_seqs)

    return fg_filtered
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("script", "module"),
                      help="type of tests to create [%default].")

    parser.set_defaults(method="script")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError(
            "setup_test.py requires one or more command line arguments")

    targetdir = os.path.dirname(__file__)

    counter = E.Counter()

    for arg in args:
        counter.input += 1
        script_dirname, basename = os.path.split(arg)

        dirname = os.path.join(targetdir, basename)

        if os.path.exists(dirname):
            E.warn("%s already exists - skipping" % basename)
            counter.skipped += 1
            continue

        os.mkdir(dirname)

        with open(os.path.join(dirname, "tests.yaml"), "w") as outf:
            outf.write(YAML_TEMPLATE)

        counter.created += 1

    E.info("%s" % str(counter))

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 5
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
    '''output a bed file with genomic regions with functional annotations.

    The regions for each gene are given in the gtf file.

    Each bed entry is a gene territory. Bed entries are labeled
    by functional annotations associated with a gene.

    Ambiguities in territories are resolved by outputting 
    annotations for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.
    '''

    to_cluster = True

    territories_file = gtffile

    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    # IMS: connect is not in this module. dbh needs to be passed from caller
    #dbh = connect()
    cc = dbh.cursor()

    outf = P.getTempFile(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write("\t".join(
                map(str, (contig, start, end, "%s:%s" %
                          (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq | gzip > %(outfile_bed)s'''

    P.run()

    outf = IOTools.openFile(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.iteritems():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()
Exemplo n.º 6
0
def checkRequirementsFromAllModules():

    all_modules = sys.modules
    counter = E.Counter()
    results = []
    for module in list(sys.modules.keys()):
        if all_modules[module] is not None:
            results.extend(
                checkRequirementsFromModule(all_modules[module], counter))
    return counter, results
Exemplo n.º 7
0
def ReadGene2GOFromFile(infile, synonyms={}, obsolete={}):
    """reads GO mappings for all go_types from a
    file.

    If synonyms is given, goids in synynoms will be translated.
    Terms in *obsolete* will be discarded.

    returns two maps: gene2go maps genes to go categories
    and go2info maps go categories to information.
    """

    gene2gos = {}
    go2infos = {}
    c = E.Counter()

    for line in infile:
        if line[0] == "#":
            continue
        try:
            go_type, gene_id, goid, description, evidence = line[:-1].split(
                "\t")
        except ValueError as msg:
            raise ValueError("parsing error in line '%s': %s" %
                             (line[:-1], msg))
        if go_type == "go_type":
            continue

        c.input += 1

        if goid in synonyms:
            c.synonyms += 1
            goid = synonyms[goid]

        if goid in obsolete:
            c.obsolete += 1
            continue

        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if go_type not in gene2gos:
            gene2gos[go_type] = {}
            go2infos[go_type] = {}

        gene2go = gene2gos[go_type]
        go2info = go2infos[go_type]

        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi
        c.output += 1

    E.debug("read gene2go assignments: %s" % str(c))

    return gene2gos, go2infos
Exemplo n.º 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))

    files.sort()

    ## do sth
    c = E.Counter()

    for f in files:
        # if f != "pipeline_ancestral_repeats.py" : continue
        E.debug("importing %s" % f)
        c.input += 1
        prefix, suffix = os.path.splitext(f)

        dirname, basename = os.path.split(prefix)

        if os.path.exists(prefix + ".pyc"):
            os.remove(prefix + ".pyc")

        success = False
        try:
            __import__(basename, globals(), locals())
            c.success += 1
            success = True
            options.stdout.write("PASS %s\n" % basename)
            options.stdout.flush()
        except ImportError, msg:
            c.import_fail += 1
            options.stdout.write("FAIL %s\n%s\n" % (basename, msg))
            options.stdout.flush()
            traceback.print_exc()
        except Exception, msg:
            c.other_fail += 1
            options.stdout.write("FAIL %s\n%s\n" % (basename, msg))
            options.stdout.flush()
            traceback.print_exc()
Exemplo n.º 9
0
def calculateSplicingIndex(bamfile, gtffile, outfile):

    bamfile = pysam.AlignmentFile(bamfile)

    counts = E.Counter()

    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(gtffile))):

        introns = GTF.toIntronIntervals(transcript)
        E.debug("Gene %s (%s), Transcript: %s, %i introns" %
                (transcript[0].gene_id, transcript[0].contig,
                 transcript[0].transcript_id, len(introns)))

        for intron in introns:
            reads = bamfile.fetch(reference=transcript[0].contig,
                                  start=intron[0],
                                  end=intron[1])

            for read in reads:
                if 'N' in read.cigarstring:
                    blocks = read.get_blocks()
                    starts, ends = zip(*blocks)
                    if intron[0] in ends and intron[1] in starts:
                        counts["Exon_Exon"] += 1
                    else:
                        counts["spliced_uncounted"] += 1
                elif (read.reference_start <= intron[0] - 3
                      and read.reference_end >= intron[0] + 3):
                    if transcript[0].strand == "+":
                        counts["Exon_Intron"] += 1
                    else:
                        counts["Intron_Exon"] += 1
                elif (read.reference_start <= intron[1] - 3
                      and read.reference_end >= intron[1] + 3):
                    if transcript[0].strand == "+":
                        counts["Intron_Exon"] += 1
                    else:
                        counts["Exon_Intron"] += 1
                else:
                    counts["unspliced_uncounted"] += 1

        E.debug("Done, counts are: " + str(counts))
    header = [
        "Exon_Exon", "Exon_Intron", "Intron_Exon", "spliced_uncounted",
        "unspliced_uncounted"
    ]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(header) + "\n")
        outf.write("\t".join(map(str, [counts[col] for col in header])) + "\n")
Exemplo n.º 10
0
def imputeGO(infile_go, infile_paths, outfile):
    '''impute GO accessions.

    Infile is a file with GO assocations and a file
    with paths of term to ancester (see go2fmt.pl).
    '''

    c = E.Counter()

    term2ancestors = collections.defaultdict(set)
    with IOTools.openFile(infile_paths) as inf:
        for line in inf:
            parts = line[:-1].split()
            term = parts[0]
            ancestors = [parts[x] for x in range(2, len(parts), 2)]
            # there can be multiple paths
            term2ancestors[term].update(ancestors)

    goid2description = {}
    gene2goids = collections.defaultdict(list)
    goid2type = {}
    with IOTools.openFile(infile_go) as inf:
        for line in inf:
            if line.startswith("go_type"):
                continue
            go_type, gene_id, goid, description, evidence = line[:-1].split(
                "\t")
            gene2goids[gene_id].append(goid)
            goid2description[goid] = description
            goid2type[goid] = go_type

    outf = IOTools.openFile(outfile, "w ")
    for gene_id, in_goids in gene2goids.iteritems():
        c.genes += 1
        out_goids = set(in_goids)
        for goid in in_goids:
            out_goids.update(term2ancestors[goid])
        if len(in_goids) != len(out_goids):
            c.increased += 1
        else:
            c.complete += 1

        for goid in out_goids:
            outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid,
                                  goid2description.get(goid, ""), "NA")) +
                       "\n")
            c.assocations += 1

    outf.close()

    E.info("%s" % str(c))
Exemplo n.º 11
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks',
              'st_ctime', 'st_dev', 'st_gid', 'st_ino',
              'st_mode', 'st_mtime', 'st_nlink',
              'st_rdev', 'st_size', 'st_uid')

    dry_run = PARAMS.get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = IOTools.openFile(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = IOTools.openFile(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = IOTools.zapFile(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write("%s\t%s\t%s\t%s\n" % (
                    fn,
                    time.asctime(time.localtime(time.time())),
                    linkdest,
                    "\t".join([str(getattr(stat, x)) for x in fields])))

    E.info("zapped: %s" % (c))
    outfile.close()

    return c
Exemplo n.º 12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if IOTools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if IOTools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 13
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 14
0
def buildMRBed(infile, outfile):
    '''output bed6 file with methylated regions.

    All regions are output, even the insignificant ones.

    The score is the log fold change.
    '''

    outf = IOTools.openFile(outfile, "w")
    c = E.Counter()
    for row in csv.DictReader(IOTools.openFile(infile), dialect="excel-tab"):
        c.input += 1

        contig, start, end = re.match("(.*):(\d+)-(\d+)",
                                      row["interval_id"]).groups()
        c.output += 1
        outf.write("\t".join((contig, start, end, str(c.input),
                              row["lfold"])) + "\n")

    outf.close()

    E.info("%s" % str(c))
Exemplo n.º 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--exons-file",
                      "--gtf-file",
                      dest="filename_exons",
                      type="string",
                      metavar="gtf",
                      help="gtf formatted file with non-overlapping exon "
                      "locations (required). [%default]")

    parser.set_defaults(
        filename_exons=None,
        read_length=200,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    exons = GTF.readAndIndex(
        GTF.iterator(IOTools.openFile(options.filename_exons)))

    pysam_in = pysam.AlignmentFile("-", "rb")

    nspliced = 0
    nspliced_ignored = 0
    nspliced_nooverlap = 0
    nspliced_halfoverlap = 0
    nspliced_bothoverlap = 0
    nspliced_overrun = [0] * 2 * (options.read_length + 10)
    nspliced_exact = 0
    nspliced_inexact = 0
    nunspliced = 0
    nunspliced_overlap = 0
    nunspliced_ignored = 0
    nunspliced_nooverlap = 0
    nunspliced_overrun = [0] * (options.read_length + 10)
    overrun_offset = options.read_length + 10
    ninput = 0
    nunmapped = 0

    c = E.Counter()

    def _splice_overrun(start, end, overlap):
        '''return splicesite over/underrun.

        positive values: overrun
        negative values: underrun
        0: no over/underrun
        '''

        exon_start = min([x[0] for x in overlap])
        exon_end = max([x[1] for x in overlap])

        if start <= exon_start and end > exon_start:
            # overrun at start or match
            r = exon_start - start
        elif start < exon_end and end >= exon_end:
            # overrun at end or match
            r = end - exon_end
        else:
            # underrun - distance to closest exon boundary
            r = -min(start - exon_start, exon_end - end)

        return r

    for read in pysam_in:
        ninput += 1
        if read.is_unmapped:
            nunmapped += 1
            continue

        # check for BAM_CREF_SKIP code in cigar string
        cigar = read.cigar
        is_spliced = 3 in [x[0] for x in cigar]

        contig = pysam_in.getrname(read.tid)
        start = read.pos
        end = read.aend
        if is_spliced:
            # count both ends
            nspliced += 1

            if len(cigar) != 3:
                nspliced_ignored += 1
                continue

            start5, end5 = start, start + cigar[0][1]
            start3, end3 = end - cigar[2][1], end
            try:
                overlap3 = list(exons.get(contig, start3, end3))
                overlap5 = list(exons.get(contig, start5, end5))
            except KeyError:
                overlap3 = overlap5 = []

            ovl3 = len(overlap3)
            ovl5 = len(overlap5)
            o3 = o5 = None
            if not ovl3 and not ovl5:
                nspliced_nooverlap += 1
            elif ovl3 and not ovl5:
                nspliced_halfoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
            elif ovl5 and not ovl3:
                nspliced_halfoverlap += 1
                o5 = _splice_overrun(start5, end5, overlap5)
            else:
                # both overlap
                nspliced_bothoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
                o5 = _splice_overrun(start5, end5, overlap5)

            if o3 is not None:
                if o3 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o3)] += 1
            if o5 is not None:
                if o5 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o5)] += 1
        else:
            nunspliced += 1
            try:
                overlap = list(exons.get(contig, start, end))
            except KeyError:
                overlap = []

            if len(overlap) == 0:
                nunspliced_nooverlap += 1
            elif len(overlap) >= 1:
                nunspliced_overlap += 1
                # multiple overlap - merge exons (usually: small introns)
                exon_start = min([x[0] for x in overlap])
                exon_end = max([x[1] for x in overlap])
                ostart = max(0, exon_start - start)
                oend = max(0, end - exon_end)
                o = min(end, exon_end) - max(start, exon_start)
                overrun = ostart + oend
                nunspliced_overrun[overrun] += 1

    # output histograms
    outfile = E.openOutputFile("overrun")
    outfile.write(
        "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n"
    )
    _nspliced_overrun = nspliced_overrun[overrun_offset:]
    _nspliced_underrun = nspliced_overrun[:overrun_offset + 1]
    _nspliced_underrun.reverse()
    for x, v in enumerate(
            zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)):
        outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v))))
    outfile.close()

    # output summary
    # convert to counter
    c.input = ninput
    c.unmapped = nunmapped
    c.mapped = ninput - nunmapped

    c.unspliced = nunspliced
    c.unspliced_nooverlap = nunspliced_nooverlap
    c.unspliced_nooverrun = nunspliced_overrun[0]
    c.unspliced_overlap = nunspliced_overlap
    c.unspliced_overrun = sum(nunspliced_overrun[1:])

    c.spliced = nspliced
    c.spliced_nooverlap = nspliced_nooverlap
    c.spliced_halfoverlap = nspliced_halfoverlap
    c.spliced_bothoverlap = nspliced_bothoverlap
    c.spliced_exact = nspliced_exact
    c.spliced_inexact = nspliced_inexact
    c.spliced_ignored = nspliced_ignored
    c.spliced_underrun = sum(_nspliced_underrun[1:])
    c.spliced_overrun = sum(_nspliced_overrun[1:])

    outfile = options.stdout
    outfile.write("category\tcounts\n")
    for k, v in sorted(c.items()):
        outfile.write("%s\t%i\n" % (k, v))

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="bioconductor database to use [default=%default].")

    parser.add_option("-m",
                      "--mapping",
                      dest="database",
                      type="string",
                      help="bioconductor mapping to use [default=%default].")

    parser.add_option(
        "-g",
        "--gtf-file",
        dest="filename_gtf",
        type="string",
        help="filename with the gene set in gtf format [default=%default].")

    parser.set_defaults(
        database="mouse4302.db",
        mapping="ENSEMBL",
        filename_gtf=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    prefix = options.database[:-len(".db")]

    mapping_probeset2gene = prefix + options.mapping
    mapping_probeset2loc = prefix + "CHRLOC"

    probeset2gene = getProbeset2Gene(database=options.database, )

    probeset2location = getProbeset2Location(database=options.database, )

    # gtf = GTF.readAndIndex(
    #     GTF.iterator( IOTools.openFile( options.filename_gtf ) ) )

    counts = E.Counter()

    outfile_notfound = open("notfound.table", "w")

    options.stdout.write("probeset_id\tgene_id\tngenes\n")

    for probeset, locations in probeset2location.iteritems():
        counts.probesets += 1
        gene_ids = probeset2gene[probeset]
        if len(gene_ids) == 0:
            counts.notfound += 1
            continue

        for gene_id in gene_ids:
            options.stdout.write("%s\t%s\t%i\n" %
                                 (probeset, gene_id, len(gene_ids)))
        counts.output += 1

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 18
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize))
          # AString.AString( "a").itemsize ))

    for contig, size in contig_sizes.items():
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        annotations[contig] = array.array("c", default_code * size)

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.openOutputFile("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError, msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig],
                        intervals,
                        is_positive,
                        code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig],
                        UTR5,
                        is_positive,
                        "u")

            addIntrons(annotations[contig],
                       UTR5,
                       is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig],
                        UTR3,
                        is_positive,
                        "v")

            addIntrons(annotations[contig],
                       UTR3,
                       is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"],
                   is_positive)

            # add introns between CDS
            addIntrons(annotations[contig],
                       cds,
                       is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" %
                                        (contig,
                                         out_positive,
                                         end,
                                         start,
                                         c.frame,
                                         c.gene_id,
                                         c.transcript_id,
                                         ))
                end = ender(c)
Exemplo n.º 19
0
def createGOFromGeneOntology(infile, outfile):
    """get GO assignments from Geneontology.org

    GO terms are mapped to ensembl gene names via uniprot identifiers.

    Configuration
    -------------
    geneontology_file
       Filename on geneontology database, e.g.,
       gene_association.goa_human.gz
    database_name
       Pipeline database name

    Arguments
    ---------
    infile : string
        Unused
    outfile : string
        Output filename
    """

    filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz")
    if not os.path.exists(filename):
        statement = '''
    wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD
    '''

        P.run()

    # see http://www.geneontology.org/gene-associations/readme/goa.README
    Data = collections.namedtuple(
        "Data",
        "db db_object_id db_object_symbol qualifier goid dbreference evidence "
        " with_id aspect "
        " db_object_name synonym db_object_type "
        " taxon_id date assigned_by "
        " annotation_extension"
        " gene_product_form_id")

    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()
    map_uniprot2ensembl = dict(
        cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info").
        fetchall())
    map_goid2description = dict(
        cc.execute("SELECT DISTINCT go_id, description FROM go_assignments").
        fetchall())

    aspect2name = {
        "P": "biol_process",
        "F": "mol_function",
        "C": "cell_location"
    }

    c = E.Counter()
    found_uniprot, found_genes, notfound_uniprot = set(), set(), set()
    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")
    for line in IOTools.openFile(filename):
        if line.startswith("!"):
            continue
        c.input += 1
        data = Data._make(line[:-1].split("\t"))

        if data.db_object_symbol in map_uniprot2ensembl:
            gene_id = map_uniprot2ensembl[data.db_object_symbol]
            found_uniprot.add(data.db_object_symbol)
            found_genes.add(gene_id)
            outf.write(
                "%s\t%s\t%s\t%s\t%s\n" %
                (aspect2name[data.aspect], gene_id, data.goid,
                 map_goid2description.get(data.goid, ""), data.evidence))
            c.output += 1

        else:
            c.notfound += 1
            notfound_uniprot.add(data.db_object_symbol)

    c.found_genes = len(found_genes)
    c.found_uniprot = len(found_uniprot)
    c.notfound_uniprot = len(notfound_uniprot)

    E.info("%s" % str(c))
    E.info("not found=%s" % str(notfound_uniprot))
    outf.close()
Exemplo n.º 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string",
        help="filename with genome [default=%default].")

    parser.add_option(
        "--remove-regex", dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.add_option(
        "-e", "--gff-file", dest="gff_file", type="string",
        help="gff file to use for getting contig sizes.")

    parser.add_option(
        "-f", "--fixed-width-windows",
        dest="fixed_width_windows", type="string",
        help="fixed width windows. Supply the window size as a "
        "parameter. Optionally supply an offset.")

    parser.set_defaults(
        genome_file=None,
        remove_regex=None,
        fixed_windows=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.remove_regex:
        remove_regex = re.compile(options.remove_regex)
    else:
        remove_regex = None

    if options.fixed_width_windows:
        v = map(int, options.fixed_width_windows.split(","))
        if len(v) == 2:
            window_size, window_increment = v
        elif len(v) == 1:
            window_size, window_increment = v[0], v[0]
        else:
            raise ValueError(
                "could not parse window size '%s': should be size[,increment]" % options.fixed_width_windows)

    if options.gff_file:
        infile = open(options.gff_file, "r")
        gff = GTF.readFromFile(infile)
        infile.close()
        for g in gff:
            try:
                map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end)
            except ValueError:
                map_contig2size[g.mName] = g.end

    else:
        gff = None

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        map_contig2size = fasta.getContigSizes(with_synonyms=False)
    else:
        fasta = None

    if map_contig2size is None:
        raise ValueError("no source of contig sizes supplied")

    # do sth
    counter = E.Counter()

    for contig, size in map_contig2size.items():
        size = int(size)
        counter.input += 1

        if remove_regex and remove_regex.search(contig):
            counter.skipped += 1
            continue

        if options.fixed_width_windows:
            for x in range(0, size, window_increment):
                if x + window_size > size:
                    continue
                options.stdout.write(
                    "%s\t%i\t%i\n" % (contig, x, min(size, x + window_size)))
                counter.windows += 1
        else:
            options.stdout.write("%s\t%i\t%i\n" % (contig, 0, size))
            counter.windows += 1

        counter.output += 1

    E.info(str(counter))

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 21
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("igv", ),
                      help="method to create plots with [%default]")

    parser.add_option("-d",
                      "--snapshot-dir",
                      dest="snapshotdir",
                      type="string",
                      help="directory to save snapshots in [%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("png", "eps", "svg"),
                      help="output file format [%default]")

    parser.add_option("-o",
                      "--host",
                      dest="host",
                      type="string",
                      help="host that IGV is running on [%default]")

    parser.add_option("-p",
                      "--port",
                      dest="port",
                      type="int",
                      help="port that IGV listens at [%default]")

    parser.add_option(
        "-e",
        "--extend",
        dest="extend",
        type="int",
        help="extend each interval by a number of bases [%default]")

    parser.add_option("-x",
                      "--expand",
                      dest="expand",
                      type="float",
                      help="expand each region by a certain factor [%default]")

    parser.set_defaults(
        method="igv",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    E.info("connection to session on %s:%s" % (options.host, options.port))

    E.info("saving images in %s" % options.snapshotdir)
    igv = IGV.IGV(host=options.host,
                  port=options.port,
                  snapshot_dir=os.path.abspath(options.snapshotdir))

    c = E.Counter()
    for bed in Bed.iterator(options.stdin):

        c.input += 1

        # IGV can not deal with white-space in filenames
        name = re.sub("\s", "_", bed.name)

        E.info("going to %s:%i-%i for %s" %
               (bed.contig, bed.start, bed.end, name))

        start, end = bed.start, bed.end
        extend = options.extend
        if options.expand:
            d = end - start
            extend = max(extend, (options.expand * d - d) // 2)

        start -= extend
        end += extend

        igv.go("%s:%i-%i" % (bed.contig, start, end))

        fn = "%s.%s" % (name, options.format)
        E.info("writing snapshot to '%s'" % fn)
        igv.save(fn)

        c.snapshots += 1

    E.info(c)
    E.Stop()
Exemplo n.º 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--min-overlap",
                      dest="min_overlap",
                      type="int",
                      help="minimum overlap")

    parser.add_option(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type="string",
        help=
        "regular expression to extract window coordinates from test id [%default]"
    )

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert direction of fold change [%default]")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfiles = IOTools.FilePool(options.output_filename_pattern)

    if options.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(options.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in IOTools.iterate(options.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = map(int, (start, end))

            yield DATA._make(
                (contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = data.next()
        entries = [last]

        while 1:
            d = data.next()
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    options.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    for group in grouper(iter(all_data), distance=options.min_overlap):

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.treatment_name,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.control_name,
                     sum([x.control_mean for x in group]) / n))

        options.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('reconcile', 'merge'),
                      help="method to apply [default=%default].")

    parser.add_option("-c", "--chop", dest="chop", action="store_true",
                      help="whether or not to trim last character of "
                      "sequence name. For example sometimes ids in the first "
                      "file in the pair will end with \1 and the second "
                      "with \2. If --chop is not specified "
                      "then the results will be wrong [default=%default].")
    parser.add_option("-u", "--unpaired", dest="unpaired", action="store_true",
                      help="whether or not to write out unpaired reads "
                      "to a seperate file")

    parser.add_option("-o", "--output-pattern",
                      dest="output_pattern", type="string",
                      help="pattern for output files [default=%default].")

    parser.set_defaults(
        method="reconcile",
        chop=False,
        unpaired=False,
        output_pattern="%s.fastq.gz",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()

    if options.method == "reconcile":

        def getIds(infile):
            '''return ids in infile.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = l[0].split()[0]
                # decide if to chop read number off
                if options.chop:
                    yield r[:-1]
                else:
                    yield r

        def write(outfile, infile, take, unpaired_file=None):
            '''filter fastq files with ids in take.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = l[0].split()[0]
                if options.chop:
                    r = r[:-1]
                if r not in take:
                    if unpaired_file is None:
                        continue
                    else:
                        unpaired_file.write("\n".join(l) + "\n")
                else:
                    outfile.write("\n".join(l) + "\n")

        E.info("reading first in pair")
        inf1 = IOTools.openFile(fn1)
        ids1 = set(getIds(inf1))

        E.info("reading second in pair")
        inf2 = IOTools.openFile(fn2)
        ids2 = set(getIds(inf2))

        take = ids1.intersection(ids2)

        E.info("first pair: %i reads, second pair: %i reads, "
               "shared: %i reads" %
               (len(ids1),
                len(ids2),
                len(take)))

        if options.unpaired:
            unpaired_filename = IOTools.openFile(
                options.output_pattern % "unpaired", "w")
        else:
            unpaired_filename = None

        with IOTools.openFile(options.output_pattern % "1", "w") as outf:
            inf = IOTools.openFile(fn1)
            E.info("writing first in pair")
            write(outf, inf, take, unpaired_filename)

        with IOTools.openFile(options.output_pattern % "2", "w") as outf:
            inf = IOTools.openFile(fn2)
            E.info("writing second in pair")
            write(outf, inf, take, unpaired_filename)

        if options.unpaired:
            unpaired_filename.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
Exemplo n.º 24
0
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | python %(scriptsdir)s/bed2fasta.py 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH",
        "pid transfac_id pos strand core_similarity matrix_similarity sequence"
    )

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r: yield pid, r
                r = []
                pid = re.match("Inspecting sequence ID\s+(\S+)",
                               line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep: continue
            if line[:-1].strip() == "": continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")
            ]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(
                MATCH._make((pid, transfac_id, int(pos), strand,
                             float(core_similarity), float(matrix_similarity),
                             sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand", "start", "end",
                          "relative_start", "relative_end", "transfac_id",
                          "core_similarity", "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile +
                                                            ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(
                map(str, (transcript_id, strand, genome_start, genome_end,
                          relative_start, relative_end, match.transfac_id,
                          match.core_similarity, match.matrix_similarity,
                          match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(
                map(str, (contig, genome_start, genome_end, transcript_id,
                          strand, match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Exemplo n.º 25
0
def annotateGREATDomains(iterator, fasta, options):
    """build great domains

    extend from TSS a basal region.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    counter = E.Counter()

    upstream, downstream = options.upstream, options.downstream
    radius = options.radius
    outfile = options.stdout

    regions = []
    ####################################################################
    # define basal regions for each gene
    # take all basal regions per transcript and merge them
    # Thus, the basal region of a gene might be larger than the sum
    # of options.upstream + options.downstream
    for gene in gene_iterator:
        counter.genes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)

        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []

        # collect every basal region per transcript
        for transcript in gene:
            counter.transcripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            # add range to both sides of tss
            if is_negative_strand:
                interval = ma - options.downstream, ma + options.upstream
            else:
                interval = mi - options.upstream, mi + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        # take first/last entry
        start, end = min(x[0] for x in regulons), max(x[1] for x in regulons)

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "greatdomain"
        gtf.start, gtf.end = start, end
        regions.append(gtf)

    regions.sort(key=lambda x: (x.contig, x.start))

    outf = IOTools.openFile("test.gff", "w")
    for x in regions:
        outf.write(str(x) + "\n")
    outf.close()

    ####################################################################
    # extend basal regions
    regions.sort(key=lambda x: (x.contig, x.start))

    # iterate within groups of overlapping basal regions
    groups = list(GTF.iterator_overlaps(iter(regions)))
    counter.groups = len(groups)

    last_end = 0
    reset = False

    for region_id, group in enumerate(groups):

        # collect basal intervals in group
        intervals = [(x.start, x.end) for x in group]

        def overlapsBasalRegion(pos):
            for start, end in intervals:
                if start == pos or end == pos:
                    continue
                if start <= pos < end:
                    return True
                if start > pos:
                    return False
            return False

        # deal with boundary cases - end of contig
        if region_id < len(groups) - 1:
            nxt = groups[region_id + 1]
            if nxt[0].contig == group[0].contig:
                next_start = min([x.start for x in nxt])
            else:
                next_start = fasta.getLength(group[0].contig)
                reset = True
        else:
            next_start = fasta.getLength(group[0].contig)
            reset = True

        # last_end = basal extension of previous group
        # next_start = basal_extension of next group

        # extend region to previous/next group always extend
        # dowstream, but upstream only extend if basal region of an
        # interval is not already overlapping another basal region
        # within the group
        save_end = 0
        for gtf in group:
            save_end = max(save_end, gtf.end)
            if gtf.strand == "+":
                if not overlapsBasalRegion(gtf.start):
                    gtf.start = max(gtf.start - radius, last_end)
                # always extend downstream
                gtf.end = min(gtf.end + radius, next_start)
            else:
                # always extend downstream
                gtf.start = max(gtf.start - radius, last_end)
                if not overlapsBasalRegion(gtf.end):
                    gtf.end = min(gtf.end + radius, next_start)
            outfile.write(str(gtf) + "\n")
            counter.regulons += 1

        if len(group) > 1:
            counter.overlaps += len(group)
        else:
            counter.nonoverlaps += 1

        if reset:
            last_end = 0
            reset = False
        else:
            last_end = save_end

    E.info("%s" % str(counter))
def buildPolyphenInput(infiles, outfile):
    '''build polyphen input file.

    SNPS across all species are aggregated into a single
    file to avoid multiple submissions for the same variant.

    Mapping to Uniprot ids was not successful - 40% of the
    SNPs would have been lost. Hence I map to ensembl protein
    identifiers. Note that the sequence file is then to be 
    submitted to POLYPHEN as well.

    Note that this method outputs 1-based coordinates for polyphen,
    while the coordinates in the .map file are still 0-based.

    SNPs are assigned a snp_id and a locus_id. The snp_id refers
    to the SNP within a peptide sequence while the locus_id refers
    to the genomic location. If there are alternative
    transcripts overlapping a SNP, the same SNP will get two
    snp_ids, but the same locus_id. As the peptide background might
    be different for the same SNP depending on the transcript,
    its effect needs to be predicted twice.
    '''

    statement = '''SELECT
        transcript_id,
        cds_start,
        cds_end,
        orig_codons,
        variant_codons,
        orig_na,
        variant_na,
        contig,
        snp_position
    FROM %(table)s_cds
    WHERE variant_code = '=' AND code = 'N'
    '''

    dbhandle = connect()
    cc = dbhandle.cursor()

    infiles.sort()

    # ensembl mapping
    map_transcript2id = dict(
        cc.execute("SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL").fetchall())

    total_counts = E.Counter()
    notfound, found = set(), set()

    outf_map = open(outfile + ".map", "w")
    outf_map.write(
        "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n")

    outf = open(outfile, "w")

    snps = {}
    locus_ids = {}

    for infile in infiles:

        table = P.toTable(infile)
        track = table[:-len("_effects")]
        print(statement % locals())
        cc.execute(statement % locals())

        counts = E.Counter()

        snp_id = 0
        for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc:

            counts.input += 1

            if transcript_id not in map_transcript2id:
                notfound.add(transcript_id)
                counts.not_found += 1
                continue

            if "," in variant_codons:
                counts.heterozygous += 1
                continue

            for phase in range(0, 3):
                if orig_na[phase].lower() != variant_na[phase].lower():
                    break

            pid = map_transcript2id[transcript_id]
            # one-based coordinates
            peptide_pos = int(math.floor(cds_start / 3.0)) + 1
            key = "%s-%i-%s" % (pid, peptide_pos, variant_codons)

            if key in snps:
                snp_id = snps[key]
            else:
                snp_id = len(snps)
                snps[key] = snp_id
                outf.write("snp%010i\t%s\t%i\t%s\t%s\n" %
                           (snp_id,
                            pid,
                            peptide_pos,
                            orig_codons,
                            variant_codons,
                            ))
                counts.output += 1

            locus_key = "%s-%i-%s" % (contig, pos, variant_codons)
            if locus_key not in locus_ids:
                locus_ids[locus_key] = len(locus_ids)

            # use 0-based coordinates throughout, including peptide pos
            outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" %
                           (snp_id,
                            track,
                            transcript_id,
                            pid,
                            peptide_pos - 1,
                            locus_ids[locus_key],
                            contig,
                            pos,
                            phase))

            found.add(transcript_id)

        total_counts += counts

        E.info("%s: %s" % (table, str(counts)))

    outf.close()
    outf_map.close()

    E.info("%s: transcripts: %s found, %i not found" % (table,
                                                        len(found),
                                                        len(notfound)))

    E.info("total=%s, snp_ids=%i, locus_ids=%i" %
           (str(total_counts), len(snps), len(locus_ids)))
    if notfound:
        E.warn("%i transcripts had SNPS that were ignored because there was no uniprot accession" %
               len(notfound))
        E.warn("notfound: %s" % ",".join(notfound))

    statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s'''

    P.run()
Exemplo n.º 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-e", "--exclusive-overlap", dest="exclusive",
        action="store_true",
        help="Intervals reported will be merged across the "
        "positive set and do not overlap any interval in any of the "
        "other sets [default=%default].")

    parser.add_option(
        "-p", "--pattern-identifier", dest="pattern_id", type="string",
        help="pattern to convert a filename "
        "to an id [default=%default].")

    parser.add_option(
        "-m", "--method", dest="method", type="choice",
        choices=("merged-combinations",
                 "unmerged-combinations"),
        help="method to perform [default=%default]")

    parser.set_defaults(
        pattern_id="(.*).bed.gz",
        exclusive=False,
        method="merged-combinations",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    tags, bedfiles = [], []
    for infile in args:
        bedfiles.append(pysam.Tabixfile(infile, "r"))
        tags.append(re.search(options.pattern_id, infile).groups()[0])

    indices = list(range(len(bedfiles)))
    is_exclusive = options.exclusive

    if options.method == "merged-combinations":

        if is_exclusive:
            start = 1
        else:
            start = 2

        options.stdout.write("combination\twithout\tcounts\n")

        for ncombinants in range(start, len(bedfiles) + 1):
            for combination in itertools.combinations(indices, ncombinants):
                other = [x for x in indices if x not in combination]
                tag = ":".join([tags[x] for x in combination])
                E.debug("combination %s started" % tag)
                E.debug("other: %s" % ":".join([tags[x] for x in other]))

                other_bed = [bedfiles[x] for x in other]
                outf = IOTools.openFile(
                    E.getOutputFile(tag), "w", create_dir=True)
                c = E.Counter()
                for contig, start, end in combineMergedIntervals(
                        [bedfiles[x] for x in combination]):
                    c.found += 1
                    if is_exclusive and isContainedInOne(contig,
                                                         start,
                                                         end,
                                                         other_bed):
                        c.removed += 1
                        continue
                    c.output += 1
                    outf.write("%s\t%i\t%i\n" % (contig, start, end))

                outf.close()
                E.info("combination %s finished: %s" % (tag, c))

                options.stdout.write("%s\t%s\t%i\n" % (
                    ":".join([tags[x] for x in combination]),
                    ":".join([tags[x] for x in other]),
                    c.output))

    elif options.method == "unmerged-combinations":
        options.stdout.write("track\tcombination\twithout\tcounts\n")

        for foreground in indices:

            start = 0

            background = [x for x in indices if x != foreground]
            for ncombinants in range(0, len(background) + 1):
                for combination in itertools.combinations(background,
                                                          ncombinants):
                    other = [x for x in background if x not in combination]
                    combination_bed = [bedfiles[x] for x in combination]
                    other_bed = [bedfiles[x] for x in other]
                    tag = ":".join([tags[foreground]] + [tags[x]
                                                         for x in combination])

                    E.debug("fg=%i, combination=%s, other=%s" %
                            (foreground, combination, other))
                    E.debug("combination %s started" % tag)
                    E.debug("other: %s" % ":".join([tags[x] for x in other]))

                    outf = IOTools.openFile(
                        E.getOutputFile(tag), "w", create_dir=True)
                    c = E.Counter()
                    for bed in combineUnmergedIntervals(
                            bedfiles[foreground],
                            combination_bed):
                        c.found += 1
                        if is_exclusive and isContainedInOne(bed.contig,
                                                             bed.start,
                                                             bed.end,
                                                             other_bed):
                            c.removed += 1
                            continue
                        c.output += 1
                        outf.write("%s\n" % str(bed))

                    outf.close()
                    E.info("combination %s finished: %s" % (tag, c))

                    options.stdout.write("%s\t%s\t%s\t%i\n" % (
                        tags[foreground],
                        ":".join([tags[x] for x in combination]),
                        ":".join([tags[x] for x in other]),
                        c.output))

    E.Stop()
Exemplo n.º 28
0
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False):
    '''get refseq gene set from UCSC database and save as :term:`gtf`
    formatted file.

    Matches to ``chr_random`` are ignored (as does ENSEMBL).

    Note that this approach does not work as a gene set, as refseq
    maps are not real gene builds and unalignable parts cause
    differences that are not reconcilable.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`gtf` format. The filename
       aims to be close to the ENSEMBL gtf format.
    remove_duplicate : bool
       If True, duplicate mappings are removed.

    '''

    duplicates = set()

    if remove_duplicates:
        cc = dbhandle.cursor()
        cc.execute("""SELECT name, COUNT(*) AS c FROM refGene
        WHERE chrom NOT LIKE '%_random'
        GROUP BY name HAVING c > 1""")
        duplicates = set([x[0] for x in cc.fetchall()])
        E.info("removing %i duplicates" % len(duplicates))

    # these are forward strand coordinates
    statement = '''
    SELECT gene.name, link.geneName, link.name, gene.name2, product,
    protAcc, chrom, strand, cdsStart, cdsEnd,
    exonCount, exonStarts, exonEnds, exonFrames
    FROM refGene as gene, refLink as link
    WHERE gene.name = link.mrnaAcc
    AND chrom NOT LIKE '%_random'
    ORDER by chrom, cdsStart
    '''

    outf = IOTools.openFile(outfile, "w")

    cc = dbhandle.cursor()
    cc.execute(statement)

    SQLResult = collections.namedtuple(
        'Result',
        '''transcript_id, gene_id, gene_name, gene_id2, description,
        protein_id, contig, strand, start, end,
        nexons, starts, ends, frames''')

    counts = E.Counter()
    counts.duplicates = len(duplicates)

    for r in map(SQLResult._make, cc.fetchall()):

        if r.transcript_id in duplicates:
            continue

        starts = map(int, r.starts.split(",")[:-1])
        ends = map(int, r.ends.split(",")[:-1])
        frames = map(int, r.frames.split(",")[:-1])

        gtf = GTF.Entry()
        gtf.contig = r.contig
        gtf.source = "protein_coding"
        gtf.strand = r.strand
        gtf.gene_id = r.gene_id
        gtf.transcript_id = r.transcript_id
        gtf.addAttribute("protein_id", r.protein_id)
        gtf.addAttribute("transcript_name", r.transcript_id)
        gtf.addAttribute("gene_name", r.gene_name)

        assert len(starts) == len(ends) == len(frames)

        if gtf.strand == "-":
            starts.reverse()
            ends.reverse()
            frames.reverse()

        counts.transcripts += 1
        i = 0
        for start, end, frame in zip(starts, ends, frames):
            gtf.feature = "exon"
            counts.exons += 1
            i += 1
            gtf.addAttribute("exon_number", i)
            # frame of utr exons is set to -1 in UCSC
            gtf.start, gtf.end, gtf.frame = start, end, "."
            outf.write("%s\n" % str(gtf))

            cds_start, cds_end = max(r.start, start), min(r.end, end)
            if cds_start >= cds_end:
                # UTR exons have no CDS
                # do not expect any in UCSC
                continue
            gtf.feature = "CDS"
            # invert the frame
            frame = (3 - frame % 3) % 3
            gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame
            outf.write("%s\n" % str(gtf))

    outf.close()

    E.info("%s" % str(counts))
Exemplo n.º 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--source",
                      dest="source_directory",
                      type="string",
                      default=False,
                      help="The directory in which data"
                      "files are held [%default]")

    parser.add_option("-d",
                      "--dest",
                      dest="dest_directory",
                      type="string",
                      default=False,
                      help="The directory in which links"
                      "are created [%default]")

    parser.set_defaults(source_directory=None, dest_directory=".")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read a map of input files to links with sanity checks
    map_filename2link = {}
    links = set()
    for line in options.stdin:
        if line.startswith("#"):
            continue

        # ignore header
        if line.startswith("source"):
            continue

        filename, link = line[:-1].split()[:2]
        if filename in map_filename2link:
            raise ValueError("duplicate filename '%s' " % filename)
        if link in links:
            raise ValueError("duplicate link '%s' " % link)
        map_filename2link[filename] = link
        links.add(link)

    counter = E.Counter()
    counter.input = len(map_filename2link)

    def _createLink(src, dest, counter):
        src = os.path.abspath(src)
        dest = os.path.abspath(os.path.join(options.dest_directory, dest))
        if os.path.exists(dest):
            E.warn("existing symlink %s" % dest)
            counter.link_exists += 1
        elif not os.path.exists(src):
            counter.file_not_found += 1
            E.warn("did not find %s" % src)
        else:
            try:
                os.symlink(src, dest)
                counter.success += 1
            except OSError:
                pass

    if not options.source_directory:
        # no source directory given, filenames must have complete path
        for filename, link in list(map_filename2link.items()):
            _createLink(filename, link, counter)
    else:
        # walk through directory hierchy and create links
        # for files matching filenames in map_filename2link
        found = set()
        for dirName, subdirList, fileList in os.walk(options.source_directory):
            for f in fileList:
                if f in map_filename2link:
                    if f in found:
                        E.warn("found multiple files with "
                               "the same name %s" % f)
                    else:
                        _createLink(os.path.join(dirName, f),
                                    map_filename2link[f], counter)
                        found.add(f)
                else:
                    E.info("Filename %s not in map" % f)

        notfound = set(map_filename2link.keys()).difference(found)
        counter.notfound = len(notfound)
        if notfound:
            E.warn("did not find %i files: %s" %
                   (len(notfound), str(notfound)))

    E.info(counter)
    # write footer and output benchmark information
    E.Stop()
Exemplo n.º 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--directory",
                      dest="directory",
                      type="string",
                      help="supply help")

    parser.set_defaults(
        directory=".",
        # using Andreas' repository in order to delay
        # changes to main repository in /ifs/devel/cgat
        basename="/ifs/devel/andreas/cgat/",
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # collect a list of python modules
    module_names = [
        os.path.basename(x)[:-3]
        for x in glob.glob(options.basename + "CGAT/*.py")
    ]
    pipeline_names = [
        os.path.basename(x)[:-3]
        for x in glob.glob(options.basename + "CGATPipelines/Pipeline*.py")
    ]

    if options.directory == "CGAT":
        files_to_update = glob.glob( os.path.join( basename, "scripts/*.py" )) +\
            glob.glob( os.path.join( options.basename, "scripts/*.pyx" )) +\
            glob.glob( os.path.join( options.basename, "CGATPipelines/pipeline_*.py" ) )+\
            glob.glob( os.path.join( options.basename, "CGATPipelines/Pipeline*.py" ) )
    else:
        files_to_update = glob.glob(os.path.join(options.directory, "*.py"))

    E.info("updating %i python scripts/modules" % len(files_to_update))

    counter = E.Counter()

    for script in files_to_update:

        counter.input += 1
        print "working on", script

        inf = open(script)
        lines = inf.readlines()
        inf.close()

        # create a backup copy
        shutil.move(script, script + ".bak")

        outf = open(script, "w")
        updated = False
        for line in lines:
            if re.match("import ", line):
                if " as " in line:
                    try:
                        module, name = re.match("import (\S+) as (\S+)\s*$",
                                                line).groups()
                    except AttributeError as msg:
                        raise AttributeError(
                            "parsing error in line '%s': '%s'" %
                            (line[:-1], msg))
                    if module in module_names:
                        line = "import CGAT.%s as %s\n" % (module, name)
                        updated = True
                else:
                    try:
                        modules = re.match("import (.+)", line).groups()[0]
                    except AttributeError as msg:
                        raise AttributeError(
                            "parsing error in line '%s': '%s'" %
                            (line[:-1], msg))
                    modules = [x.strip() for x in modules.split(",")]
                    for module in modules:
                        if module in module_names:
                            outf.write("import CGAT.%s as %s\n" %
                                       (module, module))
                            updated = True
                        elif module in pipeline_names:
                            outf.write("import CGATPipelines.%s as %s\n" %
                                       (module, module))
                            updated = True
                        else:
                            outf.write("import %s\n" % module)
                    continue

            outf.write(line)
        outf.close()

        if updated: counter.updated += 1

    E.info("summary: %s" % str(counter))

    ## write footer and output benchmark information.
    E.Stop()