예제 #1
0
def main(args):
    # fields from issake
    fields = "contig_id length reads avg_coverage seed v_region j_region".split()
    # the only fields i believe make any sense to keep
    out = "id v_region j_region length reads avg_coverage percent_of_total sequence".split()
    # total reads used in assembly
    total = 0.
    with nopen(args.fasta_in) as fasta:
        for name, seq in read_fasta(fasta):
            name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","")
            d = dict(zip(fields, name.split("|")))
            total += int(d['reads'])
    with nopen(args.fasta_in) as fasta,\
            open(args.fasta_out, 'wb') as fasta_out,\
            open(args.meta, 'wb') as meta:
        # print header
        meta.write("\t".join(out) + "\n")
        for i, (name, seq) in enumerate(read_fasta(fasta)):
            # remove some text from iSSAKE output
            name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","")
            d = dict(zip(fields, name.split("|")))
            # want to shorten the read names
            d['id'] = "contig_%d" % i
            d['percent_of_total'] = "%.6g" % (100 * (int(d['reads']) / total))
            d['sequence'] = seq
            meta.write("\t".join(map(str, [d[o] for o in out])) + "\n")
            write_fasta(fasta_out, d['id'], seq.upper())
예제 #2
0
def main(prefix, name_re, min_samples, methylation_files):
    name_re = re.compile(r"%s" % name_re)
    if not prefix.endswith((".", "/")): prefix += "."
    fhm = nopen('{prefix}methylation.txt.gz'.format(prefix=prefix), 'w')
    fhme = nopen('{prefix}methylated.txt.gz'.format(prefix=prefix), 'w')
    fhc = nopen('{prefix}counts.txt.gz'.format(prefix=prefix), 'w')

    def source_from_fname(fname):
        try:
            return name_re.search(fname).groups(0)[0]
        except:
            return op.basename(fname)

    iterables = [gen_iterable(f, source_from_fname) for f in methylation_files]
    sources = [source_from_fname(f) for f in methylation_files]

    fmt = "{chrom}:{start}\t{vals}\n"
    fhm.write("probe\t%s" % "\t".join(sources) + "\n")
    fhc.write("probe\t%s" % "\t".join(sources) + "\n")
    fhme.write("probe\t%s" % "\t".join(sources) + "\n")
    for chrom, start, end, values, counts, meths in bed_merge(iterables, sources):
        if sum(tryfloat(v) > 0 for v in values) < min_samples: continue
        vals = "\t".join(values)
        fhm.write(fmt.format(chrom=chrom, start=start, vals=vals))
        counts = "\t".join(counts)
        fhc.write(fmt.format(chrom=chrom, start=start, vals=counts))
        meths = "\t".join(meths)
        fhme.write(fmt.format(chrom=chrom, start=start, vals=meths))
예제 #3
0
def main(prefix, name_re, min_samples, methylation_files):
    name_re = re.compile(r"%s" % name_re)
    if not prefix.endswith((".", "/")): prefix += "."
    fhm = nopen('{prefix}methylation.txt.gz'.format(prefix=prefix), 'w')
    fhme = nopen('{prefix}methylated.txt.gz'.format(prefix=prefix), 'w')
    fhc = nopen('{prefix}counts.txt.gz'.format(prefix=prefix), 'w')

    def source_from_fname(fname):
        try:
            return name_re.search(fname).groups(0)[0]
        except:
            return op.basename(fname)

    iterables = [gen_iterable(f, source_from_fname) for f in methylation_files]
    sources = [source_from_fname(f) for f in methylation_files]

    fmt = "{chrom}:{start}\t{vals}\n"
    fhm.write("probe\t%s" % "\t".join(sources) + "\n")
    fhc.write("probe\t%s" % "\t".join(sources) + "\n")
    fhme.write("probe\t%s" % "\t".join(sources) + "\n")
    for chrom, start, end, values, counts, meths in bed_merge(
            iterables, sources):
        if sum(tryfloat(v) > 0 for v in values) < min_samples: continue
        vals = "\t".join(values)
        fhm.write(fmt.format(chrom=chrom, start=start, vals=vals))
        counts = "\t".join(counts)
        fhc.write(fmt.format(chrom=chrom, start=start, vals=counts))
        meths = "\t".join(meths)
        fhme.write(fmt.format(chrom=chrom, start=start, vals=meths))
예제 #4
0
def main(args):
    tags = {}
    if args.verbose:
        sys.stderr.write(">> reading in tag sequences...\n")
    with nopen(args.tags) as fasta:
        for name, seq in read_fasta(fasta):
            tags[name] = seq
    i = 0
    for fx in args.reads:
        if args.verbose:
            sys.stderr.write(">> processing %s...\n" % op.basename(fx))
        # process either fasta or fastq.
        if ".fasta" in fx or ".fa" in fx:
            with nopen(fx) as fa:
                for f_id, f_seq in read_fasta(fa):
                    i += 1
                    if i % 1000000 == 0 and args.verbose:
                        sys.stderr.write(">> processed %d reads...\n" % i)
                    print_record(tags, f_id, f_seq)
        else:
            with nopen(fx) as fq:
                for f_id, f_seq, f_qual in read_fastq(fq):
                    i += 1
                    if i % 1000000 == 0 and args.verbose:
                        sys.stderr.write(">> processed %d reads...\n" % i)
                    print_record(tags, f_id, f_seq)
예제 #5
0
    def _set_structure(self, structure):
        """
        here, we want to intersect the query and subject bed files with the
        structure.bed file and give each set of intervals in query and bed
        that fall within (or have any overlap with) a unique, fake chromosome
        so that all shuffling is within that chromosome.
        in order to do this, we also have to create a fake genome file that
        contains the lengths of those chromosomes.
        """
        if structure in (None, ""): return
        self.chrom = True  # has to be by chromosome.

        n_query_before = sum(1 for _ in nopen(self.query))
        n_subject_before = sum(1 for _ in nopen(self.subject))

        new_genome = open(mktemp(suffix='.fake_genome'), 'w')
        structure = "<(cut -f 1-3 %s)" % structure
        seen_segs = {}
        for bed in ('query', 'subject', 'exclude', 'include'):
            bed_path = getattr(self, "_" + bed, getattr(self, bed))
            if not bed_path: continue
            new_fh = open(mktemp(suffix='%s.fake' % bed), 'w')
            for toks in reader("|bedtools intersect -wo -a %s -b '%s' \
                    | sort -k4,4 -k5,5g" % (structure, bed_path),
                               header=False):
                gtoks, btoks = toks[:3], toks[3:-1]  # drop the bp overlap
                new_chrom = "_".join(gtoks)

                gtoks[1:] = map(int, gtoks[1:])
                btoks[1:3] = map(int, btoks[1:3])

                glen = gtoks[2] - gtoks[1]  # fake chrom length.
                if new_chrom.startswith('chr'): new_chrom = new_chrom[3:]
                if not new_chrom in seen_segs:
                    # save it in the genome file.
                    print >> new_genome, "\t".join((new_chrom, str(glen)))
                seen_segs[new_chrom] = True

                # with partial overlap, we'll have a negative start or an
                # end outside the genome... for now, just truncate.

                # adjust the interval to its location the new chrom.
                btoks[0] = new_chrom
                btoks[1] = max(0,
                               btoks[1] - gtoks[1])  # don't let it go below 0
                # chop to end of fake chrom.
                btoks[2] = min(btoks[2] - gtoks[1], glen - 1)
                assert 0 <= btoks[1] <= btoks[2] < glen
                btoks[1:3] = map(str, btoks[1:3])
                print >> new_fh, "\t".join(btoks)
            new_fh.close()
            setattr(self, bed, new_fh.name)
        new_genome.close()
        self.genome_file = new_genome.name
예제 #6
0
파일: shuffler.py 프로젝트: brentp/shuffler
    def _set_structure(self, structure):
        """
        here, we want to intersect the query and subject bed files with the
        structure.bed file and give each set of intervals in query and bed
        that fall within (or have any overlap with) a unique, fake chromosome
        so that all shuffling is within that chromosome.
        in order to do this, we also have to create a fake genome file that
        contains the lengths of those chromosomes.
        """
        if structure in (None, ""): return
        self.chrom = True # has to be by chromosome.

        n_query_before = sum(1 for _ in nopen(self.query))
        n_subject_before = sum(1 for _ in nopen(self.subject))

        new_genome = open(mktemp(suffix='.fake_genome'), 'w')
        structure = "<(cut -f 1-3 %s)" % structure
        seen_segs = {}
        for bed in ('query', 'subject', 'exclude', 'include'):
            bed_path = getattr(self, "_" + bed, getattr(self, bed))
            if not bed_path: continue
            new_fh = open(mktemp(suffix='%s.fake' % bed), 'w')
            for toks in reader("|bedtools intersect -wo -a %s -b '%s' \
                    | sort -k4,4 -k5,5g" % (structure, bed_path), header=False):
                gtoks, btoks = toks[:3], toks[3:-1] # drop the bp overlap
                new_chrom = "_".join(gtoks)

                gtoks[1:] = map(int, gtoks[1:])
                btoks[1:3] = map(int, btoks[1:3])

                glen = gtoks[2] - gtoks[1] # fake chrom length.
                if new_chrom.startswith('chr'): new_chrom = new_chrom[3:]
                if not new_chrom in seen_segs:
                    # save it in the genome file.
                    print >>new_genome, "\t".join((new_chrom, str(glen)))
                seen_segs[new_chrom] = True

                # with partial overlap, we'll have a negative start or an
                # end outside the genome... for now, just truncate.

                # adjust the interval to its location the new chrom.
                btoks[0] = new_chrom
                btoks[1] = max(0, btoks[1] - gtoks[1]) # don't let it go below 0
                # chop to end of fake chrom.
                btoks[2] = min(btoks[2] - gtoks[1], glen - 1)
                assert 0 <= btoks[1] <= btoks[2] < glen
                btoks[1:3] = map(str, btoks[1:3])
                print >>new_fh, "\t".join(btoks)
            new_fh.close()
            setattr(self, bed, new_fh.name)
        new_genome.close()
        self.genome_file = new_genome.name
예제 #7
0
파일: poverlap.py 프로젝트: daler/poverlap
def extend_bed(fin, fout, bases):
    # we're extending both a.bed and b.bed by this distance
    # so divide by 2.
    bases /= 2
    with nopen(fout, 'w') as fh:
        for toks in (l.rstrip("\r\n").split("\t") for l in nopen(fin)):
            toks[1] = max(0, int(toks[1]) - bases)
            toks[2] = max(0, int(toks[2]) + bases)
            if toks[1] > toks[2]:  # negative distances
                toks[1] = toks[2] = (toks[1] + toks[2]) / 2
            assert toks[1] <= toks[2]
            print >>fh, "\t".join(map(str, toks))
    return fh.name
예제 #8
0
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)

        #examines first five lines to detect if this is an interleaved fastq file
        first_five = list(islice(fq1, 5))
        fq1.seek(0)

        r1_header = first_five[0]
        r2_header = first_five[-1]

        if r1_header.split(' ')[0] == r2_header.split(' ')[0]:
            already_interleaved = True
        else:
            already_interleaved = False

        q1_iter = izip(*[fq1] * 4)

        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            if already_interleaved:
                sys.stderr.write("detected interleaved fastq\n")
            else:
                sys.stderr.write(
                    "WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))

        lt80 = 0

        if already_interleaved:
            selected_iter = q1_iter
        else:
            selected_iter = chain(*izip(q1_iter, q2_iter))

        for read_i, (name, seq, _, qual) in enumerate(selected_iter):
            if name is None: continue
            convert_and_write_read(name, seq, qual, read_i % 2, out)
            if len(seq) < 80:
                lt80 += 1

    out.flush()
    if lt80 > 50:
        sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
        sys.stderr.write("       : this program is designed for long reads\n")
    return 0
예제 #9
0
def extend_bed(fin, fout, bases):
    # `bedtools slop`

    # we're extending both a.bed and b.bed by this distance
    # so divide by 2.
    bases /= 2
    with nopen(fout, 'w') as fh:
        for toks in (l.rstrip("\r\n").split("\t") for l in nopen(fin)):
            toks[1] = max(0, int(toks[1]) - bases)
            toks[2] = max(0, int(toks[2]) + bases)
            if toks[1] > toks[2]:  # negative distances
                toks[1] = toks[2] = (toks[1] + toks[2]) / 2
            assert toks[1] <= toks[2]
            print >> fh, "\t".join(map(str, toks))
    return fh.name
예제 #10
0
def intersect(ref, xref, peaks):
    if xref:
        xref = xref_to_dict(xref)
    # group the output by chr->gene->start
    cmd = ("|bedtools intersect -wb -a {peaks} -b {ref} "
                "| sort -k1,1 -k8,8 -k2,2n").format(**locals())
    cols = ['chrom','start','stop','peak','_chrom',
                '_start','_stop','gene','_score','strand']
    tmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
    for g in grouper(nopen(cmd), cols):
        negs = []
        for i, l in enumerate(unique_everseen(\
                            g, lambda t: ret_item(t, cols, 'peak')), start=1):
            l = lparser(l, cols)
            # negative stranded sites
            if l['strand'] == "-":
                # need to count through them up, saving l each time
                negs.append(l)
                continue
            # positive stranded sites
            print >>tmp, "\t".join(get_out(l, i, xref))
        for i, l in izip(count(len(negs), -1), negs):
            print >>tmp, "\t".join(get_out(l, i, xref))
    tmp.close()
    return tmp.name
예제 #11
0
def readfx(fastx):
    with nopen(fastx) as fp:
        last = None
        while True:
            if not last:
                for l in fp:
                    if l[0] in '>@':
                        last = l[:-1]
                        break
            if not last: break
            name, seqs, last = last[1:].partition(" ")[0], [], None
            for l in fp:
                if l[0] in '@+>':
                    last = l[:-1]
                    break
                seqs.append(l[:-1])
            if not last or last[0] != '+':
                yield name, ''.join(seqs), None
                if not last: break
            else:
                seq, leng, seqs = ''.join(seqs), 0, []
                for l in fp:
                    seqs.append(l[:-1])
                    leng += len(l) - 1
                    if leng >= len(seq):
                        last = None
                        yield name, seq, ''.join(seqs);
                        break
                if last:
                    yield name, seq, None
                    break
예제 #12
0
def run_metric(cmd, metric=None):
    """
    Metric can be a string, e.g. "wc -l" or a python callable that consumes
    lines of input and returns a single value.
    e.g.

    def mymetric(fh):
        val = 0
        for line in fh:
            val += float(line.split("\t")[4])
        return val

    The lines sent to the metric function will be the result of bedtools
    intersect -wo -- so that both the -a and -b intervals will be present
    in each line.
    """

    if metric is None:
        cmd, metric = cmd
    if isinstance(metric, basestring):
        return float(run("%s | %s" % (cmd, metric)))
    else:
        proc = nopen("|%s" % cmd, mode=None)
        res = metric(proc.stdout)
        check_proc(proc, cmd)
        assert isinstance(res, (int, float))
        return res
예제 #13
0
파일: SupportFunc.py 프로젝트: wiw/pyMPFA
def GetTotalSeqRecords(input_file):
    '''
    This function count of number strings in fastq file and return sequences number
    '''
    with nopen(input_file) as f:
        TotalSeqRecords = int(sum(1 for _ in f)) / 4
    return TotalSeqRecords
예제 #14
0
def readccrs(path, gerp, phast, cadd):
    for i, d in enumerate(ts.reader(path, header="ordered")):
        d['gerp'] = ",".join(
            map(
                str,
                gerp.values("chr" + d['chrom'], int(d['start']),
                            int(d['end']))))
        d['phast'] = ",".join(
            map(
                str,
                phast.values("chr" + d['chrom'], int(d['start']),
                             int(d['end']))))
        region = d['chrom'] + ":" + d['start'] + "-" + d['end']
        var = None
        vals = []
        caddvals = []
        for toks in (x.rstrip('\r\n').split("\t")
                     for x in ts.nopen("| tabix " + cadd +
                                       " {region}".format(region=region))
                     if x[1] != "#"):  #TODO replace w cyvcf2
            if var == None or var == toks[1]:
                vals.append(float(toks[5]))
            elif var != toks[1] and var != None:
                caddvals.append(np.mean(vals))
                vals = []
            var = toks[1]
        d['cadd'] = ",".join(map(str, caddvals))
        if i == 0:
            print "\t".join(d.keys())
        print "\t".join(map(str, d.values()))
예제 #15
0
def CollectBarcode(indexFile, barcodeLength, readsValue, barcodeError, const_2,
                   const_2Error, regExpBc, mergeBC, reverseBC):
    bcList = []
    records = supp.GetTotalSeqRecords(indexFile)
    bar = progressbar.ProgressBar(
        maxval=records,
        widgets=[progressbar.Bar(left='<', marker='.', right='>')]).start()
    t = 0.0
    expr = regex.compile(regExpBc)
    with nopen(indexFile) as handle:
        for seq_record in SeqIO.parse(handle, "fastq"):
            bar.update(t)
            t += 1
            match = expr.match(str(seq_record.seq))
            if match is not None:
                if int(barcodeLength * 0.9) <= len(
                        match.group("barcode")) <= int(barcodeLength * 1.1):
                    if "N" not in match.group("barcode").upper():
                        if reverseBC:
                            bcList.append(
                                reverseComplement(match.group("barcode")))
                        else:
                            bcList.append(match.group("barcode"))
    bar.finish()
    if mergeBC:
        return bcList
    bcCount = Counter(bcList)
    bcDict = SelectionReliableBarcode(bcCount, readsValue, barcodeError)
    if len(bcDict) <= 10**5:
        # print("        Checking barcodes ... Estimated time: ~ {}".format(supp.EstimateCalculationTime(bcDict)))
        mainCheckBarcodeInDict(bcDict, barcodeError)
    return bcDict
예제 #16
0
def read_exons(gtf):
    transcripts = defaultdict(pyinter.IntervalSet)
    totlen = 0
    names = []
    trs, ids = [], []
    for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"):
        if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue
        #if toks[0] != "1": break
        start, end = map(int, toks[3:5])
        assert start <= end, toks
        transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0]
        transcripts[transcript].add(pyinter.closedopen(start-1, end))

        names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0])
        ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0])
        trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0])

    # sort by start so we can do binary search.
    # TODO: need to remove overlapping exons so we don't double-count
    transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems())
    #ends = dict((k, sorted(v)) for k, v in ends.iteritems())
    ints={}
    lens=pyinter.IntervalSet()
    for tr, ivset in transcripts.iteritems():
        sends = sorted(list(ivset))
        iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends)
        lens = lens.union(iset)
        ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends]
        ints[tr] = (ss,es)
    totlen = sum(x.upper_value-x.lower_value for x in lens)
    return ints, set(names), set(ids), set(trs), totlen
예제 #17
0
def subsample(infiles, outfiles, prob, seed=None):
    prob = 1 - prob

    if seed:
        random.seed(seed)

    def open_fq(f):
        fh = nopen(f, 'rb')
        return zip(*[fh] * 4)

    in_fh = [open_fq(i) for i in infiles]
    out_fh = [nopen(o, 'wb') for o in outfiles]

    try:
        written = 0
        for total, reads in enumerate(zip(*in_fh), 1):
            if random.random() >= prob:
                written += 1
                for read, fh in zip(reads, out_fh):
                    fh.writelines(read)

        print("wrote {} of {} reads".format(written, total))
    finally:
        for i in in_fh:
            i.close()
        for o in out_fh:
            o.close()
예제 #18
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("-p", dest="pvals", help="BED containing all the p values"
                  " used to generate `regions`")
    p.add_argument("-r", dest="regions", help="BED containing all the regions")
    p.add_argument("-s", dest="step", type=int, default=50,
            help="step size for acf calculation. should be the same "
            " value as the step sent to -d arg for acf")
    p.add_argument("--mlog", dest="mlog", action="store_true",
                   default=False, help="do the correlation on the -log10 of"
                   "the p-values. Default is to do it on the raw values")
    p.add_argument("-N", dest="N", help="number of simulations to perform",
                   type=int, default=0)
    p.add_argument("-c", dest="c", help="column number containing the p-value"
                   " of interest", type=int, default=-1)
    p.add_argument("-z", dest="z", help="use z-score correction",
                    action="store_true")
    args = p.parse_args()
    if not (args.regions and args.pvals):
        import sys
        sys.exit(not p.print_help())
    from toolshed import nopen
    header = nopen(args.regions).next()
    if header.startswith("#") or (not header.split("\t")[2].isdigit()):
        print "%s\tslk_p\tslk_sidak_p" % (header.rstrip("\r\n"),)
    return run(args)
예제 #19
0
def fastq_to_dict(fastq):
    """docstring for fastq_to_dict"""
    d = {}
    with nopen(fastq) as fh:
        for name, seq, qual in read_fastx(fh):
            d[name] = {'seq':seq,'qual':qual}
    return d
예제 #20
0
파일: PairedEndFunc.py 프로젝트: wiw/pyMPFA
def FastqJoinPaired(r1, r2, output_dir, gap_size, separator, mode="paired", reverse_complement=False):
    # set dictionary based on mode
    if mode == "R2":
        fastqdict = fastqtodict(r1, separator)
        fastq = r2
    else:
        fastqdict = fastqtodict(r2, separator)
        fastq = r1
    gap_bind, gap_qual = "N"*int(gap_size), "*"*int(gap_size)
    p_out, unq_out = os.path.join(output_dir, "output_paired.fastq"), os.path.join(output_dir, "output_unique.fastq")
    with nopen(fastq) as fq:
        with open(p_out, "w") as handle_p:
            with open(unq_out, "w") as handle_unq:
                for name, seq, qual in read_fastq(fq):
                    try:
                        # explicitly state space to facilitate future changes
                        name = name.split(" ")[0]
                        cseq = fastqdict.get(name)[0]
                        cqual = fastqdict.get(name)[1]
                        if reverse_complement:
                            cseq = reverseComplement(cseq)
                        handle_p.write("@{}\n{}{}{}\n+\n{}{}{}\n".format(name, seq, gap_bind, cseq, qual, gap_qual, cqual))
                    except KeyError:
                        # without pairs
                        if not mode == "paired":
                            handle_unq.write("@{}\n{}\n+\n{}\n".format(name, seq, qual))
    return p_out
예제 #21
0
def readgenes(trans):
    genes = defaultdict(str)
    for fields in (x.rstrip('\r\n').split("\t") for x in ts.nopen(trans)):
        gene = fields[0]
        transcript = fields[1]
        genes[gene] = transcript
    return genes
예제 #22
0
def genome_control_adjust_bed(bedfiles, colnum, outfh):
    c = colnum
    adj = genome_control_adjust([d['p'] for d in bediter(bedfiles, colnum)])

    diff = 0
    if len(bedfiles) > 1:
        print("can't do genomic control adjustment with more than 1 bed file",
              file=sys.stderr)
        sys.exit(4)
    for j, bedfile in enumerate(bedfiles):
        for i, toks in enumerate(line.rstrip("\r\n").split("\t") \
                for line in ts.nopen(bedfile)):
            try:
                float(toks[c])
            except ValueError:  # headder
                if i == 0 == j:
                    print("\t".join(toks), file=outfh)
                    diff = 1
                    continue
                elif i == 0:
                    continue
                else:
                    raise
            toks[c] = "%.5g" % adj[i - diff]
            print("\t".join(toks), file=outfh)
예제 #23
0
def run_metric(cmd, metric=None):
    """
    Metric can be a string, e.g. "wc -l" or a python callable that consumes
    lines of input and returns a single value.
    e.g.

    def mymetric(fh):
        val = 0
        for line in fh:
            val += float(line.split("\t")[4])
        return val

    The lines sent to the metric function will be the result of bedtools
    intersect -wa -- so that both the -a and -b intervals will be present
    in each line.
    """

    if metric is None:
        cmd, metric = cmd
    if isinstance(metric, basestring):
        return float(run("%s | %s" % (cmd, metric)))
    else:
        proc = nopen("|%s" % cmd, mode=None)
        res = metric(proc.stdout)
        check_proc(proc, cmd)
        assert isinstance(res, (int, float))
        return res
예제 #24
0
def local_shuffle(bed, loc='500000'):
    """
    Randomize the location of each interval in `bed` by moving its
    start location to within `loc` bp of its current location or to
    its containing interval in `loc`.

    Arguments:
        bed - input bed file
        loc - shuffle intervals to within this distance (+ or -).
               If not an integer, then this should be a BED file containing
               regions such that each interval in `bed` is shuffled within
               its containing interval in `loc`
    """
    from random import randint
    if str(loc).isdigit():
        dist = abs(int(loc))
        with nopen(bed) as fh:
            for toks in (l.rstrip('\r\n').split('\t') for l in fh):
                d = randint(-dist, dist)
                toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]]
                print "\t".join(toks)
    else:
        # we are using dist as the windows within which to shuffle
        assert os.path.exists(loc)
        bed4 = mktemp()
        with open(bed4, 'w') as fh:
            # this step is so we don't have to track the number of columns in A
            for toks in reader(bed, header=False):
                fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks)))

        missing = 0
        # we first find the b-interval that contains each a-interval by
        # using bedtools intersect
        for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}"
                           .format(**locals()), header=False):
            ajoin = toks[:4]
            a = ajoin[3].split(SEP)  # extract the full interval
            b = toks[4:]

            if int(b[-1]) == 0:
                missing += 1
                continue
            assert a[0] == b[0], ('chroms dont match', a, b)

            alen = int(a[2]) - int(a[1])
            # doesn't care if the new interval is completely contained in b
            astart = randint(int(b[1]), int(b[2]))

            # subtract half the time.
            aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \
                else (astart + alen)

            a[1], a[2] = map(str, (astart, aend) if astart < aend
                             else (aend, astart))

            print "\t".join(a)
        if missing > 0:
            print >> sys.stderr, ("found {missing} intervals in {bed} that "
                                  " were not contained in {loc}"
                                  .format(**locals()))
예제 #25
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("-p", dest="pvals", help="BED containing all the p values"
                  " used to generate `regions`")
    p.add_argument("-r", dest="regions", help="BED containing all the regions")
    p.add_argument("-s", "--step", dest="step", type=int, default=50,
            help="step size for acf calculation. should be the same "
            " value as the step sent to -d arg for acf")
    p.add_argument("-c", dest="c", help="column number containing the p-value"
                   " of interest", type=str, default=-1)
    p.add_argument("-z", dest="z", help="use z-score correction",
                    action="store_true")
    args = p.parse_args()
    if not (args.regions and args.pvals):
        import sys
        sys.exit(not p.print_help())
    header = ts.nopen(args.regions).next()
    if header.startswith("#") or (not header.split("\t")[2].isdigit()):
        print "%s\tslk_p\tslk_sidak_p" % (header.rstrip("\r\n"),)

    header = ts.header(args.pvals)
    if args.c in header:
        args.c = header.index(args.c) + 1
    else:
        args.c = int(args.c)
    return run(args)
예제 #26
0
def infos(path):
    infos = []
    for x in ts.nopen(path):
        if x[1] != "#": break
        if not "INFO" in x: continue
        infos.append(x.split("ID=")[1].split(",")[0])
    return infos
예제 #27
0
def read_regions(fregions):
    if not fregions: return None
    regions = {}
    for toks in (l.split("\t") for l in ts.nopen(fregions) if l[0] != "#"):
        if not toks[0] in regions: regions[toks[0]] = []
        regions[toks[0]].append((int(toks[1]), int(toks[2])))
    return regions
예제 #28
0
 def protein(self):
     from toolshed import nopen
     l = "http://genome.ucsc.edu/cgi-bin/hgGene?hgg_do_getProteinSeq=1&hgg_gene="
     url = l + self.name
     seq = [x.strip() for x in nopen(url) if x.strip() and
             not ">" in x]
     return "".join(seq)
예제 #29
0
def read_regions(fregions):
    if not fregions: return None
    regions = {}
    for toks in (l.split("\t") for l in ts.nopen(fregions) if l[0] != "#"):
        if not toks[0] in regions: regions[toks[0]] = []
        regions[toks[0]].append((int(toks[1]), int(toks[2])))
    return regions
예제 #30
0
파일: models.py 프로젝트: brentp/cruzdb
 def protein(self):
     from toolshed import nopen
     l = "http://genome.ucsc.edu/cgi-bin/hgGene?hgg_do_getProteinSeq=1&hgg_gene="
     url = l + self.name
     seq = [x.strip() for x in nopen(url) if x.strip() and
             not ">" in x]
     return "".join(seq)
예제 #31
0
def tofile(fiter, fname):
    fh = nopen(fname, "w")
    for line in fiter:
        print >> fh, line.rstrip("\r\n")
    fh.close()
    atexit.register(os.unlink, fname)
    return fname
예제 #32
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('attrs', help='Cytoscape attributes file of overlapping features')
    args = p.parse_args()
    
    attrsfile = nopen(args.attrs)
    # remove header
    attrsfile.readline()
    
    uniqid = os.path.basename(args.attrs).split(".", 1)[0]
    
    previous_feature = None
    
    for line in attrsfile:
        attr = line.rstrip("\r\n").split("=", 1)
        
        # everything before the "="
        cyto_info = attr[0].strip()
        feature = attr[1].strip().upper()
        fields = (cyto_info, "= True\n")
        
        if previous_feature and previous_feature == feature:
            fileout.write(" ".join(map(str, fields)))
        else:
            fileout = open("%s.%s.eda" % (uniqid, feature), 'w')
            fileout.write("feature%s\n" % feature)
            fileout.write(" ".join(map(str, fields)))
            previous_feature = feature
예제 #33
0
파일: cadd2vcf.py 프로젝트: xuyangy/vcfanno
def main(precision, path):
    header = None

    tmpl = "{Chrom}\t{Pos}\t.\t{Ref}\t{Alt}\t1\tPASS\traw={RawScore:.%if};phred={PHRED:.%if}" % (
        precision, precision)

    hdr = """\
##fileformat=VCFv4.1
##INFO=<ID=raw,Number=1,Type=Float,Description="raw cadd score">
##INFO=<ID=phred,Number=1,Type=Float,Description="phred-scaled cadd score">
##CADDCOMMENT=<ID=comment,comment="{comment}">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER INFO"""

    for i, line in enumerate(ts.nopen(path)):
        line = _to_str(line)
        if i == 0:
            print(hdr.format(comment=line.strip("# ").strip()))
            continue
        if header is None and line.startswith("#Chrom"):
            header = line[1:].rstrip().split("\t")
            continue
        d = dict(zip(header, line.rstrip().split("\t")))
        d['PHRED'] = float(d['PHRED'])
        d['RawScore'] = float(d['RawScore'])
        print(tmpl.format(**d))
예제 #34
0
파일: __main__.py 프로젝트: brentp/shuffler
def tofile(fiter, fname):
    fh = nopen(fname, "w")
    for line in fiter:
        print >>fh, line.rstrip("\r\n")
    fh.close()
    atexit.register(os.unlink, fname)
    return fname
예제 #35
0
파일: bwameth.py 프로젝트: brentp/bwa-meth
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)

        #examines first five lines to detect if this is an interleaved fastq file
        first_five = list(islice(fq1, 5))

        r1_header = first_five[0]
        r2_header = first_five[-1]

        if r1_header.split(' ')[0] == r2_header.split(' ')[0]:
            already_interleaved = True
        else:
            already_interleaved = False

        q1_iter = izip(*[chain.from_iterable([first_five,fq1])] * 4)

        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            if already_interleaved:
                sys.stderr.write("detected interleaved fastq\n")
            else:
                sys.stderr.write("WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))

        lt80 = 0

        if already_interleaved:
            selected_iter = q1_iter
        else:
            selected_iter = chain.from_iterable(izip(q1_iter, q2_iter))

        for read_i, (name, seq, _, qual) in enumerate(selected_iter):
            if name is None: continue
            convert_and_write_read(name,seq,qual,read_i%2,out)
            if len(seq) < 80:
                lt80 += 1

    out.flush()
    if lt80 > 50:
        sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
        sys.stderr.write("       : this program is designed for long reads\n")
    return 0
예제 #36
0
def main():
    args = get_args()
    
    if args.verbose:
        sys.stderr.write(">> building gene orthology cross-reference...\n")
    xref = get_xref(args.xref)
    
    if args.verbose:
        sys.stderr.write(">> building uniprot library...\n")
    uniprot = parse_uniprot_flat(args.uniprot)
    
    if args.verbose:
        sys.stderr.write(">> annotating matrisome...\n")
    
    header = nopen(args.matrisome).readline().rstrip("\r\n").split("\t")
    headerext = ['r_ENSRNOP', 'r_score', 'r_geneid', 'r_gene_description', \
                    'r_uniprot', 'r_interpro', 'r_refseqn', 'r_refseqp', \
                    'r_ensg', 'r_enst', 'r_ensp']
    header.extend(headerext)
    print "\t".join(h for h in header)
    
    for entry in reader(args.matrisome):
        
        # reset vars
        for h in headerext:
            entry[h] = ""
        
        # handle multiple entries delimited by ":"
        for entryname in entry[args.xref_col].split(":"):
            
            # looping over entire defaultdict each time
            for uid, ddict in xref.iteritems():
                
                # find a matching ortholog
                for orthoname in ddict['orthonames']:
                    if orthoname == entryname:
                        
                        # use the uid to get the rat names and scores
                        for ratname, ratscore in izip(xref[uid]['ratnames'], xref[uid]['ratscores']):
                            # print ratname
                            entry['r_ENSRNOP'] += "%s:" % ratname
                            entry['r_score'] += "%s:" % ratscore
                            
                            # for each rat ENSP, add the corresponding annotation(s)
                            for uniqueid, uniprot_entry in uniprot.iteritems():
                                for ensemblname in uniprot_entry['ensemblp']:
                                    if ensemblname == ratname:
                                        #print all of the info for this uid
                                        entry['r_geneid'] += ':'.join(t for t in uniprot[uniqueid]['geneid']) + ":"
                                        entry['r_gene_description'] += ':'.join(t for t in uniprot[uniqueid]['description']) + ":"
                                        entry['r_uniprot'] += ':'.join(t for t in uniprot[uniqueid]['uniprotid']) + ":"
                                        entry['r_interpro'] += ':'.join(t for t in uniprot[uniqueid]['interpro']) + ":"
                                        entry['r_refseqn'] += ':'.join(t for t in uniprot[uniqueid]['refseqn']) + ":"
                                        entry['r_refseqp'] += ':'.join(t for t in uniprot[uniqueid]['refseqp']) + ":"
                                        entry['r_ensg'] += ':'.join(t for t in uniprot[uniqueid]['ensemblg']) + ":"
                                        entry['r_enst'] += ':'.join(t for t in uniprot[uniqueid]['ensemblt']) + ":"
                                        entry['r_ensp'] += ':'.join(t for t in uniprot[uniqueid]['ensemblp']) + ":"
                                        
        print "\t".join(entry[h].rstrip(":") for h in header)
예제 #37
0
def get_read_length(fq):
    lens = []
    for i, line in enumerate(ts.nopen(fq)):
        if i % 4 == 1:
            lens.append(len(line) -1)
        if len(lens) > 100: break
    assert len(set(lens)) == 1, ("don't trim reads before sending to bwa-mips", set(lens))
    return lens[0]
예제 #38
0
def readstat(cifstat):
    with nopen(cifstat) as fh:
        clean = (x.strip("\r\n") for x in fh if x.strip())
        while True:
            rd = [x for x in islice(clean, 6)]
            if not rd: raise StopIteration
            assert all(rd) and len(rd) == 6
            yield CifStat(rd)
예제 #39
0
def readfq(fq):
    with nopen(fq) as fh:
        fqclean = (x.strip("\r\n") for x in fh if x.strip())
        while True:
            rd = [x for x in islice(fqclean, 4)]
            if not rd: raise StopIteration
            assert all(rd) and len(rd) == 4
            yield Fastq(rd)
예제 #40
0
 def __call__(self, fh):
     out = tofile(fh, tempfile.mktemp())
     try:
         value = nopen("%s < %s" % (self.command_string, out)).next()
         return dict(value=float(value))
     except:
         print self.command_string
         raise
예제 #41
0
파일: PairedEndFunc.py 프로젝트: wiw/pyMPFA
def fastqtodict(fastq, separator):
    """returns dict of read name to sequence"""
    fdict = {}
    with nopen(fastq) as fq:
        for name, seq, qual in read_fastq(fq):
            # explicitly state space to facilitate future changes
            fdict[name.split(separator)[0]] = [seq, qual]
    return fdict
예제 #42
0
def read_acf(acf_file):
    acf_vals = {}
    for row in ts.nopen(acf_file):
        if row[0] == "#": continue
        row = row.split("\t")
        if row[0] == "lag_min": continue
        acf_vals[(int(row[0]), int(row[1]))] = float(row[2])
    return sorted(acf_vals.items())
예제 #43
0
파일: __main__.py 프로젝트: brentp/shuffler
 def __call__(self, fh):
     out = tofile(fh, tempfile.mktemp())
     try:
         value = nopen("%s < %s" % (self.command_string, out)).next()
         return dict(value=float(value))
     except:
         print self.command_string
         raise
예제 #44
0
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)
        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            sys.stderr.write("WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))
        q1_iter = izip(*[fq1] * 4)

        lt80 = 0
        for pair in izip(q1_iter, q2_iter):
            for read_i, (name, seq, _, qual) in enumerate(pair):
                if name is None: continue
                name = name.rstrip("\r\n").split(" ")[0]
                if name[0] != "@":
                    sys.stderr.write("""ERROR!!!!
ERROR!!! FASTQ conversion failed
ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@"
""" % name)
                    sys.exit(1)
                if name.endswith(("_R1", "_R2")):
                    name = name[:-3]
                elif name.endswith(("/1", "/2")):
                    name = name[:-2]

                seq = seq.upper().rstrip('\n')
                if len(seq) < 80:
                    lt80 += 1

                char_a, char_b = ['CT', 'GA'][read_i]
                # keep original sequence as name.
                name = " ".join((name,
                                "YS:Z:" + seq +
                                "\tYC:Z:" + char_a + char_b + '\n'))
                seq = seq.replace(char_a, char_b)
                out.write("".join((name, seq, "\n+\n", qual)))

    out.flush()
    if lt80 > 50:
        sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
        sys.stderr.write("       : this program is designed for long reads\n")
    return 0
예제 #45
0
def fqiter(fq, n=4):
    with ts.nopen(fq) as fh:
        fqclean = (x.strip("\r\n") for x in fh if x.strip())
        while True:
            rec = [x for x in islice(fqclean, n)]
            if not rec: raise StopIteration
            assert all(rec) and len(rec) == 4
            yield rec
예제 #46
0
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)
        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            sys.stderr.write("WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))
        q1_iter = izip(*[fq1] * 4)

        lt80 = 0
        for pair in izip(q1_iter, q2_iter):
            for read_i, (name, seq, _, qual) in enumerate(pair):
                if name is None: continue
                name = name.rstrip("\r\n").split(" ")[0]
                if name[0] != "@":
                    sys.stderr.write("""ERROR!!!!
ERROR!!! FASTQ conversion failed
ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@"
""" % name)
                    sys.exit(1)
                if name.endswith(("_R1", "_R2")):
                    name = name[:-3]
                elif name.endswith(("/1", "/2")):
                    name = name[:-2]

                seq = seq.upper().rstrip('\n')
                if len(seq) < 80:
                    lt80 += 1

                char_a, char_b = ['CT', 'GA'][read_i]
                # keep original sequence as name.
                name = " ".join(
                    (name, "YS:Z:" + seq + "\tYC:Z:" + char_a + char_b + '\n'))
                seq = seq.replace(char_a, char_b)
                out.write("".join((name, seq, "\n+\n", qual)))

    out.flush()
    if lt80 > 50:
        sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
        sys.stderr.write("       : this program is designed for long reads\n")
    return 0
예제 #47
0
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)
        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            sys.stderr.write("WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))
        q1_iter = izip(*[fq1] * 4)

        lt80 = 0
        for pair in izip(q1_iter, q2_iter):
            for read_i, (name, seq, _, qual) in enumerate(pair):
                if name == None or 'ST:Z:gbs' in name:
                    #cache error when read2 is absent or read is GBS
                    continue
                original_name = name[:-1].replace(' ','\t')
                if 'crick' in name.lower():
                    convert_list = ['CT', 'GA'][::-1]
                else:
                    convert_list = ['CT', 'GA']
                name = name.rstrip("\r\n").split(" ")[0]
                if name.endswith(("_R1", "_R2")):
                    name = name[:-3]
                elif name.endswith(("/1", "/2")):
                    name = name[:-2]

                seq = seq.upper().rstrip('\n')
                if len(seq) < 80:
                    lt80 += 1

                char_a, char_b = convert_list[read_i]
                # keep original sequence as name.
                name = "\t".join((original_name,
                                "YS:Z:" + seq + '\n'))
                seq = seq.replace(char_a, char_b)
                out.write("".join((name, seq, "\n+\n", qual)))

    out.flush()
    out.close()
    if lt80 > 50:
        a=1
예제 #48
0
파일: poverlap.py 프로젝트: daler/poverlap
def bed_sample(bed, n=100):
    """\
    choose n random lines from a bed file. uses reservoir sampling
    Arguments:
        bed - a bed file
        n - number of lines to sample
    """
    n, lines = int(n), []
    from random import randint
    with nopen(bed) as fh:
        for i, line in enumerate(nopen(fh)):
            if i < n:
                lines.append(line)
            else:
                replace_idx = randint(0, i)
                if replace_idx < n:
                    lines[replace_idx] = line
        print "".join(lines),
def process_exact_fastq(fastq, n):
    """Group identical reads using a Counter. Returns Counter."""
    c = Counter()
    with nopen(fastq) as fh:
        for name, seq, qual in read_fastq(fh):
            seq = trim_seq(seq, 4)
            if len(seq) < n: continue
            c.update([seq])
    return c
예제 #50
0
def readfa(fa):
    with nopen(fa) as fh:
        for header, group in groupby(fh, lambda line: line[0] == '>'):
            if header:
                line = group.next()
                name = line[1:].strip()
            else:
                seq = ''.join(line.strip() for line in group)
                yield name, seq
예제 #51
0
def _qvality(fbed_file, col_num, col_null):
    from qvality import qvality

    ps = [b['p'] for b in bediter(fbed_file, col_num)]
    nulls = [b['p'] for b in bediter(fbed_file, col_null)]
    fh = ts.nopen(fbed_file)
    drop_header(fh)
    for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh):
        yield qval, pep, l
예제 #52
0
def _qvality(fbed_file, col_num, col_null):
   from qvality import qvality

   ps = [b['p'] for b in bediter(fbed_file, col_num)]
   nulls = [b['p'] for b in bediter(fbed_file, col_null)]
   fh = ts.nopen(fbed_file)
   drop_header(fh)
   for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh):
       yield qval, pep, l
예제 #53
0
def convert_reads(fq1s, fq2s, out=sys.stdout):

    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
        fq1 = nopen(fq1)
        if fq2 != "NA":
            fq2 = nopen(fq2)
            q2_iter = izip(*[fq2] * 4)
        else:
            sys.stderr.write("WARNING: running bwameth in single-end mode\n")
            q2_iter = repeat((None, None, None, None))
        q1_iter = izip(*[fq1] * 4)

        lt80 = 0
        for pair in izip(q1_iter, q2_iter):
            for read_i, (name, seq, _, qual) in enumerate(pair):
                if name == None or 'ST:Z:gbs' in name:
                    #cache error when read2 is absent or read is GBS
                    continue
                original_name = name[:-1].replace(' ', '\t')
                if 'crick' in name.lower():
                    convert_list = ['CT', 'GA'][::-1]
                else:
                    convert_list = ['CT', 'GA']
                name = name.rstrip("\r\n").split(" ")[0]
                if name.endswith(("_R1", "_R2")):
                    name = name[:-3]
                elif name.endswith(("/1", "/2")):
                    name = name[:-2]

                seq = seq.upper().rstrip('\n')
                if len(seq) < 80:
                    lt80 += 1

                char_a, char_b = convert_list[read_i]
                # keep original sequence as name.
                name = "\t".join((original_name, "YS:Z:" + seq + '\n'))
                seq = seq.replace(char_a, char_b)
                out.write("".join((name, seq, "\n+\n", qual)))

    out.flush()
    out.close()
    if lt80 > 50:
        a = 1
예제 #54
0
def bed_sample(bed, n=100):
    """\
    Choose n random lines from a bed file. Uses reservoir sampling.

    Arguments:
        bed - a bed file
        n - number of lines to sample
    """
    n, lines = int(n), []
    from random import randint
    with nopen(bed) as fh:
        for i, line in enumerate(nopen(fh)):
            if i < n:
                lines.append(line)
            else:
                replace_idx = randint(0, i)
                if replace_idx < n:
                    lines[replace_idx] = line
        print "".join(lines),
예제 #55
0
def counter(fname):
    fname = fname[0] if not isinstance(fname, basestring) else fname
    print >>sys.stderr, fname
    qual_count = [0] * 256
    for sam_line in (x.rstrip().split("\t") for x in nopen(fname) if not
            x.startswith('@')):
        qual = int(sam_line[4])
        qual_count[qual] += 1
    # each qual should be the sum of all quals with a lower qual than it
    return np.cumsum(qual_count[::-1])[::-1]
예제 #56
0
def main(regions,
         bams,
         reads=None,
         flags="-F%i" % (0x100 | 0x4 | 0x200 | 0x400),
         pad=100):
    r2 = open(tempfile.mktemp(), 'w')
    for toks in reader(regions, header=False):
        if toks[0][0] == "@" or not (toks[1] + toks[2]).isdigit(): continue
        toks[1] = str(max(0, int(toks[1]) - pad))
        toks[2] = str(int(toks[2]) + pad)
        print >> r2, "\t".join(toks)
    r2.flush()
    regions = r2.name
    print reads
    if reads.isdigit():
        reads = int(reads)
    elif reads != "bam":
        reads = int(
            nopen(
                "|bioawk -c fastx 'END { print NR }' %s" % reads).next()) * 2.0

    counts = {}
    colors = cycle('rgbkmy')
    bam_reads = {}

    counts = dict(pmap(count_both, ((bam, regions, flags) for bam in bams)))

    for bam in bams:
        nreads = count_bam(bam, flags) if reads == "bam" else reads
        bam_reads[bam] = nreads
        symbol = 'o' if len(set(counts[bam][0])) < 3 else '.'
        pl.plot(counts[bam][0] / float(nreads),
                counts[bam][1] / float(nreads),
                '%s%s' % (colors.next(), symbol),
                label=name(bam))

    pl.xlabel('off target')
    pl.ylabel('on target')
    pl.legend(loc='lower right')
    pl.xlim(xmin=0)
    pl.ylim(ymin=0)
    pl.show()
    os.unlink(r2.name)

    out = sys.stdout
    print >> out, "qual\tmethod\toff\ton"

    for qual in range(0, 256):
        for b in bams:
            print >> out, "{qual}\t{bam}\t{off}\t{on}".format(
                qual=qual,
                bam=name(b),
                off=counts[b][0][qual] / bam_reads[bam],
                on=counts[b][1][qual] / bam_reads[bam])
    print >> sys.stderr, "wrote", out.name