def calc_ontarget_rate(tree,
                       gene_info,
                       input_fasta,
                       sam_filename,
                       output_filename=None):

    query_len_dict = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta'))

    if output_filename is None:
        f = sys.stdout
    else:
        f = open(output_filename, 'w')
    f.write(
        "read_id\tread_len\tis_fl\tnum_probe\tnum_base_overlap\tloci\tgenes\n")
    reader = BioReaders.GMAPSAMReader(sam_filename,
                                      True,
                                      query_len_dict=query_len_dict)
    for r in reader:
        if r.sID == '*': continue
        num_probe, base_hit, genes_seen = get_probe_hit(tree, gene_info, r)
        f.write("{0}\t{1}\tY\t{2}\t{3}\t{4}:{5}-{6}\t{7}\n".format(
            r.qID, r.qLen, num_probe, base_hit, r.sID, r.sStart, r.sEnd,
            ",".join(genes_seen)))

    f.close()
def calc_ontarget_rate(tree,
                       gene_info,
                       input_fasta,
                       is_gtf,
                       sam_or_gtf,
                       output_filename=None):

    type = 'fasta' if input_fasta.upper().endswith(
        '.FA') or input_fasta.upper().endswith('.FASTA') else 'fastq'
    query_len_dict = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), type))

    if output_filename is None:
        f = sys.stdout
    else:
        f = open(output_filename, 'w')

    FIELDS = [
        'read_id', 'read_len', 'num_probe', 'num_base_overlap', 'loci', 'genes'
    ]
    writer = DictWriter(f, FIELDS, delimiter='\t')
    writer.writeheader()

    if is_gtf:
        reader = collapseGFFReader(sam_or_gtf)
        for r in reader:
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                'read_id': r.seqid,
                'read_len': 'NA',
                'num_probe': num_probe,
                'num_base_overlap': base_hit,
                'loci': "{0}:{1}-{2}".format(r.chr, r.start, r.end),
                'genes': ",".join(genes_seen)
            }
            writer.writerow(rec)
    else:
        reader = BioReaders.GMAPSAMReader(sam_or_gtf,
                                          True,
                                          query_len_dict=query_len_dict)
        for r in reader:
            if r.sID == '*': continue
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                'read_id': r.qID,
                'read_len': r.qLen,
                'num_probe': num_probe,
                'num_base_overlap': base_hit,
                'loci': "{0}:{1}-{2}".format(r.sID, r.sStart, r.sEnd),
                'genes': ",".join(genes_seen)
            }
            writer.writerow(rec)

    f.close()
示例#3
0
def summarize_GMAP_sam(input_fa_or_fq, input_sam):
    d = dict((r.id, len(r.seq)) for r in SeqIO.parse(
        open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))

    map_count = defaultdict(lambda: 0)
    for r in BioReaders.GMAPSAMReader(input_sam, True):
        map_count[r.qID] += 1
    multi = [x for x in map_count if map_count[x] > 1]

    f = open(input_sam + '.summary.txt', 'w')
    f.write("id\tqLength\tqCoverage\tidentity\tunique\n")
    for r in BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d):
        if r.sID == '*': continue
        if r.qID in multi: uni = 'N'
        else: uni = 'Y'
        f.write("{0}\t{1}\t{2:.4f}\t{3:.4f}\t{4}\n".format(
            r.qID, d[r.qID], r.qCoverage, r.identity, uni))
    f.close()

    print("Output written to: {0}".format(f.name), file=sys.stderr)
示例#4
0
def err_correct(genome_file, sam_file, output_err_corrected_fasta, genome_dict=None):
    if genome_dict is None:
        genome_dict = {}
        for r in SeqIO.parse(open(genome_file), 'fasta'):
            genome_dict[r.name] = r
        print >> sys.stderr, "done reading", genome_file

    f = open(output_err_corrected_fasta, 'w')
    reader = BioReaders.GMAPSAMReader(sam_file, True)
    for r in reader:
        if r.sID == '*': continue
        seq = sp.consistute_genome_seq_from_exons(genome_dict, r.sID, r.segments, r.flag.strand)
        f.write(">{0}\n{1}\n".format(r.qID, seq))

    f.close()

    print >> sys.stderr, "output written to", output_err_corrected_fasta
def categorize_aln_by_annotation(gene_annotation_file,
                                 input_fasta,
                                 input_sam,
                                 output_prefix,
                                 min_overlap_bp=200,
                                 min_query_overlap=.5,
                                 min_gene_overlap=.8):

    t = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr -> strand -> IntervalTree
    info = {}

    #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t')
    for r in DictReader(open(gene_annotation_file), delimiter='\t'):
        if r['#Replicon Name'] != 'chr':
            print("Ignore", r, file=sys.stderr)
            continue
        info[r['Locus tag']] = (int(r['Start']), int(r['Stop']),
                                r['Locus tag'])
        t[r['Replicon Accession']][r['Strand']].add(int(r['Start']),
                                                    int(r['Stop']),
                                                    r['Locus tag'])

    #pdb.set_trace()

    result = defaultdict(lambda: [])  # gene -> list of rec
    d = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta'))

    reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d)
    for r in reader:
        #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS':
        #    pdb.set_trace()
        ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap,
                                 min_gene_overlap)
        # ans is AMatch(name, strand, start, end, record)
        result[ans.name].append(ans)

    novel_ct = defaultdict(lambda: {
        '+': ClusterTree(0, 0),
        '-': ClusterTree(0, 0)
    })
    novel_list = []
    novel_index = 0

    f = open(output_prefix + '.sam', 'w')
    f.write(reader.header)
    f1 = open(output_prefix + '.report.txt', 'w')
    f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n")
    for k, v in result.items():
        # v is: list of AMatch(name, strand, start, end, record)
        if k.startswith('novel-unannotated'):
            # write novel later, we are grouping them by loci first
            #tagRG='novel'
            for x in v:
                novel_ct[x.record.sID][x.strand].insert(
                    x.start, x.end, novel_index)
                novel_index += 1
                novel_list.append(x)
            continue
        elif k.startswith('novel-antisense'):
            tagRG = 'novel-antisense'
        elif k.startswith('novel-partial'):
            tagRG = 'novel-partial'
        elif k.startswith('poly-'):
            tagRG = 'poly'
        else:
            tagRG = 'single'
        v.sort(key=lambda x: (x.start, x.end),
               reverse=True
               if v[0].strand == '-' else False)  # sort by start, then end
        for i, x in enumerate(v):
            f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                x.record.record_line, i + 1, tagRG, k))
            if x.strand == '+':
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end))
            else:  # - strand, start is end, end is start
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1))

    # now write the novel stuff, grouped by regions
    novel_region_index = 1
    for d1 in novel_ct.values():
        for ct in d1.values():
            gn = 'novel-' + str(novel_region_index)
            for _start, _end, _indices in ct.getregions():
                v = [novel_list[ind] for ind in _indices]
                v.sort(key=lambda x: (x.start, x.end),
                       reverse=True if v[0].strand == '-' else
                       False)  # sort by start, then end
                for i, x in enumerate(v):
                    f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                        x.record.record_line, i + 1, "novel-unannotated", gn))
                    if x.strand == '+':
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end))
                    else:
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1))
                novel_region_index += 1

    f.close()
    f1.close()

    print("Output written to:", f.name, file=sys.stderr)
    print("Output written to:", f1.name, file=sys.stderr)