def _read_lifted_bam_alpha(bed_fn, bam_fn, args): database = guess_database(args) conn = sql.create_connection() key = "name" if args.keep_name else "sequence" sql.create_reads_table(conn, key) # TODO create counts table sequence and autoincrement or from read cur = conn.cursor() counts = 0 seen = set() for line in bed_fn: fields = _parse_intersect(line, database, bed=True) # TODO add sequence to count table args.quant on/off name=UID or name=UID+chrom+pos if fields: hit = ".".join(fields[:3]) if hit not in seen: counts += 1 sql.insert_row_in_reads_table(cur, fields) seen.add(hit) # if counts == 1000: # counts = 0 del (seen) logger.info("Read %s lines that intersected with miRNAs." % counts) conn.commit() # TODO this'll return conn_reads and conn_counts return conn
def convert(args): samples = [] database = mapper.guess_database(args.gtf) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) for fn in args.files: read_file(fn, precursors, matures)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ if args.low_memory: read.reader(args) return None samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "manatee": out_dts[fn] = manatee.read_file(fn, database, args) elif args.format == "optimir": out_dts[fn] = optimir.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, header.make_tools(args.format)) _write(out_dts[fn], h, fn_out, args) # merge all reads for all samples into one dict if args.low_memory: return None merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, header.make_tools([args.format])), fn_merged_out, args)
def test_database(self): from mirtop.mirna import mapper args = argparse.Namespace() args.gtf = "data/examples/annotate/hsa.gff3" args.database = None db = mapper.guess_database(args) print("Database is %s" % db) if db != "miRBasev21": raise ValueError("%s not eq to miRBasev21" % db)
def _read_lifted_bam(handle, reads, args, clean): indels_skip = 0 precursors = args.precursors database = guess_database(args) for line in handle: rows = _parse_intersect(line, database, bed=True) reads = _analyze_lifted_line(rows, reads, precursors, database) logger.info("Hits: %s" % len(reads)) logger.info("Hits with indels %s" % indels_skip) if clean: reads = filter.clean_hits(reads) logger.info("Hits after clean: %s" % len(reads)) return reads
def low_memory_genomic_bam(bam_fn, sample, out_handle, args): logger.info("Reading BAM file in low memory mode.") logger.warning("This is under development and variants can be unexact.") precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) database = guess_database(args) bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed") logger.info("Making bed file.") _bed(bam_fn, bed_fn) logger.info("Intersecting bed file.") intersect_fn = intersect(bed_fn, args.gtf) logger.info("Loading database.") # TODO this'll return conn_reads and conn_counts conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args) rows = sql.select_all_reads(conn) lines = [] current = None logger.info("Analyzing database.") for row in rows: if not current or current == row[0]: lines.append(row) current = row[0] else: # TODO counts of sequence = conn_counts.query UID # it could be counts only same location UID+chrom+start, or counts all UID reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) current = row[0] lines = [] lines.append(row) reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) conn.close() logger.info("Done")
def read_bam(bam_fn, args, clean=True): """ Read bam file and perform realignment of hits Args: *bam_fn*: a BAM file with alignments to the precursor *precursors*: dict with keys being precursor names and values being sequences. Come from mirtop.mirna.fasta.read_precursor(). *clean*: Use mirtop.filter.clean_hits() to remove lower score hits. Returns: *reads (dict)*: keys are read_id and values are *mirtop.realign.hits* """ bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) reads = defaultdict(hits) if args.genomic: logger.warning( "This is under development and variants can be unexact.") bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed") logger.info("Making bed file.") _bed(bam_fn, bed_fn) logger.info("Intersecting bed file.") intersect_fn = intersect(bed_fn, args.gtf) # logger.info("Analyzing hits.") # reads = _read_lifted_bam(intersect_fn, reads, args, clean) logger.info("Loading database.") conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args) rows = sql.select_all_reads(conn) logger.info("Analyzing database.") precursors = args.precursors database = guess_database(args) reads = _read_lifted_lines(rows, precursors, database) conn.close() else: reads = _read_original_bam(bam_fn, reads, args, clean) logger.info("Done.") return reads
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args.gtf) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() for fn in args.files: if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench"]: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, "") _write(out_dts[fn], h, fn_out) # merge all reads for all samples into one dict merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, ""), fn_merged_out)
def reader(args): """ Realign BAM hits to miRBase to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) h = header.create([sample], args.database, "") out_handle = open(fn_out, 'w') print(h, file=out_handle) if args.format == "BAM": if args.genomic: low_memory_genomic_bam(fn, sample, out_handle, args) else: low_memory_bam(fn, sample, out_handle, args) elif args.format == "seqbuster": seqbuster.read_file_low_memory(fn, sample, args, out_handle) else: raise ValueError("%s not supported for low memory" % args.format) out_handle.close()
def annotate(fn, read_file, load=False, create=True): import argparse args = argparse.Namespace() args.hairpin = "data/examples/annotate/hairpin.fa" args.sps = "hsa" args.gtf = "data/examples/annotate/hsa.gff3" args.add_extra = True args.out_format = "gtf" from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) args.precursors = precursors args.matures = matures args.database = mapper.guess_database(args.gtf) from mirtop.mirna import annotate from mirtop.gff import body if not load: reads = read_file(fn, args) else: reads = read_file if create: ann = annotate.annotate(reads, matures, precursors) body = body.create(ann, "miRBase21", "Example", args) return body
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ database = mapper.guess_database(args.gtf) # hairpin, mirna = download_mirbase(args) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) # check numnbers of miRNA and precursors read # print message if numbers mismatch out_dts = dict() for fn in args.files: sample = op.splitext(op.basename(fn))[0] fn_out = op.join(args.out, sample + ".gff") if args.format == "BAM": reads = _read_bam(fn, precursors) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, precursors) custom = seqbuster.header() elif args.format == "srnabench": reads = srnabench.read_gile(fn, precursors) h = header.create([sample], database, "") ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, fn_out, h)
def test_database(self): from mirtop.mirna import mapper db = mapper.guess_database("data/examples/annotate/hsa.gff3") print "Database is %s" % db if db != "miRBasev21": raise ValueError("%s not eq to miRBasev21" % db)
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}
def test_database(self): from mirtop.mirna import mapper db = mapper.guess_database("data/examples/annotate/hsa.gff3") print("Database is %s" % db) if db != "miRBasev21": raise ValueError("%s not eq to miRBasev21" % db)