def read_file(fn, args): """ Read OptimiR file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" sample = read_samples(fn) reads = defaultdict(dict) logger.debug("OPTIMIR::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: gff = feature(line) fixed_line = line if gff.columns: if "Variant" not in gff.attributes: gff.attributes["Variant"] = "NA" logger.debug("OPTIMIR::Chrom update from %s to %s" % (gff.columns["chrom"], gff.attributes["Parent"])) gff.columns["chrom"] = gff.attributes["Parent"].split(",")[0] fixed_line = gff.paste_columns(sep=sep) if args.add_extra: extra = variant_with_nt(fixed_line, args.precursors, args.matures) fixed_line = "%s Changes %s;" % (fixed_line, extra) fixed_line = paste_columns(feature(fixed_line), sep=sep) counts = gff.attributes["Expression"].split(",") chrom = gff.columns["chrom"] start = gff.columns["start"] if start not in reads[chrom]: reads[chrom][start] = [] reads[chrom][start].append([ gff.attributes["UID"], gff.columns["chrom"], counts, sample, fixed_line ]) return reads
def read(fn, args): """Read GTF/GFF file and load into annotate, chrom counts, sample, line""" samples = read_samples(fn) lines = defaultdict(dict) sep = " " if args.out_format == "gtf" else "=" corrupted_uid = 0 with open(fn) as inh: for line in inh: if line.startswith("#"): continue line = paste_columns(feature(line), sep=sep) gff = feature(line) cols = gff.columns attr = gff.attributes if attr['UID'] and not read_id(attr['UID']): corrupted_uid += 1 continue if 'UID' not in attr: msg = "UID not found." if 'Read' not in attr: if not is_sequence(attr['Read']): msg = msg + " Sequence not valid in Read attribute." else: attr['UID'] = make_id(attr['Read']) if 'sequence' not in attr: msg = msg + " Sequence not found in sequence attribute." if not is_sequence(attr['sequence']): msg = msg + " Sequence not valid in sequence attribute." else: attr['UID'] = make_id(attr['Read']) if 'UID' not in attr: logger.warning("Line is not a valid GFF3 line: %s" % line.strip()) logger.warning(msg) if cols['start'] not in lines[cols['chrom']]: lines[cols['chrom']][cols['start']] = [] uid = "%s-%s-%s" % (attr['UID'], attr['Variant'], attr['Name']) if args.keep_name: uid = "%s-%s" % (uid, attr['Read']) lines[cols['chrom']][cols['start']].append( [uid, cols['chrom'], attr['Expression'].strip().split(","), samples, line.strip()]) logger.info("Lines skipped due to corrupted UID: %s" % corrupted_uid) return lines
def _convert_file(gff, args): sep = "\t" precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join(['mism', 'add', 't5', 't3']) gff_file = open(gff, 'r') out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['seq', 'mir', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: Read = read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join(_expand(extra, True)) summary = sep.join([Read, mirna, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def test_alignment(self): """testing alignments function""" from mirtop.bam import bam from mirtop.gff.classgff import feature fns = { "let7-last1D.sam": { 56: "iso_add3p:1,iso_snv" }, "let7-1D.sam": { 5: "iso_snv,iso_3p:-5" }, "let7-last7M1I.sam": { 5: "iso_add3p:1,iso_snv_seed" }, "let7-middle1D.sam": { 5: "iso_snv_central_supp,iso_3p:-2" }, "let7-perfect.sam": { 5: "NA" }, "let7-triming.sam": { 5: "iso_3p:+2", 4: "iso_5p:-1", 6: "iso_5p:+1,iso_3p:-3" } } #import pdb; pdb.set_trace() for fn in fns: gff = annotate("data/aligments/%s" % fn, bam.read_bam) for pos in gff['hsa-let-7a-1']: f = feature(gff['hsa-let-7a-1'][pos][0][4]) if not set(f.attributes['Variant'].split(",")) == set( fns[fn][pos].split(",")): raise ValueError("Error in %s" % fn)
def test_class(self): """Test class to read GFF line""" from mirtop.gff.classgff import feature gff = feature("hsa-let-7a-5p\tmiRBasev21\tisomiR\t4\t25\t0\t+\t.\t" "Read hsa-let-7a-1_hsa-let-7a-5p_5:26_-1:-1_mut:" "null_add:null_x861; UID bhJJ5WJL2;" " Name hsa-let-7a-5p; Parent hsa-let-7a-1;" " Variant iso_5p:+1,iso_3p:-1; Cigar 22M;" " Expression 861; Filter Pass; Hits 1;") print(gff.columns) print(gff.attributes)
def _process(fn, out_dir): if out_dir: out_fasta = os.path.join( out_dir, "%s.fasta" % os.path.splitext(os.path.basename(fn))[0]) outh = sys.stdout if not out_dir else open(out_fasta, 'w') with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) attr = gff.attributes read = read_id(attr["UID"]) print((">{0}\n{1}").format(attr["UID"], read), file=outh)
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join([ "seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity" ]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) cols = gff.columns attr = gff.attributes read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(list(map(str, mm[0]))) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist # print(cols) line = [ read, attr["Read"], "0", attr["Name"], cols['source'], cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit ] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def lift_to_genome(line, mapper): """ Function to get a class of type feature from classgff.py and map the precursors coordinates to the genomic coordinates Args: *line(str)*: string GFF line. *mapper(dict)*: dict with mirna-precursor-genomic coordinas from mirna.mapper.read_gtf_to_mirna function. Returns: *(line)*: string with GFF line with updated chr, star, end, strand """ features = feature(line) features.attributes["Variant"] chr, start, end, strand, id = mapper[features.attributes["Name"]][features.attributes["Parent"]] logger.debug("LIFT2GENOME:: %s of %s found in %s(%s) " % (features.attributes["Name"], features.attributes["Parent"], chr, strand)) nstart = start nend = end variants = read_variant(features.attributes["Variant"]) logger.debug("LIFT2GENOME:: variants %s " % (features.attributes["Variant"])) if 'iso_5p' in variants: if strand == "+": nstart = start + variants['iso_5p'] else: nend = end - variants['iso_5p'] if 'iso_3p' in variants: if strand == "+": nend = end + variants['iso_3p'] else: nstart = start - variants['iso_3p'] if 'iso_add3p' in variants: if strand == "+": nend = nend + variants['iso_add3p'] else: nstart = nstart - variants['iso_add3p'] logger.debug("LIFT2GENOME:: start %s to %s | end %s to %s " % (start, nstart, end, nend)) features.columns['chrom'] = chr features.columns['start'] = str(start) features.columns['end'] = str(end) features.columns['strand'] = strand return features.paste_columns()
def create_line(read, name, database, args): sep = " " if args.out_format == "gtf" else "=" if args.add_extra: precursors = args.precursors matures = args.matures for (ps, iso) in read.precursors.items(): p = list(ps)[0] if not iso.mirna: continue chrom = p seq = read.sequence seq_name = seq if not args.keep_name else name if iso.get_score(len(seq)) < 1: continue if iso.subs: iso.subs = [] if "N" in iso.subs[0] else iso.subs idseq = read.idseq source = "ref_miRNA" if not iso.is_iso() else "isomiR" strand = iso.strand start, end = iso.start, iso.end score = iso.map_score mirName = iso.mirna preName = p Variant = iso.formatGFF() Cigar = iso.cigar counts = read.counts Filter = iso.filter annotation = "%s.%s.%s" % (chrom, idseq, seq_name) # This get correctly formated with paste_columns below attrb = ("Read {seq_name};UID {idseq};Name {mirName};" "Parent {preName};" "Variant {Variant};Cigar {Cigar};" "Expression {counts};" "Filter {Filter};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}" "\t{score}\t{strand}\t.\t{attrb}").format(**locals()) logger.debug("GFF::%s" % line) if args.add_extra: extra = variant_with_nt(line, precursors, matures) line = "%s Changes %s;" % (line, extra) line = feature(line).paste_columns(sep) return line
def read_reference(fn): """Read GFF into UID:Variant Args: *fn (str)*: GFF file. Returns: *srna (dict)*: dict with >>> {'UID': 'iso_snp:-2,...'} """ srna = dict() with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) attr = gff.attributes srna[attr['UID']] = [_simplify(attr['Variant']), attr] return srna
def to10to11(gff_line): gff_line = gff_line.replace("_snp", "_snv") gff_line = gff_line.replace("_add", "_add3p") features = feature(gff_line) if "iso_5p" in features.attributes["Variant"]: variants = features.attributes["Variant"].split(",") iso_5p = [v.split(":") for v in variants if v.startswith("iso_5p")] iso_5p = -1 * int(iso_5p[0][1]) if iso_5p > 0: iso_5p = "+%s" % iso_5p variants = [ "iso_5p:%s" % iso_5p if v.startswith("iso_5p") else v for v in variants ] features.attributes["Variant"] = ",".join(variants) features.attributes["UID"] = make_id( read_uid_10(features.attributes["UID"])) return features.paste_columns()
def variant_with_nt(line, precursors, matures): """ Return nucleotides changes for each variant type using Variant attribute, precursor sequences and mature position. """ gff = feature(line) attr = gff.attributes read = read_id(attr["UID"]) attr["Parent"] = attr["Parent"].split(",")[0] if attr["Parent"] not in matures: logger.warning("Parent miRNA not found in database %s" % attr["Parent"]) return "" if attr["Name"] not in matures[attr["Parent"]]: logger.warning("miRNA not found in database %s" % attr["Name"]) return "" logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]]) logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], nt=8) logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if mm == "Invalid": return mm if len(mm) > 0: mm = "".join(["".join([str(v) for v in m]) for m in mm]) else: mm = "0" return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
def _compare_to_reference(fn, reference): same = 0 diff = list() extra = list() miss = list() results = list() seen = 0 seen_reference = set() with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) attr = gff.attributes if attr['UID'] in reference: mirna = "Y" if attr['Name'] == reference[ attr['UID']][1]['Name'] else attr['Name'] accuracy = _accuracy(_simplify(attr['Variant']), reference[attr['UID']][0]) results.append([attr['UID'], "D", mirna, accuracy]) if _simplify(attr['Variant']) == reference[attr['UID']][0]: same += 1 else: diff.append("%s | reference: %s" % (line.strip(), reference[attr['UID']][1])) seen += 1 seen_reference.add(attr['UID']) else: extra.append("%s | extra" % line.strip()) results.append([ attr['UID'], "E", attr['Name'], _accuracy(_simplify(attr['Variant']), "") ]) for uid in reference: if uid not in seen_reference: results.append([uid, "M", "N", _accuracy("", reference[uid][0])]) miss.append("| miss %s" % reference[uid][1]) logger.info("Number of sequences found in reference: %s" % seen) logger.info("Number of sequences matches reference: %s" % same) logger.info("Number of sequences different than reference: %s" % len(diff)) logger.info("Number of sequences extra sequences: %s" % len(extra)) logger.info("Number of sequences missed sequences: %s" % len(miss)) return results
def _calc_stats(fn): """ Read files and parse into categories """ samples = _get_samples(fn) lines = [] seen = set() ok = re.compile('pass', re.IGNORECASE) with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) cols = gff.columns attr = gff.attributes logger.debug("## STATS: attribute %s" % attr) if not ok.match(attr['Filter']): continue if "-".join([attr['UID'], attr['Variant'], attr['Name']]) in seen: continue seen.add("-".join([attr['UID'], attr['Variant'], attr['Name']])) lines.extend(_classify(cols['type'], attr, samples)) df = _summary(lines) return df
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hit, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([ variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt', 'iso_snp_nt' ]) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "%s.tsv" % op.splitext(op.basename(args.gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:") [1].strip().split(",")) header = sep.join([ 'UID', 'Read', 'miRNA', 'Variant', variant_header, samples ]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join( [UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def _fix(line, expression): # Need to fix Read attribute since not usefull when multiple sample in a line. gff = feature(line) attr = gff.attributes attr['Expression'] = expression return paste_columns(gff, guess_format(line))
def create(reads, database, sample, args, quiet=False): """Read https://github.com/miRTop/mirtop/issues/9""" sep = " " if args.out_format == "gtf" else "=" seen = set() lines = defaultdict(defaultdict) seen_ann = {} filter_precursor = 0 filter_score = 0 n_hits = 0 n_reads = 0 n_seen = 0 if args.add_extra: precursors = args.precursors matures = args.matures for (r, read) in reads.items(): hits = set() [hits.add(mature.mirna) for mature in read.precursors.values() if mature.mirna] hits = len(hits) if len(read.precursors) > 0: n_reads += 1 for (ps, iso) in read.precursors.items(): p = list(ps)[0] if not iso.mirna: filter_precursor += 1 continue if (r, iso.mirna) not in seen: seen.add((r, iso.mirna)) chrom = p seq = reads[r].sequence seq_name = seq if not args.keep_name else r if iso.get_score(len(seq)) < 1: filter_score += 1 continue if iso.subs: iso.subs = [] if "N" in iso.subs[0] else iso.subs idseq = reads[r].idseq source = "ref_miRNA" if not iso.is_iso() else "isomiR" strand = iso.strand start, end = iso.start, iso.end score = iso.map_score mirName = iso.mirna preName = p Variant = iso.formatGFF() Cigar = iso.cigar counts = read.counts Filter = iso.filter annotation = "%s.%s.%s" % (chrom, idseq, seq_name) # TODO: This need to be moved to use the feature class # It needs a dict with all variable in keys fields = {'seq_name': seq_name, 'idseq': idseq, 'name': mirName, 'parent': preName, 'variant': Variant, 'cigar': Cigar, 'counts': counts, 'filter': Filter, 'hits': hits, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand} line = feature(fields).line logger.debug("GFF::%s" % line) if args.add_extra: extra = variant_with_nt(line, precursors, matures) line = "%s Changes %s;" % (line, extra) if annotation in seen_ann and seq.find("N") < 0 and ( seen_ann[annotation].split("\t")[0].find("N") < 0): logger.warning( "Same isomir %s from different sequence:" " \n%s and \n%s" % (annotation, line, seen_ann[annotation])) seen_ann[annotation] = line logger.debug("GFF::external %s" % iso.external) if start not in lines[chrom]: lines[chrom][start] = [] lines[chrom][start].append([annotation, chrom, counts, sample, line]) logger.debug("GFF::%s" % line) n_hits += 1 else: n_seen += 1 if not quiet: logger.info("GFF miRNAs: %s" % len(lines)) logger.info("GFF hits %s by %s reads" % (n_hits, n_reads)) logger.info("Filtered by being duplicated: %s" % n_seen) logger.info("Filtered by being outside miRNA positions:" " %s" % filter_precursor) logger.info("Filtered by being low score: %s" % filter_score) return lines
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hits = len( set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hits}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" # " Parent {preName}; Variant {isoformat};" # " Cigar {cigar}; Expression {counts};" # " Filter {Filter}; Hits {hits};").format(**locals()) # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" # "{score}\t{strand}\t.\t{attrb}").format(**locals()) fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hits, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append( [idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}