Пример #1
0
def _get_hits(fn):
    hits = Counter()
    with open(fn) as handle:
        for line in handle:
            attr = read_attributes(line, "=")
            query_sequence = attr['TS'].replace("U", "T")
            if query_sequence and query_sequence.find("N") > -1:
                continue
            idu = make_id(query_sequence)
            hits[idu] += 1
    return hits
Пример #2
0
def _get_hits(fn):
    hits = Counter()
    with open(fn) as handle:
        for line in handle:
            attr = read_attributes(line, "=")
            query_sequence = attr['TS'].replace("U", "T")
            if query_sequence and query_sequence.find("N") > -1:
                continue
            idu = make_id(query_sequence)
            hits[idu] += 1
    return hits
Пример #3
0
def read_file(fn, precursors, matures):
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            attr = read_attributes(line)
            t5 = _get_5p(precursors[attr["Parent"]],
                         matures[attr["Parent"]][attr["Name"]],
                         attr["Variant"])
            t3 = _get_3p(precursors[attr["Parent"]],
                         matures[attr["Parent"]][attr["Name"]],
                         attr["Variant"])
            add = _get_add(attr["Read"], attr["Variant"])
            print[attr["Variant"], t5, t3, add]
Пример #4
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join(
                ["seq", "name", "freq", "mir", "start", "end",
                 "mism", "add", "t5", "t3", "s5", "s3", "DB",
                 "precursor", "ambiguity"]), file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            attr = read_attributes(line)
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read,
                                 attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read,
                                     mature_sequence,
                                     attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(map(str, mm[0]))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" % [attr["Variant"],
                                                          t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2],
                    mm, add, t5, t3, "NA", "NA", "miRNA",  attr["Parent"], hit]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Пример #5
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join([
                "seq", "name", "freq", "mir", "start", "end", "mism", "add",
                "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"
            ]),
                  file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            attr = read_attributes(line)
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read, attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read, mature_sequence, attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(map(str, mm[0]))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" %
                         [attr["Variant"], t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            line = [
                read, attr["Read"], "0", attr["Name"], cols[1], cols[2], mm,
                add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit
            ]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Пример #6
0
def _calc_stats(fn):
    """
    Read files and parse into categories
    """
    samples = _get_samples(fn)
    lines = []
    seen = set()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            logger.debug("## STATS: attribute %s" % cols[8])
            attr = read_attributes(line)
            if "-".join([attr['Variant'], attr['Name']]) in seen:
                continue
            seen.add("-".join([attr['Variant'], attr['Name']]))
            lines.extend(_classify(cols[2], attr, samples))
    df = _summary(lines)
    return df
Пример #7
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom,
                                                 start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            fields = {
                'seq_name': query_sequence,
                'idseq': idu,
                'name': mirName,
                'parent': preName,
                'variant': isoformat,
                'cigar': cigar,
                'counts': counts,
                'filter': Filter,
                'hits': hit,
                'chrom': chrom,
                'start': start,
                'end': end,
                'database': database,
                'source': source,
                'score': score,
                'strand': strand
            }
            # TODO: convert to genomic if args.out_genomic
            line = feature(fields).line
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(feature(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads
Пример #8
0
def read_file(fn, database, gtf):
    """
    read bam file and perform realignment of hits
    """
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(gtf))[0]
    hits = _get_hits(fn)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nSOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = genomic2transcript(map_mir[mirName], chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = (
                "Read {query_sequence}; UID {idu}; Name {mirName}; Parent {preName}; Variant {isoformat}; Isocode {isotag}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};"
            ).format(**locals())
            res = (
                "{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}"
            ).format(**locals())
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, res])

    logger.info("Hits: %s" % reads_in)
    return reads
Пример #9
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName],
                                                 chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                     " Parent {preName}; Variant {isoformat};"
                     " Isocode {isotag}; Cigar {cigar}; Expression {counts};"
                     " Filter {Filter}; Hits {hit};").format(**locals())
            line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                    "{score}\t{strand}\t.\t{attrb}").format(**locals())
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(read_gff_line(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads