Python paste_columns примеры, mirtop.gff.body.paste_columns Python примеры использования

Пример #1

0

Показать файл

Файл: optimir.py Проект: srinivas32/mirtop

def read_file(fn, args):
    """
    Read OptimiR file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    sample = read_samples(fn)
    reads = defaultdict(dict)
    logger.debug("OPTIMIR::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            gff = feature(line)
            fixed_line = line
            if gff.columns:
                if "Variant" not in gff.attributes:
                    gff.attributes["Variant"] = "NA"

                logger.debug("OPTIMIR::Chrom update from %s to %s" %
                             (gff.columns["chrom"], gff.attributes["Parent"]))
                gff.columns["chrom"] = gff.attributes["Parent"].split(",")[0]
                fixed_line = gff.paste_columns(sep=sep)
                if args.add_extra:
                    extra = variant_with_nt(fixed_line, args.precursors,
                                            args.matures)
                    fixed_line = "%s Changes %s;" % (fixed_line, extra)

                fixed_line = paste_columns(feature(fixed_line), sep=sep)
                counts = gff.attributes["Expression"].split(",")
                chrom = gff.columns["chrom"]
                start = gff.columns["start"]
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                reads[chrom][start].append([
                    gff.attributes["UID"], gff.columns["chrom"], counts,
                    sample, fixed_line
                ])
    return reads

Пример #2

0

Показать файл

def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom,
                                                 start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            fields = {
                'seq_name': query_sequence,
                'idseq': idu,
                'name': mirName,
                'parent': preName,
                'variant': isoformat,
                'cigar': cigar,
                'counts': counts,
                'filter': Filter,
                'hits': hit,
                'chrom': chrom,
                'start': start,
                'end': end,
                'database': database,
                'source': source,
                'score': score,
                'strand': strand
            }
            # TODO: convert to genomic if args.out_genomic
            line = feature(fields).line
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(feature(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads

Пример #3

0

Показать файл

def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    gff = feature(line)
    attr = gff.attributes
    attr['Expression'] = expression
    return paste_columns(gff, guess_format(line))

Пример #4

0

Показать файл

Файл: merge.py Проект: miRTop/mirtop

def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    cols = read_gff_line(line)
    cols['attrb']['Expression'] = expression
    return paste_columns(cols, guess_format(line))

Пример #5

0

Показать файл

Файл: srnabench.py Проект: srinivas32/mirtop

def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hits = len(
                set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hits}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                #          " Parent {preName}; Variant {isoformat};"
                #          " Cigar {cigar}; Expression {counts};"
                #          " Filter {Filter}; Hits {hits};").format(**locals())
                # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                #         "{score}\t{strand}\t.\t{attrb}").format(**locals())
                fields = {
                    'seq_name': query_sequence,
                    'idseq': idu,
                    'name': mirName,
                    'parent': preName,
                    'variant': isoformat,
                    'cigar': cigar,
                    'counts': counts,
                    'filter': Filter,
                    'hits': hits,
                    'chrom': chrom,
                    'start': start,
                    'end': end,
                    'database': database,
                    'source': source,
                    'score': score,
                    'strand': strand
                }
                # TODO: convert to genomic if args.out_genomic
                line = feature(fields).line
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(feature(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append(
                        [idu, chrom, counts, sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads

Пример #6

0

Показать файл

Файл: manatee.py Проект: srinivas32/mirtop

def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}

Пример #7

0

Показать файл

def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    cols = read_gff_line(line)
    cols['attrb']['Expression'] = expression
    return paste_columns(cols, guess_format(line))

Пример #8

0

Показать файл

Файл: isomirsea.py Проект: miRTop/mirtop

def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName],
                                                 chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                     " Parent {preName}; Variant {isoformat};"
                     " Isocode {isotag}; Cigar {cigar}; Expression {counts};"
                     " Filter {Filter}; Hits {hit};").format(**locals())
            line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                    "{score}\t{strand}\t.\t{attrb}").format(**locals())
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(read_gff_line(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads

Пример #9

0

Показать файл

Файл: srnabench.py Проект: miRTop/mirtop

def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hit}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                         " Parent {preName}; Variant {isoformat};"
                         " Cigar {cigar}; Expression {counts};"
                         " Filter {Filter}; Hits {hit};").format(**locals())
                line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                        "{score}\t{strand}\t.\t{attrb}").format(**locals())
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(read_gff_line(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append([idu, chrom, counts,
                                                sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads

Python paste_columns примеры использования