Exemplo n.º 1
0
 def test_read_genomic(self):
     from mirtop.mirna import mapper
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_mirna("data/examples/annotate/hsa.gff3")
     print(map_mir)
     # if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5:
     #    raise ValueError("GFF is not loaded correctly.")
     return True
Exemplo n.º 2
0
 def test_read_genomic(self):
     from mirtop.mirna import mapper
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_mirna("data/examples/annotate/hsa.gff3")
     print(map_mir)
     # if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5:
     #    raise ValueError("GFF is not loaded correctly.")
     return True
Exemplo n.º 3
0
def _write(lines, header, fn, args=None):
    out_handle = open(fn, 'w')
    print(header, file=out_handle)
    mapper = read_gtf_to_mirna(args.gtf)
    for m in lines:
        for s in sorted(lines[m].keys()):
            for hit in lines[m][s]:
                # TODO: convert to genomic if args.out_genomic
                if args and args.out_genomic:
                    lifted = body.lift_to_genome(hit[4], mapper)
                    print(lifted, file=out_handle)
                else:
                    print(hit[4], file=out_handle)
    out_handle.close()
Exemplo n.º 4
0
    def test_spikeins(self):
        """Test spikeins reading and annotation"""
        from mirtop.libs import spikeins
        from mirtop.mirna.realign import get_mature_sequence
        load = spikeins.read_spikeins("data/examples/spikeins/spikeins.fa")
        print(load)
        load1 = load['spikein-1']
        mature_from_data = get_mature_sequence(load1['precursor'],
                                               load1['position'],
                                               exact=True)
        if mature_from_data != load1['mature']:
            raise ValueError("Sequences doesn't match \n%s\n%s" %
                             (mature_from_data, load1['mature']))

        file_fasta = "data/examples/spikeins/spikeins_pre.fasta"
        file_gff = "data/examples/spikeins/spikeins_pre.gff"
        spikeins.write_precursors(load, file_fasta)
        spikeins.write_gff(load, file_gff)

        from mirtop.mirna import mapper, fasta
        map_mir = mapper.read_gtf_to_mirna(file_gff)
        print(map_mir)
        fasta_precursor = fasta.read_precursor(file_fasta, None)
        print(fasta_precursor)
Exemplo n.º 5
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom,
                                                 start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            fields = {
                'seq_name': query_sequence,
                'idseq': idu,
                'name': mirName,
                'parent': preName,
                'variant': isoformat,
                'cigar': cigar,
                'counts': counts,
                'filter': Filter,
                'hits': hit,
                'chrom': chrom,
                'start': start,
                'end': end,
                'database': database,
                'source': source,
                'score': score,
                'strand': strand
            }
            # TODO: convert to genomic if args.out_genomic
            line = feature(fields).line
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(feature(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads
Exemplo n.º 6
0
Arquivo: prost.py Projeto: smoe/mirtop
def read_file(fn, precursors, database, mirna_gtf):
    """
    read bam file and perform realignment of hits
    """
    reads = defaultdict(dict)
    sample = os.path.splitext(os.path.basename(fn))[0]
    map_mir = mapper.read_gtf_to_mirna(mirna_gtf)
    non_mirna = 0
    non_chromosome_mirna = 0
    outside_mirna = 0
    lines_read = 0
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            lines_read += 1
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if len(cols) < 12:
                non_mirna += 1
                continue
            miRNA = cols[11]
            if not miRNA:
                if cols[13]:
                    miRNA = cols[13]
                elif cols[15]:
                    miRNA = cols[15]
                else:
                    continue
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            for loc in cols[5].split(";")[:1]:
                if loc.find("-") < 0:
                    non_chromosome_mirna += 1
                    continue
                chrom = loc.split(":")[0]
                start, end = loc.split(":")[1].split("-")
                preName, reference_start =  genomic2transcript(map_mir[miRNA], chrom, int(start))
                if not chrom:
                    non_chromosome_mirna += 1
                    continue
                # reference_start = int(cols[4]) - 1
                logger.debug("\nPROST::NEW::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  reference_start: {reference_start}\n"
                             "  mirna: {miRNA}".format(**locals()))
                Filter = "PASS"
                hit = "NA"
                isoformat = _make_variant(cols[19:])
                idu = make_id(query_sequence)
                strand = "."
                counts = cols[9]
                cigar = "NA"
                score = "."
                source = "isomiR" if isoformat != "NA" else "ref_miRNA"
                attrb = ("Read {query_sequence}; UID {idu}; Name {miRNA}; Parent {preName}; Variant {isoformat}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};").format(**locals())
                res = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}").format(**locals())
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                reads[chrom][start].append([idu, chrom, counts, sample, res])

    logger.info("Lines loaded: %s" % lines_read)
    logger.info("Skipped lines because non miRNA in line: %s" % non_mirna)
    logger.info("Skipped lines because non chromosome in GTF: %s" % non_chromosome_mirna)
    logger.info("Skipped lines because outside precursor: %s" % outside_mirna)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemplo n.º 7
0
def read_file(fn, hairpins, database, mirna_gtf):
    """
    Read PROST! file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with PROST output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits*

    """
    reads = defaultdict(hits)
    sample = os.path.splitext(os.path.basename(fn))[0]
    genomics = mapper.read_gtf_to_mirna(mirna_gtf)
    matures = mapper.read_gtf_to_precursor(mirna_gtf)
    non_mirna = 0
    non_chromosome_mirna = 0
    outside_mirna = 0
    lines_read = 0
    ann, ann_type = _group_seqs_by_ann(fn)
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            lines_read += 1
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if not ann[query_sequence]:
                non_mirna += 1
                continue
            miRNA = ann_type[ann[query_sequence]][1]
            preNames = ann_type[ann[query_sequence]][0]
            if query_name not in reads and query_sequence==None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            reads[query_name].set_sequence(query_sequence)
            reads[query_name].counts = cols[9]
            for preName in preNames.split(","):
                if preName in reads[query_name].precursors:
                    continue
                if preName not in hairpins:
                    non_chromosome_mirna += 1
                    continue
                reference_start = _align_to_mature(query_sequence, hairpins[preName], matures[preName][miRNA])
                logger.debug("\nPROST!::NEW::query: {query_sequence}\n"
                             "  precursor {preName}\n"
                             "  name:  {query_name}\n"
                             "  reference_start: {reference_start}\n"
                             "  mirna: {miRNA}".format(**locals()))
                iso = isomir()
                iso.align = line
                iso.set_pos(reference_start, len(reads[query_name].sequence))
                logger.debug("PROST!:: start %s end %s" % (iso.start, iso.end))
                if len(hairpins[preName]) < reference_start + len(reads[query_name].sequence):
                    continue
                iso.subs, iso.add, iso.cigar = filter.tune(
                    reads[query_name].sequence,
                    hairpins[preName],
                    reference_start, None)
                logger.debug("PROST!::After tune start %s end %s" % (
                    iso.start, iso.end))
                if len(iso.subs) < 2:
                    reads[query_name].set_precursor(preName, iso)
    logger.info("Lines loaded: %s" % lines_read)
    logger.info("Skipped lines because non miRNA in line: %s" % non_mirna)
    logger.info("Skipped lines because non chromosome in GTF:"
                " %s" % non_chromosome_mirna)
    logger.info("Skipped lines because outside precursor: %s" % outside_mirna)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemplo n.º 8
0
def read_file(fn, database, gtf):
    """
    read bam file and perform realignment of hits
    """
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(gtf))[0]
    hits = _get_hits(fn)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nSOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = genomic2transcript(map_mir[mirName], chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = (
                "Read {query_sequence}; UID {idu}; Name {mirName}; Parent {preName}; Variant {isoformat}; Isocode {isotag}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};"
            ).format(**locals())
            res = (
                "{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}"
            ).format(**locals())
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, res])

    logger.info("Hits: %s" % reads_in)
    return reads
Exemplo n.º 9
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName],
                                                 chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                     " Parent {preName}; Variant {isoformat};"
                     " Isocode {isotag}; Cigar {cigar}; Expression {counts};"
                     " Filter {Filter}; Hits {hit};").format(**locals())
            line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                    "{score}\t{strand}\t.\t{attrb}").format(**locals())
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(read_gff_line(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads
Exemplo n.º 10
0
def create_vcf(mirgff3, precursor, gtf, vcffile):
    """
    Args:
        'mirgff3(str)': File with mirGFF3 format that will be converted
        'precursor(str)': Fasta format sequences of all miRNA hairpins
        'gtf(str)': Genome coordinates
        'vcffile': name of the file to be saved
    Returns:
        Nothing is returned, instead, a VCF file is generated
    """
    #Check if the input files exist:
    try:
        gff3_file = open(mirgff3, "r", encoding="utf-8") if six.PY3 else open(
            mirgff3, "r")
    except IOError:
        print("Can't read the file", end=mirgff3)
        sys.exit()
    with gff3_file:
        data = gff3_file.read()
        if six.PY2:
            data = data.decode("utf-8-sig").encode("utf-8")

    gff3_data = data.split("\n")
    vcf_file = open(vcffile, "w")

    ver = "v4.3"  # Current VCF version formatting
    vcf_file.write("##fileformat=VCF%s\n" % ver)
    date = datetime.datetime.now().strftime("%Y%m%d")
    vcf_file.write("##fileDate=%s\n" % date)
    source = "\n".join(s for s in gff3_data
                       if "## source-ontology: " in s)[20:]
    line = 0
    sample_names = []
    while gff3_data[line][:2] == "##":
        if gff3_data[line][:19] == "## source-ontology:":
            source = gff3_data[line][20:]
        elif gff3_data[line][:11] == "## COLDATA:":
            sample_names = gff3_data[line][12:].split(",")
        line += 1
    vcf_file.write("##source=%s\n" % source)
    vcf_file.write(
        '##INFO=<ID=NS,Type=Integer,Description="Number of samples"\n')
    vcf_file.write("##FILTER=<ID=REJECT,Description='"
                   'Filter not passed'
                   "'>\n")
    vcf_file.write(
        '##FORMAT=<ID=TRC,Number=1,Type=Integer,Description="Total read count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TSC,Number=1,Type=Integer,Description="Total SNP count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TMC,Number=1,Type=Integer,Description="Total miRNA count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n')
    header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
    # Adds Header
    for s in range(len(sample_names)):
        header = header + "\t" + sample_names[s]
    vcf_file.write(header)

    all_dict = dict(
    )  # initializing an empty dictionary where all info will be added
    key_list = [
    ]  # Initializing a list which will contain all the keys of the dictionary
    mirna_dict = dict(
    )  # initializing an empty dictionary where mirna info will be put
    n_SNP = 0
    n_noSNP = 0
    no_var = 0
    hairpins = read_precursor(precursor)
    gff3 = read_gtf_to_precursor(gtf)
    gtf_dic = read_gtf_to_mirna(gtf)
    for line in range(0, len(gff3_data)):
        if not gff3_data[line]:
            continue
        if gff3_data[line][1] == "#":
            continue
        else:  # Parsing the gff3 mirna lecture:
            gff_fields = read_gff_line(gff3_data[line])
            gtf_name = gff_fields['attrb']['Name']
            gtf_parent = gff_fields['attrb']['Parent']
            if gtf_parent not in gff3:
                continue
            if gtf_name not in gff3[gtf_parent]:
                continue
            parent_ini_pos = gff3[gtf_parent][gtf_name][0]
            parent_end_pos = gff3[gtf_parent][gtf_name][1]
            ref_seq = (hairpins[gtf_parent][parent_ini_pos:parent_end_pos + 1])
            vcf_chrom = gtf_dic[gtf_name][gtf_parent][0]
            vcf_pos = int(gff_fields['start']) + int(
                gtf_dic[gtf_name][gtf_parent][1])
            hairpin = hairpins[gtf_parent]
            variants = gff_fields['attrb']['Variant'].split(",")
            logger.debug("VCF::Variant::%s" % variants)
            #  Obtaining the iso_3p, iso_add3p and iso_5p values:

            var3p = [s for s in variants if 'iso_3p' in s]
            if len(var3p):
                var3p = int(var3p[0][7:])  # Position of iso_3p value
            else:
                var3p = 0

            var_add3p = [s for s in variants if 'iso_add3p' in s]
            if len(var_add3p):
                var_add3p = int(
                    var_add3p[0][10:])  # Position of iso_add3p value
            else:
                var_add3p = 0
            var3p = var3p + var_add3p
            logger.debug("VCF::VAR_3p::%s" % var3p)
            var5p = [s for s in variants if 'iso_5p' in s]
            if len(var5p):
                var5p = int(var5p[0][7:])  # Position of iso_5p value
            else:
                var5p = 0  #
            logger.debug("VCF::VAR_5p::%s" % var5p)
            cigar = gff_fields['attrb']["Cigar"]
            # Obtaining all the variants from the cigar:
            if 1:
                (key_pos, key_var, vcf_ref, vcf_alt) = cigar_2_key(
                    cigar, gff_fields['attrb']['Read'], ref_seq, vcf_pos,
                    var5p, var3p, parent_ini_pos, parent_end_pos, hairpin)

                # Adding the variants to a dictionary and calculating all the fields of a vcf file format:
                if len(key_var) > 0:
                    for s in range(len(key_var)):
                        key_dict = vcf_chrom + '-' + str(
                            key_pos[s]) + '-' + str(key_var[s])
                        raw_counts = gff_fields['attrb']['Expression']
                        raw_counts = [int(i) for i in raw_counts.split(',')]
                        nozero_counts = [
                            int(i > 0) for i in raw_counts
                        ]  # counts for every sample if expr != 0.
                        if gtf_name in mirna_dict:  # Adding expression values to same mirnas
                            mirna_dict[gtf_name]['Z'] = [
                                sum(x) for x in zip(mirna_dict[gtf_name]['Z'],
                                                    raw_counts)
                            ]
                        else:
                            mirna_dict[gtf_name] = {}
                            mirna_dict[gtf_name]["Z"] = raw_counts
                        if key_dict in all_dict:
                            if all_dict[key_dict]["Type"] in [
                                    "A", "C", "T", "G"
                            ]:
                                all_dict[key_dict]['X'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['X'], nozero_counts)
                                ]
                                all_dict[key_dict]['Y'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['Y'], raw_counts)
                                ]
                        else:
                            key_list.append(key_dict)
                            all_dict[key_dict] = {}
                            all_dict[key_dict]["Chrom"] = vcf_chrom
                            all_dict[key_dict]["Position"] = key_pos[s]
                            all_dict[key_dict]["mirna"] = gtf_name
                            all_dict[key_dict]["Type"] = key_var[s]
                            if key_var[s][0] in ["A", "C", "T", "G"]:
                                n_SNP += 1
                                all_dict[key_dict]["SNP"] = True
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-SNP' + str(n_SNP)
                                all_dict[key_dict]['X'] = nozero_counts
                                all_dict[key_dict]['Y'] = raw_counts
                            else:
                                n_noSNP += 1
                                all_dict[key_dict]["SNP"] = False
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-nonSNP' + str(n_noSNP)
                            all_dict[key_dict]["Ref"] = vcf_ref[s]
                            all_dict[key_dict]["Alt"] = vcf_alt[s]
                            all_dict[key_dict]["Qual"] = "."
                            all_dict[key_dict]["Filter"] = gff_fields['attrb'][
                                'Filter']
                            all_dict[key_dict]["Info"] = "NS=" + str(
                                len(sample_names))
            else:
                no_var += 1

    #  Writing the VCF file:
    for s in key_list:
        variant_line = (
            "\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
            (all_dict[s]["Chrom"], all_dict[s]["Position"], all_dict[s]["ID"],
             all_dict[s]["Ref"], all_dict[s]["Alt"], all_dict[s]["Qual"],
             all_dict[s]["Filter"], all_dict[s]["Info"]))
        if all_dict[s]["Type"] in ["A", "T", "C", "G"]:
            format_col = "TRC:TSC:TMC:GT"
            variant_line = variant_line + "\t" + format_col
            samples = ""
            for n in range(len(sample_names)):
                X = all_dict[s]["X"][n]
                Y = all_dict[s]["Y"][n]
                Z = mirna_dict[all_dict[s]["mirna"]]["Z"][n]
                # Calculating the genotype:
                if Y == 0:
                    GT = "0|0"
                elif Z == Y:
                    GT = "1|1"
                else:
                    GT = "1|0"
                samples = samples + "\t" + str(X) + ":" + str(Y) + ":" + str(
                    Z) + ":" + GT
            variant_line = variant_line + samples
        else:
            format_col = ""
            variant_line = variant_line + format_col
        vcf_file.write(variant_line)
    vcf_file.close()
Exemplo n.º 11
0
 def test_read_mir2genomic(self):
     from mirtop.mirna import mapper
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_mirna("data/examples/annotate/hsa.gff3")
     print(map_mir)