示例#1
0
 def _stat_and_correct(self, stats, folder):
     '''do statistics and print the final gff file'''
     for gff in os.listdir(folder):
         prefix = gff.replace("_CRISPR.gff", "")
         stats[prefix] = {"all": {"cri": 0, "re": {}}}
         gh = open(os.path.join(folder, gff), "r")
         oh = open("tmp_cri.gff", "w")
         oh.write("##gff-version 3\n")
         cr_num = 0
         re_num = 0
         first = True
         for entry in Gff3Parser().entries(gh):
             if entry.seq_id not in stats[prefix].keys():
                 stats[prefix][entry.seq_id] = {"cri": 0, "re": {}}
             if entry.feature == "CRISPR":
                 id_ = "CRISPR_" + str(cr_num)
                 attribute = ";".join(
                     ["ID=" + entry.seq_id + "_" + id_, "method=CRT"])
                 cr_num += 1
                 if first:
                     first = False
                 else:
                     if repeat not in stats[prefix][
                             entry.seq_id]["re"].keys():
                         stats[prefix][entry.seq_id]["re"][repeat] = 1
                     else:
                         stats[prefix][entry.seq_id]["re"][repeat] += 1
                     if repeat not in stats[prefix]["all"]["re"].keys():
                         stats[prefix]["all"]["re"][repeat] = 1
                     else:
                         stats[prefix]["all"]["re"][repeat] += 1
                 repeat = 0
                 stats[prefix][entry.seq_id]["cri"] += 1
                 stats[prefix]["all"]["cri"] += 1
             elif entry.feature == "repeat_unit":
                 attribute = ";".join([
                     "ID=" + entry.seq_id + "_Repeat_" + str(re_num),
                     "method=CRT", "Parent=" + id_
                 ])
                 re_num += 1
                 repeat += 1
             oh.write(
                 "\t".join([entry.info_without_attributes, attribute]) +
                 "\n")
         if not first:
             if repeat not in stats[prefix][entry.seq_id]["re"].keys():
                 stats[prefix][entry.seq_id]["re"][repeat] = 1
             else:
                 stats[prefix][entry.seq_id]["re"][repeat] += 1
             if repeat not in stats[prefix]["all"]["re"].keys():
                 stats[prefix]["all"]["re"][repeat] = 1
             else:
                 stats[prefix]["all"]["re"][repeat] += 1
         gh.close()
         oh.close()
         os.remove(os.path.join(folder, gff))
         shutil.move("tmp_cri.gff", os.path.join(folder, gff))
示例#2
0
def read_gff(input_file):
    datas = []
    gff_parser = Gff3Parser()
    f_h = open(input_file, "r")
    for entry in gff_parser.entries(f_h):
        entry.attributes["print"] = False
        datas.append(entry)
    datas = sorted(datas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return datas
示例#3
0
def read_data(inter_gff, tss_file, srna_gff, fasta, utr_detect):
    seq = {}
    inters = []
    tsss = []
    srnas = []
    fh = open(inter_gff)
    for entry in Gff3Parser().entries(fh):
        if ((entry.source == "UTR_derived") and
            (utr_detect)) or ((entry.source == "intergenic") or
                              (entry.source == "antisense")):
            inters.append(entry)
    inters = sorted(inters, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    fh.close()
    if tss_file is not None:
        fh = open(tss_file)
        for entry in Gff3Parser().entries(fh):
            tsss.append(entry)
        tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
        fh.close()
    else:
        tsss = None
    if srna_gff is not None:
        fh = open(srna_gff)
        for entry in Gff3Parser().entries(fh):
            new = {}
            for key, value in entry.attributes.items():
                if "sORF" not in key:
                    new[key] = value
            entry.attributes = copy.deepcopy(new)
            srnas.append(entry)
        srnas = sorted(srnas,
                       key=lambda k: (k.seq_id, k.start, k.end, k.strand))
        fh.close()
    else:
        srnas = None
    with open(fasta, "r") as s_f:
        for line in s_f:
            line = line.strip()
            if line.startswith(">"):
                strain = line[1:]
                seq[strain] = ""
            else:
                seq[strain] = seq[strain] + line
    return inters, tsss, srnas, seq
示例#4
0
def combine(frag_file, tex_file, tolerance, output_file):
    frags = []
    norms = []
    finals = []
    out = open(output_file, "w")
    out.write("##gff-version 3\n")
    f_h = open(frag_file, "r")
    for entry in Gff3Parser().entries(f_h):
        entry.attributes["print"] = False
        frags.append(entry)
    f_h.close()
    n_h = open(tex_file, "r")
    for entry in Gff3Parser().entries(n_h):
        entry.attributes["print"] = False
        norms.append(entry)
    n_h.close()
    sort_frags = sorted(frags,
                        key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    sort_norms = sorted(norms,
                        key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for frag in sort_frags:
        overlap = False
        for norm in sort_norms:
            overlap = compare(frag, norm, overlap, tolerance)
        if overlap:
            store(frag, "fragmented&tex_notex", finals)
        else:
            store(frag, "fragmented", finals)
    for norm in sort_norms:
        if norm.attributes["print"] is False:
            store(norm, "tex_notex", finals)
    sort_finals = sorted(finals,
                         key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    num = 0
    for tar in sort_finals:
        if tar.attributes["print"] is True:
            continue
        overlap = False
        for ref in sort_finals:
            overlap = compare(tar, ref, overlap, tolerance)
        name = '%0*d' % (5, num)
        print_file(tar, out, name, num)
        num += 1
    out.close()
def read_tag_file(gff_file, ta_file, c_feature):
    region = None
    gffs = []
    tas = []
    stats = {}
    stats["All"] = {"bsae": 0, "bsbe": 0, "asae": 0,
                    "asbe": 0, "other": 0, "gene": 0}
    pre_seq_id = ""
    ta_f = open(ta_file, "r")
    for entry in Gff3Parser().entries(ta_f):
        if entry.seq_id != pre_seq_id:
            pre_seq_id = entry.seq_id
            stats[entry.seq_id] = {"bsae": 0, "bsbe": 0, "asae": 0,
                                   "asbe": 0, "other": 0, "gene": 0}
        entry.attributes = del_attributes(entry, [
                           "_".join(["associated", c_feature]),
                           "_".join(["compare", c_feature])])
        tas.append(entry)
    ta_f.close()
    g_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(g_f):
        if (entry.feature == c_feature):
            ori_parents = []
            if "Parent" in entry.attributes.keys():
                parents = entry.attributes["Parent"].split(",")
                for parent in parents:
                    if "gene" in parent:
                        ori_parents.append(parent)
                if len(ori_parents) == 0:
                    entry.attributes = del_attributes(entry, ["Parent"])
                else:
                    entry.attributes["Parent"] = ",".join(ori_parents)
            if entry.seq_id in stats.keys():
                stats[entry.seq_id]["gene"] += 1
                stats["All"]["gene"] += 1
        if (entry.feature.lower() != "region") and (
                entry.feature.lower() != "source") and (
                entry.feature.lower() != "remark"):
            gffs.append(entry)
        else:
            region = entry
    g_f.close()
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return gffs, tas, stats, region
示例#6
0
def read_tss(tss_file):
    tsss = []
    if tss_file is not None:
        tss_f = open(tss_file, "r")
        gff_parser = Gff3Parser()
        for entry in gff_parser.entries(tss_f):
            tsss.append(entry)
        tss_f.close()
    num_tss = None
    return tsss, num_tss
示例#7
0
 def _read_gff(self, txt):
     gffs = []
     gh = open(os.path.join(self.gff_path, txt.replace(".txt", ".gff")),
               "r")
     for entry in Gff3Parser().entries(gh):
         if (entry.feature == "gene") or (entry.feature == "CDS") or (
                 entry.feature == "tRNA") or (entry.feature == "rRNA"):
             gffs.append(entry)
     gh.close()
     return gffs
示例#8
0
 def fix_ratt(self, gff_file, strain, out_file):
     out = open(out_file, "w")
     out.write("##gff-version 3\n")
     nums = {"cds": 0, "rna": 0, "gene": 0}
     genes = []
     datas = []
     check_parent = False
     self._read_gff(gff_file, genes, datas, strain)
     check_parent = False
     for data in datas:
         if data.feature == "gene":
             data = genes[nums["gene"]]
             nums["gene"] += 1
         elif (data.feature == "rRNA") or \
              (data.feature == "tRNA"):
             name = data.attributes["locus_tag"]
             data.attribute_string = ";".join([
                 "ID=rna" + str(nums["rna"]),
                 "Name=" + name, data.attribute_string])
             nums["rna"] += 1
         elif data.feature == "CDS":
             if "protein_id" in data.attributes.keys():
                 name = data.attributes["protein_id"]
             for gene in genes:
                 if ((gene.start <= data.start) and (
                         gene.end >= data.end)) or (
                         gene.attributes["locus_tag"] ==
                         data.attributes["locus_tag"]):
                     data.attribute_string = ";".join([
                         "ID=cds" + str(nums["cds"]), "Name=" + name,
                         "Parent=" + gene.attributes["ID"],
                         data.attribute_string])
                     check_parent = True
                     break
             if check_parent:
                 check_parent = False
                 pass
             else:
                 data.attribute_string = ";".join([
                     "ID=cds" + str(nums["cds"]),
                     "Name=" + name, data.attribute_string])
             nums["cds"] += 1
         if "group" in data.attributes.keys():
             ref_f = open(gff_file, "r")
             for ref in Gff3Parser().entries(ref_f):
                 if "group" in ref.attributes.keys():
                     if (data.attributes["group"] ==
                             ref.attributes["group"]):
                         if (data.strand != ref.strand):
                             data.strand = ref.strand
                         break
             ref_f.close()
         out.write("\t".join([data.info_without_attributes,
                              data.attribute_string]) + "\n")
     out.close()
示例#9
0
def read_gff(gff_file, tran_file, hypo):
    trans = []
    gffs = []
    gh = open(gff_file)
    for entry in Gff3Parser().entries(gh):
        if (Helper().feature_without_notgene(entry)) and (entry.feature !=
                                                          "sORF"):
            if ("product" in entry.attributes.keys()) and (hypo):
                if "hypothetical protein" not in entry.attributes["product"]:
                    gffs.append(entry)
            else:
                gffs.append(entry)
    th = open(tran_file)
    for entry in Gff3Parser().entries(th):
        trans.append(entry)
    gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    trans = sorted(trans, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    gh.close()
    th.close()
    return gffs, trans
示例#10
0
def read_gff(gff_file, features):
    gffs = []
    if not os.path.isfile(gff_file):
        filename = gff_file.split(".")
        gff_file = ".".join(filename[0:-2]) + ".gff"
    g_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(g_f):
        if entry.feature in features:
            gffs.append(entry)
    gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start))
    return gffs
示例#11
0
def read_gff(filename):
    gffs = []
    genes = []
    for entry in Gff3Parser().entries(open(filename)):
        if entry.feature == "gene":
            genes.append(entry)
        gffs.append(entry)
    gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    if len(genes) != 0:
        genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return gffs, genes
示例#12
0
def read_gff(seq_file, gff_file, tran_file):
    genome = {}
    genes = []
    trans = []
    for entry in Gff3Parser().entries(open(gff_file)):
        if (entry.feature == "gene"):
            genes.append(entry)
    for entry in Gff3Parser().entries(open(tran_file)):
        trans.append(entry)
    with open(seq_file, "r") as q_h:
        for line in q_h:
            line = line.strip()
            if line.startswith(">"):
                strain = line[1:]
                genome[strain] = ""
            else:
                genome[strain] = genome[strain] + line
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    trans = sorted(trans, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return genes, genome, trans
示例#13
0
def read_predict_manual_gff(gff_file, args_ops):
    num = 0
    gffs = []
    f_h = open(gff_file, "r")
    for entry in Gff3Parser().entries(f_h):
        if (entry.start <= int(args_ops.gene_length)):
            num += 1
            entry.attributes["print"] = False
            gffs.append(entry)
    f_h.close()
    return num, gffs
示例#14
0
def read_gff(filename, index):
    gf = open(filename, "r")
    gff_parser = Gff3Parser()
    datas = []
    for entry in gff_parser.entries(gf):
        entry.attributes[index] = "NA"
        datas.append(entry)
        datas = sorted(datas,
                       key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    gf.close()
    return datas
def check_overlap(table_file, gff_file):
    out = open(table_file + "tmp", "w")
    gffs = []
    gff_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(gff_f):
        if Helper().feature_without_notgene(entry):
            gffs.append(entry)
    fh = open(table_file, "r")
    out.write("\t".join([
        "Rank", "Genome", "Name", "Start", "End", "Strand",
        "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates",
        "Lib_type", "Best_avg_coverage", "Track/Coverage",
        "Normalized_secondary_energy_change(by_length)", "sRNA_types",
        "Conflict_sORF", "nr_hit_number", "sRNA_hit_number",
        "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score",
        "Overlap_CDS_forward", "Overlap_nts_forward", "Overlap_CDS_reverse",
        "Overlap_nts_reverse", "End_with_terminator", "Associated_promoter",
        "sRNA_length"
    ]) + "\n")
    for row in csv.reader(fh, delimiter='\t'):
        if row[3] != "Start":
            overlaps = {"forward": [], "reverse": [], "CDS_f": [], "CDS_r": []}
            start = int(row[3])
            end = int(row[4])
            for gff in gffs:
                if ((gff.end < end) and (gff.end > start) and
                    (gff.start <= start)) or (
                        (gff.start > start) and (gff.start < end) and
                        (gff.end >= end)) or ((gff.end >= end) and
                                              (gff.start <= start)) or (
                                                  (gff.end <= end) and
                                                  (gff.start >= start)):
                    overlap = min(gff.end, end) - max(gff.start, start) + 1
                    percent = "{0:.0f}%".format(
                        (float(overlap) / float(end - start + 1)) * 100)
                    if gff.strand == "+":
                        overlaps["forward"].append(
                            str(overlap) + "(" + str(percent) + ")")
                        overlaps["CDS_f"].append(import_cds(gff))
                    else:
                        overlaps["reverse"].append(
                            str(overlap) + "(" + str(percent) + ")")
                        overlaps["CDS_r"].append(import_cds(gff))
            if len(overlaps["forward"]) == 0:
                overlaps["forward"] = ["NA"]
                overlaps["CDS_f"] = ["NA"]
            if len(overlaps["reverse"]) == 0:
                overlaps["reverse"] = ["NA"]
                overlaps["CDS_r"] = ["NA"]
            out.write("\t".join(row[0:19] + [
                ";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]),
                ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"])
            ] + row[21:]) + "\n")
    shutil.move(table_file + "tmp", table_file)
示例#16
0
def read_gff(gff_file):
    cdss = []
    g_h = open(gff_file)
    for entry in Gff3Parser().entries(g_h):
        if (entry.feature == "CDS") or (entry.feature
                                        == "tRNA") or (entry.feature
                                                       == "rRNA"):
            cdss.append(entry)
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    g_h.close()
    return cdss
示例#17
0
def read_gffs(gff_files, feature):
    gffs = {}
    if feature == "transcript":
        gffs["transcript"] = []
        gff_f = open(gff_files, "r")
        for entry in Gff3Parser().entries(gff_f):
            gffs["transcript"].append(entry)
        gff_f.close()
        gffs["transcript"] = sorted(gffs["transcript"],
                                    key=lambda x:
                                    (x.seq_id, x.start, x.end, x.strand))
    else:
        num = 0
        for files in gff_files:
            for gff_file in glob(files):
                gffs[num] = []
                gff_f = open(gff_file, "r")
                for entry in Gff3Parser().entries(gff_f):
                    parent = None
                    if (entry.feature
                            == "CDS") or (entry.feature == "exon") or (
                                entry.feature == "repeat_unit") or (
                                    entry.feature
                                    == "tRNA") or (entry.feature
                                                   == "rRNA") or (entry.feature
                                                                  == "ncRNA"):
                        if "Parent" in entry.attributes.keys():
                            parent = entry.attributes["Parent"]
                    del_attributes(
                        entry,
                        ["associated_tran", "parent_tran", "Parent", "Parent"])
                    if parent is not None:
                        entry.attributes["Parent"] = parent
                    entry.attributes["print"] = False
                    gffs[num].append(entry)
                gff_f.close()
                gffs[num] = sorted(gffs[num],
                                   key=lambda x:
                                   (x.seq_id, x.start, x.end, x.strand))
                num += 1
    return gffs
def filter_frag(srna_table, srna_gff):
    out = open("tmp_srna.gff", "w")
    out_ta = open("tmp_srna.csv", "w")
    out.write("##gff-version 3\n")
    gffs = []
    tables = []
    gff_parser = Gff3Parser()
    g_f = open(srna_gff, "r")
    for entry in gff_parser.entries(g_f):
        gffs.append(entry)
    fh = open(srna_table, "r")
    for row in csv.reader(fh, delimiter='\t'):
        tables.append(row)
    new_gffs = []
    for gff in gffs:
        if ("UTR_type" in gff.attributes.keys()):
            if ("5utr" in gff.attributes["UTR_type"]) or (
                    "interCDS" in gff.attributes["UTR_type"]):
                for table in tables:
                    if (gff.seq_id == table[0]) and (
                            gff.start == int(table[2])) and (
                            gff.end == int(table[3])) and (
                            gff.strand == table[4]):
                        if "frag" in table[5]:
                            new_gffs.append(gff)
            elif "3utr" in gff.attributes["UTR_type"]:
                new_gffs.append(gff)
        else:
            new_gffs.append(gff)
    new_tables = []
    for table in tables:
        for gff in new_gffs:
            if (gff.seq_id == table[0]) and (
                 gff.start == int(table[2])) and (
                 gff.end == int(table[3])) and (
                 gff.strand == table[4]):
                new_tables.append(table)
                out_ta.write("\t".join(table) + "\n")
    for gff in new_gffs:
        for table in new_tables:
            if (gff.seq_id == table[0]) and (
                 gff.start == int(table[2])) and (
                 gff.end == int(table[3])) and (
                 gff.strand == table[4]):
                out.write(gff.info + "\n")
    g_f.close()
    fh.close()
    out.close()
    out_ta.close()
    os.remove(srna_gff)
    os.remove(srna_table)
    shutil.move("tmp_srna.gff", srna_gff)
    shutil.move("tmp_srna.csv", srna_table)
示例#19
0
def read_gff(gff_file):
    cdss = []
    genes = []
    g_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(g_f):
        if (Helper().feature_without_notgene(entry)):
            cdss.append(entry)
        if entry.feature == "gene":
            genes.append(entry)
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return cdss, genes
示例#20
0
def read_gff(gff_file, type_):
    cdss = []
    g_h = open(gff_file)
    for entry in Gff3Parser().entries(g_h):
        if (Helper().feature_without_notgene(entry)):
            if (type_ == "riboswitch") and (entry.feature != "riboswitch"):
                cdss.append(entry)
            elif (type_
                  == "thermometer") and (entry.feature != "RNA_thermometer"):
                cdss.append(entry)
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    g_h.close()
    return cdss
示例#21
0
def fill_gap(gff_file, ta_file, type_, output):
    tas = []
    genes = []
    print_list = []
    ta_f = open(ta_file, "r")
    gff_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(ta_f):
        tas.append(entry)
    ta_f.close()
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for entry in Gff3Parser().entries(gff_f):
        if (entry.feature == "gene") or (entry.feature == "CDS") or (
                entry.feature == "rRNA") or (entry.feature == "tRNA"):
            genes.append(entry)
    gff_f.close()
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    out = open(output, "w")
    out.write("##gff-version 3\n")
    if type_ == "overlap":
        overlap(tas, genes, print_list, out)
    elif type_ == "uni":
        uni(tas, genes, out)
示例#22
0
def fill_gap(gff_file, ta_file, type_, output, modify):
    '''compare transcript with genome annotation to modify the transcript'''
    tas = []
    genes = []
    ta_f = open(ta_file, "r")
    gff_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(ta_f):
        tas.append(entry)
    ta_f.close()
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for entry in Gff3Parser().entries(gff_f):
        if (entry.feature == "gene") or (entry.feature == "CDS") or (
                entry.feature == "rRNA") or (entry.feature == "tRNA"):
            genes.append(entry)
    gff_f.close()
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    out = open(output, "w")
    out.write("##gff-version 3\n")
    if type_ == "overlap":
        overlap(tas, genes, out, modify)
    elif type_ == "uni":
        uni(tas, genes, out)
示例#23
0
def output_coverage(table_file, gff_file, cutoff_cover, stat_file, out_folder):
    out = open(os.path.join(out_folder, "tmp_srna_table"), "w")
    out_g = open(os.path.join(out_folder, "tmp_srna_gff"), "w")
    out.write("\t".join([
        "Rank", "Genome", "Name", "Start", "End", "Strand",
        "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates",
        "Lib_type", "Best_avg_coverage", "Best_highest_coverage",
        "Best_lower_coverage", "Track/Coverage",
        "Normalized_secondary_energy_change(by_length)",
        "UTR_derived/Intergenic", "Confliction_of_sORF", "nr_hit_number",
        "sRNA_hit_number", "nr_hit_top3|ID|e-value", "sRNA_hit|e-value",
        "Overlap_CDS", "Overlap_percent", "End_with_terminator"
    ]) + "\n")
    out_g.write("##gff-version 3\n")
    stat_out = open(stat_file, "w")
    nums = {5: 0}
    for i in range(10, 100, 10):
        nums[i] = 0
    for i in range(100, 1000, 100):
        nums[i] = 0
    for i in range(1000, 5000, 500):
        nums[i] = 0
    gffs = []
    gh = open(gff_file, "r")
    for entry in Gff3Parser().entries(gh):
        gffs.append(entry)
    fh = open(table_file, "r")
    rank = 1
    new_gffs = []
    for row in csv.reader(fh, delimiter='\t'):
        if row[0] != "rank":
            for cutoff in nums.keys():
                if float(row[10]) >= cutoff:
                    nums[cutoff] += 1
            if float(row[10]) >= cutoff_cover:
                row[0] = str(rank)
                out.write("\t".join(row) + "\n")
                rank += 1
                for gff in gffs:
                    if (row[1] == gff.seq_id) and (row[3] == str(
                            gff.start)) and (row[4] == str(
                                gff.end)) and (row[5] == gff.strand):
                        new_gffs.append(gff)
    sort_gffs = sorted(new_gffs,
                       key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for gff in sort_gffs:
        out_g.write(gff.info + "\n")
    coverlist = sorted(nums, key=lambda key: nums[key])
    stat_out.write("coverage\tfrequency\n")
    for cover in coverlist:
        stat_out.write("\t".join([str(cover), str(nums[cover])]) + "\n")
示例#24
0
def gen_promoter_table(input_file, output_file, tss_file, type_):
    '''generate the table of promoter based on MEME'''
    tsss = []
    gff_f = open(tss_file, "r")
    for entry in Gff3Parser().entries(gff_f):
        tsss.append(entry)
    out = open(output_file, "w")
    out.write("\t".join(["Genome", "TSS_position", "TSS_strand", "Motif"]) +
              "\n")
    detect = False
    num = 1
    with open(input_file) as fh:
        for line in fh:
            line = line.strip()
            if type_ == "meme":
                if line.startswith("MOTIF"):
                    motif = line.split("MEME")[0].strip()
                    datas = motif.split(" ")
                    motif = datas[0] + "_" + datas[-1]
                    detect = False
                elif (line.startswith("Sequence name")) and (
                        line.endswith("Site")):
                    detect = True
                elif (len(line) == 0):
                    detect = False
                elif (detect) and (not line.startswith("---")):
                    tag = line.split(" ")[0]
                    datas = tag.split("_")
                    for tss in tsss:
                        if ("_".join(datas[2:])
                                in tss.seq_id) and (datas[0] == str(
                                    tss.start)) and (datas[1] == tss.strand):
                            out.write("\t".join(
                                [tss.seq_id, datas[0], datas[1], motif]) +
                                      "\n")
            elif type_ == "glam2":
                if line.startswith("*"):
                    detect = True
                    motif = "MOTIF_" + str(num)
                    num += 1
                elif len(line) == 0:
                    detect = False
                elif detect:
                    datas = line.split(" ")[0].split("_")
                    for tss in tsss:
                        if ("_".join(datas[2:])
                                in tss.seq_id) and (datas[0] == str(
                                    tss.start)) and (datas[1] == tss.strand):
                            out.write("\t".join(
                                [tss.seq_id, datas[0], datas[1], motif]) +
                                      "\n")
示例#25
0
def read_gff(gff_file):
    cdss = []
    genes = []
    g_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(g_f):
        if (entry.feature == "CDS") or \
           (entry.feature == "rRNA") or \
           (entry.feature == "tRNA"):
            cdss.append(entry)
        if entry.feature == "gene":
            genes.append(entry)
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return cdss, genes
示例#26
0
 def __init__(self, args_term):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_term.gffs, "tmp")
     self.fasta_path = os.path.join(args_term.fastas, "tmp")
     self.tran_path = os.path.join(args_term.trans, "tmp")
     self.outfolder = {
         "term": os.path.join(args_term.out_folder, "gffs"),
         "csv": os.path.join(args_term.out_folder, "tables")
     }
     self.terms = {
         "all": os.path.join(self.outfolder["term"], "all_candidates"),
         "express": os.path.join(self.outfolder["term"],
                                 "expressed_candidates"),
         "best": os.path.join(self.outfolder["term"], "best_candidates"),
         "non": os.path.join(self.outfolder["term"],
                             "non_expressed_candidates")
     }
     self.csvs = {
         "all": os.path.join(self.outfolder["csv"], "all_candidates"),
         "express": os.path.join(self.outfolder["csv"],
                                 "expressed_candidates"),
         "best": os.path.join(self.outfolder["csv"], "best_candidates"),
         "non": os.path.join(self.outfolder["csv"],
                             "non_expressed_candidates")
     }
     self.combine_path = os.path.join(self.gff_path, "combine")
     self.tmps = {
         "transterm": os.path.join(os.getcwd(), "tmp_transterm"),
         "hp": "transtermhp",
         "hp_gff": "transtermhp.gff",
         "hp_path": "tmp_transterm/tmp",
         "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
         "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
         "gff": "tmp.gff",
         "folder": os.path.join(os.getcwd(), "tmp")
     }
     self.suffixs = {
         "gff": "term.gff",
         "csv": "term.csv",
         "allgff": "term_all.gff"
     }
     if args_term.srnas:
         self.srna_path = os.path.join(args_term.srnas, "tmp")
     else:
         self.srna_path = None
     self._make_gff_folder()
示例#27
0
def print_coverage(trans, out, out_gff, wigs_f, wigs_r, table_best, gff_file):
    genes = []
    if gff_file is not None:
        gff_f = open(gff_file, "r")
        for entry in Gff3Parser().entries(gff_f):
            if (entry.feature == "gene"):
                genes.append(entry)
    for tran in trans:
        infos = {}
        tran.attributes["detect_lib"] = tran.attributes["detect_lib"].replace(
            "tex_notex", "TEX+/-")
        out.write("\t".join([
            tran.seq_id, tran.attributes["Name"],
            str(tran.start),
            str(tran.end), tran.strand, tran.attributes["detect_lib"]
        ]))
        compare_ta_genes(tran, genes, out)
        print_associate("associated_tss", tran, out)
        print_associate("associated_term", tran, out)
        if tran.strand == "+":
            detect_coverage(wigs_f, tran, infos)
        else:
            detect_coverage(wigs_r, tran, infos)
        out.write("\t")
        best = -1
        best_track = ""
        best_cover = {}
        for track, cover in infos.items():
            if not table_best:
                if best != -1:
                    out.write(";")
                out.write("{0}(avg={1})".format(track, str(cover["avg"])))
            if cover["avg"] > best:
                best = cover["avg"]
                best_track = track
                best_cover = cover
        if table_best:
            out.write("{0}(avg={1})".format(best_track,
                                            str(best_cover["avg"])))
        out.write("\n")
        new_attrs = {}
        for key, value in tran.attributes.items():
            if ("high_coverage" not in key) and ("low_coverage" not in key):
                new_attrs[key] = value
        new_attrs["best_avg_coverage"] = str(best_cover["avg"])
        attribute_string = ";".join(
            ["=".join(items) for items in new_attrs.items()])
        out_gff.write(
            "\t".join([tran.info_without_attributes, attribute_string]) + "\n")
示例#28
0
def read_gff(gff_file, tss_file):
    tsss = []
    gffs = []
    gff_parser = Gff3Parser()
    fh = open(gff_file)
    for gff in gff_parser.entries(fh):
        gffs.append(gff)
    gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    fh.close()
    tss_f = open(tss_file, "r")
    for tss in gff_parser.entries(tss_f):
        tsss.append(tss)
    tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    tss_f.close()
    return gffs, tsss
示例#29
0
def read_file(seq_file, tran_file, gff_file):
    seq = {}
    tas = []
    genes = []
    merges = []
    with open(seq_file, "r") as f_h:
        for line in f_h:
            line = line.strip()
            if line.startswith(">"):
                strain = line[1:]
                seq[strain] = ""
            else:
                seq[strain] = seq[strain] + line
    ta_fh = open(tran_file, "r")
    for entry in Gff3Parser().entries(ta_fh):
        tas.append(entry)
        merges.append(entry)
    for entry in Gff3Parser().entries(open(gff_file)):
        if (entry.feature == "gene"):
            genes.append(entry)
            merges.append(entry)
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return seq, tas, merges, genes
示例#30
0
def longer_ta(ta_file, length, out_file):
    '''merge overlaped transcript to for a complete transcript'''
    tas = []
    for entry in Gff3Parser().entries(open(ta_file)):
        tas.append(entry)
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for ta_1 in tas:
        for ta_2 in tas:
            if (ta_1.seq_id == ta_2.seq_id) and (
                    ta_1.strand == ta_2.strand):
                if (ta_1.start <= ta_2.start) and (
                        ta_1.end >= ta_2.start) and (
                        ta_1.end <= ta_2.end):
                    ta_1.end = ta_2.end
                elif (ta_1.start >= ta_2.start) and (
                        ta_1.start <= ta_2.end) and (
                        ta_1.end >= ta_2.end):
                    ta_1.start = ta_2.start
                elif (ta_1.start <= ta_2.start) and (
                        ta_1.end >= ta_2.end):
                    pass
                elif (ta_1.start >= ta_2.start) and (
                        ta_1.end <= ta_2.end):
                    ta_1.start = ta_2.start
                    ta_1.end = ta_2.end
    first = True
    out = open(out_file, "w")
    out.write("##gff-version 3\n")
    num = 0
    pre_ta = None
    tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for ta in tas:
        if (ta.end - ta.start) >= length:
            if first:
                first = False
                print_file(ta, num, out)
                num += 1
            else:
                if (ta.seq_id == pre_ta.seq_id) and (
                        ta.strand == pre_ta.strand) and (
                        ta.start == pre_ta.start) and (
                        ta.end == pre_ta.end):
                    pass
                else:
                    print_file(ta, num, out)
                    num += 1
        pre_ta = ta
    out.close()