Пример #1
0
def main(target, strain_lst):
    print("START: query about {} strains".format(len(strain_lst)))
    dct_lst = []
    for strain in strain_lst:
        refseqFilepath = "/data/mitsuki/data/denovo/{}/annotation/refseq/gff/{}.gff".format(
            target, strain)
        refseq_df = read_gff(refseqFilepath, ["orf_id"])
        dbFilepath = "/data/mitsuki/data/denovo/{}/annotation/prodigal/sup/{}.sq3".format(
            target, strain)
        sdc = ScoreDbController(dbFilepath)

        infoCount = 0
        for _, row in refseq_df.iterrows():
            dct = {"orf_id": row["orf_id"]}
            info_dct = sdc.info(row["seqname"], row["start"], row["end"])
            if len(info_dct) > 0:
                infoCount += 1
                dct.update(info_dct)
            dct_lst.append(dct)
        print("\tDONE: found information for {}/{} genes in {}".format(
            infoCount, refseq_df.shape[0], strain))
    score_df = pd.DataFrame(dct_lst)
    score_df = score_df[[
        "orf_id", "Beg", "End", "Std", "Total", "CodPot", "StrtSc", "Codon",
        "RBSMot", "Spacer", "RBSScr", "UpsScr", "TypeScr", "GCCont"
    ]]

    outFilepath = "../data/{}/orf2score.csv".format(target)
    score_df.to_csv(outFilepath, index=False)
    print("DONE: output {}".format(outFilepath))
Пример #2
0
def main(strain, clusterFilepath, inFilepath, outFilepath):
    cluster_df = pd.read_csv(clusterFilepath, delimiter="\t")
    filter_df = cluster_df[~cluster_df[strain].isnull()]
    orf2family = defaultdict(
        list)  # key: orf_id , val: list of families which belongs to
    for family, orfIds in zip(filter_df["family"], filter_df[strain]):
        for orfId in orfIds.split(","):
            orf2family[orfId].append(family)

    # add family information to attribute
    gff_df = read_gff(inFilepath, additional_lst=["orf_id"])
    attribute_lst = []
    assignCount = 0
    for _, row in gff_df.iterrows():  #!!! assuming every row is CDS
        if row["orf_id"] in orf2family.keys():
            assignCount += 1
            att = "{};family={}".format(row["attribute"],
                                        ",".join(orf2family[row["orf_id"]]))
            attribute_lst.append(att)
        else:  # when this CDS is pseudogene, no sequence information in FASTA so that no family information is given
            #            print("DEBUG: {} has no family information".format(row["orf_id"]))
            attribute_lst.append(row["attribute"])
    assert assignCount == len(
        orf2family
    )  #every orf_id should appear exactly once in gff, as orf_id is uniquely defined
    gff_df["attribute"] = attribute_lst
    write_gff(gff_df, outFilepath)
    print("DONE: output {}".format(outFilepath))
    print("\tassigned family to {}/{} CDS".format(assignCount,
                                                  gff_df.shape[0]))
Пример #3
0
def main(target, strain_lst):
    annotType="refseq"
    direc="/data/mitsuki/data/denovo/{}/annotation/{}".format(target, annotType)
    
    print("START: parse {} * 2 FASTA files".format(len(strain_lst)))
    family2fna=defaultdict(list)
    family2faa=defaultdict(list)
    for strain in strain_lst:
        gffFilepath="{}/gff/{}.gff".format(direc, strain)
        fnaFilepath="{}/fna/{}.fna".format(direc, strain)
        faaFilepath="{}/faa/{}.faa".format(direc, strain)

        gff_df=read_gff(gffFilepath, ["orf_id","family"])
        id2family={}
        for _, row in gff_df.iterrows():
            id2family[row["orf_id"]]=row["family"]

        for rec in SeqIO.parse(fnaFilepath, "fasta"):
            family=id2family[rec.id]
            family2fna[family].append(rec)

        for rec in SeqIO.parse(faaFilepath, "fasta"):
            family=id2family[rec.id]
            family2faa[family].append(rec)

    print("START: output fna & faa for every family")
    outDirec="{}/family/fna".format(direc)
    os.makedirs(outDirec, exist_ok=True)
    output_family2rec(family2fna, outDirec, "fna")
    print("\tDONE: output {} family in {}".format(len(family2fna), outDirec))

    outDirec="{}/family/faa".format(direc)
    os.makedirs(outDirec, exist_ok=True)
    output_family2rec(family2faa, outDirec, "faa")
    print("\tDONE: output {} family in {}".format(len(family2faa), outDirec))
Пример #4
0
def main(inFilepath, outFilepath):
    """
    add orf_id column to .gff created by prodigal
    """

    gff_df = read_gff(inFilepath, additional_lst=["ID"])
    attribute_lst = []
    for _, row in gff_df.iterrows():
        orfId = "{}_{}".format(row["seqname"], row["ID"].split("_")[-1])
        att = "{};orf_id={}".format(row["attribute"], orfId)
        attribute_lst.append(att)
    gff_df["attribute"] = attribute_lst
    write_gff(gff_df, outFilepath)
    print("DONE: output {}".format(outFilepath))
Пример #5
0
def refseq(inFilepath, outFilepath):
    """
    delete some information in attribute, and renew ID column with unique id
    """

    refseq_df = read_gff(inFilepath, ["family", "orf_id"])
    att_lst = []
    for _, row in refseq_df.iterrows():
        delCol_lst = ["Parent", "ID", "gene"]
        addCol_dct = {"ID": "{}({})".format(row["family"], row["orf_id"])}
        att_lst.append(
            format_attribute(row["attribute"],
                             delCol_lst=delCol_lst,
                             addCol_dct=addCol_dct))
    refseq_df["attribute"] = att_lst
    write_gff(refseq_df, outFilepath)
Пример #6
0
def main(strain, hitFilepath, geneFilepath, overlapFilepath):
    hit_df = pd.read_csv(hitFilepath)
    gff_df = read_gff(gffFilepath, ["orf_id", "family"])

    ovr_df = get_overlap_df(gff_df, hit_df)
    ovr_df = add_sbjct_pos(ovr_df, gff_df)
    ovr_df = add_query_pos(ovr_df, hit_df)

    column_lst = [
        "overlap_id", "region_id", "ostart", "oend", "olength", "chr_name",
        "qstrain", "sstrain", "qfamily", "sfamily", "qstrand", "sstrand",
        "qorf_id", "sorf_id", "qostart_dna", "qostart_pro", "qoend_dna",
        "qoend_pro", "sostart_dna", "sostart_pro", "soend_dna", "soend_pro",
        "qstart", "qend", "sstart", "send", "cstart", "cend", "qosp", "qoep"
    ]
    ovr_df = ovr_df[column_lst]
    ovr_df = ovr_df.set_index("overlap_id")
    ovr_df.to_csv(overlapFilepath)
    print("DONE: {} overlaps in {}".format(ovr_df.shape[0], overlapFilepath))
Пример #7
0
def get_orf2synteny(strain_lst, gffDirec):
    """
    key: orf_id
    val:  dictionary of ["neighbor", "left", "right", "strand"]
    """
    orf2synteny = {}
    for strain in strain_lst:
        gffFilepath="{}/{}.gff".format(gffDirec, strain)
        gff_df = read_gff(gffFilepath, ["orf_id", "family"])
        gff_df = gff_df[~gff_df["family"].isnull()] #drop rows for pseudogenes which does not have family information
        
        for seqname in set(gff_df["seqname"]):
            filtered_df = gff_df[gff_df["seqname"]==seqname].copy()
            filtered_df = filtered_df.sort_values(by=["start", "end"])
            filtered_df = filtered_df.reset_index(drop=True)

            family_lst = list(filtered_df["family"])
            for key, row in filtered_df.iterrows():
                pre_lst = family_lst[max(0, key - 3): key]
                post_lst = family_lst[key + 1: key + 4] 

                dct = {}
                dct["neighbor"] = pre_lst + post_lst
                dct["left"], dct["right"] = "", ""

                if len(pre_lst) > 0:
                    if row["strand"] == "+":
                        dct["left"] = pre_lst[-1]
                    elif row["strand"] == "-":
                        dct["right"] = pre_lst[-1]
                    else:
                        print("ERROR: unknown strand {}".format(row["strand"]))
                if len(post_lst) > 0:
                    if row["strand"] == "+":
                        dct["right"] = post_lst[0]
                    elif row["strand"] == "-":
                        dct["left"] = post_lst[0]
                    else:
                        print("ERROR: unknown strand {}".format(row["strand"]))
                orf2synteny[row["orf_id"]] = dct    
    return orf2synteny 
Пример #8
0
def main():
    options = parse_options()

    exon_out_f = open(options.exon_output_filename, "w")
    gene_out_f = open(options.gene_output_filename, "w")
    
    util.info.write_info(exon_out_f, options)
    util.info.write_info(gene_out_f, options)
        
    chrom_dict = chromosome.get_chromosome_dict(options.chrom_file)
    
    gene_dict, tr_dict, gene_chrom_dict, tr_chrom_dict = \
      gff.read_gff(options.gff, chrom_dict,
                   region_chrom=options.chrom,
                   region_start=options.start,
                   region_end=options.end)

    gene_num = 0
    for gene in gene_chrom_dict[options.chrom]:
        exons = gene.get_merged_exons()

        gene_num += 1
        gene_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, 
                                                     gene.chrom.name, gene_num, gene.start,
                                                     gene.end, gene.strand))
        exon_num = 0

        if gene.strand == -1:
            exons = exons[::-1]
        
        for ex in exons:
            exon_num += 1
            exon_out_f.write("%s %s %s %d %d %d %d\n" % 
                             (gene.gene_id, gene.gene_name, gene.chrom.name, exon_num,
                              ex.start, ex.end, gene.strand))


    exon_out_f.close()
    gene_out_f.close()
Пример #9
0
def filter_gff(inFilepath, outFilepath):
    gff_df = read_gff(inFilepath)
    filtered_df = gff_df[gff_df["seqname"] != "train"]
    write_gff(outFilepath, filtered_df)
Пример #10
0
def calc_stat(target, strain):
    seqFilepath = "/data/mitsuki/data/denovo/{}/dnaseq/{}.dnaseq".format(target, strain)
    rec_lst = []
    for rec in SeqIO.parse(seqFilepath, "fasta") :
        rec_lst.append(rec)

    hitFilepath = "../blastn/result/{}/{}.csv".format(target, strain)
    hit_df = pd.read_csv(hitFilepath)

    gffFilepath = "/data/mitsuki/data/denovo/{}/annotation/refseq/gff/{}.gff".format(target, strain)
    gff_df = read_gff(gffFilepath, ["pseudo"])
    if "pseudo" in gff_df.columns:
        gff_df["pseudo_b"] = gff_df["pseudo"].notnull()
    else:
        gff_df["pseudo_b"] = False
        
    length = 0
    genicSum = 0
    pseudoSum = 0
    interSum = 0
    hitSum = 0
    hitJustSum = 0
    genicHitSum = 0
    pseudoHitSum= 0
    interHitSum = 0
    
    for rec in rec_lst:
        seqname = rec.id

        # define genic_lst, psedo_lst, inter_lst according to gff_df
        genic_lst = []
        pseudo_lst = []
        for _, row in gff_df[gff_df["seqname"] == seqname].iterrows():
            if row["pseudo_b"]:
                pseudo_lst.append( Interval(row["start"] -1, row["end"]) )
            else:
                genic_lst.append( Interval(row["start"] -1, row["end"]) )
        inter_lst =  interval_not(genic_lst + pseudo_lst, len(rec))

        # define hit_lst according to hit_df
        hit_lst = []
        for _, row in hit_df[hit_df["sseqid"] == seqname].iterrows():
            if row["hit_strand"] == 1:
                start = row["sstart"] - 1
                end = row["send"]
            else:
                start = row["send"] - 1
                end = row["sstart"]
            hit_lst.append(Interval(start, end))

        length += len(rec)
        genicSum += interval_sum(genic_lst)
        pseudoSum += interval_sum(pseudo_lst)
        interSum += interval_sum(inter_lst)
        hitSum += interval_sum(hit_lst)
        hitJustSum += interval_justsum(hit_lst)
        genicHitSum += interval_and(genic_lst, hit_lst)
        pseudoHitSum += interval_and(pseudo_lst, hit_lst)
        interHitSum += interval_and(inter_lst, hit_lst)
        
    ret = {
        "length": length,
        "genic_sum" : genicSum,
        "pseudo_sum": pseudoSum,
        "inter_sum": interSum,
        "hit_sum": hitSum,
        "hit_justsum": hitJustSum,
        "genic_hit_sum": genicHitSum,
        "pseudo_hit_sum": pseudoHitSum,
        "inter_hit_sum": interHitSum
    }
    return ret
Пример #11
0
def main(strain, seedFilepath, gffFilepath):
    for record in SeqIO.parse(seedFilepath, "fasta"):
        seedRec = record
        break
    gff_df = read_gff(gffFilepath)

    #get all the shuffle region
    prv = 0
    pos_lst = []
    for _, row in gff_df.iterrows():
        pos_lst.append(("nc", prv, row["start"] - 1, "+"))
        pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"]))
        prv = row["end"]
    pos_lst.append(("nc", prv, len(seedRec), "+"))

    # configuration for evolution
    treeFilepath = "tmp.tree"
    mytree = pyvolve.read_tree(file=treeFilepath)
    ncm = pyvolve.Model("nucleotide")  # non-coding model
    cm = pyvolve.Model("ECMrest")  # coding model

    outputSeq_lst = [Seq("") for _ in range(4)]  # assuming tree has 4 nodes
    for pos in pos_lst:
        category, start, end, strand = pos

        # get rootSeq according to start, end, strand info
        rootSeq = seedRec.seq[start:end]
        if strand == "-":
            rootSeq = rootSeq.reverse_complement()
        rootSeq = str(rootSeq)

        # get simulated sequences
        if category == "nc":
            #            partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq)
            #            evolver = pyvolve.Evolver(partition = partition, tree = mytree)
            #            rec_lst = get_evolved(evolver)
            rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)]
        elif category == "c":
            partition = pyvolve.Partition(
                models=cm,
                root_sequence=rootSeq[3:-3])  #remove start & stop codon
            evolver = pyvolve.Evolver(partition=partition, tree=mytree)
            rec_lst = get_evolved(evolver)
            for rec in rec_lst:
                rec.seq = rootSeq[:3] + rec.seq + rootSeq[
                    -3:]  #add last stop codon back
        assert len(rec_lst) == len(outputSeq_lst)

        # concat to outputSeq_lst
        for i, rec in enumerate(rec_lst):
            simSeq = rec.seq
            if strand == "-":
                simSeq = simSeq.reverse_complement()
            outputSeq_lst[i] += simSeq

    for i, outputSeq in enumerate(outputSeq_lst):
        genomeId = "{}_sim{}".format(strain, i + 1)
        outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId)
        with open(outFilepath, "w") as f:
            seqname = "{}:seq".format(genomeId)
            rec = SeqRecord(outputSeq, id=seqname, description="")
            SeqIO.write(rec, f, "fasta")
        print("DONE: output {}".format(outFilepath))
Пример #12
0
import sys
import os
import pandas as pd

sys.path.append("../helper")
from gff import read_gff
from myio import *

target=sys.argv[1]
annotType="refseq"
strain_lst = get_strain_lst(target)

# check cluster.tsv
cluster_df = get_cluster_df(target)
for col in ["family", "lineage", "size"] + strain_lst:
    if not(col in cluster_df.columns):
        print("ERROR: {} dose not have column {}".format(clusterFilepath, col), file = sys.stderr)
        exit(1)

# check gff
for strain in strain_lst:
    gffFilepath = "/data/mitsuki/data/denovo/{}/annotation/{}/gff/{}.gff".format(target, annotType, strain)
    try:
        read_gff(gffFilepath, ["family"])
    except KeyError:
        print("ERROR: {} does not have family information".format(gffFilepath), file = sys.stderr)
        exit(2)
        
exit(0)
Пример #13
0
#!/usr/bin/env python3

import sys
import os
sys.path.append("../helper")
from myio import get_strain_lst
from gff import read_gff

target = sys.argv[1]
strain_lst = get_strain_lst(target, full=True)
direc = "/data/mitsuki/data/denovo/{}".format(target)
for strain in strain_lst:
    dnafp = "{}/dnaseq/{}.dnaseq".format(direc, strain)
    fnafp = "{}/annotation/refseq/fna/{}.fna".format(direc, strain)
    faafp = "{}/annotation/refseq/faa/{}.faa".format(direc, strain)
    gfffp = "{}/annotation/refseq/gff/{}.gff".format(direc, strain)

    for fp in (dnafp, fnafp, faafp):
        if not (os.path.isfile(fp)):
            print("ERROR: {} not found".format(fp), file=sys.stderr)
            exit(1)

    gff_df = read_gff(gfffp, ["orf_id"])
    if not ("orf_id" in gff_df.columns):
        print("ERROR: orf_id not found in {}".format(gfffp), file=sys.stderr)
        exit(2)

exit(0)