예제 #1
0
def MakeGeneraFastas(fin_taxonomy,fin_repset):
    global repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
    fin_repset = open(fin_repset,"U")
    fin_taxonomy = open(fin_taxonomy,"U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset,ignore_comment=True):
        repsetdic[label] = seq
    taxdic = {}
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus[0:3] == "g__":
            genus = genus[3:]
        taxgendic[accessionID] = genus
        taxdic[accessionID] = taxonomyline
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__"+genus+"_seqs.fasta","w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq,id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    cwd = os.getcwd()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
def make_genera_fastas(fin_taxonomy, fin_repset):
    """Takes ITS fasta file representative sequences and sorts the
       OTUs/species into their corresponding genus file.  This allows
       OTUs to be compared to other OTUs from the same genus.

    Parameters
    ----------
    repsetdic : dict
        A dictionary containing the label (key)
        and sequence (value) from ITS representative sequences file.


    repgenlist : list
        A list that contains all unique genera from ITS fasta file.


    taxgendic: dict
        A dictionary containing accession ID (key) and genus only
        from the Unite taxonomy file. ***** not used currently


    repsetIDlist : list
        A list that contains all of the IDs from the representative ITS
        sequences.



    Returns
    ----------

    Examples
    ----------
    Input is a representative sequence fasta file where each sequence
    corresponds to one representative for all of the OTUs in each
    cluster.  Each sequence has an accession ID that corresponds to
    one sequence in the Unite database.

    Example of one representative fasta sequence from the input
    fasta file:


    >>AB015922 Some_comment_ie_sample_location
    CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG
    ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA
    CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA
    CACGGGTGGGAGGGTCGACCCAGCACGC


    Example of a taxonomy line:

    AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes;
    o__Thelephorales;f__Thelephoraceae;g__Thelephora;
    s__Thelephora_terrestris



    This code identifies the genus of all OTUs by looking at the
    accession number from the fasta sequence, then looking at the
    Unite taxonomy file and identifying the genus the sequence
    belongs to. The OTUs then get sorted into genus files that
    have one or more OTUs/species per file.

    """
    global repgenlist
    fin_repset = open(fin_repset, "U")
    fin_taxonomy = open(fin_taxonomy, "U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset, ignore_comment=True):
        repsetdic[label] = seq
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus.startswith("g__"):
            genus = genus[3:]
        taxgendic[accessionID] = genus
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__" + genus + "_seqs.fasta", "w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq, id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repgenlist
def make_genera_fastas(fin_taxonomy,fin_repset):
    """Takes ITS fasta file representative sequences and sorts the
       OTUs/species into their corresponding genus file.  This allows
       OTUs to be compared to other OTUs from the same genus.

    Parameters
    ----------
    repsetdic : dict
        A dictionary containing the label (key)
        and sequence (value) from ITS representative sequences file.


    repgenlist : list
        A list that contains all unique genera from ITS fasta file.


    taxgendic: dict
        A dictionary containing accession ID (key) and genus only
        from the Unite taxonomy file. ***** not used currently


    repsetIDlist : list
        A list that contains all of the IDs from the representative ITS
        sequences.



    Returns
    ----------

    Examples
    ----------
    Input is a representative sequence fasta file where each sequence
    corresponds to one representative for all of the OTUs in each
    cluster.  Each sequence has an accession ID that corresponds to
    one sequence in the Unite database.

    Example of one representative fasta sequence from the input
    fasta file:


    >>AB015922 Some_comment_ie_sample_location
    CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG
    ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA
    CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA
    CACGGGTGGGAGGGTCGACCCAGCACGC


    Example of a taxonomy line:

    AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes;
    o__Thelephorales;f__Thelephoraceae;g__Thelephora;
    s__Thelephora_terrestris



    This code identifies the genus of all OTUs by looking at the
    accession number from the fasta sequence, then looking at the
    Unite taxonomy file and identifying the genus the sequence
    belongs to. The OTUs then get sorted into genus files that
    have one or more OTUs/species per file.

    """
    global repgenlist
    fin_repset = open(fin_repset,"U")
    fin_taxonomy = open(fin_taxonomy,"U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset,ignore_comment=True):
        repsetdic[label] = seq
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus.startswith("g__"):
            genus = genus[3:]
        taxgendic[accessionID] = genus
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__"+genus+"_seqs.fasta","w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq,id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repgenlist