示例#1
0
    def test_dn_ds(self):
        from Bio.codonalign.codonseq import cal_dn_ds
        codon_seq1 = self.aln[0]
        codon_seq2 = self.aln[1]
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='NG86')
        self.assertAlmostEqual(round(dN, 4), 0.0209, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0178, places=4)
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='LWL85')
        self.assertAlmostEqual(round(dN, 4), 0.0203, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0164, places=4)

        try:
            import scipy
        except ImportError:
            # Silently skip the rest of the test
            return

        # This should be present:
        from scipy.linalg import expm
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='YN00')
        self.assertAlmostEqual(round(dN, 4), 0.0198, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0222, places=4)

        try:
            # New in scipy v0.11
            from scipy.optimize import minimize
            dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='ML')
            self.assertAlmostEqual(round(dN, 4), 0.0194, places=4)
            self.assertAlmostEqual(round(dS, 4), 0.0217, places=4)
        except ImportError:
            # TODO - Show a warning?
            pass
示例#2
0
    def test_dn_ds(self):
        from Bio.codonalign.codonseq import cal_dn_ds
        codon_seq1 = self.aln[0]
        codon_seq2 = self.aln[1]
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='NG86')
        self.assertAlmostEqual(round(dN, 4), 0.0209, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0178, places=4)
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='LWL85')
        self.assertAlmostEqual(round(dN, 4), 0.0203, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0164, places=4)

        try:
            import scipy
        except ImportError:
            # Silently skip the rest of the test
            return

        # This should be present:
        from scipy.linalg import expm
        dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='YN00')
        self.assertAlmostEqual(round(dN, 4), 0.0198, places=4)
        self.assertAlmostEqual(round(dS, 4), 0.0222, places=4)

        try:
            # New in scipy v0.11
            from scipy.optimize import minimize
            dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='ML')
            self.assertAlmostEqual(round(dN, 4), 0.0194, places=4)
            self.assertAlmostEqual(round(dS, 4), 0.0217, places=4)
        except ImportError:
            # TODO - Show a warning?
            pass
示例#3
0
def calculate_piN_piS(codonseqs, method, codon_table, het=False):
    """
    takes a list of CodonSeq() objects and calculates piN, piS, pi, and piNpiS
    for them
    """
    analysis = {
        "seqname": "",
        "piN": -1,
        "piS": -1,
        "piNpiS": -1,
        "pi": -1,
        "method": method
    }
    x = seqfreqs(codonseqs)
    #if 'piNpiS' in options.debug:
    #    print("freqs are: {}".format(x))
    #    print("len codonseqs is: ", len(codonseqs))
    piN = 0
    piS = 0
    for i in range(len(codonseqs)):
        for j in range(i + 1, len(codonseqs)):
            #print(codonseqs[i], codonseqs[j])
            if not het:
                dN, dS = cal_dn_ds(codonseqs[i],
                                   codonseqs[j],
                                   codon_table=codon_table,
                                   method=method)
                piN = piN + (x[i] * x[j] * dN)
                piS = piS + (x[i] * x[j] * dS)
                #if 'piNpiS' in options.debug:
                #    print("{0} dN{1}{2}={3} dS{1}{2}={4}".format(method, i, j, dN, dS))
            else:
                try:
                    dN, dS = cal_dn_ds(codonseqs[i],
                                       codonseqs[j],
                                       codon_table=codon_table,
                                       method=method)
                    piN = piN + (x[i] * x[j] * dN)
                    piS = piS + (x[i] * x[j] * dS)
                except:
                    pass

    analysis['piN'] = piN
    analysis['piS'] = piS
    try:
        analysis['piNpiS'] = piN / piS
    except:
        analysis['piNpiS'] = 0
    #if 'piNpiS' in options.debug:
    #    print ("{0} dN={1:.3f} dS={2:.3f} piN/piS = {3:.3f}".format(
    #        method, analysis['piN'], analysis['piS'], analysis['piNpiS']))

    return analysis
示例#4
0
    def get_dn_ds_matrix(self, method="NG86", codon_table=default_codon_table):
        """Available methods include NG86, LWL85, YN00 and ML.

        Argument:
            - method       - Available methods include NG86, LWL85, YN00 and ML.
            - codon_table  - Codon table to use for forward translation.
        """
        from Bio.Phylo.TreeConstruction import _DistanceMatrix as DM
        names = [i.id for i in self._records]
        size = len(self._records)
        dn_matrix = []
        ds_matrix = []
        for i in range(size):
            dn_matrix.append([])
            ds_matrix.append([])
            for j in range(i + 1):
                if i != j:
                    dn, ds = cal_dn_ds(self._records[i],
                                       self._records[j],
                                       method=method,
                                       codon_table=codon_table)
                    dn_matrix[i].append(dn)
                    ds_matrix[i].append(ds)
                else:
                    dn_matrix[i].append(0.0)
                    ds_matrix[i].append(0.0)
        dn_dm = DM(names, matrix=dn_matrix)
        ds_dm = DM(names, matrix=ds_matrix)
        return dn_dm, ds_dm
示例#5
0
    def get_dn_ds_matrix(self, method="NG86", codon_table=default_codon_table):
        """Available methods include NG86, LWL85, YN00 and ML.
        Argument:
            - method       - Available methods include NG86, LWL85, YN00 and ML.
            - codon_table  - Codon table to use for forward translation.
        """
        from Bio.Phylo.TreeConstruction import _DistanceMatrix as DM

        names = [i.id for i in self._records]
        size = len(self._records)
        dn_matrix = []
        ds_matrix = []
        for i in range(size):
            dn_matrix.append([])
            ds_matrix.append([])
            for j in range(i + 1):
                if i != j:
                    dn, ds = cal_dn_ds(self._records[i], self._records[j], method=method, codon_table=codon_table)
                    dn_matrix[i].append(dn)
                    ds_matrix[i].append(ds)
                else:
                    dn_matrix[i].append(0.0)
                    ds_matrix[i].append(0.0)
        dn_dm = DM(names, matrix=dn_matrix)
        ds_dm = DM(names, matrix=ds_matrix)
        return dn_dm, ds_dm
示例#6
0
 def get_dn_ds_matrix(self, method="NG86"):
     """Available methods include NG86, LWL85, YN00 and ML.
     """
     from Bio.Phylo.TreeConstruction import _DistanceMatrix as DM
     names = [i.id for i in self._records]
     size = len(self._records)
     dn_matrix = []
     ds_matrix = []
     for i in range(size):
         dn_matrix.append([])
         ds_matrix.append([])
         for j in range(i + 1):
             if i != j:
                 dn, ds = cal_dn_ds(self._records[i], self._records[j],
                                    method=method)
                 dn_matrix[i].append(dn)
                 ds_matrix[i].append(ds)
             else:
                 dn_matrix[i].append(0.0)
                 ds_matrix[i].append(0.0)
     dn_dm = DM(names, matrix=dn_matrix)
     ds_dm = DM(names, matrix=ds_matrix)
     return dn_dm, ds_dm
def divergence():

    ########################
    ## Arguments d'entrée ##
    ########################
    fic1dna = sys.argv[1]  #fichier des séquences adn de l'espèce 1
    fic2dna = sys.argv[2]  #fichier des séquences adn de l'espèce 2
    fic1prot = sys.argv[3]  #fichier des séquences protéiques de l'espèce 1
    fic2prot = sys.argv[4]  #fichier des séquences protéiques de l'espèce 2

    #outfile_unaligned="outfile_unaligned.fa"
    #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8')
    outfile_dn_ds = sys.argv[5]  #fichier de sortie format tableau, sep = ";"
    outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8')
    method = sys.argv[6]  #Methode utilisée
    muscle_exe = sys.argv[7]  #Chemin vers le fichier executable de MUSCLE

    #Transformation des séquences en format SeqIO
    seq1dna = list(
        SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq2dna = list(
        SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein))
    seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein))

    #Première ligne du tableau "titres"
    """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2",
		";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)"""

    print("Nombre de paires de sequences a analyser: ", len(seq1dna))

    print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";",
          "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";",
          "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";",
          "Mean_length")
    """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2",
		"GC_content_seq1","GC_content_seq2","GC","Mean_length"))"""

    #Boucle sur chaque paire de séquence
    u = 0
    while u < (len(seq1dna)):

        try:

            ###########################################################
            #.    Alignement entre chaque paire de séquence           #
            ###########################################################

            nuc1 = str(seq1dna[u].seq
                       )  #Récupère la séquence u et la transforme en string
            nuc2 = str(seq2dna[u].seq)
            prot1 = str(seq1prot[u].seq)
            prot2 = str(seq2prot[u].seq)

            protein2 = SeqRecord(
                Seq(prot2, alphabet=IUPAC.protein), id='protein2'
            )  #Transformation de la séquence protéique en format SeqRecord
            protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),
                                 id='protein1')

            with open(
                    "outfile_unaligned.fa", "w", encoding='utf-8'
            ) as output_handle:  #Permet de créer un fichier de deux séquences non-alignées (format fasta)
                SeqIO.write(protein1, output_handle, "fasta")
                SeqIO.write(protein2, output_handle, "fasta")

            muscle_cline = MuscleCommandline(
                muscle_exe,
                input="outfile_unaligned.fa",
                out="outfile_aligned.aln"
            )  #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées
            stdout, stderr = muscle_cline()
            alns = AlignIO.read(
                "outfile_aligned.aln",
                "fasta")  #Lecture du fichier de séquences alignées

            prot1 = str(alns[0].seq)  #Récupère la séquence protéique 1 alignée
            prot2 = str(alns[1].seq)  #Récup§re la séquence protéique 2 alignée

            nuc2 = SeqRecord(
                Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2'
            )  #Transformation de la séquence nucléique en format SeqRecord
            nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()),
                             id='nuc1')

            prot1 = SeqRecord(
                Seq(prot1, alphabet=IUPAC.protein), id='pro1'
            )  #Transformation de la séquence protéique en format SeqRecord
            prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2')

            aln = MultipleSeqAlignment(
                [prot1, prot2]
            )  #Créer format alignement des 2 séquences protéiques préalablement alignées

            codon_aln = codonalign.build(
                aln, [nuc1, nuc2])  #Créer un alignement de codon

            #Fichier d'alignement
            #AlignIO.write(codon_aln,"outfile_aligned", 'fasta')

            lengthseq1 = len(nuc1.seq)
            lengthseq2 = len(nuc2.seq)
            GCcontentseq1 = GC(nuc1.seq)
            GCcontentseq2 = GC(nuc2.seq)

            GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2)

            if lengthseq1 >= lengthseq2:
                Min_length = lengthseq2
            if lengthseq1 < lengthseq2:
                Min_length = lengthseq1

            ##########################################################
            #           CALCULS DES INDICES DE DIVERGENCE            #
            ##########################################################

            #Calcul de divergence synonyme et non-synonyme

            #Supression des gaps
            seq1 = ""
            seq2 = ""
            for x, z in zip(codon_aln[0], codon_aln[1]):
                if z == "-":
                    continue
                if x == "-":
                    continue
                else:
                    seq1 += x
                    seq2 += z

            #################################################################
            #.	        Comptage du nombre de site polymorhe brute          #
            #################################################################

            #Compteur de différences par site
            compteur0 = 0
            for i, e in zip(seq1, seq2):
                if i != e:
                    compteur0 += 1

            distance_brute = round(float((compteur0) / len(seq1)), 3)

            seq1_third_pos = ""
            seq2_third_pos = ""

            compteur1 = 0
            for i in seq1[2::3]:
                if i.isalpha():
                    seq1_third_pos += i
                    compteur1 += 1

            compteur2 = 0
            for i in seq2[2::3]:
                if i.isalpha():
                    seq2_third_pos += i
                    compteur2 += 1

            ####################################################################
            #	Comptage du nombre de site polymorphe en troisième position    #
            ####################################################################

            #Compteur de différences par site (3ieme position)
            compteur3 = 0
            for i, e in zip(seq1_third_pos, seq2_third_pos):
                if i != e:
                    compteur3 += 1

            distance_third_pos = round(float((compteur3) / compteur2), 3)

            ####################################################################
            #			Calcul dN et dS selon la méthode utilisée 			   #
            ####################################################################

            try:

                dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method)
                """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos,
                      ";", distance_brute, ";", lengthseq1, ";", lengthseq2,
                      ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean,
                      ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ValueError:
                result = 9.999  #Saturation trop importante pour calculer les indices.
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ZeroDivisionError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except KeyError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            u += 1

        except:
            traceback.print_exc()
            print("Une erreur est survenue pour la sequence: ", seq1dna[u].id,
                  "vs", seq2dna[u].id)
            """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA",
		"Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)"""

            u += 1

    #df2.to_csv(outfile_dn_ds, sep='\t')
    outfile_dn_ds.close()  #Fermeture du fichier ouvert
示例#8
0
    index_Cap = int([names.index(i) for i in names if 'Capybara' in i][0])
    index_Cpor = int([names.index(i) for i in names if 'Cavia' in i][0])
    CapCDS = aln.takeSeqs([names[index_Cap]])
    Cap_seqs.append(CapCDS.getSeqNames()[0].split('|')[1])
    CporCDS = aln.takeSeqs([names[index_Cpor]])
    Cpor_seqs.append(CporCDS.getSeqNames()[0].split('|')[1])

    if ((len(aln) / len(aln1)) > 0.5):
        #Solo analizar alineaientos que despues de eliminados los gaps, tengan una longitud mayor al 50% de la longitud original
        CapCDSstr = str(CapCDS.todict().values()[0])
        CapCDSstr = str(CapCDS.todict().values()[0])
        CapCDSstr = CodonSeq(CapCDSstr)
        CporCDSstr = str(CporCDS.todict().values()[0])
        CporCDSstr = CodonSeq(CporCDSstr)
        try:
            if cal_dn_ds(CapCDSstr, CporCDSstr)[1] == 0.0:
                val1 = 0.0
            else:
                val1 = cal_dn_ds(CapCDSstr, CporCDSstr)[0] / cal_dn_ds(
                    CapCDSstr, CporCDSstr)[1]
            CGP.append(val1)
        except:
            print 'Error with: ' + os.path.split(
                sequences[s])[-1].split('_')[0]
    else:
        CGP.append('NA')
    #s = s + 1

Cpor_seqs = np.array(Cpor_seqs)
Cap_seqs = np.array(Cap_seqs)
CGP = np.array(CGP)
示例#9
0
    Cap_seqs.append(CapCDS.getSeqNames()[0].split('|')[1])
    CporCDS = aln.takeSeqs([names[index_Cpor]])
    Cpor_seqs.append(CporCDS.getSeqNames()[0].split('|')[1])
    RatCDS = aln.takeSeqs([names[index_Rat]])
    Rat_seqs.append(RatCDS.getSeqNames()[0])

    if ((len(aln) / len(aln1)) > 0.5):
        #Solo analizar alineaientos que despues de eliminados los gaps, tengan una longitud mayor al 50% de la longitud original
        CapCDSstr = str(CapCDS.todict().values()[0])
        CapCDSstr = CodonSeq(CapCDSstr)
        CporCDSstr = str(CporCDS.todict().values()[0])
        CporCDSstr = CodonSeq(CporCDSstr)
        RatCDSstr = str(RatCDS.todict().values()[0])
        RatCDSstr = CodonSeq(RatCDSstr)
        try:
            if cal_dn_ds(CapCDSstr, CporCDSstr)[1] == 0.0:
                val1 = 0.0
            elif cal_dn_ds(CporCDSstr, RatCDSstr)[1] == 0.0:
                val2 = 0.0
            else:
                val1 = cal_dn_ds(CapCDSstr, CporCDSstr)[0] / cal_dn_ds(
                    CapCDSstr, CporCDSstr)[1]
                val2 = cal_dn_ds(CporCDSstr, RatCDSstr)[0] / cal_dn_ds(
                    CporCDSstr, RatCDSstr)[1]
            CGP.append(val1)
            GPR.append(val2)
        except:
            print 'Error with: ' + os.path.split(
                sequences[s])[-1].split('_')[0]
    else:
        CGP.append('NA')