Пример #1
0
                       rettype="fasta",
                       retmode="text",
                       id=gi_number[b]) as handle:
        search_results = handle.read()
        MSA_als = open('MSA_als.fasta', 'a+')
        MSA_als.write(search_results)
        MSA_als.write('\n')
        MSA_als.close

## merge two fasta files
filenames = ['protein_sequence.fasta', 'MSA_als.fasta']
# call the merge function
overwrite(filenames).merge()

## MSA
cline = ClustalwCommandline("clustalw", infile="final_seq.fasta")
cline()

## build the tree

tree = Phylo.read("final_seq.dnd", "newick")
Phylo.draw_ascii(tree)

## delete useless files
if sys.argv[3] == 'no':
    os.remove('MSA_als.fasta')
    os.remove('protein_sequence.fasta')
    os.remove('final_seq.fasta')
    os.remove('final_seq.dnd')
    os.remove('final_seq.aln')
elif sys.argv[3] == 'yes':
Пример #2
0
from Bio.Align.Applications import ClustalwCommandline
               cline = ClustalwCommandline("clustalw2", infile="/Users/dazhang/Downloads/clustalw-2.1-macosx/opuntia.fastahuman.fasta")
               print(cline)
Пример #3
0
    def computeAlignment(self, id, alignment):
        "Computes multiple sequence alignment with inputed method"

        if alignment == "clustalw":
            gop = LP(self.parameterfile, "clustalw_gap_opening")
            gep = LP(self.parameterfile, "clustalw_gap_extension")
            d_matrix = LP(self.parameterfile, "clustalw_distance_matrix")

            input_sequences = self.dirname + id + ".fasta"
            output_align = self.dirname + id + ".aln"
            output_fasta = self.dirname + id + "_clustalw.fasta"
            output_tree = self.dirname + id + ".dnd"
            try:
                cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe")
                clustalw = ClustalwCommandline(cmd,
                                               infile=input_sequences,
                                               outfile=output_align,
                                               newtree=output_tree,
                                               align="input",
                                               seqnos="ON",
                                               outorder="input",
                                               type="PROTEIN",
                                               pwmatrix=d_matrix,
                                               gapopen=gop,
                                               gapext=gep)
                clustalw()
            except:
                cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw")
                clustalw = ClustalwCommandline(cmd,
                                               infile=input_sequences,
                                               outfile=output_align,
                                               newtree=output_tree,
                                               align="input",
                                               seqnos="ON",
                                               outorder="input",
                                               type="PROTEIN",
                                               pwmatrix=d_matrix,
                                               gapopen=gop,
                                               gapext=gep)
                clustalw()
            AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
            try:
                remove(output_align)
                remove(output_tree)
            except:
                pass

        elif alignment == "muscle":
            iteration = LP(self.parameterfile, "muscle_max_iteration")

            input_sequences = self.dirname + id + ".fasta"
            output_align = self.dirname + id + "_muscle.aln"
            output_fasta = self.dirname + id + "_muscle.fasta"

            muscle = MuscleCommandline(input=input_sequences,
                                       out=output_align,
                                       clwstrict=True,
                                       maxiters=iteration)
            muscle()
            AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
            try:
                remove(output_align)
            except:
                pass

            organism_order = []
            input_sequences = self.dirname + id + ".fasta"
            align = SeqIO.parse(input_sequences, "fasta", IUPAC.protein)
            for record in align:
                org = record.description
                organism_order.append(org)

            rec = dict()
            output_fasta = self.dirname + id + "_muscle.fasta"
            align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein)
            for record in align:
                org = str(record.description)
                seq = str(record.seq)
                rec[org] = seq

            fasta = open(output_fasta, "w")
            fasta.close()
            fasta = open(output_fasta, "a")
            for org in (organism_order):
                seq = rec[org]
                fasta.write(">" + org + "\n" + seq + "\n")
            fasta.close()

        else:
            configuration = LP(self.parameterfile, "mafft_configuration")
            threads = LP(self.parameterfile, "mafft_threading")
            input_sequences = self.dirname + id + ".fasta"
            output_fasta = self.dirname + id + "_mafft.fasta"

            if configuration == "fftnsi":
                if threads == False:
                    fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder "
                    mafft = system(fftnsi + input_sequences + ">" +
                                   output_fasta)
                    mafft
                else:
                    try:
                        threads = int(threads)
                        fftnsi = "mafft --retree 2 --maxiterate 1000\
                         --inputorder --threads %i " % (threads)
                        mafft = system(fftnsi + input_sequences + ">" +
                                       output_fasta)
                        mafft
                    except:
                        fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder "
                        mafft = system(fftnsi + input_sequences + ">" +
                                       output_fasta)
                        mafft
            else:
                if threads == False:
                    linsi = "mafft --localpair --maxiterate 1000 --inputorder "
                    mafft = system(linsi + input_sequences + ">" +
                                   output_fasta)
                    mafft
                else:
                    try:
                        threads = int(threads)
                        linsi = "mafft --localpair --maxiterate 1000\
                         --inputorder --threads %i " % (threads)
                        mafft = system(linsi + input_sequences + ">" +
                                       output_fasta)
                        mafft
                    except:
                        linsi = "mafft --localpair --maxiterate 1000 --inputorder "
                        mafft = system(linsi + input_sequences + ">" +
                                       output_fasta)
                        mafft
Пример #4
0
#-*- coding: cp1252 -*-
import os
from Bio import Phylo
from Bio import SeqIO
from Bio.Align.Applications import ClustalwCommandline
""" Prend en paramètre un fichier fasta et renvoi en sortie standard un arbre phylogénique
Nécessite l'utilisation de Clustalw qu'il faut installer en local"""

fich = "./results/sequences_cluster1.fasta"  # Fichier input correspondant à un fichier fasta

clustalw_exe = r"/net/cremi/login/Bureau/clustalw2"  # lien vers le programme clustalw

clustalw_cline = ClustalwCommandline(clustalw_exe, infile=fich)

assert os.path.isfile(clustalw_exe), "Clustal W executable missing"
stdout, stderr = clustalw_cline()
tree = Phylo.read(fich + ".dnd", "newick")
Phylo.draw_ascii(tree)
Пример #5
0
def runClustal(fileName):

    global germs

    sequences = []
    total, good, mutated = 0, 0, 0

    tempName = re.sub("\.fa", "_temp", fileName)

    print("Starting work on %s..." % fileName)

    #load sequences and preprocess to align:
    reader = SeqIO.parse(open(fileName, "r"), "fasta")
    for entry in reader:

        total += 1
        gene = re.search("(?:v_call|V_gene)=(IG[HKL]V[^,\s]+)",
                         entry.description)

        if gene:

            germline = gene.groups()[0]

            if not germline in germs:
                print(
                    "%s might be misassigned; %s is not in my germline library. Skipping..."
                    % (entry.id, germline))
                continue

            with open("%s.fa" % tempName, "w") as handle:
                handle.write(">%s\n%s\n>%s\n%s\n" %
                             (germline, germs[germline], entry.id, entry.seq))

            clustal_cline = ClustalwCommandline(cmd=clustalw,
                                                infile="%s.fa" % tempName)
            try:
                stdout, stderr = clustal_cline()
            except:
                print("Error in alignment of %s (will skip): %s" %
                      (entry.id, stderr))
                for f in glob.glob("%s.*" % tempName):
                    os.remove(os.path.abspath(f))
                continue

            alignment = AlignIO.read("%s.aln" % tempName, "clustal")

            if not arguments['--keep']:
                shift = False
                for record in alignment:
                    #strip end gaps, they don't matter
                    #full-codon indels are also fine
                    codons = re.sub("---", "", str(record.seq.strip("-")))
                    if "-" in codons:
                        shift = True  #likely frameshift --discard!
                if shift:
                    for f in glob.glob("%s.*" % tempName):
                        os.remove(os.path.abspath(f))
                    continue

            #count no-frameshift seqs for functional repertoires
            good += 1

            #now, remove gaps

            #Input order is not maintained, so we need a little
            #   kludge to check which one isthe germline sequence.
            germRow = 0
            if alignment[0].id != germline:
                germRow = 1

            #look for gaps one at a time so we don't get tripped up by shifting indices
            gap = re.search("-+", str(alignment[germRow].seq))
            while (gap):
                alignment = alignment[:, 0:gap.start()] + alignment[:,
                                                                    gap.end():]
                gap = re.search("-+", str(alignment[germRow].seq))

            #translate to check for AA substitutions
            mySeq = alignment[1 - germRow]
            mySeq.seq, n = re.subn("-", "N", str(mySeq.seq))
            mySeq.seq = Seq.Seq(mySeq.seq, Alphabet.IUPAC.ambiguous_dna)
            mySeq.seq = mySeq.seq.translate()
            germSeq = str(alignment[germRow].seq.translate())

            mutCount = 0
            for a, b in zip(germSeq, str(mySeq.seq)):
                if b != 'X' and b != "-" and b != a:
                    mutCount += 1

            if mutCount > 0:
                mutated += 1
                entry.seq = Seq.Seq(re.sub("^X+", "", str(mySeq.seq)),
                                    Alphabet.IUPAC.ExtendedIUPACProtein)
                sequences.append(entry)

            for f in glob.glob("%s.*" % tempName):
                os.remove(os.path.abspath(f))

    return (total, good, mutated, sequences)
blastOutputFileName = argv[1]
contigFileName1 = argv[2]
contigFileName2 = argv[3]
blastOutputFileHandle = open(blastOutputFileName, 'r')
seqRecordsTaxon1 = SeqIO.index(contigFileName1, "fasta")
seqRecordsTaxon2 = SeqIO.index(contigFileName2, "fasta")
pattern = re.compile(
    '"(\w+)"\s+"(\w+)"\s+\S+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)')
# group(1) is taxon1, group(2) is taxon2, group(3) is db start, group(4) is db stop
for line in blastOutputFileHandle:
    match = pattern.search(line)
    alignFastaFileName = 'z' + match.group(1) + '-' + match.group(2) + '.fasta'
    alignFastaFileHandle = open(alignFastaFileName, 'w')
    alignFastaFileHandle.write('>' + match.group(1) + '\n')
    alignFastaFileHandle.write(
        str(seqRecordsTaxon1[match.group(1)].seq) + '\n')
    alignFastaFileHandle.write('>' + match.group(2) + '\n')
    if (int(match.group(3)) < int(match.group(4))):
        alignFastaFileHandle.write(
            str(seqRecordsTaxon2[match.group(2)].seq) + '\n')
    else:  # Need the reverse-complement
        alignFastaFileHandle.write(
            str(seqRecordsTaxon2[match.group(2)].seq.reverse_complement()) +
            '\n')
    alignFastaFileHandle.close()
    cmd = ClustalwCommandline(infile=alignFastaFileName)
    p = subprocess.Popen(str(cmd), shell=True)
    sts = os.waitpid(p.pid, 0)
    os.remove(alignFastaFileName)
    dndFileName = 'z' + match.group(1) + '-' + match.group(2) + '.dnd'
    os.remove(dndFileName)
Пример #7
0
def gen_alignment(input_file, seq_vector=None, n_sequences=None, algorithm='mafft', output_file='output'):

    assert input_file is not None and os.path.isfile(input_file)
    assert output_file is not None

    assert seq_vector is not None or n_sequences is not None, \
        'Both arguments are None (sequence vector and number of sequences)'

    assert isinstance(seq_vector, list) or isinstance(n_sequences, int), \
        'Either one of two must be provided: sequence vector or number of sequences'

    assert algorithm in GLOBALS['SUPPORTED ALGORITHMS'], \
        'Algorithm does not match any of the currently supported MSA algorithms'

    assert isinstance(input_file, str)

    iterable = SeqIO.parse(open(input_file, 'rU'), 'fasta')

    tmp_file = 'pre_alignment.fna'

    if seq_vector is not None:
        sequences = (r for r in iterable if r.description.split('|')[-1] in seq_vector)
    else:
        sequences = generator_from_iterable(iterable, n_sequences)

    sequences = [x for x in sequences]
    if len(sequences) == 0:
        print 'No sequences were found'
        sys.exit(0)

    #print sequences
    SeqIO.write(sequences, tmp_file, 'fasta')

    try:

        t0 = time.time()
        if algorithm == 'clustal':

            if not output_file.endswith('.aln'):
                output_file += '.aln'

            algorithm = 'clustalw2'
            cline = ClustalwCommandline(algorithm,
                                        infile=tmp_file,
                                        outfile=output_file + '.aln')
        elif algorithm == 'muscle':

            if not output_file.endswith('.fna'):
                output_file += '.fna'

            alg = r"/usr/local/bin/muscle3.8.31_i86linux64"
            cline = MuscleCommandline(alg, input=tmp_file,
                                      out='source_sequences/muscle_' + str(n_sequences) + '.fna',
                                      clwstrict=True)
        elif algorithm == 'mafft':

            if not output_file.endswith('.fasta'):
                output_file += '.fasta'

            alg = r"/usr/local/bin/mafft"
            cline = MafftCommandline(alg,
                                     input=tmp_file,
                                     clustalout=True)
        else:
            print 'Unknown algorithm\n'
            sys.exit(0)

        stdout, stderr = cline()

        if algorithm == 'mafft':
            with open(output_file, "wb") as handle:
                handle.write(stdout)

        print 'Elapsed time: ' + str((time.time() - t0) / 60)

        return output_file
    except:
        print 'Error aligning with ' + algorithm
Пример #8
0
	
Also, rather than downloading files and then parsing them into Python, 
you can programmatically access sequence files via Entrez using GI numbers:

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "*****@*****.**"
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="Q15878")
seq_record = SeqIO.read(handle, "gb")
handle.close()
seq_record

## alignment
from Bio.Align.Applications import ClustalwCommandline
# create bash command
cline = ClustalwCommandline("clustalw", infile="Q15878.fasta")
print(cline)
# execute clustalw
stdout, stderr = cline()


## merge two fasta files
files = ['Q15878.fasta', 'Q1234.fasta']
with open('Q1234.fasta', 'w') as outfile:
    for name in files:
        with open(name) as infile:
            outfile.write(infile.read())
            outfile.write('\n')

from Bio.Align.Applications import ClustalwCommandline
from Bio.Align.Applications import MuscleCommandline
Пример #9
0
 def alinhamento_multiplo(self):
     from Bio.Align.Applications import ClustalwCommandline
     clustalw_cline = ClustalwCommandline(self.diretoria,
                                          infile=self.seq_input)
     clustalw_cline()
Пример #10
0
# standard library
import os
import sys
import subprocess

# biopython
from Bio.Alphabet import Gapped, IUPAC
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass the full path of the executable to this via cmd="..."
cline = ClustalwCommandline(infile='opuntia.fasta', outfile='test.aln')

# actually perform the alignment
return_code = subprocess.call(str(cline), shell=(sys.platform != "win32"))
assert return_code == 0, "Calling ClustalW failed"

# Parse the output
alignment = AlignIO.read("test.aln",
                         "clustal",
                         alphabet=Gapped(IUPAC.unambiguous_dna))

print alignment

print 'first description:', alignment[0].description
print 'first sequence:', alignment[0].seq
Пример #11
0
	print(align)
##better visualization
from Bio.pairwise2 import format_alignment
for align in alignments:
	print(format_alignment(*align))
#### using Align module

from Bio import Align
aligner=Align.PairwiseAligner()
print("Using now Align module")
print("Aligner:")
print(aligner)

###example using the clustalw2 command, download it from www.clustal.org/download/current
from Bio.Align.Applications import ClustalwCommandline
cmd=ClustalwCommandline("clustalw2",infile="opuntia.fasta")
print(cmd)
cmd()  #runs command and generates .aln and .dnd files(opuntia)

align=AlignIO.read("opuntia.aln","clustal")
print("Read align from opuntia.aln generated by clustal")
print(align)
print("Printing alignments")

for a in align:
	print(a.seq)
print("Print[0][1]of align\n")
print(align[0].seq)
print(align[1].seq)

Пример #12
0
def call_clustal(string):
    cline = ClustalwCommandline("clustalw2", infile=string)
    process = subprocess.Popen(str(cline),
                               shell=(sys.platform != "win32"),
                               stdout=subprocess.PIPE)
    return process.communicate()[0]
Пример #13
0
import sys
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline

if len(sys.argv) != 3:
    print("Invalid params:")
    print("1) Multifasta input file")
    print("2) MSA output file")
    sys.exit(1)

in_file_name = sys.argv[1]
out_file_name = sys.argv[2]

clustalw_cline = ClustalwCommandline("clustalo",
                                     infile=in_file_name,
                                     outfile=out_file_name)
stdout, stderr = clustalw_cline()
Пример #14
0
def ClustalW_alignment(writen_alignment:'str') -> None:
    '''This function runs the alignment from the command line'''
    clustalw_cline = ClustalwCommandline(infile=writen_alignment)
    stdout, stderr = clustalw_cline()
Пример #15
0
    total += 1
    gene = re.search("(?:v_call|V_gene)=(IG[HKL]V[^,\s]+)", entry.description)

    if gene:
        germline = gene.groups()[0]
        if not germline in germs:
            print(
                "%s might be misassigned; %s is not in my germline library. Skipping..."
                % (entry.id, germline))
            continue

        with open("%s.fa" % fa_head, "w") as handle:
            handle.write(">%s\n%s\n>%s\n%s\n" %
                         (germline, germs[germline], entry.id, entry.seq))

        clustal_cline = ClustalwCommandline(cmd=clustalw,
                                            infile="%s.fa" % fa_head)
        try:
            stdout, stderr = clustal_cline()
        except:
            print("Error in alignment of %s (will skip): %s" %
                  (entry.id, stderr))
            for f in glob.glob("%s.*" % fa_head):
                os.remove(f)
            continue

        alignment = AlignIO.read("%s.aln" % fa_head, "clustal")
        shift = False

        for record in alignment:
            codons = re.sub(
                "---", "", str(record.seq.strip("-"))
Пример #16
0
from __future__ import print_function

import sys
import subprocess

# biopython
from Bio.Alphabet import Gapped, IUPAC
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass the full path of the executable to this via cmd="..."
cline = ClustalwCommandline(infile="opuntia.fasta", outfile="test.aln")

# actually perform the alignment
return_code = subprocess.call(str(cline), shell=(sys.platform != "win32"))
assert return_code == 0, "Calling ClustalW failed"

# Parse the output
alignment = AlignIO.read("test.aln",
                         "clustal",
                         alphabet=Gapped(IUPAC.unambiguous_dna))

print(alignment)

print("first description: %s" % alignment[0].description)
print("first sequence: %s" % alignment[0].seq)
Пример #17
0
def alignSeqs(sequencedict):
    clustalfh = open('clustal_alignments.aln', 'w')
    '''
    stockholmfh = open('stockholm_alignments.aln', 'w')
    '''
    UTRfastasfh = open('UTRfastas.fa', 'w')
    clustalfh.close()
    '''
    stockholmfh.close()
    '''
    UTRfastasfh.close()

    if os.path.exists('./StockholmAlignments/') == False:
        os.mkdir('./StockholmAlignments')
    
    for UTR in sequencedict:
        UTRID = str(UTR)
        #Write fasta file from dictionary entry
        fastafh = open('temp.fasta', 'w')
        fastastring = ''
        for species in sequencedict[UTR]:
            fastastring += '>' + str(species.keys()[0]) + '\n' + str(species.values()[0]) + '\n'

        fastafh.write(fastastring)
        fastafh.close()
        tempfastafh = open('temp.fasta', 'r')
        tempfastalines = []
        for line in tempfastafh:
            tempfastalines.append(line)
        tempfastafh.close()

        #Align fasta using clustalw
        cline = ClustalwCommandline('clustalw2', infile = 'temp.fasta')
        cline() #alignment now in temp.aln
        clustallines = []
        tempclustalfh = open('temp.aln', 'r')
        for line in tempclustalfh:
            clustallines.append(line)
        tempclustalfh.close()
        
        #Convert clustal to stockholm
        AlignIO.convert('temp.aln', 'clustal', 'tempstockholm.aln', 'stockholm') 

        #Get secondary structure line from RNAalifold
        ss = subprocess.check_output(['RNAalifold', 'temp.aln']).replace(' ', '\n', 1).split('\n')[-3]
        ssline = '#=GC SS_cons ' + ss + '\n' + '//' + '\n'
        
        #Replace '//' in stockholm file with secondary structure line
        replace_in_file.replace('tempstockholm.aln', '//', ssline)

        #Add ID line to file.  This is necessary for Infernal.
        titleline = '# STOCKHOLM 1.0'
        IDline = '#=GF ID ' + UTRID
        replacement = titleline + '\n' + IDline + '\n'
        replace_in_file.replace('tempstockholm.aln', '# STOCKHOLM 1.0', replacement)

        #Rename stockholm file
        os.rename('tempstockholm.aln', './StockholmAlignments/' + UTRID + '.aln')

        #Now making many small stockholm files instead of one big one.
        '''
        tempstockholmfh = open('tempstockholm.aln', 'r')
        stockholmlines = []
        for line in tempstockholmfh:
            stockholmlines.append(line)
        tempstockholmfh.close()
        '''
        
        #Append current temp aln files to their respective alignment files
        with open('clustal_alignments.aln', 'a') as clustalfile:
            for line in clustallines:
                clustalfile.write(line)

        #Now making many small stockholm files instead of one big one.
        '''
        with open('stockholm_alignments.aln', 'a') as stockholmfile:
            for line in stockholmlines:
                stockholmfile.write(line)
        '''
        
        with open('UTRfastas.fa', 'a') as UTRfastafile:
            for line in tempfastalines:
                UTRfastafile.write(line)
            UTRfastafile.write('\n' + '\n' + '\n')

    #Cleanup
    os.remove('alirna.ps')
    os.remove('temp.aln')
    os.remove('temp.dnd')
    os.remove('temp.fasta')
Пример #18
0
    def showResult3(self, filepath):
        clustalw_exe = r"C:\Program Files (x86)\ClustalW2\clustalw2.exe"  # Clustalw2 path
        if filepath != "":
            filepath = filepath[0]
        self.ui.msaOutputText.clear()
        # Get Current Method from a Dropdown menu (in GUI)
        checkMethod = self.ui.comboBox_3.currentText()
        filename, extension = os.path.splitext(filepath)
        MSA_outputfile = "MSA_outalign.fasta"  # Output Multiple Sequence Alignment file result
        phylo_outputfile = filename + ".ph"  # Output Phylogentic file result
        # Check if user Chose a Sequence Alignment / Phylogenetic Tree methods from a drop down in the GUI
        if checkMethod == "Sequence Alignment" or checkMethod == "Phylogenetic Tree":
            # If user chose "Sequence Alignment", then:
            if checkMethod == "Sequence Alignment":
                self.ui.graphicsView.clear()
                # MSA using Clustal2 and Generating output FASTA file of result
                cmd = ClustalwCommandline(clustalw_exe,
                                          infile=filepath,
                                          type="DNA",
                                          output='FASTA',
                                          outfile=MSA_outputfile)
                std_out, std_err = cmd()

                record = SeqIO.parse(MSA_outputfile, 'fasta')
                for element in record:
                    # Output of will appear on the GUI Output Text
                    self.ui.msaOutputText.append(
                        "Sequence ID: " +
                        str(element.name))  # Display a Name of Sequence
                    self.ui.msaOutputText.append(
                        "Sequence: " +
                        str(element.seq))  # Display a result of MSA
                    self.ui.msaOutputText.append(
                        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
                    )

            if checkMethod == "Phylogenetic Tree":
                # Phylogentic Tree using Clustal2 and Generating output file of result
                cmd = ClustalwCommandline(clustalw_exe,
                                          infile=filepath,
                                          type="DNA",
                                          clustering="NJ",
                                          tree=True,
                                          outputtree='phylip',
                                          output='FASTA')
                std_out, std_err = cmd()
                # Reading Ouput Tree File
                readTree = Phylo.read(phylo_outputfile, 'newick')
                # Draw the Tree on The GUI
                # Draw Function is imported from "drawFunc.py": It's exactly a function in the documentation of Biopython but we edited a function to draw a picture of result on th ui
                draw(readTree)
                imgArr = cv2.imread("tree.jpg")
                img = pg.ImageItem(imgArr)
                img.rotate(270)
                self.ui.graphicsView.addItem(
                    img)  # Displaying an image of Phylogentic Tree
                self.ui.msaOutputText.append(
                    std_out
                )  # Displaying information about Output tree on a output text are in the GUI
        else:
            choice = QtWidgets.QMessageBox.question(
                self, 'WARNING!', "Please Choose Method first",
                QtWidgets.QMessageBox.Ok)
            if choice == QtWidgets.QMessageBox.Ok:
                return
    #Check the DND file was created.
    #TODO - Try and parse this with Bio.Nexus?
    if newtree_file is not None:
        tree_file = newtree_file
    else:
        #Clustalw will name it based on the input file
        tree_file = os.path.splitext(input_file)[0] + ".dnd"
    assert os.path.isfile(tree_file), \
           "Did not find tree file %s" % tree_file
    os.remove(tree_file)

    #And again, but this time using Bio.Align.Applications wrapper
    #Any filesnames with spaces should get escaped with quotes automatically.
    #Using keyword arguments here.
    cline = ClustalwCommandline(clustalw_exe,
                                infile=input_file,
                                outfile=output_file)
    assert str(eval(repr(cline))) == str(cline)
    if newtree_file is not None:
        #Test using a property:
        cline.newtree = newtree_file
        #I don't just want the tree, also want the alignment:
        cline.align = True
        assert str(eval(repr(cline))) == str(cline)
    #print cline
    output, error = cline()
    assert output.strip().startswith("CLUSTAL")
    assert error.strip() == ""
    align = AlignIO.read(output_file, "clustal")
    assert set(input_records.keys()) == set(output_records.keys())
    for record in align:
first_blossfeldia_rpl16_sequence.translate()

### BLASTING
from Bio.Blast import NCBIWWW
from Bio import SeqIO
result_handle = NCBIWWW.qblast("blastn", "nt",
                               _first_blossfeldia_rpl16_sequence)
save_file = open("blast_search_on_first_blossfeldia_rpl16_sequence.xml", "w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()

### Clustal
import os
from Bio.Align.Applications import ClustalwCommandline

clustalw_exe = r"C:\Program Files (x86)\ClustalW2\clustalw2.exe"
clustalw_cmd_line = ClustalwCommandline(clustalw_exe,
                                        infile="opuntia_rpl16.fasta")
stdout, stderr = clustalw_cmd_line(
)  #outputs two files opuntia_rpl16.aln, opuntia_rpl16.dnd

# Read Multiple Alignment
from Bio import AlignIO
opuntia_rpl16_alignment = AlignIO.read("opuntia_rpl16.aln", "clustal")
print opuntia_rpl16_alignment

# Draw Phylo Tree
from Bio import Phylo
opuntia_rpl16_tree = Phylo.read("opuntia_rpl16.dnd", "newick")
Phylo.draw(opuntia_rpl16_tree)
Пример #21
0
		#BLAST code
		bashCom = '%s -p blastp -i %s -D 1 -F F -j %s -o %s' % (blast_exe, query_file, subject_file, result_file)
		#print bashCom
		os.system(bashCom)
		f = open(result_file, 'r').readlines()
		result = f[3].split('\t')
		blast_e = result[-2]
		blast_bit = float(result[-1])
	except:
		blast_e = "null"
		blast_bit = "null"
		print "Blast Error"

	#Clustal code
	try:
		cline = ClustalwCommandline("clustalw", infile = clustalw_infile)
		stdout, stderr = cline()
		alignment = AlignIO.read(clustalw_infile.replace('.fasta', '.aln'), "clustal")
		length = alignment.get_alignment_length()
		stars = alignment._star_info.count('*')
		clustal_score = float(stars)/length
	except:
		clustal_score = "null"
		print "Clustal Error"

	#Stow away the remove statement
	deletes.append('DELETE FROM scores_summary where query like "%s" and subject like "%s"' % (task[0], task[1]))

	#stow away the insert statement
	inserts.append('REPLACE INTO scores_summary (query, subject, blast_score, clustalw_score, blast_bit_score) VALUES ("%s", "%s", %s, %s, %s)' % (task[0], task[1], blast_e,clustal_score,blast_bit))
Пример #22
0
def main(folder_name, folder_answer, prot_sequence, model_prot, use_uniprot,
         clustalw_exe):
    initial_location = os.getcwd()
    working_directory = folder_name
    if os.path.exists(working_directory) is True:
        os.chdir(working_directory)
        # Check if ActiveSite directory exists
        if os.path.exists("ActiveSite") is False:
            os.mkdir("ActiveSite")
            os.chdir("./ActiveSite")
    else:
        print("The folder specified doesn't exist! Check again.")
        quit()

    # Download M-CSA in csv format if no in there already - https://www.ebi.ac.uk/thornton-srv/m-csa/

    url_atlas = "https://www.ebi.ac.uk/thornton-srv/m-csa/media/flat_files/curated_data.csv"
    print(
        "Downloading the curated data from the Mechanism and Catalytic Site atlas - www.ebi.ac.uk -."
    )
    with urllib.request.urlopen(url_atlas) as data, open(
            './curated_data.csv', 'w', encoding="utf-8") as f:
        f.write(data.read().decode())

    # Define the query sequence. Ask to use either the one in the blast&modeller directory or specify the one to use.
    id_1 = ''
    if folder_answer == 'NO':
        if os.path.exists("../Blast&Modeller") is True:
            shutil.copy("../Blast&Modeller/query.fasta", "./query.fasta")
            shutil.copy("../Blast&Modeller/model.fasta", "./model.fasta")
            with open("./model.fasta") as file:
                id_1 = file.readline().replace(">", "").rstrip().upper()
    elif folder_answer == "YES":
        with open("query.fasta", "w+") as f:
            protein_sequence = prot_sequence
            f.write(">query" + "\n" + protein_sequence)
        id_1 = model_prot
    # From pdb id to uniprot id, used in the M-CSA file#
    print("Searching for the uniprot equivalent of your pdb file...")
    url = 'https://www.uniprot.org/uploadlists/'
    params = {'from': 'PDB_ID', 'to': 'ACC', 'format': 'tab', 'query': id_1}
    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    try:
        with urllib.request.urlopen(req) as f:
            response = f.read()
    except BaseException:
        print(
            "There has been some problem connecting to UniPROT. Check your internet connection and try again!"
        )
        time.sleep(3)
        quit()

    fromtoid = response.decode('utf-8')
    try:
        uniprotfullID = fromtoid.split("\t")[2]
        uniprotID = uniprotfullID.split("\n")[0]
    except BaseException:
        print(
            "It seems that your model protein does not have a uniprot equivalent... "
            "You can try with an alternate model!")
        time.sleep(3)
        quit()
    if uniprotID.isalpha():
        print("Your pdb does not appear to have a UniProt equivalent. Sorry!")
    else:
        print("Uniprot id found. Extracting information...")
    handle = urllib.request.urlopen("https://www.uniprot.org/uniprot/" +
                                    str(uniprotID) + ".xml")
    record = SeqIO.read(handle, "uniprot-xml")

    # Extract basic info from the uniprot id from the pdb file#

    protein_type = ""
    proteinECnumber = ""

    for information in record:
        if "submittedName_fullName" in record.annotations:
            protein_type = record.annotations["submittedName_fullName"]
            break
        elif "recommendedName_fullName" in record.annotations:
            protein_type = str(record.annotations["recommendedName_fullName"])
            break

    for information in record:
        if "submittedName_ecNumber" in record.annotations:
            proteinECnumber = record.annotations['submittedName_ecNumber']
            break
        elif "recommendedName_ecNumber" in record.annotations:
            proteinECnumber = record.annotations['recommendedName_ecNumber']
            break

    if isinstance(proteinECnumber, list):
        proteinECnumber = str(proteinECnumber[0])

    if proteinECnumber == "":
        print("It seems the information provided by uniprot is not enough. "
              "Please, check https://www.uniprot.org/uniprot/" +
              str(uniprotID))
        print(
            "Your protein doesn't seem to have an EC number... Not much we can do without it!"
        )
        if proteinECnumber.upper().replace(" ", "") == "NO":
            print(
                "Sorry, but without the EC number there is little we can do..."
            )
            time.sleep(3)
            quit()

    # If uniprot provided enough data, print into the console the answers
    res = open("Active_site.txt", "a")
    if isinstance(protein_type, list) is True:
        print("\nYour model protein, " + str(id_1) +
              ", has been identified as a " + protein_type[0] +
              " with EC number " + str(proteinECnumber) + ".\n")
        print("\nYour model protein, " + str(id_1) +
              ", has been identified as a " + protein_type[0] +
              " with EC number " + str(proteinECnumber) + ".\n",
              file=res)
    elif isinstance(protein_type, str) is True:
        print("\nYour model protein, " + str(id_1) +
              ", has been identified as a " +
              protein_type.replace("[", "").replace("]", "") +
              " with EC number " + str(proteinECnumber) + ".\n")
        print("\nYour model protein, " + str(id_1) +
              ", has been identified as a " +
              protein_type.replace("[", "").replace("]", "") +
              " with EC number " + str(proteinECnumber) + ".\n",
              file=res)
    else:
        print("\nYour model protein, " + str(id_1) + ", has EC number " +
              str(proteinECnumber) + ".\n")
        print("\nYour model protein, " + str(id_1) + ", has EC number " +
              str(proteinECnumber) + ".\n",
              file=res)
    res.close()
    time.sleep(3)

    # In a few cases, uniprot provides already the information about the catalytic site. In this case, we will compare
    # the pdb file provided to the query and avoid using the ATLAS site.
    act_site = []
    bind_site = []
    catalytic_res = {}
    cofactor = []

    # Find if information about the catalytic residues exists in the uniprot xml
    for i in range(len(record.features)):
        if record.features[i].type == "active site":
            act_site.append(int(record.features[i].location.end) - 1)
        elif record.features[i].type == "binding site":
            bind_site.append(int(record.features[i].location.end) - 1)

    cofactorslist = [
        "NAD", "FAD", "FMN", "SAM", "PLP", "ATP", "UTP", "ADP", "UDP", "CTP",
        "PAPS", "acetyl COA", "Zn", "Fe", "Cu", "K", "Mg", "Mo", "Ni", "Se"
    ]
    if "comment_cofactor" in record.annotations:
        for ncof in cofactorslist:
            for act in range(len(record.annotations["comment_cofactor"])):
                if ncof in record.annotations["comment_cofactor"][act]:
                    cofactor.append(ncof)

    # If it exists, extract that information and ask for user confirmation to use it
    if len(act_site) > 0 and use_uniprot == 'YES':
        up_maxid = record.annotations["accessions"][0]
        for i in range(len(act_site)):
            catalytic_res[record.seq[act_site[i]]] = act_site[i]
        copyfile("query.fasta", "refsequp.fasta")
        with open("refsequp.fasta", "a") as f:
            f.write("\n>" + str(record.annotations["accessions"][0]) + "\n" +
                    str(record.seq))

    if len(act_site) == 0 or use_uniprot == "NO":
        print(
            "Your model protein does not have an active site identified in UniProt. "
            "Proceeding to check in the M-CSA database...")
        time.sleep(3)
        # Create dictionary from database with uniprot id and EC number only of those
        # sharing the same ECnumber with the model#
        upidEC = {}
        curated_data = csv.reader(open("./curated_data.csv", "r"))
        for row in curated_data:
            upidEC[row[3]] = str(row[1])
        searchUP_dict = {}
        while len(searchUP_dict) == 0:
            for key, value in upidEC.items():
                if proteinECnumber in key:
                    searchUP_dict[key] = value
            else:
                proteinECnumberlist = proteinECnumber.split(".")
                maxi = len(proteinECnumberlist)
                proteinECnumber = str(proteinECnumberlist[0]) + "." + str(
                    proteinECnumberlist[1])
                if proteinECnumber in key:
                    searchUP_dict[key] = value

        # Clean uniprot ids if they have more than 6 characters
        uniprotids = list(searchUP_dict.values())
        uniprotids_clean = []
        for i in uniprotids:
            if len(i) != 6:
                uniprotids_clean.append(i[0:6])
            else:
                uniprotids_clean.append(i)
        # Retrieve sequences of those uniprot identifiers from the website
        upECrefseq = []
        i = 0
        for x in uniprotids_clean:
            handle2 = urllib.request.urlopen(
                "https://www.uniprot.org/uniprot/" + uniprotids_clean[i] +
                ".xml")
            record2 = SeqIO.read(handle2, "uniprot-xml")
            upECrefseq.append(record2.seq)
            i += 1
        # Use Pairwise alignment to identify the best hit from the M-CSA list
        # Create the fasta file for the identified hit sin the M-CSA
        i = 0
        for seqs in uniprotids_clean:
            with open(uniprotids_clean[i] + ".fasta", "w") as f:
                f.write("\n>" + uniprotids_clean[i] + "\n" +
                        str(upECrefseq[i]))
            i += 1

        # Pairwise alignment execution. The options used are the typical from the BLOSUM62 substitution matrix

        aligner = Align.PairwiseAligner()
        alphabet = "PROTEIN"
        aligner.open_gap_score = -10
        aligner.extend_gap_score = -0.1
        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")

        # Save into a list the scores of each alignment

        alignment_scores = []
        i = 0
        for seqs in uniprotids_clean:
            seq1 = SeqIO.read("query.fasta", "fasta")
            seq2 = SeqIO.read(uniprotids_clean[i] + ".fasta", "fasta")
            alignments = aligner.align(seq1.seq, seq2.seq)
            alignment_scores.append(alignments.score)
            i += 1

        # Idenfity the UniProt ID with the maximal identity to the query sequence

        try:
            up_maxid = uniprotids_clean[alignment_scores.index(
                max(alignment_scores))]
            print("The best hit for your protein is uniprotID: " +
                  str(up_maxid) + ". \n")
            time.sleep(3)
        except BaseException:
            print(
                "Could not find a protein with enough homology with your query. "
                "You can try again with a different model!")
            time.sleep(4)
            quit()
        time.sleep(2)

        # From the UniProtID with max identity, retrieve the catalytic residues annotated in the M-CSA csv file
        curated_data = csv.reader(open("./curated_data.csv", "r"))
        for row in curated_data:
            if str(row[1])[0:6] == str(up_maxid) and row[4] == "residue":
                if row[5] not in catalytic_res.keys():
                    catalytic_res[row[5]] = [row[7]]
                elif row[5] in catalytic_res.keys(
                ) and row[7] not in catalytic_res[row[5]]:
                    catalytic_res[row[5]].append(row[7])

        # From the UniProtID with max identity, retrieve the cofactors (if any)
        curated_data = csv.reader(open("./curated_data.csv", "r"))
        for row in curated_data:
            if str(row[1]) == str(up_maxid) and row[4] == "cofactor":
                cofactor.append(row[8] + "(" + row[5] + ")")
        cofactor = set(cofactor)

        # Alignment of the query sequence with the best hit from UniProt to identify the catalytic residues in the query.
        # Create a fata file containing the query sequence and the sequence of the best UniProtID
        shutil.copyfile("query.fasta", "refsequp.fasta")

        with open("refsequp.fasta", "a") as f:
            f.write(
                "\n>" + up_maxid + "\n" +
                str(upECrefseq[alignment_scores.index(max(alignment_scores))]))

    # Use ClustalW for the alignment
    sequp_align = ClustalwCommandline(clustalw_exe,
                                      infile="refsequp.fasta",
                                      score="percent")
    stdout, stderr = sequp_align()
    with open("alignment_output.txt", "w+") as clustalscore:
        clustalscore.write(stdout)

    # From the ClustalW results, check if the identity is sufficient to continue with the active site identification
    with open("alignment_output.txt") as c:
        for line in c:
            if "Sequences (1:2)" in line:
                score = int(''.join(filter(str.isdigit, line)))
    if len(str(score)) == 3:
        score = score - 120
    elif len(str(score)) == 4:
        score = score - 1200
    elif len(str(score)) == 5:
        score = 100

    print("\t Percentage of identity = " + str(score) + "\n")
    time.sleep(3)
    if 40 > score > 15:
        identity = 1
    elif score <= 15:
        identity = 2
    else:
        identity = 0

    time.sleep(3)
    if identity == 0:
        highhomology = "YES"
    elif identity == 1:
        highhomology = "YES"
    else:
        highhomology = "NO"
        print(
            "Your query sequence alignment resulted in less than 15% identity. "
            "Prediction of the active site would not be accurate using it.\n")
        time.sleep(3)

    # From alignment, and the catalytic residues identified in the M-CSA hit or the UniProt,
    # return positions in your protein#
    res = open("Active_site.txt", "a")
    sequp_alignment = AlignIO.read("refsequp.aln", "clustal")

    if highhomology == "YES":
        # Check that the list is not empty
        aaposition_cat = []
        aaposition_cat1 = []
        if len(list(catalytic_res.keys())[0]) >= 1:
            aaname_cat = list(catalytic_res.keys())
            aaposition_cat = list(catalytic_res.values())
            for aaposition in aaposition_cat:
                if isinstance(aaposition, list):
                    for aa in aaposition:
                        aaposition_cat1.append(aa)
                else:
                    aaposition_cat1.append(aaposition)

            # Convert the identified residues to a dictionary with the information in a more readable format
            sequp_alignment = AlignIO.read("refsequp.aln", "clustal")
            threetoone = {
                "Ala": "A",
                "Arg": "R",
                "Asn": "N",
                "Asp": "D",
                "Cys": "C",
                "Glu": "E",
                "Gln": "Q",
                "Gly": "G",
                "His": "H",
                "Ile": "I",
                "Leu": "L",
                "Lys": "K",
                "Met": "M",
                "Phe": "F",
                "Pro": "P",
                "Ser": "S",
                "Thr": "T",
                "Trp": "W",
                "Tyr": "Y",
                "Val": "V"
            }
            catalytic_res1lett = {}
            i = 0
            conv3to1 = 0
            for keys in catalytic_res.keys():
                if keys in threetoone.keys():
                    conv3to1 = 1
                    catalytic_res1lett[threetoone[str(
                        aaname_cat[i])]] = aaposition_cat[i]
                    i += 1
            if conv3to1 == 1:
                aaname_cat = list(catalytic_res1lett.keys())
            else:
                for positions in catalytic_res.keys():
                    catalytic_res1lett[positions] = [catalytic_res[positions]]
        else:
            aaname_cat = list(catalytic_res.keys())
            aaposition_cat = list(catalytic_res.values())
            aaposition_cat1 = list(catalytic_res.values())
            for positions in catalytic_res.keys():
                catalytic_res1lett[positions] = [catalytic_res[positions]]
        # From the known catalytic positions, identify the corresponding positions in the alignment
        print(
            "Identifying the predicted catalytic residues in your protein...\n"
        )
        time.sleep(3)
        a = 0
        positions_tocheck = []
        for it in aaposition_cat1:
            positions_tocheck.append(int(it) - 1)
            a += 1
        i = 0
        align_catpos = []
        for i in range(a):
            pos_cat = positions_tocheck[i]
            gap_count, gap_count_1, gap_count_2, gap_count_3 = (0, 0, 0, 0)
            for aa in sequp_alignment[1][:positions_tocheck[i]]:
                if aa == "-":
                    gap_count += 1
            if gap_count != 0:
                for aa1 in sequp_alignment[1][pos_cat:pos_cat + gap_count]:
                    if aa == "-":
                        gap_count_1 += 1
                if gap_count_1 != 0:
                    for aa2 in sequp_alignment[1][pos_cat:pos_cat + gap_count +
                                                  gap_count_1]:
                        if aa == "-":
                            gap_count_2 += 1
                    if gap_count_2 != 0:
                        for aa3 in sequp_alignment[1][pos_cat:pos_cat +
                                                      gap_count + gap_count_1 +
                                                      gap_count_2]:
                            if aa == "-":
                                gap_count_3 += 1
            align_catpos.append(pos_cat + gap_count + gap_count_1 +
                                gap_count_2 + gap_count_3)
            i += 1
        # Extract in list from the dictionary of 1lettcode
        lettkeys = list(catalytic_res1lett.keys())

        # From the positions in the alignment, identify the real position in the query sequence
        query_catalytic_res = {}
        for lett in lettkeys:
            query_catalytic_res[lett] = []

        query = SeqIO.read("query.fasta", "fasta")
        for pos in align_catpos:
            pos_cat = pos
            pos_cat_real = pos_cat
            for aas in sequp_alignment[0][0:pos_cat + 1]:
                if aas == "-":
                    pos_cat_real -= 1
            try:
                if isinstance(query_catalytic_res[query[pos_cat_real]],
                              list) is True:
                    query_catalytic_res[query[pos_cat_real]].append(
                        pos_cat_real + 1)
                elif isinstance(query_catalytic_res[query[pos_cat_real]],
                                str) is True:
                    query_catalytic_res[query[pos_cat_real]] = [
                        query_catalytic_res[query[pos_cat_real]]
                    ]
                    query_catalytic_res[query[pos_cat_real]].append(
                        pos_cat_real + 1)
            except KeyError:
                try:
                    if isinstance(query_catalytic_res[query[pos_cat_real + 1]],
                                  list) is True:
                        query_catalytic_res[query[pos_cat_real +
                                                  1]].append(pos_cat_real + 2)
                    elif isinstance(
                            query_catalytic_res[query[pos_cat_real + 1]],
                            str) is True:
                        query_catalytic_res[query[pos_cat_real + 1]] = [
                            query_catalytic_res[query[pos_cat_real + 1]]
                        ]
                        query_catalytic_res[query[pos_cat_real +
                                                  1]].append(pos_cat_real + 2)
                except KeyError:
                    try:
                        if isinstance(
                                query_catalytic_res[query[pos_cat_real - 1]],
                                list) is True:
                            query_catalytic_res[query[pos_cat_real -
                                                      1]].append(pos_cat_real)
                        elif isinstance(
                                query_catalytic_res[query[pos_cat_real - 1]],
                                str) is True:
                            query_catalytic_res[query[pos_cat_real - 1]] = [
                                query_catalytic_res[query[pos_cat_real - 1]]
                            ]
                            query_catalytic_res[query[pos_cat_real -
                                                      1]].append(pos_cat_real)
                    except KeyError:
                        query_catalytic_res[
                            query[pos_cat_real]] = str(pos_cat_real + 1) + "*"

        query_catalytic_resC = query_catalytic_res.copy()
        for key in query_catalytic_resC.keys():
            if len(query_catalytic_resC[key]) == 0:
                query_catalytic_res[
                    key] = "Present in model and absent in the query."
        time.sleep(3)

        # Print the results both in the console and in a file
        print("The predicted active site is formed by:")
        print("The predicted active site is formed by:", file=res)
        for key, value in query_catalytic_res.items():
            print(
                str(key) + "--> " +
                str(value).replace("[", "").replace("]", ""))
            print(str(key) + "--> " +
                  str(value).replace("[", "").replace("]", ""),
                  file=res)
        print(
            "\n* : Residue predicted in the query but not present as part of the active site of the model.\n"
        )
        print(
            "\n* : Residue predicted in the query but not present as part of the active site of the model.\n",
            file=res)
        time.sleep(3)
    else:
        if len(bind_site) > 0:
            print(
                "Due to low homology, calculation of the active site failed. "
                "Maybe you could check the model pdb file information in  "
                "https://www.uniprot.org/uniprot/" + str(uniprotID) +
                " for more information.")
            print(
                "Due to low homology, calculation of the active site failed. "
                "Maybe you could check the model pdb file information in  "
                "https://www.uniprot.org/uniprot/" + str(uniprotID) +
                " for more information.",
                file=res)
            time.sleep(3)
        else:
            print(
                "Due to low homology, calculation of the active site failed. "
                "Maybe you could check the model pdb file information in  "
                "https://www.uniprot.org/uniprot/" + str(uniprotID) +
                " for more information.")
            print(
                "Due to low homology, calculation of the active site failed. "
                "Maybe you could check the model pdb file information in  "
                "https://www.uniprot.org/uniprot/" + str(uniprotID) +
                " for more information.",
                file=res)

    # At last, from the EC number write some general information of the protein. If they are supposed to have a cofactor
    # but could not be identified, print also that!
    if len(cofactor) > 0:
        print("\nIdentified cofactor(s): ")
        print("\nIdentified cofactor(s): ", file=res)
        for x in cofactor:
            print(x)
            print(x, file=res)
    elif len(bind_site) > 0:
        print(
            "Also, in Uniprot some binding sites are indicated for the pdb model in positions: "
        )
        print(*bind_site, sep=",")
        print(
            "Also, in Uniprot some binding sites are indicated for the pdb model in positions: ",
            file=res)
        print(*bind_site, sep=",", file=res)
    else:
        if "1." in proteinECnumber[0:2]:
            print(
                "Your protein is an oxidoreductase, so it should have a cofactor. But it could not be identified. "
                "Please, check uniprot entry " + str(up_maxid) +
                " -the best hit identified from the M-CSA database- "
                "for more information.")
            print(
                "Your protein is an oxidoreductase, so it should have a cofactor. But it could not be identified. "
                "Please, check uniprot entry " + str(up_maxid) +
                " -the best hit identified from the M-CSA database- "
                "for more information.",
                file=res)
        elif "2." in proteinECnumber[0:2]:
            print(
                "Your protein is a transferase, commonly, cofactor dependant. Please, check uniprot entry "
                + str(up_maxid) +
                " -the best hit identified- for more information.")
            print(
                "Your protein is a transferase, commonly, cofactor dependant. Please, check uniprot entry "
                + str(up_maxid) +
                " -the best hit identified- for more information.",
                file=res)
        elif "3." in proteinECnumber[0:2]:
            print("Your protein is an hydrolase.")
            print("Your protein is an hydrolase.", file=res)
        elif "4." in proteinECnumber[0:2]:
            print("Your protein is most similar to characterised lyases.")
            print("Your protein is most similar to characterised lyases.",
                  file=res)
        elif "5." in proteinECnumber[0:2]:
            print("Your protein seems to be an isomerase.")
            print("Your protein seems to be an isomerase.", file=res)
        elif "6." in proteinECnumber[0:2]:
            print(
                "Your protein seems to be an ligases. Normally, this enzymes are ATP dependant but no cofactor could "
                "be identified. Please, check uniprot entry " + str(up_maxid) +
                " -the best hit identified- for more information.")
            print(
                "Your protein seems to be an ligases. Normally, this enzymes are ATP dependant but no cofactor could "
                "be identified. Please, check uniprot entry " + str(up_maxid) +
                " -the best hit identified- for more information.",
                file=res)

    res.close()

    time.sleep(3)

    # Clean the directory of intermediate files which are not necessary
    os.remove("refsequp.dnd")
    tokeep = ["query.fasta", str(up_maxid) + ".fasta"]
    for file in glob.glob("*.fasta"):
        if file not in tokeep:
            os.remove(file)

    print(
        "\nFinished running the ActiveSiteID!\n. Your results can be found in "
        + str(os.getcwd()) +
        ", in a file named Active_site.txt, together with the alignment and the fasta files for your sequences. "
        "You can run now the Surface&Clusters module! ")

    os.chdir(initial_location)

    return uniprotID
Пример #23
0
# In[2]:


from Bio.Align.Applications import ClustalwCommandline


# In[ ]:


handler_clustal = '/usr/bin/clustalw'

filtered_files = glob.glob("filtered_yeast/*.fasta")

for file in filtered_files:
    clustal_alignment = ClustalwCommandline(handler_clustal, 
                                            infile= file)
    out_log, err_log = clustal_alignment()


# In[20]:


from Bio.Phylo.TreeConstruction import *
from Bio import AlignIO, Phylo
from matplotlib import pyplot as plt


# In[21]:


def create_tree(infile, outfile = None, algorithm = 'upgma'):
muscle_exe = "//anaconda3/lib/python3.7/site-packages/Bio/Application/_Phyml.py"
cmdline = MuscleCommandline(muscle_exe, input=inputFile, out=musFile, clw=True)
print(cmdline)
cmdline()

#assert os.path.isfile(muscle_exe), "Muscle executable missing"
stdout, stderr = cmdline()

# %% test to run clustalW to try and learn commandline
import os
from Bio.Align.Applications import ClustalwCommandline
clusFileName = "bioCoralSeqMuscleOut.aln"
clusFile = path + "/" + clusFileName

clustalw_exe = r"//anaconda3/lib/python3.7/site-packages/Bio/Application/__init__.py"
cline = ClustalwCommandline(clustalw_exe, infile=clusFile)
print(cline)
#cline()

assert os.path.isfile(clustalw_exe), "Clustal W executable missing"
stdout, stderr = cline()

# %% convert alignment file from fasta from muscle in Poseidon to phyli-relaxed
msaFile = "/muscleAlignmentCoralSeq.msa"
outFile = "/muscleAlignmentCoralSeq.phy"

import os
path = os.getcwd()
#path = "/Users/kgrabb/Documents/2018.05CoralLarvae/Genomes/Poseidon/blastResults/v2"
print(path)
print(path + msaFile)
Пример #25
0
    def extract_sequence_motifs(self,
                                X,
                                interp_steps=100,
                                save_path=None,
                                max_examples=4000,
                                mer_size=12):
        counter = 0
        all_mers = []
        all_scores = []

        for _node_tensor, _segment, _raw_seq in zip(*X):
            if counter >= max_examples:
                break
            _meshed_node_tensor = np.array(
                [self.embedding_vec[idx] for idx in _node_tensor])
            _meshed_reference_input = np.zeros_like(_meshed_node_tensor)
            new_node_tensor = []
            for i in range(0, interp_steps + 1):
                new_node_tensor.append(
                    _meshed_reference_input + i / interp_steps *
                    (_meshed_node_tensor - _meshed_reference_input))

            feed_dict = {
                self.node_tensor: np.concatenate(np.array(new_node_tensor),
                                                 axis=0),
                self.max_len: _segment,
                self.segment_length: [_segment] * (interp_steps + 1),
                self.is_training_ph: False
            }

            grads = self.sess.run(self.g_nodes, feed_dict).reshape(
                (interp_steps + 1, _segment, 4))
            grads = (grads[:-1] + grads[1:]) / 2.0
            node_scores = np.sum(
                np.average(grads, axis=0) *
                (_meshed_node_tensor - _meshed_reference_input),
                axis=-1)
            mer_scores = []
            for start in range(len(node_scores) - mer_size + 1):
                mer_scores.append(np.sum(node_scores[start:start + mer_size]))
            max_scores = np.max(node_scores)
            all_mers.append(
                _raw_seq[np.argmax(mer_scores):np.argmax(mer_scores) +
                         mer_size].upper().replace('T', 'U'))
            all_scores.append(max_scores)
            counter += 1

        FNULL = open(os.devnull, 'w')
        for top_rank in [100, 500, 1000, 2000]:
            # align top_rank mers
            best_mers = np.array(all_mers)[:top_rank]
            fasta_path = os.path.join(save_path, 'top%d_mers.fa' % (top_rank))
            with open(fasta_path, 'w') as f:
                for i, seq in enumerate(best_mers):
                    print('>{}'.format(i), file=f)
                    print(seq, file=f)
            # multiple sequence alignment
            out_fasta_path = os.path.join(save_path,
                                          'aligned_top%d_mers.fa' % (top_rank))
            cline = ClustalwCommandline("clustalw2",
                                        infile=fasta_path,
                                        type="DNA",
                                        outfile=out_fasta_path,
                                        output="FASTA")
            sp.call(str(cline), shell=True, stdout=FNULL)
            motif_path = os.path.join(save_path,
                                      'top%d-sequence_motif.jpg' % (top_rank))
            lib.plot.plot_weblogo(out_fasta_path, motif_path)

        even_mers = all_mers[::2]
        fasta_path = os.path.join(save_path, 'even_mers.fa')
        with open(fasta_path, 'w') as f:
            for i, seq in enumerate(even_mers):
                print('>{}'.format(i), file=f)
                print(seq, file=f)
        # multiple sequence alignment
        out_fasta_path = os.path.join(save_path, 'aligned_even_mers.fa')
        cline = ClustalwCommandline("clustalw2",
                                    infile=fasta_path,
                                    type="DNA",
                                    outfile=out_fasta_path,
                                    output="FASTA")
        sp.call(str(cline), shell=True, stdout=FNULL)
        motif_path = os.path.join(save_path, 'top1000-even-sequence_motif.jpg')
        lib.plot.plot_weblogo(out_fasta_path, motif_path)

        odd_mers = all_mers[1::2]
        fasta_path = os.path.join(save_path, 'odd_mers.fa')
        with open(fasta_path, 'w') as f:
            for i, seq in enumerate(odd_mers):
                print('>{}'.format(i), file=f)
                print(seq, file=f)
        # multiple sequence alignment
        out_fasta_path = os.path.join(save_path, 'aligned_odd_mers.fa')
        cline = ClustalwCommandline("clustalw2",
                                    infile=fasta_path,
                                    type="DNA",
                                    outfile=out_fasta_path,
                                    output="FASTA")
        sp.call(str(cline), shell=True, stdout=FNULL)
        motif_path = os.path.join(save_path, 'top1000-odd-sequence_motif.jpg')
        lib.plot.plot_weblogo(out_fasta_path, motif_path)
Пример #26
0
def fasta2select(fastafilename, is_aligned=False,
                 ref_resids=None, target_resids=None,
                 ref_offset=0, target_offset=0, verbosity=3,
                 alnfilename=None, treefilename=None, clustalw="clustalw2"):
    """Return selection strings that will select equivalent residues.

    The function aligns two sequences provided in a FASTA file and
    constructs MDAnalysis selection strings of the common atoms. When
    these two strings are applied to the two different proteins they
    will generate AtomGroups of the aligned residues.

    *fastafilename* contains the two un-aligned sequences in FASTA
    format. The reference is assumed to be the first sequence, the
    target the second. ClustalW_ produces a pairwise
    alignment (which is written to a file with suffix .aln).  The
    output contains atom selection strings that select the same atoms
    in the two structures.

    Unless *ref_offset* and/or *target_offset* are specified, the resids
    in the structure are assumed to correspond to the positions in the
    un-aligned sequence, namely the first residue has resid == 1.

    In more complicated cases (e.g. when the resid numbering in the
    structure/psf has gaps due to missing parts), simply provide the
    sequence of resids as they appear in the psf in *ref_resids* or
    *target_resids*, e.g. ::

       target_resids = [a.resid for a in trj.select_atoms('name CA')]

    (This translation table *is* combined with any value for *xxx_offset*!)

    :Arguments:
      *fastafilename*
         FASTA file with first sequence as reference and
         second the one to be aligned (ORDER IS IMPORTANT!)
      *is_aligned*
         False: run clustalw for sequence alignment; True: use
         the alignment in the file (e.g. from STAMP) [``False``]
      *ref_offset*
         add this number to the column number in the FASTA file
         to get the original residue number
      *target_offset*
         same for the target
      *ref_resids*
         sequence of resids as they appear in the reference structure
      *target_resids*
         sequence of resids as they appear in the target
      *alnfilename*
         filename of ClustalW alignment (clustal format) that is
         produced by *clustalw* when *is_aligned* = ``False``.
         ``None`` uses the name and path of *fastafilename* and
         subsititutes the suffix with '.aln'.[``None``]
      *treefilename*
         filename of ClustalW guide tree (Newick format);
         if ``None``  the the filename is generated from *alnfilename*
         with the suffix '.dnd' instead of '.aln' [``None``]
      *clustalw*
         path to the ClustalW (or ClustalW2) binary; only
         needed for *is_aligned* = ``False`` ["clustalw2"]

    :Returns:
      *select_dict*
          dictionary with 'reference' and 'mobile' selection string
          that can be used immediately in :func:`rms_fit_trj` as
          ``select=select_dict``.
    """
    import Bio.SeqIO
    import Bio.AlignIO
    import Bio.Alphabet
    import numpy as np

    protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein)
    if is_aligned:
        logger.info("Using provided alignment %r", fastafilename)
        with open(fastafilename) as fasta:
            alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped)
    else:
        from Bio.Align.Applications import ClustalwCommandline
        import os.path

        if alnfilename is None:
            filepath, ext = os.path.splitext(fastafilename)
            alnfilename = filepath + '.aln'
        if treefilename is None:
            filepath, ext = os.path.splitext(alnfilename)
            treefilename = filepath + '.dnd'
        run_clustalw = ClustalwCommandline(clustalw, infile=fastafilename, type="protein",
                                           align=True, outfile=alnfilename, newtree=treefilename)
        logger.debug("Aligning sequences in %(fastafilename)r with %(clustalw)r.", vars())
        logger.debug("ClustalW commandline: %r", str(run_clustalw))
        try:
            stdout, stderr = run_clustalw()
        except:
            logger.exception("ClustalW %(clustalw)r failed", vars())
            logger.info("(You can get clustalw2 from http://www.clustal.org/clustal2/)")
            raise
        with open(alnfilename) as aln:
            alignment = Bio.AlignIO.read(aln, "clustal", alphabet=protein_gapped)
        logger.info("Using clustalw sequence alignment %r" % alnfilename)
        logger.info("ClustalW Newick guide tree was also produced: %r" % treefilename)

    nseq = len(alignment)
    if nseq != 2:
        raise ValueError("Only two sequences in the alignment can be processed.")

    orig_resids = [ref_resids, target_resids]  # implict assertion that
    # we only have two sequences in the alignment
    offsets = [ref_offset, target_offset]
    for iseq, a in enumerate(alignment):  # need iseq index to change orig_resids
        if orig_resids[iseq] is None:
            # build default: assume consecutive numbering of all
            # residues in the alignment
            GAP = a.seq.alphabet.gap_char
            length = len(a.seq) - a.seq.count(GAP)
            orig_resids[iseq] = np.arange(1, length + 1)
        else:
            orig_resids[iseq] = np.asarray(orig_resids[iseq])
    # add offsets to the sequence <--> resid translation table
    seq2resids = [resids + offset for resids, offset in zip(orig_resids, offsets)]
    del orig_resids
    del offsets

    def resid_factory(alignment, seq2resids):
        """Return a function that gives the resid for a position ipos in
        the nseq'th alignment.

        resid = resid_factory(alignment,seq2resids)
        r = resid(nseq,ipos)

        It is based on a look up table that translates position in the
        alignment to the residue number in the original
        sequence/structure.

        The first index of resid() is the alignmment number, the
        second the position in the alignment.

        seq2resids translates the residues in the sequence to resid
        numbers in the psf. In the simplest case this is a linear map
        but if whole parts such as loops are ommitted from the protein
        the seq2resids may have big gaps.

        Format: a tuple of two numpy arrays; the first array is for
        the reference, the second for the target, The index in each
        array gives the consecutive number of the amino acid in the
        sequence, the value the resid in the structure/psf.

        Note: assumes that alignments have same length and are padded if
        necessary.
        """
        # could maybe use Bio.PDB.StructureAlignment instead?
        nseq = len(alignment)
        t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int)
        for iseq, a in enumerate(alignment):
            GAP = a.seq.alphabet.gap_char
            t[iseq, :] = seq2resids[iseq][np.cumsum(np.where(
                np.array(list(a.seq)) == GAP, 0, 1)) - 1]
            # -1 because seq2resid is index-1 based (resids start at 1)

        def resid(nseq, ipos, t=t):
            return t[nseq, ipos]

        return resid

    resid = resid_factory(alignment, seq2resids)

    res_list = []  # collect individual selection string
    # could collect just resid and type (with/without CB) and
    # then post-process and use ranges for continuous stretches, eg
    # ( resid 1:35 and ( backbone or name CB ) ) or ( resid 36 and backbone ) ...

    GAP = alignment[0].seq.alphabet.gap_char  # should be the same for both seqs
    if GAP != alignment[1].seq.alphabet.gap_char:
        raise ValueError("Different gap characters in sequence 'target' and 'mobile'.")
    for ipos in xrange(alignment.get_alignment_length()):
        aligned = list(alignment[:, ipos])
        if GAP in aligned:
            continue  # skip residue
        template = "resid %i"
        if 'G' not in aligned:
            # can use CB
            template += " and ( backbone or name CB )"
        else:
            template += " and backbone"
        template = "( " + template + " )"

        res_list.append([template % resid(iseq, ipos) for iseq in xrange(nseq)])

    sel = np.array(res_list).transpose()

    ref_selection = " or ".join(sel[0])
    target_selection = " or ".join(sel[1])
    return {'reference': ref_selection, 'mobile': target_selection}
Пример #27
0
import sys, re
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO

input_fa = sys.argv[1]
output_aln = input_fa + ".aln"
output_fa = output_aln + '.fa'

cmd = ClustalwCommandline('clustalw2', infile=input_fa, outfile=output_aln)

stdout, stderr = cmd()

align = AlignIO.read(output_aln, 'clustal')

AlignIO.write(align, output_fa, 'fasta')
	seq2 = [x[:-1] for x in seq2.readlines() if '>' not in x]
	seq2 = ''.join(seq2)
	infile.write(seq2)
	return infile

if args.ReverseComplement == None:
	infile = create_infile(mitogenome,coi)
else:
	rc_mitogenome = reverse_complement(args.WholeMitogenome)
	rc_mitogenome = open('RC_'+str(args.WholeMitogenome)).read()
	infile = create_infile(rc_mitogenome,coi)

infile.close()
coi.close()

cline = ClustalwCommandline("clustalw", infile="infile.fasta")
child = subprocess.call(str(cline),
  stdout=subprocess.PIPE,
  shell=(sys.platform!="win32"))

align = AlignIO.read("infile.aln", "clustal")
count = 0
for record in align:
	if record.id == 'COI':
		for letter in record.seq:
			if letter == '-':
				count += 1
			else:
				break

Пример #29
0
for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                print('****Alignment****')
                print('sequence:', alignment.title)
                print('length:', alignment.length)
                print('e value:', hsp.expect)

result.close()

#Alinhamento multiplo e arvore filogenética
help(ClustalwCommandline)
cline = ClustalwCommandline(
    "clustalw2",
    infile=
    "C:/Users/Zé Freitas/Desktop/Mestrado/Labs_Bioinf/Trabalho prático/scripts/Labs_Bioinf/Proibitina/Probitina_MA.fasta"
)
print(cline)

cline = MuscleCommandline(
    input=
    "C:/Users/Zé Freitas/Desktop/Mestrado/Labs_Bioinf/Trabalho prático/scripts/Labs_Bioinf/Proibitina/Proibitina_MA.fasta",
    out="Proibitina_MA.aln",
    clw=True)
print(cline)

#Leitura de ficheiro do alinhamento multiplo
alignment = AlignIO.read(
    "C:/Users/Zé Freitas/Desktop/Mestrado/Labs_Bioinf/Trabalho prático/scripts/Labs_Bioinf/Proibitina/Proibitinalinhados.fasta",
    "fasta")
Пример #30
0
from Bio.Align.Applications import ClustalwCommandline
cline = ClustalwCommandline("clustalw2",
                            infile="/home/koreanraichu/lactobacillus.fasta")
print(cline)  # clustalw2 -infile=/home/koreanraichu/lactobacillus.aln
help(ClustalwCommandline)  # Help
# 어쩌라는거지... 설치해야되나... cookbook에 있는 거 윈도용 아니냐...