Пример #1
0
 def test_simple_clustal_strict(self):
     """Simple muscle call using strict Clustal output"""
     input_file = "Fasta/f002"
     self.assertTrue(os.path.isfile(input_file))
     records = list(SeqIO.parse(input_file,"fasta"))
     records.sort(key = lambda rec: rec.id)
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", input_file)
     #Use clustal output (with a CLUSTAL header)
     cmdline.set_parameter("clwstrict", True)  # Default None treated as False!
     self.assertEqual(str(cmdline).rstrip(), _escape_filename(muscle_exe) +
                      " -in Fasta/f002 -clwstrict")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     #Didn't use -quiet so there should be progress reports on stderr,
     align = AlignIO.read(child.stdout, "clustal")
     align.sort()
     self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
     self.assertEqual(len(records),len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     child.stdout.close()
     child.stderr.close()
     del child
Пример #2
0
def muscleProcess (threadID, filebase, outbase, treebase):

	fasta	 = filebase % threadID
	output	 = outbase  % threadID
	treeFile = treebase % threadID

	print( "Building NJ tree from %s" % fasta )

	run_muscle = MuscleCommandline( cmd=muscle, input=fasta, out=output )
	run_muscle.tree1      = treeFile
	run_muscle.cluster1   = "neighborjoining"
	run_muscle.maxiters   = 1
	thisVarHidesTheOutput = run_muscle()
Пример #3
0
    def align_alleles(self):
        """

        """
        logging.info('Aligning extracted alleles with MUSCLE')
        cline = MuscleCommandline(input=self.unaligned_alleles,
                                  out=self.aligned_alleles)
        cline()
Пример #4
0
def alignment():
    # align sequences with muscle, (http://www.drive5.com/muscle/)
    if muscle_loc:
        muscle_cline = MuscleCommandline(muscle_loc,
                                         input=file_ali_in,
                                         out=file_ali_out,
                                         clwstrict=True)
        muscle_cline()
    else:
        muscle_cline = MuscleCommandline(input=file_ali_in,
                                         out=file_ali_out,
                                         clwstrict=True)
        muscle_cline()
    alignment = open(file_ali_out, "r")
    print_alignment = alignment.read()
    print print_alignment
    alignment.close()
Пример #5
0
def muscle(path_to_seq):
    muscle_exe = "/afs/andrew.cmu.edu/usr23/lleung/muscle/muscle"
    muscle_cline = MuscleCommandline(muscle_exe, input=path_to_seq)
    head, filename = os.path.split(path_to_seq)
    stdout, stderr = muscle_cline()
    handle = open("aligned.fasta", "w")  #change name of alignment file
    handle.write(stdout)
    handle.close()
Пример #6
0
def precentage_identity_readP_anntP(bm_ids, out_prec_iden, orfs_reads,
                                    annt_prot):
    with open(out_prec_iden + ".txt", "w") as pIdentity:
        pIdentity.write("read" + "\t" + "txt" + "\t" + "per_identity" + "\t" +
                        "len_alig" + "\t" + "match" + "\t" + "mismatch" + "\n")
        indx_orfs_reads = SeqIO.index(orfs_reads, "fasta")
        indx_annt_protn = SeqIO.index(annt_prot, "fasta")
        txt_dic = get_txt_from_anntProtein(indx_annt_protn)
        in_both = 0
        no_read = 0
        no_txt = 0
        for map_ids in open(bm_ids):
            df = map_ids.split()
            txt = str(df[1]).split("|")[0]
            read = str(df[0])
            if read in indx_orfs_reads and txt in txt_dic:
                try:
                    in_both += 1
                    record1 = indx_orfs_reads[read]
                    record2 = indx_annt_protn[txt_dic[txt]]
                    records = (record1, record2)
                    handle = StringIO()
                    SeqIO.write(records, handle, "fasta")
                    muscle_cline = MuscleCommandline(
                        clwstrict=True)  #clwstrict , msf=True
                    data = handle.getvalue()
                    stdout, stderr = muscle_cline(stdin=data)
                    align = AlignIO.read(StringIO(stdout), "clustal")
                    target = str(align[0].seq)
                    query = str(align[1].seq)
                    match = 0
                    mismatch = 0
                    for t, q in zip(target, query):
                        if t == q:
                            match += 1
                        else:
                            mismatch += 1
                    pIdentity.writelines(
                        str(read) + "\t" + str(txt) + "\t" +
                        str((match * 100 / len(target))) + "\t" +
                        str(len(target)) + "\t" + str(match) + "\t" +
                        str(mismatch) + "\n")
                except Exception as e:
                    print(e)
            else:
                if read in indx_orfs_reads:
                    no_txt = no_txt + 1
                    with open(out_prec_iden + "_transcpNoFound.txt",
                              "a+") as tnf:
                        tnf.writelines(str(txt) + "\n")
                elif txt in txt_dic:
                    no_read = no_read + 1
                    with open(out_prec_iden + "_readsNoFound.txt",
                              "a+") as rnf:
                        rnf.writelines(str(read) + "\n")

    print("both: ", in_both, "read_no_found: ", no_read, "txt_no_found: ",
          no_txt)
Пример #7
0
def align_fasta(infname, outfname, debug=False):
    """
    Generate an alignment for the given fasta file.

    Args:
        infname (str): Path to fasta to be aligned.
        outfname (str): Path to output fasta to be
    """
    muscle_exec = {
        "Windows": "niclassify/bin/muscle3.8.31_i86win32.exe",
        "Linux": "niclassify/bin/muscle3.8.31_i86linux64",
        "Darwin": "niclassify/bin/muscle3.8.31_i86darwin64"
    }[PLATFORM]

    alignment_call = MuscleCommandline(os.path.realpath(
        os.path.join(MAIN_PATH, muscle_exec)),
                                       input=os.path.realpath(infname),
                                       out=os.path.realpath(outfname))

    print(alignment_call.__str__())

    if debug:
        subprocess.run(alignment_call.__str__(),
                       creationflags=subprocess.CREATE_NEW_CONSOLE,
                       shell=True)
    else:
        subprocess.run(alignment_call.__str__(), shell=True)

    r_script = os.path.realpath(
        os.path.join(MAIN_PATH, "niclassify/core/scripts/trim_alignment.R"))

    trim_call = [R_LOC, r_script, outfname, outfname]

    if debug:
        proc = subprocess.run(trim_call,
                              creationflags=subprocess.CREATE_NEW_CONSOLE,
                              env=os.environ.copy())
    else:
        proc = subprocess.run(trim_call, env=os.environ.copy())

    if os.stat(outfname).st_size == 0:
        raise ChildProcessError("Sequence Alignment Failed")

    if proc.returncode != 0:
        raise RScriptFailedError("R TrimAlignment failed")
Пример #8
0
 def run_muscle(self, sequences_to_align, output_file_name, muscle_mode):
     """
     This method allows to interact with the local MUSCLE.
     """
     # TODO: to insert the following options:
     #           - guide tree from:
     #               - none
     #               - first iteration
     #               - second iteration
     self.pymod.build_sequence_file(sequences_to_align,
                                    output_file_name,
                                    unique_indices_headers=True)
     # Input FASTA for MUSCLE.
     infasta = os.path.join(self.pymod.alignments_dirpath,
                            output_file_name + ".fasta")
     # Output FASTA from MUSCLE, in tree order.
     outfasta_tree = os.path.join(self.pymod.alignments_dirpath,
                                  output_file_name + ".out_fasta")
     # Output ALN.
     outaln = os.path.join(self.pymod.alignments_dirpath,
                           output_file_name + ".aln")
     muscle_exec = self.tool["exe_file_path"].get_value()
     if muscle_mode == "highest_accuracy":
         cline = MuscleCommandline(muscle_exec,
                                   input=infasta,
                                   out=outfasta_tree,
                                   clwout=outaln)
     elif muscle_mode == "large_datasets":
         cline = MuscleCommandline(muscle_exec,
                                   input=infasta,
                                   out=outfasta_tree,
                                   clwout=outaln,
                                   maxiters=2)
     elif muscle_mode == "fastest":
         cline = MuscleCommandline(muscle_exec,
                                   input=infasta,
                                   out=outfasta_tree,
                                   clwout=outaln,
                                   maxiters=1,
                                   diags=True,
                                   sv=True,
                                   distance1="kbit20_3")
     else:
         raise KeyError(muscle_mode)
     self.pymod.execute_subprocess(str(cline))
Пример #9
0
def muscle_align(fasta_in, outname):
    """
    """

    cline = MuscleCommandline(input=fasta_in, out=outname)

    cline()
    aln = AlignIO.read(outname, 'fasta')
    return aln
Пример #10
0
def multiple_sequence_alignment(
    records,
    output_fn="/var/www/html/dl/alignment.fasta",
    format="clustal",
    id_prefix="",
    index=None,
):
    """Then go to https://www.ncbi.nlm.nih.gov/projects/msaviewer/
    https://soerendip.com/dl/alignment.fasta
    """

    if isinstance(records[0], str):
        if index is None:
            records = [
                SeqRecord(Seq(r), id=f"{id_prefix}-{i:03.0f}")
                for i, r in enumerate(records)
            ]
        else:
            records = [
                SeqRecord(Seq(r), id=f"{_id}")
                for i, (r, _id) in enumerate(zip(records, index))
            ]

    path = tempfile.gettempdir()
    job_id = "msa-" + str(uuid())
    tmp_inputs_fn = os.path.join(path, job_id + ".faa")
    if output_fn == None:
        output_fn = os.path.join(path, job_id + ".fasta")
    tmp_log = os.path.join(path, job_id + ".log")
    SeqIO.write(records, tmp_inputs_fn, "fasta")

    msa = MuscleCommandline(input=tmp_inputs_fn,
                            out=output_fn,
                            diags=True,
                            maxiters=1,
                            log=tmp_log)
    msa()

    with open(output_fn, "r") as file:
        align = AlignIO.read(file, "fasta")

    # print(align.format(format))

    lines = align.format("stockholm").split("\n")

    result = []
    index = []
    for line in lines:
        if line.startswith("//"):
            continue
        if line == "":
            continue
        if not line.startswith("#"):
            result.append(list(line.split(" ")[1]))
            index.append(line.split(" ")[0])

    return pd.DataFrame(np.array(result), index=index).sort_index()
Пример #11
0
 def generaAln(self):
     self.file.close()
     muscle_exe = r"C:\Users\Gerson\Downloads\muscle.exe"
     cline = MuscleCommandline(muscle_exe,
                               input="alinear.fasta",
                               out="arbol.aln",
                               clw=True)
     string = str(cline)
     subprocess.call(string, shell=True)
Пример #12
0
def align_with_muscle(input_fasta):
    muscle_exe = Path("../bin/muscle3.8.31_i86linux64")
    muscle_cline = MuscleCommandline(muscle_exe, input=input_fasta)
    # The variable `stdout` ("standard out") captures the output from MUSCLE
    # `stderr` ("standard error") captures any errors.
    stdout, stderr = muscle_cline()
    # `AlignIO` reads an alignment
    # `StringIO` lets BioPython treat a string as though it were a file
    return AlignIO.read(StringIO(stdout), "fasta")
Пример #13
0
def muscle_largeinput(file):
    muscle_cline = MuscleCommandline(input=file)
    child = subprocess.Popen(str(muscle_cline),
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True,
                             shell=(sys.platform != "linux2"))
    muscle_align = AlignIO.read(child.stdout, "fasta")
    print(muscle_align)
Пример #14
0
def quickAlign(refseq, testseq, maxiters=None, diags=None, gapopen=None):

    #sanity check
    try:
        refseq = re.sub("-", "", refseq)
    except TypeError:
        #not a string, probably a SeqRecord
        try:
            refseq = str(refseq.seq)
            refseq = re.sub("-", "", refseq)
        except AttributeError:
            #give up
            sys.exit(
                "quickAlign() requires inputs to be either strings or SeqRecord objects"
            )

    try:
        testseq = re.sub("-", "", testseq)
    except TypeError:
        #not a string, probably a SeqRecord
        try:
            testseq = str(testseq.seq)
            testseq = re.sub("-", "", testseq)
        except AttributeError:
            #give up
            sys.exit(
                "quickAlign() requires inputs to be either strings or SeqRecord objects"
            )

    handle = StringIO()
    handle.write(">ref\n%s\n>test\n%s\n" % (refseq, testseq))
    data = handle.getvalue()

    muscle_cline = MuscleCommandline(cmd=muscle, quiet=True)
    if maxiters is not None: muscle_cline.maxiters = maxiters
    if diags is not None: muscle_cline.diags = diag
    if gapopen is not None: muscle_cline.gapopen = gapopen

    stdout, stderr = muscle_cline(stdin=data)

    aligned = dict()
    for p in SeqIO.parse(StringIO(stdout), "fasta"):
        aligned[p.id] = str(p.seq)
    return aligned
Пример #15
0
 def test_with_multiple_output_formats(self):
     """Simple muscle call with multiple output formats."""
     input_file = "Fasta/f002"
     output_html = "temp_f002.html"
     output_clwstrict = "temp_f002.clw"
     self.assertTrue(os.path.isfile(input_file))
     records = list(SeqIO.parse(input_file, "fasta"))
     records.sort(key=lambda rec: rec.id)  # noqa: E731
     # Prepare the command... use Clustal output (with a MUSCLE header)
     cmdline = MuscleCommandline(
         muscle_exe,
         input=input_file,
         clw=True,
         htmlout=output_html,
         clwstrictout=output_clwstrict,
     )
     self.assertEqual(
         str(cmdline).rstrip(),
         _escape_filename(muscle_exe) +
         " -in Fasta/f002 -clw -htmlout temp_f002.html" +
         " -clwstrictout temp_f002.clw",
     )
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(
         str(cmdline),
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         universal_newlines=True,
         shell=(sys.platform != "win32"),
     )
     # Clustalw on stdout:
     align = AlignIO.read(child.stdout, "clustal")
     align.sort()
     # Didn't use -quiet so there should be progress reports on stderr,
     self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
     child.stdout.close()
     child.stderr.close()
     del child
     handle = open(output_html)
     html = handle.read().strip().upper()
     handle.close()
     self.assertTrue(html.startswith("<HTML"))
     self.assertTrue(html.endswith("</HTML>"))
     # ClustalW strict:
     align = AlignIO.read(output_clwstrict, "clustal")
     align.sort()
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
     os.remove(output_html)
     os.remove(output_clwstrict)
Пример #16
0
    def find_sqce_consensus( list_of_sequences, sqce_type=Constants.SEQUENCE_TYPE_DNA, \
                             threshold=Constants.DEFAULT_SQCE_CONSENSUS_AMBIG_THRESHOLD, \
                             fasta_end_name = '' ):

        if (sqce_type == Constants.SEQUENCE_TYPE_DNA):
            alphabet = generic_dna
            ambiguous = Constants.SEQUENCE_AMBIGUOUS_DNA_BASE

        elif (sqce_type == Constants.SEQUENCE_TYPE_PROT):
            alphabet = generic_protein
            ambiguous = Constants.SEQUENCE_AMBIGUOUS_PROT_AA

        else:
            raise DenCellORFException(
                'MergeStrategy.find_sqce_consensus(): The type of sequence provided'
                + ' has to be ' + Constants.SEQUENCE_TYPE_DNA + ' or ' +
                Constants.SEQUENCE_TYPE_PROT + ' (provided type: ' +
                str(sqce_type) + ').')

        # Store the input sequences in a fasta file in order to run Muscle
        input_sequences = (SeqRecord(Seq(s, alphabet))
                           for s in list_of_sequences)

        if (not os.path.exists(DefaultTemporaryFolder.TEMPORARY_FOLDER)):
            os.makedirs(DefaultTemporaryFolder.TEMPORARY_FOLDER)

        input_sequences_file = os.path.join(
            DefaultTemporaryFolder.TEMPORARY_FOLDER,
            'input_sequences' + fasta_end_name + '.fasta')
        SeqIO.write(input_sequences, input_sequences_file, 'fasta')

        # Perform the multiple sequences alignment and
        # store the output in a fasta file
        aligned_sequences_file = os.path.join(
            DefaultTemporaryFolder.TEMPORARY_FOLDER,
            'aligned_sequences' + fasta_end_name + '.fasta')
        muscle_cline = MuscleCommandline(cmd='/bin/muscle',
                                         input=input_sequences_file,
                                         out=aligned_sequences_file)

        (stdout, stderr) = muscle_cline()

        # Read the fasta file containing aligned sequences
        align = AlignIO.read(aligned_sequences_file, 'fasta')

        summary_align = AlignInfo.SummaryInfo(align)

        # Compute the consensus
        consensus = summary_align.gap_consensus(threshold=threshold,
                                                ambiguous=ambiguous)

        # Remove the temporary fasta files
        os.remove(input_sequences_file)
        os.remove(aligned_sequences_file)

        return str(consensus)
Пример #17
0
def muscle_alignment(seqs):
    """Align 2 sequences with muscle"""
    filename = 'temp.faa'
    SeqIO.write(seqs, filename, "fasta")
    name = os.path.splitext(filename)[0]
    from Bio.Align.Applications import MuscleCommandline
    cline = MuscleCommandline(input=filename, out=name+'.txt')
    stdout, stderr = cline()
    align = AlignIO.read(name+'.txt', 'fasta')
    return align
    def _perform_ma(self, data):
        params = {'maxiters': 7}
        if self.diags is True:
            params['diags'] = True
        if self.maxhours is not None:
            params['maxhours'] = self.maxhours

        muscle_cline = MuscleCommandline(**params)
        stdout, stderr = muscle_cline(stdin=data)
        return stdout
def align_ks_domains(reference_alignment, ks_names, ks_seqs, data_dir):
    """Function that aligns a number of query KS domain sequences to the 
    reference alignment of KS domains.
    """
    #Set file names and write query domains to temp input file
    in_temp = os.path.join(os.getcwd(), "in_seq.fasta")
    in_temp_aligned = os.path.join(os.getcwd(), "in_seq_aligned.fasta")
    out_temp = os.path.join(os.getcwd(), "out_seq.fasta")
    alignment_file = os.path.join(os.getcwd(), "aligned.fasta")
    with open(in_temp, "w") as tmp_input:
        for name, seq in zip(ks_names, ks_seqs):
            tmp_input.write("%s\n%s\n" % (name, seq))

    #Generate alignment of query sequences
    muscle_cmd = str(MuscleCommandline(input=in_temp, out=in_temp_aligned))
    out, err, retcode = utils.execute(muscle_cmd.split(" "))
    if retcode == 1:
        logging.error(
            "Alignment of query KS sequences with Muscle failed. Check if Muscle is installed appropriately."
        )
        sys.exit(1)

    #Align the query alignment to the reference alignment using muscle --profile
    muscle_cmd = str(
        MuscleCommandline(profile='True',
                          in1=reference_alignment,
                          in2=in_temp_aligned,
                          out=out_temp))
    out, err, retcode = utils.execute(muscle_cmd.split(" "))
    if retcode == 1:
        logging.error(
            "Alignment of query+reference KS sequences with Muscle failed. Check if Muscle is installed appropriately."
        )
        sys.exit(1)
    else:
        f_temp_input = open(out_temp, 'r').read()
        reformat(input=f_temp_input, out_filename=alignment_file)

    #Remove temporary files
    for f in [in_temp, out_temp]:
        os.remove(f)

    return alignment_file
Пример #20
0
def muscle_align(input, output):
    try:
        in_file = r'{0}'.format(input)
        out_file = r'{0}'.format(output)
        muscle_cline = MuscleCommandline(input=in_file, out=out_file)
        stdout, stderr = muscle_cline()
    except:
        print('Imposible alinear el archivo ' + query + ':'
              '¿Ha comprobado sus valores de coverage e identity?')
        pass
Пример #21
0
def build_profile_hmm_for_repeats(repeats, error_rate):
    muscle_cline = MuscleCommandline('muscle', clwstrict=True)
    data = '\n'.join(
        ['>%s\n' % str(i) + repeats[i] for i in range(len(repeats))])
    stdout, stderr = muscle_cline(stdin=data)
    alignment = AlignIO.read(StringIO(stdout), "clustal")
    aligned_repeats = [str(aligned.seq) for aligned in alignment]

    return build_profile_hmm_pseudocounts_for_alignment(
        error_rate, aligned_repeats)
Пример #22
0
    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO, SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune == False:
                aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id, "has premature stops, discarding")
            bad_seq += '*' in str(tempseq)[:-1]

        print('Number of sequences with stops:', bad_seq, 'out of total',
              len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname, 'fasta')

        if alignment_tool == 'muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname,
                                      out=tmpfname[:-5] + 'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta")
        elif alignment_tool == 'mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:', alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id: seq for seq in self.aln}
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
Пример #23
0
 def test_Muscle_simple(self):
     """Simple round-trip through app just infile and outfile"""
     cmdline = MuscleCommandline(muscle_exe,
                                 input=self.infile1,
                                 out=self.outfile1)
     self.assertEqual(str(cmdline), _escape_filename(muscle_exe)
                      + ' -in Fasta/f002 -out "Fasta/temp align out1.fa"')
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     output, error = cmdline()
     self.assertEqual(output, "")
     self.assertTrue("ERROR" not in error)
Пример #24
0
def pool_write_microalignment(mblocknum,targetdata,extendedsourcedata,nbinitialsource,all_ids,msamethod):
    aln = {}
    i = mblocknum[0]
    mblock = mblocknum[1]
    input_muscle_file = "input_muscle.fasta"+str(i)
    output_muscle_file = "output_muscle.fasta"+str(i)
        
    input_muscle = open(input_muscle_file,"w")

    nbseq = 0
    for gene in targetdata:
        geneid,geneseq = gene
        if geneid in mblock.keys() and mblock[geneid][1] > mblock[geneid][0]:
            input_muscle.write(">"+geneid + "\n" + geneseq[mblock[geneid][0]:mblock[geneid][1]]+"\n")
            nbseq += 1

    for j in range(nbinitialsource):
        cds = extendedsourcedata[j]
        cdsid,cdsseq,cdsgeneid,null = cds
        if cdsid in mblock.keys() and mblock[cdsid][1] > mblock[cdsid][0]:
            input_muscle.write(">"+cdsid + "\n" + cdsseq[mblock[cdsid][0]:mblock[cdsid][1]]+"\n")
            nbseq += 1
                
    input_muscle.close()

    msa = []
    if(nbseq > 0):
        if(msamethod == "muscle"):
            muscle_cline = MuscleCommandline(input=input_muscle_file, out=output_muscle_file, gapopen=-800.0)
            stdout, stderr = muscle_cline()
        else:# msamethod == "mafft"
            mafft_cline = MafftCommandline(input=input_muscle_file)
            stdout, stderr = mafft_cline()
            with open(output_muscle_file, "w") as handle:
                handle.write(stdout)            
        msa = AlignIO.read(output_muscle_file, "fasta")
    else:
        open(output_muscle_file,"w").close()

        
    present_ids = []
    length = 0
    for record in msa:
        present_ids.append(record.id)
        aln[record.id] = record.seq
        length = len(record.seq)

    for id in all_ids:
        if(id not in present_ids):
            aln[id] = '-'*length

    os.remove(input_muscle_file)
    os.remove(output_muscle_file)
    return aln
Пример #25
0
 def align(self):
     if self.align_software == 'mafft':
         mafft_cline = MafftCommandline(
             cmd=self.mafft_path, input=self.pair_pep_file, auto=True)
         stdout, stderr = mafft_cline()
         align = AlignIO.read(StringIO(stdout), "fasta")
         AlignIO.write(align, self.prot_align_file, "fasta")
     if self.align_software == 'muscle':
         muscle_cline = MuscleCommandline(
             cmd=self.muscle_path, input=self.pair_pep_file, out=self.prot_align_file, seqtype="protein", clwstrict=True)
         stdout, stderr = muscle_cline()
Пример #26
0
def multialign_genomic_templates(fastafile):
	"""Uses MUSCLE to return the multialigned genomic data."""
	from Bio.Align.Applications import MuscleCommandline
	from StringIO import StringIO
	from Bio import AlignIO

	muscle_cline = MuscleCommandline(input=fastafile)
	stdout, stderr = muscle_cline()
	multialign = AlignIO.read(StringIO(stdout), "fasta")

	return multialign
Пример #27
0
def runMuscle(filePath):

    alnFilePath = os.path.splitext(filePath)[0] + ".muscle.aln"
    print("[INFO] Running muscle on {}".format(filePath))

    muscle_cline = MuscleCommandline(input=filePath, out=alnFilePath, clw=True)
    stdout, stderr = muscle_cline()

    print("[INFO] Creating alignment from {}".format(alnFilePath))
    align = AlignIO.read(alnFilePath, "clustal")
    return align
Пример #28
0
def muscleAlignment(seqs, muscle_exe="muscle"):
    '''
    align sequences with muscle
    given a list of seqs in SeqIO format,
    return a aligned seqs in SeqIO format
    '''
    f_mem = io.StringIO()
    SeqIO.write(seqs, f_mem, 'fasta')
    data = f_mem.getvalue()
    muscle_cline = MuscleCommandline(muscle_exe)
    stdout, stderr = muscle_cline(stdin=data)
    return list(SeqIO.parse(io.StringIO(stdout), 'fasta'))
Пример #29
0
def muscleAlign(seq_records):
    muscle_cmd_line = MuscleCommandline()

    child_process = subprocess.Popen(str(muscle_cmd_line),
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.DEVNULL,
                                     universal_newlines=True)
    SeqIO.write(seq_records, child_process.stdin, 'fasta')
    child_process.stdin.close()

    return AlignIO.read(child_process.stdout, 'fasta')
Пример #30
0
def muscleAlign(a, b):
    filenames = [a + ".fasta", b + ".fasta"]
    with open(a + "-" + b + ".fasta", 'w') as outfile:
        for fname in filenames:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

    cline = MuscleCommandline(input=a + '-' + b + '.fasta',
                              out=a + '-' + b + '_aligned.fasta')
    print cline
    os.system(str(cline))
Пример #31
0
def generated_paired_alignment():
    print "Generating paired alignments in ClustalW format using MUSCLE"
    from Bio.Align.Applications import MuscleCommandline
    muscle_cline = MuscleCommandline(
        input=
        "F:\\KINEV\\fasta_files\\1FFW_A_1FFW_B_P0AE67_P07363.sffamily_interactingpairs.fasta"
    )
    stdout, stderr = muscle_cline()
    from StringIO import StringIO
    from Bio import AlignIO
    align = AlignIO.read(StringIO(stdout), "fasta")
    print(align)
Пример #32
0
def quickAlign( refseq, testseq, maxiters=None, diags=None, gapopen=None ):
    
	#sanity check
	refseq	= re.sub( "-", "", str(refseq) )
	testseq = re.sub( "-", "", str(testseq) )

	handle = StringIO()
	handle.write( ">ref\n%s\n>test\n%s\n"%(refseq,testseq) )
	data = handle.getvalue()

	muscle_cline = MuscleCommandline(cmd=muscle, quiet=True)
	if maxiters is not None: muscle_cline.maxiters = maxiters
	if diags    is not None: muscle_cline.diags    = diag
	if gapopen  is not None: muscle_cline.gapopen  = gapopen

	stdout, stderr = muscle_cline(stdin=data)

	aligned = dict()
	for p in SeqIO.parse(StringIO(stdout), "fasta"):
		aligned[ p.id ] = str(p.seq)
	return aligned
Пример #33
0
def align_muscle(infile_name, outfile_name, log_file):
    """Make external call to Muscle aligner."""
    cline = MuscleCommandline(input=infile_name,
                              out=outfile_name,
                              clw=True,
                              loga=log_file,
                              quiet='y')
    child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True)
    output, error = child.communicate()
    report = {'output': output, 'error': error}
    # TODO: should set up something to parse MUSCLE errors
    return report
Пример #34
0
def allign_fasta(filename = "filename", 
                 extension_in = ".fasta", 
                 extension_out = ".aln"):
    """
        This function requires MUSCLE from http://www.drive5.com/muscle. The
            main objective - read FASTA file with multiple records, find similar
            sequences, save alingment of similar sequences to "filename.aln".
        @param filename: FASTA file, which should be alligned.
        @param extension_in: FASTA file type end, could be .fa or similar.
        @param extension_out: Alignment file type: ".aln".
    """
    from Bio.Align.Applications import MuscleCommandline
    
    
    if filename == None:
        return False;
    
    if not os.path.exists(filename + extension_in):
        return False;   
        
    cline = MuscleCommandline(input=filename + extension_in, out=filename + extension_out);
    os.system(cline.__str__());
    return True;
Пример #35
0
 def test_Muscle_profile_simple(self):
     """Simple round-trip through app doing a profile alignment"""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("out", self.outfile3)
     cmdline.set_parameter("profile", True)
     cmdline.set_parameter("in1", self.infile2)
     cmdline.set_parameter("in2", self.infile3)
     self.assertEqual(str(cmdline), _escape_filename(muscle_exe) +
                      " -out Fasta/temp_align_out3.fa" +
                      " -profile -in1 Fasta/fa01 -in2 Fasta/f001")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     output, error = cmdline()
     self.assertEqual(output, "")
     self.assertTrue("ERROR" not in error)
     self.assertTrue(error.strip().startswith("MUSCLE"), output)
Пример #36
0
 def test_Muscle_profile_simple(self):
     """Simple round-trip through app doing a profile alignment."""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("out", self.outfile3)
     cmdline.set_parameter("profile", True)
     cmdline.set_parameter("in1", self.infile2)
     cmdline.set_parameter("in2", self.infile3)
     self.assertEqual(str(cmdline), muscle_exe + \
                      " -out Fasta/temp_align_out3.fa" + \
                      " -profile -in1 Fasta/fa01 -in2 Fasta/f001")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdin, stdout, stderr = generic_run(cmdline)
     self.assertEqual(stdin.return_code, 0)
     self.assertEqual(stdout.read(), "")
     self.assert_("ERROR" not in stderr.read())
     self.assertEqual(str(stdin._cl), str(cmdline))
Пример #37
0
 def test_Muscle_with_options(self):
     """Round-trip through app with a switch and valued option"""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("input", self.infile1)  # "input" is alias for "in"
     cmdline.set_parameter("out", self.outfile2)
     #Use property:
     cmdline.objscore = "sp"
     cmdline.noanchors = True
     self.assertEqual(str(cmdline), _escape_filename(muscle_exe) +
                      " -in Fasta/f002" +
                      " -out Fasta/temp_align_out2.fa" +
                      " -objscore sp -noanchors")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     output, error = cmdline()
     self.assertEqual(output, "")
     self.assertTrue("ERROR" not in error)
     self.assertTrue(error.strip().startswith("MUSCLE"), output)
Пример #38
0
 def test_Muscle_with_options(self):
     """Round-trip through app with a switch and valued option."""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("input", self.infile1) #"input" is alias for "in"
     cmdline.set_parameter("out", self.outfile2)
     #Use property:
     cmdline.objscore = "sp"
     cmdline.noanchors = True
     self.assertEqual(str(cmdline), muscle_exe +\
                      " -in Fasta/f002" + \
                      " -out Fasta/temp_align_out2.fa" + \
                      " -objscore sp -noanchors")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdin, stdout, stderr = generic_run(cmdline)
     self.assertEqual(stdin.return_code, 0)
     self.assertEqual(stdout.read(), "")
     self.assert_("ERROR" not in stderr.read())
     self.assertEqual(str(stdin._cl), str(cmdline))
Пример #39
0
 def test_Muscle_profile_simple(self):
     """Simple round-trip through app doing a profile alignment"""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("out", self.outfile3)
     cmdline.set_parameter("profile", True)
     cmdline.set_parameter("in1", self.infile2)
     cmdline.set_parameter("in2", self.infile3)
     self.assertEqual(str(cmdline), muscle_exe + \
                      " -out Fasta/temp_align_out3.fa" + \
                      " -profile -in1 Fasta/fa01 -in2 Fasta/f001")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     output, error = child.communicate()
     self.assertEqual(child.returncode, 0)
     self.assertEqual(output, "")
     self.assert_("ERROR" not in error)
     del child
Пример #40
0
 def test_Muscle_with_options(self):
     """Round-trip through app with a switch and valued option"""
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("input", self.infile1) #"input" is alias for "in"
     cmdline.set_parameter("out", self.outfile2)
     #Use property:
     cmdline.objscore = "sp"
     cmdline.noanchors = True
     self.assertEqual(str(cmdline), muscle_exe +\
                      " -in Fasta/f002" + \
                      " -out Fasta/temp_align_out2.fa" + \
                      " -objscore sp -noanchors")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     output, error = child.communicate()
     self.assertEqual(child.returncode, 0)
     self.assertEqual(output, "")
     self.assert_("ERROR" not in error)
     del child
Пример #41
0
 def test_simple_clustal_strict(self):
     """Simple muscle call using strict Clustal output."""
     input_file = "Fasta/f002"
     self.assert_(os.path.isfile(input_file))
     records = list(SeqIO.parse(open(input_file),"fasta"))
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", input_file)
     #Preserve input record order (makes checking output easier)
     cmdline.set_parameter("stable", True) #Default None treated as False!
     #Use clustal output (with a CLUSTAL header)
     cmdline.set_parameter("clwstrict", True) #Default None treated as False!
     self.assertEqual(str(cmdline).rstrip(), muscle_exe + \
                      " -in Fasta/f002 -clwstrict -stable")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, out_handle, err_handle = generic_run(cmdline)
     align = AlignIO.read(out_handle, "clustal")
     self.assertEqual(len(records),len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     #Didn't use -quiet so there should be progress reports on stderr,
     self.assert_(err_handle.read().strip().startswith("MUSCLE"))
Пример #42
0
 def test_long(self) :
     """Simple muscle call using long file."""
     #Create a large input file by converting some of another example file
     temp_large_fasta_file = "temp_cw_prot.fasta"
     handle = open(temp_large_fasta_file, "w")
     records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40]
     SeqIO.write(records, handle, "fasta")
     handle.close()
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", temp_large_fasta_file)
     #Preserve input record order
     cmdline.set_parameter("stable", True) #Default None treated as False!
     #Use fast options
     cmdline.set_parameter("maxiters", 1)
     cmdline.set_parameter("diags", True) #Default None treated as False!
     #Use clustal output
     cmdline.set_parameter("clwstrict", True) #Default None treated as False!
     #Shoudn't need this, but just to make sure it is accepted
     cmdline.set_parameter("maxhours", 0.1)
     #No progress reports to stderr
     cmdline.set_parameter("quiet", True) #Default None treated as False!
     #TODO - Fix the trailing space!
     self.assertEqual(str(cmdline).rstrip(), muscle_exe + \
                      " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \
                      " -maxiters 1 -clwstrict -stable -quiet")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, out_handle, err_handle = generic_run(cmdline)
     align = AlignIO.read(out_handle, "clustal")
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align) :
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     os.remove(temp_large_fasta_file)
     #See if quiet worked:
     self.assertEqual("", err_handle.read().strip())
Пример #43
0
 def test_long(self):
     """Simple muscle call using long file"""
     #Create a large input file by converting some of another example file
     temp_large_fasta_file = "temp_cw_prot.fasta"
     records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
     SeqIO.write(records, temp_large_fasta_file, "fasta")
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", temp_large_fasta_file)
     #Use fast options
     cmdline.set_parameter("maxiters", 1)
     cmdline.set_parameter("diags", True)  # Default None treated as False!
     #Use clustal output
     cmdline.set_parameter("clwstrict", True)  # Default None treated as False!
     #Shoudn't need this, but just to make sure it is accepted
     cmdline.set_parameter("maxhours", 0.1)
     #No progress reports to stderr
     cmdline.set_parameter("quiet", True)  # Default None treated as False!
     self.assertEqual(str(cmdline).rstrip(), _escape_filename(muscle_exe) +
                      " -in temp_cw_prot.fasta -diags -maxhours 0.1" +
                      " -maxiters 1 -clwstrict -quiet")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     align = AlignIO.read(child.stdout, "clustal")
     align.sort()
     records.sort(key = lambda rec: rec.id)
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     #See if quiet worked:
     self.assertEqual("", child.stderr.read().strip())
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     child.stdout.close()
     child.stderr.close()
     del child
     os.remove(temp_large_fasta_file)
Пример #44
0
	def run(self):
		run_muscle = MuscleCommandline( input=self.fasta, out=self.output )
		run_muscle.tree1      = self.tree
		run_muscle.cluster1   = "neighborjoining"
		run_muscle.maxiters   = 1
		thisVarHidesTheOutput = run_muscle()
Пример #45
0
 def GetExec(self, optList, frame):
     # Respond to the "muscle" command.
     self.frame = frame
     plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe"
     self.outfile=r".\plugins\muscle.txt"
     self.outtype="fasta"
     cline = MuscleCommandline(plugin_exe,out=self.outfile)
     if '1ProfileCheck' in self.frame.paramBoxes:
         if self.frame.paramBoxes['1ProfileCheck'].GetValue():
             cline.profile = True
             cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
             cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
         else:
             cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
     if '1DiagCheck' in self.frame.paramBoxes:
         if self.frame.paramBoxes['1DiagCheck'].GetValue():
             cline.diags=True
             if "DiagLenSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagLenSpin"])
             if "DiagMargSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagMargSpin"])
             if "DiagBreakSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagBreakSpin"])
         elif "GapPenSpin" in self.frame.paramBoxes:
             cline.gapopen=float(self.frame.paramBoxes["GapPenSpin"].GetValue())
         else:
             cline.input=r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
     if self.frame.abet=="AA":
         cline.seqtype="protein"
     elif self.frame.abet=="DNA" or self.frame.abet=="RNA":
         cline.seqtype="nucleo"
     else:
         cline.seqtype="auto"
     
     
     if self.frame.options:
         cline.objscore=str(self.boxList[9].GetValue())
         cline.weight1=str(self.boxList[13].GetValue())
         cline.weight2=str(self.boxList[15].GetValue())
         cline.anchorspacing=int(self.boxList[17].GetValue())
         cline.center=float(self.boxList[19].GetValue())
         cline.hydro=int(self.boxList[21].GetValue())
         cline.hydrofactor=float(self.boxList[23].GetValue())
         cline.maxhours=float(self.boxList[25].GetValue())
         cline.maxiters=int(self.boxList[27].GetValue())
         cline.maxtrees=int(self.boxList[29].GetValue())
         cline.minbestcolscore=float(self.boxList[31].GetValue())
         cline.minsmoothscore=float(self.boxList[33].GetValue())
         cline.smoothscoreceil=float(self.boxList[35].GetValue())
         cline.smoothwindow=int(self.boxList[37].GetValue())
         cline.sueff=float(self.boxList[39].GetValue())
     
     return str(cline)
Пример #46
0
def main():

    global inFile, lookup

    oldFiles = (
        glob.glob("%s/infile" % prj_tree.phylo)
        + glob.glob("%s/outtree" % prj_tree.phylo)
        + glob.glob("%s/outfile" % prj_tree.phylo)
    )
    if len(oldFiles) > 0:
        if force:
            for f in oldFiles:
                os.remove(f)
        else:
            sys.exit("Old files exist! Please use the -f flag to force overwrite.")

    if doAlign:

        # first create a working file to align and add the germline and natives
        shutil.copyfile(
            "%s/%s-collected.fa" % (prj_tree.nt, prj_name), "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name)
        )
        handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a")
        handle.write(">%s\n%s\n" % (germ_seq.id, germ_seq.seq))
        for n in natives.values():
            handle.write(">%s\n%s\n" % (n.id, n.seq))
        handle.close()

        # now run muscle
        run_muscle = MuscleCommandline(
            input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)
        )
        run_muscle.maxiters = 2
        run_muscle.diags = True
        run_muscle.gapopen = -5000.0  # code requires a float
        print run_muscle
        run_muscle()
        # thisVarHidesTheOutput = run_muscle()

        # change inFile variable so that remaining code is the same for both cases
        # It's probably really bad form to handle this in this way
        inFile = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)

    # open the alignment to rename everything and find germline sequence
    # rename is to avoid possible errors with DNAML from sequence ids that are too long
    germ_pos = 1
    with open(inFile, "rU") as handle:
        if doAlign:
            aln = AlignIO.read(handle, "fasta")
        else:
            try:
                aln = AlignIO.read(handle, "phylip")
            except:
                sys.exit("Please make sure custom input is aligned and in PHYLIP format")

    lookup = []
    for seq in aln:
        lookup.append(seq.id)
        if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None:
            germ_pos = len(lookup)
        seq.id = "%010d" % len(lookup)

    with open("%s/infile" % prj_tree.phylo, "w") as output:
        AlignIO.write(aln, output, "phylip")

    # now generate script for DNAML
    # J is "jumble" followed by random seed and number of times to repeat
    # O is outgroup root, followed by position of the germline in the alignment
    # 5 tells DNAML to do the ancestor inference
    # Y starts the run
    with open("%s/dnaml.in" % prj_tree.phylo, "w") as handle:
        seed = random.randint(0, 1e10) * 2 + 1  # seed must be odd
        handle.write("J\n%d\n3\nO\n%d\n5\nY\n" % (seed, germ_pos))

    # change to work directory so DNAML finds "infile" and puts the output where we expect
    os.chdir(prj_tree.phylo)
    with open("%s/dnaml.in" % prj_tree.phylo, "rU") as pipe:
        subprocess.call([DNAML], stdin=pipe)

    # revert names in tree
    with open("%s/outtree" % prj_tree.phylo, "rU") as intree:
        mytree = intree.read()
    fixedtree = re.sub("\d{10}", revertName, mytree)
    with open("%s/%s.tree" % (prj_tree.out, prj_name), "w") as outtree:
        outtree.write(fixedtree)

    # revert names in out file
    with open("%s/outfile" % prj_tree.phylo, "rU") as instuff:
        mystuff = instuff.read()
    fixedstuff = re.sub("\d{10}", revertName, mystuff)
    with open("%s/%s.dnaml.out" % (prj_tree.logs, prj_name), "w") as outstuff:
        outstuff.write(fixedstuff)

    # clean up
    os.remove("infile")
    os.remove("outfile")
    os.remove("outtree")
Пример #47
0
def buildGSSP( vgene ):

	results = []

	if len(masterList[vgene]) < arguments["--numSequences"]:
		print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) )
		return []
		
	if vgene not in germList:
		print( "Skipping %s, it's not in the germline database..." %vgene )
		return []

	# Take random overlapping subsets to generate multiple profiles
	#  need to add back a sanity check for capping the number of subsets if there's not enough raw data.
	numProfiles = arguments['--profiles']
	if arguments["--profiles"] == 0:
		numProfiles = 1

	success = 0
		
	for i in range(numProfiles):
		seqs = [] + germList[vgene] #force a copy rather than an alias
		if arguments["--profiles"] == 0:
			seqs += list(masterList[vgene])
		else:
			#get our sequence subset, add the germlines, and write them
			#   to a temporary file for alignment
			seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False))

		tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene)
		with open("%s.fa"%tempFile, "w") as temp:
			SeqIO.write(seqs,temp,"fasta")

		muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile)

		#try to speed up the process a little bit for large datasets
		#still going to max out at ~50k seqs per profile (probably)
		muscle_cline.maxiters	= 2
		muscle_cline.diags	= True

		try:
			stdout, stderr = muscle_cline()
		except:
			print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) )
			for f in glob.glob("%s.*"%tempFile): 
				os.remove(f)
			continue

		alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal")
		success += 1

		#Input order is not maintained, so we need a little
		#   kludge to find a germline sequences. Use the 
		#   first one to remove any insertions from the alignment
		germRow = 0
		for n, rec in enumerate(alignment):
			if rec.id in [g.id for g in germList[vgene]]:
				germRow = n
				break

		#look for gaps one at a time so we don't get tripped up by shifting indices
		gap = re.search( "-+", str(alignment[germRow].seq) )
		while (gap):
			alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():]
			gap = re.search( "-+", str(alignment[germRow].seq) )
		
		#Now we get BioPython to make a PSSM for us. To convert that into
		#    a mutability profile, we will delete the germline residue[s]
		#    at each position (but save what they were)
		germRes = defaultdict(Counter)
		summary_align = AlignInfo.SummaryInfo(alignment)
		pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X'])

		#get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data
		# do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues.
		denominator = []
		for p,pos in enumerate(pssm):
			denominator.append( sum(pos.values()) - len(germList[vgene]) )
    
		for germ in germList[vgene]:
			for pos, residue in enumerate(germ):
				if residue == "X":
					continue
				germRes[pos][residue] += 1
				pssm[pos][residue] = 0

		#normalize and save
		for p, pos in enumerate(pssm):
			germAA = ",".join([ x[0] for x in germRes[p].most_common() ])
			results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] )
	    
		#clean up
		for f in glob.glob("%s.*"%tempFile): 
			os.remove(f)

	print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) )
	return results