Пример #1
0
def align_sequences(input_file: str, output_file: str = "alignment.fasta") -> MultipleSeqAlignment:
    """
    Aligns the sequences using the muscle algorithm
    :param input_file: fasta-file with the input sequences
    :param output_file: save as aligned fasta-file
    :return: MultipleSeqAlignment with the alignment result
    """
    # run   muscle to align all sequences
    # can also be ran online:
    # https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=muscle-I20200329-210908-0063-87869209-p2m
    '''
    /nfs/public/ro/es/appbin/linux-x86_64/muscle-3.8.31/muscle -in muscle-I20200329-210908-0063-87869209-p2m.upfile -verbose -log muscle-I20200329-210908-0063-87869209-p2m.output -quiet -fasta -out muscle-I20200329-210908-0063-87869209-p2m.fasta -tree2 muscle-I20200329-210908-0063-87869209-p2m.dnd
    '''

    # specify where the muscle.exe is located
    muscle_exe = os.path.join('..', 'muscle3.8.31_i86linux64')

    # define the command line for muscle
    muscle_cline = MuscleCommandline(muscle_exe, input=input_file)

    # use 2 iterations; when sequences are far apart, the attempt to reach a more finer alignment leads to an error
    muscle_cline.maxiters = 2

    # report the final command line
    print(muscle_cline)

    # execute the command
    stdout, stderr = muscle_cline()

    # save for later faster processing or testing
    with open(output_file, "w") as alignment_file:
        alignment_file.write(stdout)

    # return the aligned sequences
    return AlignIO.read(StringIO(stdout), "fasta")
Пример #2
0
    def GetExec(self, optList, frame):
        # Respond to the "muscle" command.
        self.frame = frame
        plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe"
        self.outfile = r".\plugins\muscle.txt"
        self.outtype = "fasta"
        cline = MuscleCommandline(plugin_exe, out=self.outfile)
        if '1ProfileCheck' in self.frame.paramBoxes:
            if self.frame.paramBoxes['1ProfileCheck'].GetValue():
                cline.profile = True
                cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
                cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
            else:
                cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
        if '1DiagCheck' in self.frame.paramBoxes:
            if self.frame.paramBoxes['1DiagCheck'].GetValue():
                cline.diags = True
                if "DiagLenSpin" in self.frame.paramBoxes:
                    cline.diaglength = int(
                        self.frame.paramBoxes["DiagLenSpin"])
                if "DiagMargSpin" in self.frame.paramBoxes:
                    cline.diaglength = int(
                        self.frame.paramBoxes["DiagMargSpin"])
                if "DiagBreakSpin" in self.frame.paramBoxes:
                    cline.diaglength = int(
                        self.frame.paramBoxes["DiagBreakSpin"])
            elif "GapPenSpin" in self.frame.paramBoxes:
                cline.gapopen = float(
                    self.frame.paramBoxes["GapPenSpin"].GetValue())
            else:
                cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
        if self.frame.abet == "AA":
            cline.seqtype = "protein"
        elif self.frame.abet == "DNA" or self.frame.abet == "RNA":
            cline.seqtype = "nucleo"
        else:
            cline.seqtype = "auto"

        if self.frame.options:
            cline.objscore = str(self.boxList[9].GetValue())
            cline.weight1 = str(self.boxList[13].GetValue())
            cline.weight2 = str(self.boxList[15].GetValue())
            cline.anchorspacing = int(self.boxList[17].GetValue())
            cline.center = float(self.boxList[19].GetValue())
            cline.hydro = int(self.boxList[21].GetValue())
            cline.hydrofactor = float(self.boxList[23].GetValue())
            cline.maxhours = float(self.boxList[25].GetValue())
            cline.maxiters = int(self.boxList[27].GetValue())
            cline.maxtrees = int(self.boxList[29].GetValue())
            cline.minbestcolscore = float(self.boxList[31].GetValue())
            cline.minsmoothscore = float(self.boxList[33].GetValue())
            cline.smoothscoreceil = float(self.boxList[35].GetValue())
            cline.smoothwindow = int(self.boxList[37].GetValue())
            cline.sueff = float(self.boxList[39].GetValue())

        return str(cline)
Пример #3
0
 def GetExec(self, optList, frame):
     # Respond to the "muscle" command.
     self.frame = frame
     plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe"
     self.outfile=r".\plugins\muscle.txt"
     self.outtype="fasta"
     cline = MuscleCommandline(plugin_exe,out=self.outfile)
     if '1ProfileCheck' in self.frame.paramBoxes:
         if self.frame.paramBoxes['1ProfileCheck'].GetValue():
             cline.profile = True
             cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
             cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
         else:
             cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
     if '1DiagCheck' in self.frame.paramBoxes:
         if self.frame.paramBoxes['1DiagCheck'].GetValue():
             cline.diags=True
             if "DiagLenSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagLenSpin"])
             if "DiagMargSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagMargSpin"])
             if "DiagBreakSpin" in self.frame.paramBoxes:
                 cline.diaglength=int(self.frame.paramBoxes["DiagBreakSpin"])
         elif "GapPenSpin" in self.frame.paramBoxes:
             cline.gapopen=float(self.frame.paramBoxes["GapPenSpin"].GetValue())
         else:
             cline.input=r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta"
     if self.frame.abet=="AA":
         cline.seqtype="protein"
     elif self.frame.abet=="DNA" or self.frame.abet=="RNA":
         cline.seqtype="nucleo"
     else:
         cline.seqtype="auto"
     
     
     if self.frame.options:
         cline.objscore=str(self.boxList[9].GetValue())
         cline.weight1=str(self.boxList[13].GetValue())
         cline.weight2=str(self.boxList[15].GetValue())
         cline.anchorspacing=int(self.boxList[17].GetValue())
         cline.center=float(self.boxList[19].GetValue())
         cline.hydro=int(self.boxList[21].GetValue())
         cline.hydrofactor=float(self.boxList[23].GetValue())
         cline.maxhours=float(self.boxList[25].GetValue())
         cline.maxiters=int(self.boxList[27].GetValue())
         cline.maxtrees=int(self.boxList[29].GetValue())
         cline.minbestcolscore=float(self.boxList[31].GetValue())
         cline.minsmoothscore=float(self.boxList[33].GetValue())
         cline.smoothscoreceil=float(self.boxList[35].GetValue())
         cline.smoothwindow=int(self.boxList[37].GetValue())
         cline.sueff=float(self.boxList[39].GetValue())
     
     return str(cline)
Пример #4
0
def muscleProcess (threadID, filebase, outbase, treebase):

	fasta	 = filebase % threadID
	output	 = outbase  % threadID
	treeFile = treebase % threadID

	print( "Building NJ tree from %s" % fasta )

	run_muscle = MuscleCommandline( cmd=muscle, input=fasta, out=output )
	run_muscle.tree1      = treeFile
	run_muscle.cluster1   = "neighborjoining"
	run_muscle.maxiters   = 1
	thisVarHidesTheOutput = run_muscle()
Пример #5
0
def muscleProcess (threadID, filebase, outbase, treebase):

	fasta	 = filebase % threadID
	output	 = outbase  % threadID
	treeFile = treebase % threadID

	print( "Building NJ tree from %s" % fasta )

	run_muscle = MuscleCommandline( cmd=muscle, input=fasta, out=output )
	run_muscle.tree1      = treeFile
	run_muscle.cluster1   = "neighborjoining"
	run_muscle.maxiters   = 1
	thisVarHidesTheOutput = run_muscle()
Пример #6
0
def quickAlign(refseq, testseq, maxiters=None, diags=None, gapopen=None):

    #sanity check
    try:
        refseq = re.sub("-", "", refseq)
    except TypeError:
        #not a string, probably a SeqRecord
        try:
            refseq = str(refseq.seq)
            refseq = re.sub("-", "", refseq)
        except AttributeError:
            #give up
            sys.exit(
                "quickAlign() requires inputs to be either strings or SeqRecord objects"
            )

    try:
        testseq = re.sub("-", "", testseq)
    except TypeError:
        #not a string, probably a SeqRecord
        try:
            testseq = str(testseq.seq)
            testseq = re.sub("-", "", testseq)
        except AttributeError:
            #give up
            sys.exit(
                "quickAlign() requires inputs to be either strings or SeqRecord objects"
            )

    handle = StringIO()
    handle.write(">ref\n%s\n>test\n%s\n" % (refseq, testseq))
    data = handle.getvalue()

    muscle_cline = MuscleCommandline(cmd=muscle, quiet=True)
    if maxiters is not None: muscle_cline.maxiters = maxiters
    if diags is not None: muscle_cline.diags = diag
    if gapopen is not None: muscle_cline.gapopen = gapopen

    stdout, stderr = muscle_cline(stdin=data)

    aligned = dict()
    for p in SeqIO.parse(StringIO(stdout), "fasta"):
        aligned[p.id] = str(p.seq)
    return aligned
Пример #7
0
def quickAlign( refseq, testseq, maxiters=None, diags=None, gapopen=None ):
    
	#sanity check
	refseq	= re.sub( "-", "", str(refseq) )
	testseq = re.sub( "-", "", str(testseq) )

	handle = StringIO()
	handle.write( ">ref\n%s\n>test\n%s\n"%(refseq,testseq) )
	data = handle.getvalue()

	muscle_cline = MuscleCommandline(cmd=muscle, quiet=True)
	if maxiters is not None: muscle_cline.maxiters = maxiters
	if diags    is not None: muscle_cline.diags    = diag
	if gapopen  is not None: muscle_cline.gapopen  = gapopen

	stdout, stderr = muscle_cline(stdin=data)

	aligned = dict()
	for p in SeqIO.parse(StringIO(stdout), "fasta"):
		aligned[ p.id ] = str(p.seq)
	return aligned
Пример #8
0
from Bio.Align.Applications import MuscleCommandline

# read the sequence accession numbers
viruses = pd.read_csv(os.path.join('..', 'data', 'viruses.csv'),
                      index_col='Accession number')

# specify where the muscle executable is located, and the exact name of the executable
muscle_exe = os.path.join("..", "muscle3.8.31_i86linux64")

# define the command line for muscle
muscle_cline = MuscleCommandline(muscle_exe,
                                 input=os.path.join("..", "data",
                                                    "downloads.fasta"))

# use 2 iterations; when sequences are far apart, the attempt to reach a more finer alignment leads to an error
muscle_cline.maxiters = 2

# run muscle to align all sequences
stdout, stderr = muscle_cline()

# get the alignment
align = AlignIO.read(StringIO(stdout), "fasta")
print(align)

# make sequence lists from the alignment object
mat = [[0 if nucleotide == '-' else 1 for nucleotide in rec.seq]
       for rec in align]
cmap = ListedColormap(['w', 'r'])
fig, ax = plt.subplots(1, 1)
ax.matshow(mat, cmap=cmap)
ax.set_aspect(ax.get_xlim()[1] / ax.get_ylim()[0] / 3)
Пример #9
0
def buildGSSP(vgene):

    results = []

    if len(masterList[vgene]) < arguments["--numSequences"]:
        print("Skipping %s, not enough sequences (%d)..." %
              (vgene, len(masterList[vgene])))
        return []

    if vgene not in germList:
        print("Skipping %s, it's not in the germline database..." % vgene)
        return []

    # Take random overlapping subsets to generate multiple profiles
    #  need to add back a sanity check for capping the number of subsets if there's not enough raw data.
    numProfiles = arguments['--profiles']
    if arguments["--profiles"] == 0:
        numProfiles = 1

    success = 0

    for i in range(numProfiles):
        seqs = [] + germList[vgene]  #force a copy rather than an alias
        if arguments["--profiles"] == 0:
            seqs += list(masterList[vgene])
        else:
            #get our sequence subset, add the germlines, and write them
            #   to a temporary file for alignment
            seqs += list(
                numpy.random.choice(masterList[vgene],
                                    size=arguments["--numSequences"],
                                    replace=False))

        tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene)
        with open("%s.fa" % tempFile, "w") as temp:
            SeqIO.write(seqs, temp, "fasta")

        muscle_cline = MuscleCommandline(cmd=muscle,
                                         input="%s.fa" % tempFile,
                                         out="%s.aln" % tempFile)

        #try to speed up the process a little bit for large datasets
        #still going to max out at ~50k seqs per profile (probably)
        muscle_cline.maxiters = 2
        muscle_cline.diags = True

        try:
            stdout, stderr = muscle_cline()
        except:
            print("Error in alignment #%d for %s (skipping)" % (i + 1, vgene))
            for f in glob.glob("%s.*" % tempFile):
                os.remove(f)
            continue

        alignment = AlignIO.read("%s.aln" % tempFile, "fasta")  #"clustal")
        success += 1

        #Input order is not maintained, so we need a little
        #   kludge to find a germline sequences. Use the
        #   first one to remove any insertions from the alignment
        germRow = 0
        for n, rec in enumerate(alignment):
            if rec.id in [g.id for g in germList[vgene]]:
                germRow = n
                break

        #look for gaps one at a time so we don't get tripped up by shifting indices
        gap = re.search("-+", str(alignment[germRow].seq))
        while (gap):
            alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():]
            gap = re.search("-+", str(alignment[germRow].seq))

        #Now we get BioPython to make a PSSM for us. To convert that into
        #    a mutability profile, we will delete the germline residue[s]
        #    at each position (but save what they were)
        germRes = defaultdict(Counter)
        summary_align = AlignInfo.SummaryInfo(alignment)
        pssm = summary_align.pos_specific_score_matrix(
            chars_to_ignore=['-', 'X'])

        #get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data
        # do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues.
        denominator = []
        for p, pos in enumerate(pssm):
            denominator.append(sum(pos.values()) - len(germList[vgene]))

        for germ in germList[vgene]:
            for pos, residue in enumerate(germ):
                if residue == "X":
                    continue
                germRes[pos][residue] += 1
                pssm[pos][residue] = 0

        #normalize and save
        for p, pos in enumerate(pssm):
            germAA = ",".join([x[0] for x in germRes[p].most_common()])
            results.append([
                vgene, i + 1, p + 1, germAA, "None" if
                (p < mask[vgene]
                 or denominator[p] < arguments["--numSequences"]) else "%.5f" %
                (sum(pos.values()) / denominator[p])
            ] + [
                "%.5f" %
                (pos.get(r, 0) /
                 sum(pos.values())) if sum(pos.values()) > 0 else "0.00"
                for r in aa_list
            ])

        #clean up
        for f in glob.glob("%s.*" % tempFile):
            os.remove(f)

    print("Successfully built %d/%d profiles for %s using %d sequences!" %
          (success, numProfiles, vgene, len(seqs) - len(germList[vgene])))
    return results
Пример #10
0
def buildGSSP( vgene ):

	results = []

	if len(masterList[vgene]) < arguments["--numSequences"]:
		print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) )
		return []
		
	if vgene not in germList:
		print( "Skipping %s, it's not in the germline database..." %vgene )
		return []

	# Take random overlapping subsets to generate multiple profiles
	#  need to add back a sanity check for capping the number of subsets if there's not enough raw data.
	numProfiles = arguments['--profiles']
	if arguments["--profiles"] == 0:
		numProfiles = 1

	success = 0
		
	for i in range(numProfiles):
		seqs = [] + germList[vgene] #force a copy rather than an alias
		if arguments["--profiles"] == 0:
			seqs += list(masterList[vgene])
		else:
			#get our sequence subset, add the germlines, and write them
			#   to a temporary file for alignment
			seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False))

		tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene)
		with open("%s.fa"%tempFile, "w") as temp:
			SeqIO.write(seqs,temp,"fasta")

		muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile)

		#try to speed up the process a little bit for large datasets
		#still going to max out at ~50k seqs per profile (probably)
		muscle_cline.maxiters	= 2
		muscle_cline.diags	= True

		try:
			stdout, stderr = muscle_cline()
		except:
			print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) )
			for f in glob.glob("%s.*"%tempFile): 
				os.remove(f)
			continue

		alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal")
		success += 1

		#Input order is not maintained, so we need a little
		#   kludge to find a germline sequences. Use the 
		#   first one to remove any insertions from the alignment
		germRow = 0
		for n, rec in enumerate(alignment):
			if rec.id in [g.id for g in germList[vgene]]:
				germRow = n
				break

		#look for gaps one at a time so we don't get tripped up by shifting indices
		gap = re.search( "-+", str(alignment[germRow].seq) )
		while (gap):
			alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():]
			gap = re.search( "-+", str(alignment[germRow].seq) )
		
		#Now we get BioPython to make a PSSM for us. To convert that into
		#    a mutability profile, we will delete the germline residue[s]
		#    at each position (but save what they were)
		germRes = defaultdict(Counter)
		summary_align = AlignInfo.SummaryInfo(alignment)
		pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X'])

		#get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data
		# do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues.
		denominator = []
		for p,pos in enumerate(pssm):
			denominator.append( sum(pos.values()) - len(germList[vgene]) )
    
		for germ in germList[vgene]:
			for pos, residue in enumerate(germ):
				if residue == "X":
					continue
				germRes[pos][residue] += 1
				pssm[pos][residue] = 0

		#normalize and save
		for p, pos in enumerate(pssm):
			germAA = ",".join([ x[0] for x in germRes[p].most_common() ])
			results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] )
	    
		#clean up
		for f in glob.glob("%s.*"%tempFile): 
			os.remove(f)

	print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) )
	return results
Пример #11
0
def main():

    global inFile, lookup

    oldFiles = (
        glob.glob("%s/infile" % prj_tree.phylo)
        + glob.glob("%s/outtree" % prj_tree.phylo)
        + glob.glob("%s/outfile" % prj_tree.phylo)
    )
    if len(oldFiles) > 0:
        if force:
            for f in oldFiles:
                os.remove(f)
        else:
            sys.exit("Old files exist! Please use the -f flag to force overwrite.")

    if doAlign:

        # first create a working file to align and add the germline and natives
        shutil.copyfile(
            "%s/%s-collected.fa" % (prj_tree.nt, prj_name), "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name)
        )
        handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a")
        handle.write(">%s\n%s\n" % (germ_seq.id, germ_seq.seq))
        for n in natives.values():
            handle.write(">%s\n%s\n" % (n.id, n.seq))
        handle.close()

        # now run muscle
        run_muscle = MuscleCommandline(
            input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)
        )
        run_muscle.maxiters = 2
        run_muscle.diags = True
        run_muscle.gapopen = -5000.0  # code requires a float
        print run_muscle
        run_muscle()
        # thisVarHidesTheOutput = run_muscle()

        # change inFile variable so that remaining code is the same for both cases
        # It's probably really bad form to handle this in this way
        inFile = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)

    # open the alignment to rename everything and find germline sequence
    # rename is to avoid possible errors with DNAML from sequence ids that are too long
    germ_pos = 1
    with open(inFile, "rU") as handle:
        if doAlign:
            aln = AlignIO.read(handle, "fasta")
        else:
            try:
                aln = AlignIO.read(handle, "phylip")
            except:
                sys.exit("Please make sure custom input is aligned and in PHYLIP format")

    lookup = []
    for seq in aln:
        lookup.append(seq.id)
        if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None:
            germ_pos = len(lookup)
        seq.id = "%010d" % len(lookup)

    with open("%s/infile" % prj_tree.phylo, "w") as output:
        AlignIO.write(aln, output, "phylip")

    # now generate script for DNAML
    # J is "jumble" followed by random seed and number of times to repeat
    # O is outgroup root, followed by position of the germline in the alignment
    # 5 tells DNAML to do the ancestor inference
    # Y starts the run
    with open("%s/dnaml.in" % prj_tree.phylo, "w") as handle:
        seed = random.randint(0, 1e10) * 2 + 1  # seed must be odd
        handle.write("J\n%d\n3\nO\n%d\n5\nY\n" % (seed, germ_pos))

    # change to work directory so DNAML finds "infile" and puts the output where we expect
    os.chdir(prj_tree.phylo)
    with open("%s/dnaml.in" % prj_tree.phylo, "rU") as pipe:
        subprocess.call([DNAML], stdin=pipe)

    # revert names in tree
    with open("%s/outtree" % prj_tree.phylo, "rU") as intree:
        mytree = intree.read()
    fixedtree = re.sub("\d{10}", revertName, mytree)
    with open("%s/%s.tree" % (prj_tree.out, prj_name), "w") as outtree:
        outtree.write(fixedtree)

    # revert names in out file
    with open("%s/outfile" % prj_tree.phylo, "rU") as instuff:
        mystuff = instuff.read()
    fixedstuff = re.sub("\d{10}", revertName, mystuff)
    with open("%s/%s.dnaml.out" % (prj_tree.logs, prj_name), "w") as outstuff:
        outstuff.write(fixedstuff)

    # clean up
    os.remove("infile")
    os.remove("outfile")
    os.remove("outtree")
Пример #12
0
def main():

    oldFiles = glob.glob("%s/infile" % prj_tree.phylo) + glob.glob(
        "%s/%s_igphyml.tree" %
        (prj_tree.out, prj_name)) + glob.glob("%s/%s_igphyml_stats.txt" %
                                              (prj_tree.logs, prj_name))
    if len(oldFiles) > 0:
        if arguments['-f']:
            for f in oldFiles:
                os.remove(f)
        else:
            sys.exit(
                "Old files exist! Please use the -f flag to force overwrite.")

    if arguments['-v'] is not None:

        #do alignment

        #first create a working file to align and add the germline and natives
        shutil.copyfile(arguments['--seqs'],
                        "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name))
        handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a")
        handle.write("\n>%s\n%s\n" % (germ_seq.id, germ_seq.seq))
        for n in natives.values():
            handle.write(">%s\n%s\n" % (n.id, n.seq))
        handle.close()

        #now run muscle
        run_muscle = MuscleCommandline(
            cmd=muscle,
            input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name),
            out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name))
        run_muscle.maxiters = 2
        run_muscle.diags = True
        run_muscle.gapopen = -5000.0  #code requires a float
        print(run_muscle)
        run_muscle()

        #this is probably bad form
        arguments['-i'] = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)

    #open the alignment to rename everything and find germline sequence
    with open(arguments['-i'], "r") as handle:
        try:
            aln = AlignIO.read(handle, arguments['--format'])
        except:
            sys.exit("Couldn't read alignment: is %s the correct format?" %
                     arguments['--format'])

    align_len = aln.get_alignment_length()
    extra = align_len % 3
    if extra > 0:
        print("Trimming alignment to even codon length...", file=sys.stderr)
        aln = aln[:, 0:-extra]
        align_len -= extra

    #kill the fasta def line and any usearch/vsearch annotations to avoid formatting foul-ups
    germ_id = ""
    foundRoot = False
    gaps = defaultdict(list)
    for seq in aln:
        seq.id = re.sub("[;:].*", "", seq.id)
        seq.description = ""
        if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id, re.I) is not None:
            germ_id = seq.id

        if arguments['--root'] is not None and seq.id == arguments['--root']:
            foundRoot = True

        for g in re.finditer("-+", str(seq.seq)):
            #save gap. value is a field to help me determine what's real in assignGaps
            gaps[seq.id.upper()].append({
                'start': g.start(),
                'end': g.end(),
                'value': 1
            })

    if arguments['--root'] is not None:
        germ_id = arguments['--root']
        if not foundRoot:
            sys.exit("Couldn't find specified root sequence %s in input file" %
                     arguments['--root'])
    elif germ_id == "":
        sys.exit(
            "Couldn't find a germline gene in the alignment, please use the --root option and try again."
        )

    with open("%s/infile" % prj_tree.phylo, "w") as output:
        AlignIO.write(aln, output, "fasta")

    #now call IgPhyML
    #fast initial tree
    opts = ["--threads", arguments['--threads']]
    if not arguments['--quick']:
        opts += ["-s", "SPR"]
    if arguments['--seed'] is not None:
        opts += ["--r_seed", arguments['--seed']]
    #set an environmental variable so that IgPhyML can find its libraries
    os.environ.update(
        {'IGPHYML_PATH': '%s/third-party/src/motifs' % SCRIPT_FOLDER})
    s = subprocess.Popen([
        igphyml, "-i",
        "%s/infile" % prj_tree.phylo, "-m", "GY", "-w", "MO", "-t", "e",
        "--run_id", "gy94"
    ] + opts,
                         universal_newlines=True,
                         stderr=subprocess.PIPE)
    o, e = s.communicate()

    if re.search("error while loading shared libraries", str(e)):
        #Some libraries needed for optimized execution are missing
        #  Try again with a version compiled without optimizations
        s = subprocess.Popen(
            [
                igphyml_slow, "-i",
                "%s/infile" % prj_tree.phylo, "-m", "GY", "-w", "MO", "-t",
                "e", "--run_id", "gy94"
            ] + opts[2:],  #no threading option available
            universal_newlines=True,
            stderr=subprocess.PIPE)
        o, e = s.communicate()

    if e != "" or s.returncode != 0:
        sys.exit("Error running '%s':\n%sExit code %d" %
                 (" ".join(s.args), e, s.returncode))

    #Refine tree with AID-specific hotpsot motifs
    opts = ["--threads", arguments['--threads']]
    if arguments['--quick']:
        opts += ['-o', 'lr']
    else:
        opts += ['-o', 'tlr']
    if arguments['--seed'] is not None:
        opts += ["--r_seed", arguments['--seed']]

    s = subprocess.Popen([
        igphyml, "-i",
        "%s/infile" % prj_tree.phylo, "-m", "HLP17", "--root", germ_id, "-u",
        "%s/infile_igphyml_tree.txt_gy94" % prj_tree.phylo, "--motifs", "FCH",
        "--run_id", "hlp17", "--ambigfile",
        "%s/ambigfile.txt" % prj_tree.phylo
    ] + opts,
                         universal_newlines=True,
                         stderr=subprocess.PIPE)
    o, e = s.communicate()

    if re.search("error while loading shared libraries", str(e)):
        s = subprocess.Popen(
            [
                igphyml_slow, "-i",
                "%s/infile" % prj_tree.phylo, "-m", "HLP17", "--root", germ_id,
                "-u",
                "%s/infile_igphyml_tree.txt_gy94" % prj_tree.phylo, "--motifs",
                "FCH", "--run_id", "hlp17", "--ambigfile",
                "%s/ambigfile.txt" % prj_tree.phylo
            ] + opts[2:],  #no threading option available
            universal_newlines=True,
            stderr=subprocess.PIPE)
        o, e = s.communicate()

    if e != "" or s.returncode != 0:
        sys.exit("Error running '%s':\n%sExit code %d" %
                 (" ".join(s.args), e, s.returncode))

    if not arguments['--noAnc']:
        #now need to set up a config file for ancestor reconstruction
        with open("%s/ar.config" % prj_tree.phylo, "w") as handle:
            handle.write("length\t%d\n" % (align_len / 3))
            handle.write("rooted\t1\noutdir\t%s\n" % prj_tree.phylo)
            handle.write("seqfile\t%s/infile\n" % prj_tree.phylo)
            handle.write("rootid\t%s\n" % germ_id)
            handle.write("igphyml\t%s/%s\n" % (SCRIPT_FOLDER, "third-party"))
            handle.write("stats\t%s/infile_igphyml_stats.txt_hlp17\n" %
                         prj_tree.phylo)
            handle.write("tree\t%s/infile_igphyml_tree.txt_hlp17\n" %
                         prj_tree.phylo)
            handle.write("ambigfile\t%s/ambigfile.txt\n" % prj_tree.phylo)
            handle.write("stem\t%s\n" % prj_name)

        s = subprocess.Popen([
            "perl", "-I",
            "%s/third-party" % SCRIPT_FOLDER, reconstruct,
            "%s/ar.config" % prj_tree.phylo
        ],
                             universal_newlines=True,
                             stderr=subprocess.PIPE)
        o, e = s.communicate()
        if e != "" or s.returncode != 0:
            sys.exit("Error running '%s':\n%sExit code %d" %
                     (" ".join(s.args), e, s.returncode))

        if len(gaps) > 0:
            #fix ancestor inference by putting gaps back in
            #start by reading in inferred sequences and reconstructing the tree
            tree = dict()
            stack = list()
            seqDict = OrderedDict()
            with open("%s/%s.MLcodons.fa" % (prj_tree.phylo, prj_name),
                      "r") as infer:
                for seq in SeqIO.parse(infer, "fasta"):
                    name = seq.id.split(";")[1]
                    seqDict[name] = seq
                    if "," in name:
                        tree[name] = {'id': name, 'children': stack[-2:]}
                        tree[stack.pop()]['parent'] = name
                        tree[stack.pop()]['parent'] = name
                        stack.append(name)
                    else:
                        tree[name] = {'id': name, 'children': []}
                        stack.append(name)

            #now iterate down tree to propogate gaps
            assignGaps(stack[0], tree, gaps)

            #do output
            with open("%s/%s_inferredAncestors.fa" % (prj_tree.nt, prj_name),
                      "w") as handle:
                SeqIO.write(getFinalSeqs(seqDict, gaps), handle, "fasta")
            with open("%s/%s_inferredAncestors.fa" % (prj_tree.aa, prj_name),
                      "w") as handle:
                SeqIO.write(getFinalSeqs(seqDict, gaps, trans=True), handle,
                            "fasta")

        else:
            os.rename("%s/%s.MLcodons.fa" % (prj_tree.phylo, prj_name),
                      "%s/%s_inferredAncestors.fa" % (prj_tree.nt, prj_name))
            os.rename("%s/%s.MLaas.fa" % (prj_tree.phylo, prj_name),
                      "%s/%s_inferredAncestors.fa" % (prj_tree.aa, prj_name))

    #move non-seqeunce outputs to logical places
    os.rename("%s/infile_igphyml_stats.txt_hlp17" % prj_tree.phylo,
              "%s/%s_igphyml_stats.txt" % (prj_tree.logs, prj_name))
    os.rename("%s/infile_igphyml_tree.txt_hlp17" % prj_tree.phylo,
              "%s/%s_igphyml.tree" % (prj_tree.out, prj_name))
Пример #13
0
def main():

    global inFile, lookup, workDir, outTreeFile, outFile, seqFile

    oldFiles = glob.glob("%s/infile"%workDir) + glob.glob("%s/outtree"%workDir) + glob.glob("%s/outfile"%workDir)
    if len(oldFiles) > 0:
        if force:
            for f in oldFiles:
                os.remove(f)
        else:
            sys.exit("Old files exist! Please use the -f flag to force overwrite.")
        

    if doAlign:

        #first create a working file to align and add the germline and natives
        shutil.copyfile(seqFile, "%s/%s_to_align.fa"%(workDir, prj_name))
        handle = open( "%s/%s_to_align.fa"%(workDir, prj_name), "a" )
        handle.write( ">%s\n%s\n" % (germ_seq.id, germ_seq.seq) )
        for n in natives.values():
            handle.write( ">%s\n%s\n" % (n.id, n.seq) )
        handle.close()

        #now run muscle
        run_muscle            = MuscleCommandline( input="%s/%s_to_align.fa" % (workDir, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) )
        run_muscle.maxiters   = 2
        run_muscle.diags      = True
        run_muscle.gapopen    = -5000.0 #code requires a float
        print run_muscle
        run_muscle()

        inFile = "%s/%s_aligned.afa" % (workDir, prj_name)


    #open the alignment to rename everything and find germline sequence
    #rename is to avoid possible errors with DNAML from sequence ids that are too long
    germ_pos = 1
    with open(inFile, "rU") as handle:
        if doAlign:
            aln = AlignIO.read(handle, "fasta")
        else: 
            try:
                aln = AlignIO.read(handle, "phylip-relaxed")
            except:
                sys.exit("Please make sure custom input is aligned and in PHYLIP format...")

    lookup = []
    for seq in aln:
        lookup.append( seq.id )
        if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None:
            germ_pos = len( lookup )
        seq.id = "%010d" % len( lookup )


    with open("%s/infile" % workDir, "w") as output:
        AlignIO.write(aln, output, "phylip")


    #now generate script for DNAML
    # J is "jumble" followed by random seed and number of times to repeat
    # O is outgroup root, followed by position of the germline in the alignment
    # 5 tells DNAML to do the ancestor inference
    # Y starts the run
    with open("%s/dnaml.in"%workDir, "w") as handle:
        seed = random.randint(0,1e10) * 2 + 1 #seed must be odd
        handle.write("J\n%d\n5\nG\nO\n%d\n5\nY\n" % (seed, germ_pos))


    # change to work directory so DNAML finds "infile" and puts the output where we expect
    origWD = os.getcwd()
    os.chdir(workDir)
    with open("dnaml.in", "rU") as pipe:
        subprocess.call([dnaml], stdin=pipe)
    os.chdir(origWD)

    #revert names in tree
    with open("%s/outtree"%workDir, "rU") as intree:
        mytree = intree.read()
    fixedtree = re.sub("\d{10}", revertName, mytree)
    with open(outTreeFile, "w") as outtree:
        outtree.write(fixedtree)

    #revert names in out file
    with open("%s/outfile"%workDir, "rU") as instuff:
        mystuff = instuff.read()
    fixedstuff = re.sub("\d{10}", revertName, mystuff)
    with open(outFile, "w") as outstuff:
        outstuff.write(fixedstuff)
        
	
    print "\nOutput in %s and %s\n" % (outTreeFile, outFile)
Пример #14
0
	def run(self):
		run_muscle = MuscleCommandline( input=self.fasta, out=self.output )
		run_muscle.tree1      = self.tree
		run_muscle.cluster1   = "neighborjoining"
		run_muscle.maxiters   = 1
		thisVarHidesTheOutput = run_muscle()