def remove(sequence, hmmfile, domNumber): """ Deletes a specified domain in the input sequence. Args: sequence: The full gene sequence (including both domain and linker regions) hmmfile: hmm file containing hmm of domain to be removed domNumber: The position of the domain to be duplicated w.r.t. the other domains Returns sequence with specified duplication """ if HMMER: starts, ends = findDomains(sequence, hmmfile)[:2] else: starts, ends = findMotifs(sequence, hmmfile)[:2] #Removes one of the linkers if necessary if domNumber > 0: sequence = sequence[:starts[domNumber] - 5] + sequence[ends[domNumber] + 1:] elif domNumber < len(starts) - 1: sequence = sequence[:starts[domNumber]] + sequence[ends[domNumber] + 1 + 5:] else: sequence = sequence[:starts[domNumber]] + sequence[ends[domNumber] + 1:] return sequence
def genRandomSequence(numDoms): """ Generates a random zf-C2H2 protein sequence with the given number of domains. Includes sequence before and after zf domain. """ files = ls(DATAPATH) f = list(open(DATAPATH + choice(files)))[1::2] sequence = choice(f).strip() sequence.translate(None, '-') starts, ends, seqs = findDomains(sequence, hmmfile) if len(starts) < numDoms: return genRandomSequence(numDoms) prefix = sequence[:starts[0]] suffix = sequence[ends[-1]:] if prefix == '' or suffix == '': return genRandomSequence(numDoms) linkers = [] for i in range(len(starts) - 1): linkers.append(sequence[ends[i] + 1:starts[i + 1]]) middle = '' for _ in range(numDoms - 1): middle += choice(seqs) + choice(linkers) middle += choice(seqs) newSeq = prefix + middle + suffix newSeq = ''.join(newSeq.split('-')) #Deletes all lowercase letters newSeq = newSeq.translate(None, string.ascii_lowercase) #Deletes all illegal AA characters newSeq = newSeq.translate(None, 'BJOUXZ') return newSeq
def duplicate(sequence, hmmfile, domNumber, length): """ Given a sequence and a domain number (ith domain in sequence), duplicates this domain in the sequence. Args: sequence: The full gene sequence (including both domain and linker regions) hmmfile: hmm file containing hmm of domain to be duplicated domNumber: The position of the domain to be duplicated w.r.t. the other domains length: Length (#domains) involved in duplication Returns: sequence (str ): The sequence after the specified duplication. """ BASELINKER = 'TGEVK' #BASELINKER = '' if HMMER: starts, ends = findDomains(sequence, hmmfile)[:2] else: starts, ends = findMotifs(sequence, hmmfile)[:2] sequence = sequence[:ends[domNumber + length - 1] + 1] + BASELINKER + \ sequence[starts[domNumber]:] return sequence
def generateIQTree(): sd = 1 #startingDomains hostTree = createRandomTopology(1, 1, lambda x: x) guestTree, nodeMap = buildGuestTree(hostTree, s2, expfunc, .2, gaussNoise, sd) rootSequence = grs(sd) evolveAlongTree(hostTree, guestTree, nodeMap, rootSequence, hmmfile, emissionProbs, transmat) names, seqs = [], [] for node in hostTree: if HMMER: seqs += findDomains(node.sequence, hmmfile)[2] else: seqs += findMotifs(node.sequence, hmmfile)[2] gnodes = findLeaves(nodeMap[node]) n = [(leaf.position, leaf.name) for leaf in gnodes if leaf.event != 'LOSS'] n.sort() names += [name[1] for name in n] guestTree = prune(guestTree, names) outgroup = Tree() outgroup.up = guestTree guestTree.children.append(outgroup) outgroup.name = 'Outgroup' outseq = evolveSequence(rootSequence, .1, 2, emissionProbs, hmmfile, transmat) if HMMER: outseq = findDomains(outseq, hmmfile)[2][0] else: outseq = findMotifs(outseq, hmmfile)[2][0] outgroup.add_feature('sequence', outseq) seqs.insert(0, outseq) names.insert(0, 'Outgroup') guestTree.write(outfile = 'testtree.nwk') hostTree.write(outfile='hosttree.nwk') addRandomTrees('testtree.nwk') writeFasta(names, seqs, 'testfasta.fa', False) mlTree('testfasta.fa', 'testtree.nwk', True) iqtree = Tree('testfasta.fa.treefile') iqtree.set_outgroup(iqtree&('Outgroup')) return hostTree, guestTree, iqtree
def evolveSequence(sequence, rate, branchLength, emissionProbs, hmmfile, transmat): """ Putting the previous steps together, simulates evolutoin of a full sequence including both domains and non-domain sequence. Args: sequence (str): The full sequence to be evolved emissionsProbs: matrix with dimensions (n x 20) where n is the length of the domain. Each row contains the probability of each aa appearing at that position (in pfam hmm order) transmat (list): aa transition matrix with dimensions (20 x 20) """ #FOR TESTING original_sequence = sequence #END TESTING BLOCK #Find domains, check if sequence begins and/or ends with a domain if HMMER: domains = findDomains(sequence, hmmfile)[2] else: domains = findMotifs(sequence, hmmfile)[2] #split on all domains for seq in domains: sequence = sequence.replace(seq, "xxx") sequences = sequence.split("xxx") #Evolve sequence fragments individually for i in range(len(domains)): domains[i] = evolveDomain(domains[i], rate, branchLength, emissionProbs, transmat, hmmfile) for i in range(len(sequences)): if sequences[i] == '': sequences[i] = evolveEmptyLinker(branchLength) else: sequences[i] = evolveLinker(sequences[i], branchLength) #Reassemble full sequence post evolution sequence = '' try: for i in range(len(domains)): sequence += sequences[i] + domains[i] except: print original_sequence print domains print sequences printDomSeq(original_sequence, hmmfile) raise Exception sequence += sequences[-1] if len(sequences) > len(domains) else domains[-1] return sequence
def findAndAlign(hmmfile, sequence): possStarts, possEnds, possSequences = findDomains(sequence, hmmfile) starts = [] ends = [] sequences = [] for i in range(len(possSequences)): if len(possSequences[i]) == 23 and any( base.islower() for base in possSequences[i]) == False: starts.append(possStarts[i]) ends.append(possEnds[i]) sequences.append(possSequences[i]) origCount = len(sequences)
def genRandomSequence2(numDoms): """ Generates a zf-C2H2 protein sequence with the given number of domains. Picks 1 domain per sequence, one sequence per orthogroup """ files = ls(DATAPATH) for i in range(len(files))[::-1]: if '.fa' not in files[i]: files.pop(i) pool = [] i = 0 while i < numDoms: f = list(open(DATAPATH + choice(files)))[1::2] f = choice(f).strip() if len(findDomains(f, hmmfile)[0]) > 1: pool.append(string.translate(f, None, '-')) i += 1 starts, ends = findDomains(pool[0], hmmfile)[:2] prefix = pool[0][:starts[0]] suffix = pool[0][ends[-1]:] if prefix == '' or suffix == '': return genRandomSequence2(numDoms) i, j = starts[0], starts[1] middle = fixzf(pool[0][i:j]) for sequence in pool[1:]: starts = findDomains(sequence, hmmfile)[0] i, j = starts[0], starts[1] middle += fixzf(sequence[i:j]) newSeq = prefix + middle + suffix newSeq = newSeq.translate(None, '-') #''.join(newSeq.split('-')) newSeq = newSeq.translate(None, string.ascii_lowercase) newSeq = newSeq.translate(None, 'BJOUXZ') return newSeq
def seqDiff(n=10, bl=1): RED = '\033[91m' NORMAL = '\033[0m' seq = grs(1) if HMMER: dom = findDomains(seq, hmmfile)[2][0] else: dom = findMotifs(seq, hmmfile)[2][0] print dom iterations = 0 while iterations < n: try: temp = evolveSequence(seq, .05, bl, emissionProbs, hmmfile, transmat) if HMMER: tempdom = findDomains(temp, hmmfile)[2][0] else: tempdom = findMotifs(temp, hmmfile)[2][0] out = "" nMuts = 0 for i in range(len(dom)): if tempdom[i] == dom[i]: out += tempdom[i] else: out += RED + tempdom[i] + NORMAL nMuts += 1 totalMuts = 0 for i in range(len(temp)): if temp[i] != seq[i]: totalMuts += 1 print out, nMuts, totalMuts, round(totalMuts / float(nMuts) / (len(temp) / 23.)) except: continue iterations += 1
def withHost(numLeaves = 4, bl = .5, hostTree = None): sd = 1 #startingDomains extralen = .05 if hostTree is None: hostTree = createRandomTopology(numLeaves, bl, lambda x: x) for leaf in hostTree: leaf.dist += extralen dupFunc = lambda x, y: 1 #guestTree, nodeMap = buildGuestTree(hostTree, s2, expfunc, .1, gaussNoise, sd) guestTree, nodeMap = buildGuestTree(hostTree, s2, dupFunc, .1, gaussNoise, sd) for leaf in guestTree: leaf.dist += extralen #rootSequence = grs(sd) rootSequence = gfs('60_emissions.fa', 40) evolveAlongTree(hostTree, guestTree, nodeMap, rootSequence, hmmfile, emissionProbs, transmat) names = [(leaf.position, leaf.name) for leaf in guestTree if leaf.event != 'LOSS'] names.sort() names = [i[1] for i in names] names.sort() seqs = [] hnodes = sorted([i.name for i in hostTree]) for node in hnodes: if HMMER: seqs += findDomains((hostTree&node).sequence, hmmfile)[2] else: seqs += findMotifs((hostTree&node).sequence, hmmfile)[2] for node in hostTree.traverse(): node.del_feature('leaves') guestTree = guestTree.children[0] guestTree.up = None writeTree(hostTree, 'host.nwk') writeTree(guestTree, 'guest.nwk') writeFasta(names, seqs, 'sequences.fa') return hostTree, guestTree, names, seqs
def groupDomains(names, sequences, hmmfile): """ Takes a list of input sequences from an msa and returns a list of domain strings for each. Leaves an empty string at position i of the jth list if the jth sequence does not have a copy of domain i. This aligns all existing domains and makes it clear which domains are present in which sequence Example (domains marked as xxx): sequences = ["AAxxxAAxxxAAAAAAA", "AAAAAAAxxxAAxxxAA", "AAxxxAAAAAAAxxxAA"] grouped = [[dom1, dom2, '' ], ['' , dom2, dom3], [dom1, '' , dom3]] Args: sequences (list): A list of sequences hmmfile (str ): The name of the hmmfile containing the desired domain model Returns: grouped (list): A list of lists of all domain sequences from each domain domNames (list): A list of lists of domain names for each domain in each sequence """ domStarts = [findDomains(i, hmmfile)[0] for i in sequences] domNames = [] allStarts = sorted(list(set.union(*[set(i) for i in domStarts]))) grouped = [] for i in range(len(domStarts)): domains = ['' for _ in range(len(allStarts))] dnames = ['' for _ in range(len(allStarts))] for start in domStarts[i]: domains[allStarts.index(start)] = sequences[i][start:start + 23] dnames[allStarts.index(start)] = names[i] + "_" + str(start) grouped.append(domains) domNames.append(dnames) return grouped, domNames
def selfSimilarity(name, sequence, hmmfile, heatmap=False): """ Given a single sequence, checks the level of self similarity between its constituent domains. Optionally creates a heatmap of this similarity Args: sequence (str ): An amino acid string representing a protein hmmfile (str ): File path of the hmm used to find domains heatmap (bool): (optional, default False) If true, displays a heatmap of self similarity between domains on the sequence Output: simMatrix (list): A """ domains = findDomains(sequence, hmmfile)[2] numDomains = len(domains) simMatrix = np.zeros((numDomains, numDomains)) for i in range(numDomains): for j in range(i, numDomains): simMatrix[i][j] = domainSim(domains[i], domains[j]) simMatrix[j][i] = simMatrix[i][j] #Normalization step if len(simMatrix) > 1: bias = np.min(simMatrix) scale = np.max(simMatrix) - bias for i in range(len(simMatrix)): for j in range(len(simMatrix)): simMatrix[i][j] = (simMatrix[i][j] - bias) / scale if heatmap: sns.heatmap(simMatrix, cmap='viridis') #plt.savefig('tmp/' + name + '.pdf') plt.show() plt.close() return simMatrix
def domainEvolution(host, guest, hmmfile, nodemap, sequence): possStarts, possEnds, possSequences = findDomains(sequence, hmmfile) starts = [] ends = [] sequences = [] positions = {} for i in range(len(possSequences)): if len(possSequences[i]) == 23 and any( base.islower() for base in possSequences[i]) == False: # if len(possSequences[i]) == 23 and any(base.islower() for base in possSequences[i]) == False: starts.append(possStarts[i]) ends.append(possEnds[i]) sequences.append(possSequences[i]) origCount = len(sequences) print("ORIG: " + str(origCount)) linkerStarts, linkerEnds, linkerSequences = findLinkers( starts, ends, sequence) orthogroup = [] bookkeeping = {} internalNodes = {} guestData = {} print("*******************************************") print(host.get_tree_root()) for node in host.traverse(): print("HOST NODE: " + node.name) print("HOST NODE CHILDREN: " + str(node.children)) # find the root sequence mapped_guest = nodemap[node] print("NODEMAP: " + str(mapped_guest)) guestRoots = [] if node.is_root(): guestRoots.append(guest.get_tree_root()) else: for guestNode in mapped_guest: if guestNode.up not in mapped_guest: guestRoots.append(guestNode) print("GUEST ROOTS: " + str(guestRoots)) # # map guestTree node to domains if node.is_root(): node.add_feature('sequences', sequences[:]) node.add_feature('starts', starts[:]) node.add_feature('ends', ends[:]) node.add_feature('positions', dict(positions)) node.add_feature('linkerStarts', linkerStarts[:]) node.add_feature('linkerEnds', linkerEnds[:]) node.add_feature('linkerSequences', linkerSequences[:]) for i in range(len(guestRoots)): guestRoots[i].add_feature('sequences', sequences[:][i]) guestRoots[i].add_feature('starts', starts[:][i]) guestRoots[i].add_feature('ends', ends[:][i]) else: betweenNodeLinkers = node.up.linkerSequences[:] betweenNodeSequences = node.up.sequences[:] distance = node.up.get_distance(node) # sequence level linker evolution for i in range(len(betweenNodeLinkers)): betweenNodeLinkers[i] = evolveLinker(betweenNodeLinkers[i], distance) # domain level evolution for i in range(len(betweenNodeSequences)): betweenNodeSequences[i] = mutateDomain(betweenNodeSequences[i], hmmfile, distance) node.add_feature('sequences', betweenNodeSequences[:]) node.add_feature('starts', node.up.starts[:]) node.add_feature('ends', node.up.ends[:]) node.add_feature('positions', dict(node.up.positions)) node.add_feature('linkerStarts', node.up.linkerStarts[:]) node.add_feature('linkerEnds', node.up.linkerEnds[:]) node.add_feature('linkerSequences', betweenNodeLinkers[:]) current_sequences = node.sequences current_starts = node.starts current_ends = node.ends current_positions = node.positions current_linkerStarts = node.linkerStarts current_linkerEnds = node.linkerEnds current_linkerSequences = node.linkerSequences print("INPUT POSITIONS: " + str(current_positions)) print("INPUT: " + str(sequences)) print("MAPPING: " + str(mapped_guest)) # initialize positions pos_init = 0 extra = dict({}) print("-------------------------------------------") for root in guestRoots: print("CURRENTLY EXAMINING GUEST ROOT: " + root.name) # for a subtree by traversing from a root node, # see if node has chilren that are in the list, # cut off when it doesn't to form subtree subtree = root.copy("deepcopy") # print(subtree.write(format=8)) for newbie in subtree.iter_descendants(): if newbie not in mapped_guest: newbie.detach() # print(subtree.write(format=8)) # initialize distances distances = [] distances.append([root, 0]) closestNode = distances[0][0] closestDistance = distances[0][1] index = 0 # obtain domain information (to be updated later) if node.is_root(): root_sequences = root.sequences root_starts = root.starts root_ends = root.ends # index = guestRoots.index(root) else: for i in current_positions: if i.name == root.up.name: index = current_positions[i] del current_positions[i] current_positions[root] = index # index = current_positions[root.up] # if root.up in current_positions: # del current_positions[root.up] root_sequences = current_sequences[index] root_starts = current_starts[index] root_ends = current_ends[index] if pos_init == 0 and node.is_root(): current_positions[root] = 0 pos_init = 1 length = len(root_sequences) count = 1 # iterate by minimum distance while True: print("EVENT TITLE: " + closestNode.event) # print("CURRENT POSITIONS: " + str(current_positions)) # print(index) # check event node, update positions list if closestNode.event == "DUPLICATION": for position in current_positions: if current_positions[position] > index: current_positions[position] += 1 if node.is_root and root.is_root: oldPosition = current_positions[closestNode] del current_positions[closestNode] else: oldPosition = current_positions[closestNode.up] del current_positions[closestNode.up] current_positions[closestNode.children[0]] = oldPosition current_positions[ closestNode.children[1]] = oldPosition + 1 linkerLength = 0 if index == 0: linkerLength = 5 else: linkedLength = len(current_linkerSequences[index]) current_starts.append(root_starts + linkerLength + length) current_ends.append(root_ends + linkerLength + length) current_sequences.append(root_sequences) current_linkerStarts.append(current_linkerStarts[index] + linkerLength + length) current_linkerEnds.append(current_linkerEnds[index] + linkerLength + length) current_linkerSequences.append( current_linkerSequences[index]) elif closestNode.event == "LOSS": current_starts.pop(index) current_ends.pop(index) current_sequences.pop(index) current_linkerStarts.pop(index) current_linkerEnds.pop(index) current_linkerSequences.pop(index) for position in current_positions: if current_positions[position] > index: current_positions[position] -= 1 del current_positions[current_positions.keys()[index]] elif closestNode.event == "SPECIATION": closestNodeSearch = guest & closestNode.name if closestNodeSearch.up in current_positions: del current_positions[closestNodeSearch.up] current_positions[closestNode] = index # # sort to maintain original order in case of duplication current_sequences = sortBy(current_sequences, current_starts) current_starts.sort() current_ends.sort() current_positions = dict( sorted(current_positions.items(), key=lambda x: x[1])) current_linkerSequences = sortBy(current_linkerSequences, current_linkerStarts) current_linkerStarts.sort() current_linkerEnds.sort() print("POSITIONS: " + str(current_positions)) print("STARTS: " + str(current_starts)) print("ENDS: " + str(current_ends)) print("CLOSEST NAME AND DISTANCE:") print(closestNode.name) print(closestDistance) guestNode = closestNode.name guestData[closestNode.name] = [ current_sequences[index], current_starts[index], current_ends[index] ] print("PRE-DELETION DISTANCES:") print(distances) del distances[distances.index([closestNode, closestDistance])] print("POST-DELETION DISTANCES:") print(distances) closestChildren = closestNode.children print("CLOSEST CHILDREN:") print(closestChildren) # subtract distance to closest from every remaining considered gene for gene in distances: gene[1] -= closestDistance # add the closest's childrento the list for child in closestChildren: # print(mapped_guest) if child in mapped_guest: distances.append([child, child.dist]) print("UNSORTED DISTANCES: " + str(distances)) sortedDistances = sorted(distances, key=lambda x: x[1]) print("SORTED DISTANCES: " + str(sortedDistances)) if len(sortedDistances) > 0: closestNode = sortedDistances[0][0] closestDistance = sortedDistances[0][1] # sequence level linker evolution for i in range(len(current_linkerSequences)): current_linkerSequences[i] = evolveLinker( current_linkerSequences[i], closestDistance) # domain level evolution for i in range(len(current_sequences)): old = current_sequences[i] current_sequences[i] = mutateDomain( current_sequences[i], hmmfile, closestDistance) distances = sortedDistances sequence = reconstructSequence(current_starts, current_ends, current_sequences, current_linkerStarts, current_linkerEnds, current_linkerSequences) internalNodes[guestNode] = sequence guestData[closestNode.name] = [ current_sequences[index], current_starts[index], current_ends[index] ] print("-------------------------------------------") current_positions[closestNode] = index + 1 index += 1 if closestNode not in mapped_guest: break else: distances = sortedDistances sequence = reconstructSequence(current_starts, current_ends, current_sequences, current_linkerStarts, current_linkerEnds, current_linkerSequences) internalNodes[guestNode] = sequence guestData[closestNode.name] = [ current_sequences[index], current_starts[index], current_ends[index] ] print("-------------------------------------------") break print("Finished examining guest root: " + root.name) # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") node.add_feature("positions", current_positions) node.add_feature("sequences", current_sequences) node.add_feature("starts", current_starts) node.add_feature("ends", current_ends) if node.is_leaf() == 1: bookkeeping[node.name] = [ current_sequences, current_starts, current_ends ] print("*******************************************") print("ENDING sequences: " + str(current_sequences)) finalSequence = reconstructSequence(current_starts, current_ends, current_sequences, current_linkerStarts, current_linkerEnds, current_linkerSequences) node.add_feature("final", finalSequence) print print if node.is_leaf() == True: if len(finalSequence) > 10: orthogroup.append(finalSequence) # orthogroup[node.name] = finalSequence # print(orthogroup) return orthogroup, bookkeeping, internalNodes, guestData