def wrapperProfileGraph(parentFile, contactFile): ''' Draws a graph of the number of clashes at each recombination point ''' pdbName = contactFile.split('_')[0][-4:] parent_list = schema.readMultipleSequenceAlignmentFile( file(parentFile, 'r')) parents = [p for (k, p) in parent_list] pdb_contacts = schema.readContactFile(file(contactFile, 'r')) clash_data = [[] for x in parents[0]] for i in range(1, len(parents)): print i #This reshuffles the alignment to make the first and second sequences the ones analysed. It was needed as SCHEMA is limited to 9 sequences. newList = [parents[0], parents[i]] for x in range(1, len(parents)): if not i == x: newList.append(parents[x]) #Graphs for hotspots for residue in range(0, len(parents[0])): crossovers = [residue] contacts = schema.getSCHEMAContactsWithCrossovers( pdb_contacts, newList, crossovers) fragments = schema.getFragments(crossovers, parents[0]) clash_data[residue].append( schema.getChimeraDisruption('21', contacts, fragments, newList)) means = [np.mean(values) for values in clash_data] StDev = [np.std(values) for values in clash_data] makeBarGraph(means, StDev, pdbName)
def frags(self, parent): return schema.getFragments(self.crossovers(), parent)
def curve(results, parents, bin_width, max_samples=1e10): """Compute a curve of average energy and average mutation, with the latter binned by bin_width. """ if len(results) < 1: # Nothing to do! return (e, xovers, lmin, lmax) = results[0] num_crossovers = len(xovers) #print "# No. of RASPP results:", len(results) # Because the RASPP curve does not involve the length min/max values, we can collapse # the set of RASPP results to only those with unique crossovers. This can # greatly improve performance, since crossover patterns are often duplicated # across min/max values. unique_results = set([(avg_energy, tuple(crossovers)) for (avg_energy, crossovers, l_min, l_max) in results ]) #print "# No. of unique RASPP results:", len(unique_results) # Now compute the average mutation levels for these unique libraries. avg_E_ms = [] for (avg_energy, crossovers) in unique_results: crossovers = list(crossovers) fragments = schema.getFragments(crossovers, parents[0]) avg_m = schema.averageMutationSampled(fragments, parents, max_samples) avg_E_ms.append((avg_energy, avg_m, crossovers)) ms = [m for (E, m, crossovers) in avg_E_ms] (min_m, max_m) = (min(ms), max(ms)) num_bins = int((max_m - min_m) / bin_width) + 1 # Assemble the RASPP curve. If num_samples exceeds the library size, # then this curve is approximate. approx_curve = [] for i in range(num_bins): approx_curve.append(None) for (E, m, crossovers) in avg_E_ms: bin_number = int((m - min_m) / bin_width) # If there's an existing value in this bin, check it if approx_curve[bin_number]: (E_old, m_old, crossovers_old) = approx_curve[bin_number] # If lower E in this bin, substitute it if E < E_old: approx_curve[bin_number] = (E, m, crossovers) else: # Otherwise just add it approx_curve[bin_number] = (E, m, crossovers) # It may be that the approximate curve is exact. If so, just return it. library_size = len(parents)**(num_crossovers + 1) approximate = (library_size > max_samples) if not approximate: return [r for r in approx_curve if r] # If the curve IS approximate, we'll do a final pass so that # the bin values are correct. Some libraries may still be # incorrectly binned because we've # Compute the exact mutation numbers for the lowest-E libraries final_curve = [] for r in approx_curve: if r: (E, approx_m, crossovers) = r fragments = schema.getFragments(crossovers, parents[0]) true_avg_m = schema.averageMutation(fragments, parents) final_curve.append((E, true_avg_m, crossovers)) #print "%1.2f\t%1.2f" % (true_avg_m, approx_m) return final_curve
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemarandom.py": print_usage(args) return # Flags and values print_E = False print_m = False # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, "r")) parents = [p for (k, p) in parent_list] # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], "r")) # Establish connection to output, either file or, if no output file is # specified, to standard output. if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], "w") else: output_file = sys.stdout # Get the number of libraries to evaluate. if arg_dict.has_key(ARG_NUM_LIBRARIES): num_libraries = int(arg_dict[ARG_NUM_LIBRARIES]) else: num_libraries = int(1e3) # Get the minimum fragment size. if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE): min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE]) else: min_length = 4 # Get the number of fragments -- one more than the number of crossovers. num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS]) + 1 num_parents = len(parents) library_size = num_parents ** num_fragments if arg_dict.has_key(ARG_MAX_CHIMERAS_PER_LIBRARY): max_chimeras = min(library_size, int(arg_dict[ARG_MAX_CHIMERAS_PER_LIBRARY])) else: max_chimeras = library_size if arg_dict.has_key(ARG_RANDOM_SEED): random.seed(int(arg_dict[ARG_RANDOM_SEED])) # Make libraries consistent with RASPP (new_parents, identical_sites) = raspp.collapse_parents(parents) if len(new_parents[0]) < num_fragments * min_length: error_msg = ( "Minimum diversity length of %d is too large.\n%d " + "fragments with diversity %d cannot be found in a " + "sequence of length %d (with identities removed). Aborting..." ) print error_msg % (min_length, num_fragments, min_length, len(parents[0])) return start_time = time.clock() output_file.write("# <E>\t<m>\tcrossover points\n") random_crossovers = [] for libnum in range(num_libraries): crossovers = schema.generateRandomCrossovers(len(new_parents[0]), num_fragments - 1, min_length) crossovers = raspp.translate_collapsed_indices(crossovers, identical_sites) random_crossovers.append(crossovers) for crossovers in random_crossovers: fragments = schema.getFragments(crossovers, parents[0]) filtered_contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) all_chimeras = [] if max_chimeras < library_size: # Assemble a random sample of chimeras, with replacement for n_chim in range(max_chimeras): chim_index = random.randint(0, library_size - 1) n2c = schema.base(chim_index, num_parents) chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c]) all_chimeras.append(chimera_blocks) else: # We'll be covering all chimeras in the library; might as well get a good sample. # The number of parents and fragments specifies all possible chimeras, regardless of # crossover point positions, so pre-generate all chimeras. max_chimeras = library_size for i in range(library_size): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i, num_parents) chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c]) all_chimeras.append(chimera_blocks) # Randomly assort the chimeras random.shuffle(all_chimeras) # Calculate average E and m for the library or subsample E_values = [] m_values = [] for chim_index in range(max_chimeras): chimera_blocks = all_chimeras[chim_index] E = schema.getChimeraDisruption(chimera_blocks, filtered_contacts, fragments, parents) m = schema.getChimeraShortestDistance(chimera_blocks, fragments, parents) E_values.append(E) m_values.append(m) average_E = schema.mean(E_values) average_m = schema.mean(m_values) xover_pat = "%d " * len(crossovers) xover_str = xover_pat % tuple(crossovers) output_file.write(("%1.4f\t%1.4f\t%s\n") % (average_E, average_m, xover_str)) output_file.flush() total_time = time.clock() - start_time output_file.write( "# Finished in %1.2f seconds (%d libraries, %d chimeras)\n" % (total_time, num_libraries, num_libraries * max_chimeras) ) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemaenergy.py": print_usage(args) return # Flags and values print_E = False print_m = False output_file = sys.stdout # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] if arg_dict.has_key(ARG_PRINT_E): print_E = True if arg_dict.has_key(ARG_PRINT_M): print_m = True # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k,p) in parent_list] crossovers = schema.readCrossoverFile(file(arg_dict[ARG_CROSSOVER_FILE], 'r')) fragments = schema.getFragments(crossovers, parents[0]) # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') # Now, what does the user want? output_string = '%s' output_file.write('# chimera') if print_E: output_string += '\t%d' output_file.write('\tE') if print_m: output_string += '\t%d' output_file.write('\tm') output_string += '\n' output_file.write('\n') if arg_dict.has_key(ARG_CHIMERAS): # Print values for chimeras chimeras = arg_dict[ARG_CHIMERAS] # Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras. if type(chimeras) is list: # It's a list of chimeras for chimera_blocks in chimeras: outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) elif os.path.isfile(chimeras): # It's a file of chimeras for line in file(chimeras,'r').readlines(): chimera_blocks = line.strip() outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # It's a single chimera sequence chimera_blocks = chimeras outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # Enumerates all possible chimeras and their disruption and mutation values. p = len(parents) n = len(fragments) Es = [] ms = [] for i in xrange(len(parents)**len(fragments)): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i,p) chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c]) (E, m) = outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) if (print_E): Es.append(E) if (print_m): ms.append(m) if (print_E): mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es) output_file.write(mean_str) if (print_m): mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms) output_file.write(mean_str) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def frags(self,parent): return schema.getFragments(self.crossovers(), parent)
def curve(results, parents, bin_width, max_samples=1e10): """Compute a curve of average energy and average mutation, with the latter binned by bin_width. """ if len(results) < 1: # Nothing to do! return (e, xovers, lmin, lmax) = results[0] num_crossovers = len(xovers) #print "# No. of RASPP results:", len(results) # Because the RASPP curve does not involve the length min/max values, we can collapse # the set of RASPP results to only those with unique crossovers. This can # greatly improve performance, since crossover patterns are often duplicated # across min/max values. unique_results = set([(avg_energy, tuple(crossovers)) for (avg_energy, crossovers, l_min, l_max) in results]) #print "# No. of unique RASPP results:", len(unique_results) # Now compute the average mutation levels for these unique libraries. avg_E_ms = [] for (avg_energy, crossovers) in unique_results: crossovers = list(crossovers) fragments = schema.getFragments(crossovers, parents[0]) avg_m = schema.averageMutationSampled(fragments, parents, max_samples) avg_E_ms.append((avg_energy, avg_m, crossovers)) ms = [m for (E, m, crossovers) in avg_E_ms] (min_m, max_m) = (min(ms), max(ms)) num_bins = int((max_m-min_m)/bin_width) + 1 # Assemble the RASPP curve. If num_samples exceeds the library size, # then this curve is approximate. approx_curve = [] for i in range(num_bins): approx_curve.append(None) for (E, m, crossovers) in avg_E_ms: bin_number = int((m - min_m)/bin_width) # If there's an existing value in this bin, check it if approx_curve[bin_number]: (E_old, m_old, crossovers_old) = approx_curve[bin_number] # If lower E in this bin, substitute it if E < E_old: approx_curve[bin_number] = (E, m, crossovers) else: # Otherwise just add it approx_curve[bin_number] = (E, m, crossovers) # It may be that the approximate curve is exact. If so, just return it. library_size = len(parents)**(num_crossovers+1) approximate = (library_size > max_samples) if not approximate: return [r for r in approx_curve if r] # If the curve IS approximate, we'll do a final pass so that # the bin values are correct. Some libraries may still be # incorrectly binned because we've # Compute the exact mutation numbers for the lowest-E libraries final_curve = [] for r in approx_curve: if r: (E, approx_m, crossovers) = r fragments = schema.getFragments(crossovers, parents[0]) true_avg_m = schema.averageMutation(fragments, parents) final_curve.append((E, true_avg_m, crossovers)) #print "%1.2f\t%1.2f" % (true_avg_m, approx_m) return final_curve
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemaenergy.py": print_usage(args) return # Flags and values print_E = False print_m = False output_file = sys.stdout # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] if arg_dict.has_key(ARG_PRINT_E): print_E = True if arg_dict.has_key(ARG_PRINT_M): print_m = True # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k, p) in parent_list] crossovers = schema.readCrossoverFile( file(arg_dict[ARG_CROSSOVER_FILE], 'r')) fragments = schema.getFragments(crossovers, parents[0]) # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') # Now, what does the user want? output_string = '%s' output_file.write('# chimera') if print_E: output_string += '\t%d' output_file.write('\tE') if print_m: output_string += '\t%d' output_file.write('\tm') output_string += '\n' output_file.write('\n') if arg_dict.has_key(ARG_CHIMERAS): # Print values for chimeras chimeras = arg_dict[ARG_CHIMERAS] # Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras. if type(chimeras) is list: # It's a list of chimeras for chimera_blocks in chimeras: outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) elif os.path.isfile(chimeras): # It's a file of chimeras for line in file(chimeras, 'r').readlines(): chimera_blocks = line.strip() outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # It's a single chimera sequence chimera_blocks = chimeras outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # Enumerates all possible chimeras and their disruption and mutation values. p = len(parents) n = len(fragments) Es = [] ms = [] for i in xrange(len(parents)**len(fragments)): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i, p) chimera_blocks = ''.join(['1'] * (n - len(n2c)) + ['%d' % (int(x) + 1, ) for x in n2c]) (E, m) = outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) if (print_E): Es.append(E) if (print_m): ms.append(m) if (print_E): mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es) output_file.write(mean_str) if (print_m): mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms) output_file.write(mean_str) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()