def main(): parser = argparse.ArgumentParser( description="Calculate sequence-level epitope coverage for " "some library design.") parser.add_argument('-d', '--design', help="Name of design file to compare against " "reference file.") parser.add_argument('-r', '--reference', help="Reference dataset from which 'design' was " "created, script calculates epitope coverage " "for each sequence in the reference.") parser.add_argument('-o', '--output', help="Name of file to write output to. ") parser.add_argument('--delimiter', default='|', help="Delimiter to place between sequence name " "and its percent epitope coverage.") parser.add_argument('-e', '--epitope_size', type=int, help="Integer size of epitopes to use for coverage " "information gathering.") args = parser.parse_args() ref_names, ref_sequences = oligo.read_fasta_lists(args.reference) design_names, design_seqs = oligo.read_fasta_lists(args.design) header = "Sequence Name%sPercent Epitopes (%d-mers) Covered in design\n" % ( args.delimiter, args.epitope_size) out_strings = list() design_kmers = set() for item in design_seqs: design_kmers |= oligo.subset_lists_iter(item, args.epitope_size, 1) for index in range(len(ref_names)): current_name = ref_names[index] current_seq = ref_sequences[index] current_kmers = oligo.subset_lists_iter(current_seq, args.epitope_size, 1) if len(current_kmers) > 0: perc_cov = len(current_kmers & design_kmers) / len(current_kmers) else: perc_cov = 4 out_strings.append("%s%s%f\n" % (current_name, args.delimiter, perc_cov)) with open(args.output, 'w') as out_file: out_file.write("%s\n" % header) for item in out_strings: out_file.write(item)
def main(): names, sequences = oligo.read_fasta_lists( sys.argv[ 1 ] ) kmer_set = set() for seq in sequences: kmer_set |= oligo.subset_lists_iter( seq, 9, 1 ) print( "%s|%d" % ( sys.argv[ 1 ], len( kmer_set ) ) )
def _count_kmers_in_file(self, filename): oligo_set = set() names, sequences = oligo.read_fasta_lists(self._dir_name + '/' + filename) for item in sequences: oligo_set |= oligo.subset_lists_iter(item, self._kmer_size, 1) return len(oligo_set)
def kmer_dict_wcounts(sequences, k): seqs = [item.get_seq() for item in sequences] kmer_dict = {} for seq in seqs: kmers = oligo.subset_lists_iter(seq, k, 1) for km in kmers: if km not in kmer_dict: kmer_dict[km] = 0 kmer_dict[km] += 1 return kmer_dict
def calculate_score(ymer, comparison_dict, window_size, step_size): """ Calculates the score of a ymer """ name, subset_ymer = oligo.subset_lists_iter("", ymer, window_size, step_size) total = 0 for current_ymer in subset_ymer: if current_ymer in comparison_dict and isinstance( comparison_dict[current_ymer], list): total += comparison_dict[current_ymer][0] return total, subset_ymer
def count_kmers_in_file(filename, k, count_total=False): names, sequences = oligo.read_fasta_lists(filename) kmers = set() ret_val = 0 for seq in sequences: kmer_set = oligo.subset_lists_iter(seq, k, 1) if count_total: ret_val += len(kmer_set) else: kmers |= kmer_set ret_val = len(kmers) return ret_val
def main(): usage = "usage: %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() names, sequences = oligo.read_fasta_lists(options.query) min_ymers = 999999999999999999999999999999999 for i in range(options.iterations): xmer_seq_dict = {} # create list of Xmer sequences for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.XmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): value = [options.redundancy, name[index]] xmer_seq_dict[sequence[index]] = value # create dict of Ymer sequences ymer_seq_dict = {} # Break each ymer up into subsets of xmer size for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.YmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): ymer_seq_dict[sequence[index]] = name[index] total_ymers = len(ymer_seq_dict) array_design = {} array_xmers = {} to_add = [] ymer_xmers = [] iter_count = 0 while True: #reset max score at the beginning of each iteration max_score = 0 for current_ymer in ymer_seq_dict.keys(): # calculate the score of this ymer score, subset_ymer = calculate_score(current_ymer, xmer_seq_dict, options.XmerWindowSize, 1) if score > max_score: to_add = list() max_score = score to_add.append(current_ymer) ymer_xmers = [subset_ymer] elif score == max_score: to_add.append(current_ymer) ymer_xmers.append(subset_ymer) random_index = random.choice(range(len(to_add))) oligo_to_remove = to_add[random_index] chosen_xmers = ymer_xmers[random_index] # array_xmers.update(chosen_xmers) for each in chosen_xmers: array_xmers[each] = array_xmers.get(each, 0) + 1 # subtract from the score of each xmer within the chosen ymer for item in chosen_xmers: if item in xmer_seq_dict: # We dont' want negative scores if xmer_seq_dict[item][0] > 0: xmer_seq_dict[item][0] -= 1 else: print("%s - not found in xmer dict!!!" % (item)) iter_count += 1 if len(ymer_seq_dict) == 0 or max_score <= 0: print("Final design includes %d %d-mers (%.1f%% of total) " % (len(array_design), options.YmerWindowSize, (len(array_design) / float(total_ymers)) * 100)) # average_redundancy = sum( xmer_seq_dict[ item ][ 0 ] for item in xmer_seq_dict ) / len( xmer_seq_dict ) print("%d unique %d-mers in final %d-mers (%.2f%% of total)" % (len(array_xmers), options.XmerWindowSize, options.YmerWindowSize, (float(len(array_xmers)) / len(xmer_seq_dict)) * 100)) print("Average redundancy of %d-mers in %d-mers: %.2f" % (options.XmerWindowSize, options.YmerWindowSize, sum(array_xmers.values()) / float(len(array_xmers)))) if len(array_design) < min_ymers: min_ymers = len(array_design) best_xmer_seq_dict = xmer_seq_dict del (xmer_seq_dict) best_array_design = array_design del (array_design) break try: array_design[oligo_to_remove] = ymer_seq_dict[oligo_to_remove] del ymer_seq_dict[oligo_to_remove] except KeyError: continue if not iter_count % 250: print("Current Iteration: " + str(iter_count)) # print( "Number of output ymers: " + str( len( array_design ) ) ) print("Current xmer dictionary score: " + str(sum(item[0] for item in xmer_seq_dict.values()))) write_outputs(best_xmer_seq_dict, options.outPut) names = [] sequences = [] # Write resulting oligos to file for sequence, name in best_array_design.items(): names.append(name) sequences.append(sequence) oligo.write_fastas(names, sequences, output_name=options.outPut + "_R" + str(options.redundancy) + ".fasta")
def main(): usage = "usage: %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() if options.query is None: print("ERROR: Fasta query file must be provided.") sys.exit(0) names, sequences = oligo.read_fasta_lists(options.query) names, sequences = oligo.sort_sequences_by_length(names, sequences) cluster_dict = {} # Get the sequences sorted in decreasing order names.reverse() sequences.reverse() num_seqs = len(sequences) ymer_dict = {} for index in range(num_seqs): ymer_dict[sequences[index]] = oligo.subset_lists_iter( sequences[index], options.XmerWindowSize, 1) num_seqs = len(sequences) out_list = np.empty(1) for current_seq in range(num_seqs): for inner_index in range(current_seq + 1, num_seqs): out_list = np.append( out_list, oligo.get_single_sequence_dist( ymer_dict[sequences[current_seq]], ymer_dict[sequences[inner_index]], options.XmerWindowSize, 1)) out_list = np.delete(out_list, 0) if options.verbose: print("Distance Matrix complete") Z = linkage(out_list, 'single') cluster = fcluster(Z, options.clusters, criterion='maxclust') out_file = open(options.output, 'w') if options.verbose: print("Clustering Complete") for sequence in range(len(names)): if cluster[sequence] not in cluster_dict: cluster_dict[cluster[sequence]] = list() cluster_dict[cluster[sequence]].append( (names[sequence], sequences[sequence])) out_file.write("%d %s\n" % (cluster[sequence], names[sequence])) if options.verbose: display_cluster_information(cluster_dict, out_list, options.XmerWindowSize, 1, ymer_dict) out_file.close()
def main(): usage = "usage %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() if options.query is None: print("Fasta query file must be provided.") sys.exit() names, sequences = oligo.read_fasta_lists(options.query) num_seqs = len(names) assert len(names) == len(sequences) sequence_dict = create_seq_dict(names, sequences) ymer_dict = {} total_ymers = set() for current_seq in range(len(sequences)): current_ymers = frozenset( oligo.subset_lists_iter(sequences[current_seq], options.kmerSize, 1)) total_ymers |= current_ymers ymer_dict[names[current_seq]] = current_ymers if 'tax' in options.clustering: if options.lineage: clusters_with_kmers = {} created_clusters = cluster_taxonomically(options, sequence_dict, ymer_dict) else: print( "Lineage file must be provided for taxonomic clustering, exiting" ) sys.exit() else: sorted_ids = sorted(options.id.split(',')) created_clusters = cluster_by_kmers(float(sorted_ids[0]), sequence_dict, ymer_dict) for current_id in sorted_ids[1::]: current_id = float(current_id) max_cluster_size = max( [item.get_num_kmers() for item in created_clusters.values()]) if max_cluster_size > options.number: re_cluster_kmers(sequence_dict, ymer_dict, created_clusters, current_id, options.number) print("Id threshold: %s." % options.id) min_cluster_size, median_cluster_size, avg_cluster_size, max_cluster_size = get_cluster_stats( created_clusters, total_ymers) print("Number of unique ymers: %d." % len(total_ymers)) print("Number of clusters: %d." % len(created_clusters.keys())) print("Minimum cluster size: %d." % min_cluster_size) print("Median cluster size: %.2f." % median_cluster_size) print("Average cluster size: %.2f." % avg_cluster_size) print("Maximum cluster size: %d." % max_cluster_size) write_outputs(options.output, created_clusters, options.number)
def get_kmers_with_step(self, k, step_size): return oligo.subset_lists_iter(self.sequence, k, step_size)
def seqs_to_kmers(seqs, k): output = set() for seq in seqs: output |= oligo.subset_lists_iter(seq, k, 1) return len(output)
def get_kmers( seq, k ): s = seq.seq return oligo.subset_lists_iter( s, k, 1 )