예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description="Calculate sequence-level epitope coverage for "
        "some library design.")

    parser.add_argument('-d',
                        '--design',
                        help="Name of design file to compare against "
                        "reference file.")
    parser.add_argument('-r',
                        '--reference',
                        help="Reference dataset from which 'design' was "
                        "created, script calculates epitope coverage "
                        "for each sequence in the reference.")
    parser.add_argument('-o',
                        '--output',
                        help="Name of file to write output to. ")
    parser.add_argument('--delimiter',
                        default='|',
                        help="Delimiter to place between sequence name "
                        "and its percent epitope coverage.")
    parser.add_argument('-e',
                        '--epitope_size',
                        type=int,
                        help="Integer size of epitopes to use for coverage "
                        "information gathering.")

    args = parser.parse_args()

    ref_names, ref_sequences = oligo.read_fasta_lists(args.reference)
    design_names, design_seqs = oligo.read_fasta_lists(args.design)

    header = "Sequence Name%sPercent Epitopes (%d-mers) Covered in design\n" % (
        args.delimiter, args.epitope_size)

    out_strings = list()
    design_kmers = set()

    for item in design_seqs:
        design_kmers |= oligo.subset_lists_iter(item, args.epitope_size, 1)

    for index in range(len(ref_names)):
        current_name = ref_names[index]
        current_seq = ref_sequences[index]
        current_kmers = oligo.subset_lists_iter(current_seq, args.epitope_size,
                                                1)

        if len(current_kmers) > 0:
            perc_cov = len(current_kmers & design_kmers) / len(current_kmers)
        else:
            perc_cov = 4

        out_strings.append("%s%s%f\n" %
                           (current_name, args.delimiter, perc_cov))

    with open(args.output, 'w') as out_file:
        out_file.write("%s\n" % header)
        for item in out_strings:
            out_file.write(item)
예제 #2
0
def main():
    names, sequences = oligo.read_fasta_lists( sys.argv[ 1 ] ) 

    kmer_set = set()
    for seq in sequences:
        kmer_set |= oligo.subset_lists_iter( seq, 9, 1 )
    print( "%s|%d" % ( sys.argv[ 1 ], len( kmer_set ) ) )
예제 #3
0
    def _count_kmers_in_file(self, filename):
        oligo_set = set()
        names, sequences = oligo.read_fasta_lists(self._dir_name + '/' +
                                                  filename)

        for item in sequences:
            oligo_set |= oligo.subset_lists_iter(item, self._kmer_size, 1)
        return len(oligo_set)
예제 #4
0
def kmer_dict_wcounts(sequences, k):
    seqs = [item.get_seq() for item in sequences]
    kmer_dict = {}

    for seq in seqs:
        kmers = oligo.subset_lists_iter(seq, k, 1)
        for km in kmers:
            if km not in kmer_dict:
                kmer_dict[km] = 0
            kmer_dict[km] += 1
    return kmer_dict
예제 #5
0
def calculate_score(ymer, comparison_dict, window_size, step_size):
    """
        Calculates the score of a ymer
    """
    name, subset_ymer = oligo.subset_lists_iter("", ymer, window_size,
                                                step_size)
    total = 0
    for current_ymer in subset_ymer:
        if current_ymer in comparison_dict and isinstance(
                comparison_dict[current_ymer], list):
            total += comparison_dict[current_ymer][0]
    return total, subset_ymer
예제 #6
0
def count_kmers_in_file(filename, k, count_total=False):
    names, sequences = oligo.read_fasta_lists(filename)
    kmers = set()
    ret_val = 0

    for seq in sequences:
        kmer_set = oligo.subset_lists_iter(seq, k, 1)

        if count_total:
            ret_val += len(kmer_set)
        else:
            kmers |= kmer_set
            ret_val = len(kmers)

    return ret_val
예제 #7
0
def main():
    usage = "usage: %prog [options]"
    option_parser = optparse.OptionParser(usage)

    add_program_options(option_parser)

    options, arguments = option_parser.parse_args()

    names, sequences = oligo.read_fasta_lists(options.query)

    min_ymers = 999999999999999999999999999999999

    for i in range(options.iterations):

        xmer_seq_dict = {}

        # create list of Xmer sequences
        for index in range(len(sequences)):

            name, sequence = oligo.subset_lists_iter(names[index],
                                                     sequences[index],
                                                     options.XmerWindowSize,
                                                     options.stepSize)

            for index in range(len(sequence)):
                if oligo.is_valid_sequence(sequence[index], options.minLength,
                                           options.percentValid):
                    value = [options.redundancy, name[index]]
                    xmer_seq_dict[sequence[index]] = value

        # create dict of Ymer sequences
        ymer_seq_dict = {}

        # Break each ymer up into subsets of xmer size
        for index in range(len(sequences)):

            name, sequence = oligo.subset_lists_iter(names[index],
                                                     sequences[index],
                                                     options.YmerWindowSize,
                                                     options.stepSize)

            for index in range(len(sequence)):

                if oligo.is_valid_sequence(sequence[index], options.minLength,
                                           options.percentValid):
                    ymer_seq_dict[sequence[index]] = name[index]

        total_ymers = len(ymer_seq_dict)

        array_design = {}
        array_xmers = {}
        to_add = []
        ymer_xmers = []
        iter_count = 0

        while True:
            #reset max score at the beginning of each iteration
            max_score = 0
            for current_ymer in ymer_seq_dict.keys():
                # calculate the score of this ymer
                score, subset_ymer = calculate_score(current_ymer,
                                                     xmer_seq_dict,
                                                     options.XmerWindowSize, 1)

                if score > max_score:
                    to_add = list()
                    max_score = score
                    to_add.append(current_ymer)
                    ymer_xmers = [subset_ymer]
                elif score == max_score:
                    to_add.append(current_ymer)
                    ymer_xmers.append(subset_ymer)

            random_index = random.choice(range(len(to_add)))
            oligo_to_remove = to_add[random_index]
            chosen_xmers = ymer_xmers[random_index]
            #        array_xmers.update(chosen_xmers)
            for each in chosen_xmers:
                array_xmers[each] = array_xmers.get(each, 0) + 1

            # subtract from the score of each xmer within the chosen ymer
            for item in chosen_xmers:
                if item in xmer_seq_dict:
                    # We dont' want negative scores
                    if xmer_seq_dict[item][0] > 0:
                        xmer_seq_dict[item][0] -= 1
                else:
                    print("%s - not found in xmer dict!!!" % (item))

            iter_count += 1

            if len(ymer_seq_dict) == 0 or max_score <= 0:
                print("Final design includes %d %d-mers (%.1f%% of total) " %
                      (len(array_design), options.YmerWindowSize,
                       (len(array_design) / float(total_ymers)) * 100))
                #            average_redundancy = sum( xmer_seq_dict[ item ][ 0 ] for item in xmer_seq_dict ) / len( xmer_seq_dict )
                print("%d unique %d-mers in final %d-mers (%.2f%% of total)" %
                      (len(array_xmers), options.XmerWindowSize,
                       options.YmerWindowSize,
                       (float(len(array_xmers)) / len(xmer_seq_dict)) * 100))
                print("Average redundancy of %d-mers in %d-mers: %.2f" %
                      (options.XmerWindowSize, options.YmerWindowSize,
                       sum(array_xmers.values()) / float(len(array_xmers))))
                if len(array_design) < min_ymers:
                    min_ymers = len(array_design)
                    best_xmer_seq_dict = xmer_seq_dict
                    del (xmer_seq_dict)
                    best_array_design = array_design
                    del (array_design)
                break

            try:
                array_design[oligo_to_remove] = ymer_seq_dict[oligo_to_remove]
                del ymer_seq_dict[oligo_to_remove]
            except KeyError:
                continue

            if not iter_count % 250:
                print("Current Iteration: " + str(iter_count))
                #            print( "Number of output ymers: " + str( len( array_design ) ) )
                print("Current xmer dictionary score: " +
                      str(sum(item[0] for item in xmer_seq_dict.values())))

    write_outputs(best_xmer_seq_dict, options.outPut)

    names = []
    sequences = []

    # Write resulting oligos to file
    for sequence, name in best_array_design.items():
        names.append(name)
        sequences.append(sequence)

    oligo.write_fastas(names,
                       sequences,
                       output_name=options.outPut + "_R" +
                       str(options.redundancy) + ".fasta")
예제 #8
0
def main():
    usage = "usage: %prog [options]"
    option_parser = optparse.OptionParser(usage)
    add_program_options(option_parser)
    options, arguments = option_parser.parse_args()

    if options.query is None:
        print("ERROR: Fasta query file must be provided.")
        sys.exit(0)

    names, sequences = oligo.read_fasta_lists(options.query)
    names, sequences = oligo.sort_sequences_by_length(names, sequences)

    cluster_dict = {}

    # Get the sequences sorted in decreasing order
    names.reverse()
    sequences.reverse()

    num_seqs = len(sequences)

    ymer_dict = {}
    for index in range(num_seqs):
        ymer_dict[sequences[index]] = oligo.subset_lists_iter(
            sequences[index], options.XmerWindowSize, 1)

    num_seqs = len(sequences)
    out_list = np.empty(1)

    for current_seq in range(num_seqs):
        for inner_index in range(current_seq + 1, num_seqs):
            out_list = np.append(
                out_list,
                oligo.get_single_sequence_dist(
                    ymer_dict[sequences[current_seq]],
                    ymer_dict[sequences[inner_index]], options.XmerWindowSize,
                    1))

    out_list = np.delete(out_list, 0)

    if options.verbose:
        print("Distance Matrix complete")

    Z = linkage(out_list, 'single')

    cluster = fcluster(Z, options.clusters, criterion='maxclust')

    out_file = open(options.output, 'w')

    if options.verbose:
        print("Clustering Complete")

    for sequence in range(len(names)):
        if cluster[sequence] not in cluster_dict:
            cluster_dict[cluster[sequence]] = list()
        cluster_dict[cluster[sequence]].append(
            (names[sequence], sequences[sequence]))

        out_file.write("%d %s\n" % (cluster[sequence], names[sequence]))

    if options.verbose:
        display_cluster_information(cluster_dict, out_list,
                                    options.XmerWindowSize, 1, ymer_dict)

    out_file.close()
예제 #9
0
def main():

    usage = "usage %prog [options]"
    option_parser = optparse.OptionParser(usage)

    add_program_options(option_parser)

    options, arguments = option_parser.parse_args()

    if options.query is None:
        print("Fasta query file must be provided.")
        sys.exit()

    names, sequences = oligo.read_fasta_lists(options.query)
    num_seqs = len(names)

    assert len(names) == len(sequences)

    sequence_dict = create_seq_dict(names, sequences)

    ymer_dict = {}
    total_ymers = set()

    for current_seq in range(len(sequences)):
        current_ymers = frozenset(
            oligo.subset_lists_iter(sequences[current_seq], options.kmerSize,
                                    1))
        total_ymers |= current_ymers
        ymer_dict[names[current_seq]] = current_ymers

    if 'tax' in options.clustering:
        if options.lineage:

            clusters_with_kmers = {}

            created_clusters = cluster_taxonomically(options, sequence_dict,
                                                     ymer_dict)

        else:
            print(
                "Lineage file must be provided for taxonomic clustering, exiting"
            )
            sys.exit()
    else:
        sorted_ids = sorted(options.id.split(','))
        created_clusters = cluster_by_kmers(float(sorted_ids[0]),
                                            sequence_dict, ymer_dict)

        for current_id in sorted_ids[1::]:
            current_id = float(current_id)
            max_cluster_size = max(
                [item.get_num_kmers() for item in created_clusters.values()])

            if max_cluster_size > options.number:
                re_cluster_kmers(sequence_dict, ymer_dict, created_clusters,
                                 current_id, options.number)

        print("Id threshold: %s." % options.id)

    min_cluster_size, median_cluster_size, avg_cluster_size, max_cluster_size = get_cluster_stats(
        created_clusters, total_ymers)

    print("Number of unique ymers: %d." % len(total_ymers))
    print("Number of clusters: %d." % len(created_clusters.keys()))
    print("Minimum cluster size: %d." % min_cluster_size)
    print("Median cluster size: %.2f." % median_cluster_size)
    print("Average cluster size: %.2f." % avg_cluster_size)
    print("Maximum cluster size: %d." % max_cluster_size)

    write_outputs(options.output, created_clusters, options.number)
예제 #10
0
 def get_kmers_with_step(self, k, step_size):
     return oligo.subset_lists_iter(self.sequence, k, step_size)
예제 #11
0
def seqs_to_kmers(seqs, k):
    output = set()
    for seq in seqs:
        output |= oligo.subset_lists_iter(seq, k, 1)
    return len(output)
예제 #12
0
def get_kmers( seq, k ):
    s = seq.seq
    return oligo.subset_lists_iter( s, k, 1 )