Exemplo n.º 1
0
def build_read_kmers_index(reads_filename, kmer_ii, kmer_size):
    """
    Return two dictionaries.  One contains the counts of ambiguous k-mers, while the other
    contains the number of unique k-mers that map to a given contig.
    """

    ambiguous_kmer_counts = defaultdict(int)
    contig_counts = defaultdict(int)

    pf = SeqIO.ParseFastQ(reads_filename)
    #tuple = pf.getNextReadSeq()
    kmer = None
    contig = None
    contigs_containing_kmer = []
    unalignable_kmers = 0
    #total_abundance = 0
    num_reads = 0
    sum = 0
    for tuple in pf:

        # For each k-mer in the read...
        for i in xrange(0, len(tuple[1]) - kmer_size + 1):

            # ... find what contigs contain it.
            kmer = tuple[1][i:i + kmer_size]
            if kmer in kmer_ii or revcompl(kmer) in kmer_ii:
                if kmer not in kmer_ii:
                    kmer = revcompl(kmer)

                # and randomly assign the count to one of the items.
                contigs_containing_kmer = accumulate(kmer_ii[kmer])

                #print kmer +'\t',
                contigs_containing_kmer = list(contigs_containing_kmer)

                if len(contigs_containing_kmer) > 1:
                    ambiguous_kmer_counts[kmer] += 1
                else:
                    contig_counts[contigs_containing_kmer[0][0]] += 1
            else:
                unalignable_kmers += 1

        if num_reads % 100000 == 0:
            sys.stderr.write('Processed reads:\t' + str(num_reads) + '\r')

        sum += len(tuple[1])
        num_reads += 1

    return ambiguous_kmer_counts, contig_counts, sum / num_reads
Exemplo n.º 2
0
def assign_read_kmers_to_contigs(reads_filename, kmer_ii, kmer_size):
    """
    Given a set of reads and k-mer length, assign k-mer counts to the contigs.
    """

    contig_counts = defaultdict(int)

    pf = SeqIO.ParseFastQ(reads_filename)
    #tuple = pf.getNextReadSeq()
    kmer = None
    contig = None
    unalignable_kmers = 0
    num_reads = 0
    sum = 0
    for tuple in pf:
        #while tuple is not None:

        # For each k-mer in the read...
        for i in xrange(0, len(tuple[1]) - kmer_size + 1):

            # ... find what contigs contain it.
            kmer = tuple[1][i:i + kmer_size]
            if kmer in kmer_ii:
                # and randomly assign the count to one of the items.
                contig = random.choice(kmer_ii[kmer])[0]
                contig_counts[contig] += 1

            elif revcompl(kmer) in kmer_ii:
                contig = random.choice(kmer_ii[revcompl(kmer)])[0]
                contig_counts[contig] += 1

            else:
                unalignable_kmers += 1

        sum += len(tuple[1])
        num_reads += 1

    #    tuple = pf.getNextReadSeq()

    #print 'Unalignable k-mers:\t' + str(unalignable_kmers)
    return contig_counts, sum / num_reads
Exemplo n.º 3
0
def assign_read_kmers_to_contigs_iterative(reads_filename, kmer_ii, kmer_size,
                                           contig_abundances):
    """
    Given a set of reads and k-mer length, assign k-mer counts to the contigs based on their abundances.
    """

    contig_counts = defaultdict(int)

    pf = SeqIO.ParseFastQ(reads_filename)
    #tuple = pf.getNextReadSeq()
    kmer = None
    contig = None
    contigs_containing_kmer = []
    unalignable_kmers = 0
    total_abundance = 0
    num_reads = 0
    sum = 0
    for tuple in pf:
        #while tuple is not None:

        # For each k-mer in the read...
        for i in xrange(0, len(tuple[1]) - kmer_size + 1):

            # ... find what contigs contain it.
            kmer = tuple[1][i:i + kmer_size]
            if kmer in kmer_ii or revcompl(kmer) in kmer_ii:
                if kmer not in kmer_ii:
                    kmer = revcompl(kmer)

                # and randomly assign the count to one of the items.
                contigs_containing_kmer = accumulate(kmer_ii[kmer])

                #print kmer +'\t',
                contigs_containing_kmer = list(contigs_containing_kmer)
                #print contigs_containing_kmer

                # Calculate total abundance
                for contig in contigs_containing_kmer:
                    total_abundance += contig_abundances[contig[0]]

                # Choose
                choice = random.randint(1, total_abundance)

                curr_abundance = 0
                chosen_contig_tuple = None
                for contig in contigs_containing_kmer:
                    curr_abundance += contig_abundances[contig[0]]

                    # Have we found the right contig?
                    if curr_abundance >= choice:
                        chosen_contig_tuple = contig
                        #print 'Selecting:\t',
                        #print chosen_contig_tuple
                        break

                contig_counts[chosen_contig_tuple[0]] += 1

                total_abundance = 0

            else:
                unalignable_kmers += 1

        sum += len(tuple[1])
        num_reads += 1

    return contig_counts, sum / num_reads
Exemplo n.º 4
0
def main():
    if len(sys.argv) < 1:
        print USAGE
        sys.exit()

    parser = OptionParser()
    parser.add_option("-n", "--num_trials", dest="num_trials", default="1000")
    parser.add_option("-s",
                      "--sample_size",
                      dest="sample_size",
                      default="10000")
    parser.add_option("-i", "--input", dest="input", default=None)
    parser.add_option("-1", "--1", dest="first_mates")
    parser.add_option("-2", "--2", dest="second_mates")
    parser.add_option("-k", "--samples", dest="samples", default=0)
    parser.add_option("-o", "--output_dir", dest="output_dir", default="./")
    parser.add_option("-t", "--trials", dest="trials", default=0)
    parser.add_option("-d", "--debug_level", dest="debug_level", default=0)
    parser.set_usage(USAGE)
    (options, args) = parser.parse_args(sys.argv[1:])
    debug_level = int(options.debug_level)

    # Read through each reads, and add their respective input_number to sample_set.
    # [1 1 1 1 2 2 2 2 2 ... 6 6 6]
    # This way we can choose how many reads of what input file we should have based
    # on their abundances.
    # TODO(cmhill): Inefficient, but works fine for 100 million reads.
    total_read_set = []

    # We have to process the mates together in order.
    first_mate_files = options.first_mates.split(',')
    second_mate_files = options.second_mates.split(',')

    if len(first_mate_files) != len(second_mate_files):
        print "Error: Mate files need to have the same number."
        sys.exit(0)

    # Handle the option of multiple samples.
    for samples in options.samples.split(','):
        samples = int(samples)

        output_dir = options.output_dir + '/' + str(samples) + '/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Re-open all read files.
        first_mate_readers = []
        second_mate_readers = []

        for i in range(len(first_mate_files)):
            first_mate_readers.append(SeqIO.ParseFastQ(first_mate_files[i]))
            second_mate_readers.append(SeqIO.ParseFastQ(second_mate_files[i]))

        sample_reads_dict = {}
        sample_reads = []

        k = samples
        index = 0

        file_index = 0
        while file_index < len(first_mate_readers):

            second_mate = second_mate_readers[file_index].next()
            for first_mate in first_mate_readers[file_index]:
                index += 1

                # Reserviour sampling algorithm.
                if len(sample_reads) < k:
                    sample_reads.append(
                        (file_index, (first_mate, second_mate)))
                else:
                    r = random.randrange(index)
                    if r < k:
                        sample_reads[r] = ((file_index, (first_mate,
                                                         second_mate)))

                try:
                    second_mate = second_mate_readers[file_index].next()
                except:
                    pass

            if debug_level > 0:
                print 'File Index: ' + str(file_index)
                print 'Reads needed: ' + str(k)
                print sample_reads

            file_index += 1

        # TODO(cmhill): Remove, since we print the reads out right away.
        sample_reads_dict[file_index] = sample_reads

        file_index = 0
        # Write out these sample reads to file.
        # Re-open all read files.
        first_mate_writers = []
        second_mate_writers = []

        for i in range(len(first_mate_files)):
            first_mate_writers.append(
                open(output_dir + '/' + str(file_index) + '_1.fastq', 'w'))
            second_mate_writers.append(
                open(output_dir + '/' + str(file_index) + '_2.fastq', 'w'))
            file_index += 1

        for reads in sample_reads:
            first_mate_writers[reads[0]].write('\n'.join(reads[1][0]) + '\n')
            second_mate_writers[reads[0]].write('\n'.join(reads[1][1]) + '\n')