Пример #1
0
def get_contigs_info(contigs_file):
    contigs_info = {}
    contigs_fasta = fp.read_fasta_dict(contigs_file)
    for ctg_id, ctg_seq in contigs_fasta.iteritems():
        contig_type = ctg_id.split("_")[0]
        contigs_info[ctg_id] = ContigInfo(ctg_id, len(ctg_seq), contig_type)

    return contigs_info
Пример #2
0
def get_bubbles(alignment_path, contigs_info, contigs_path,
                err_mode, num_proc, min_alignment):
    """
    The main function: takes an alignment and returns bubbles
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_fasta_dict(contigs_path),
                                       min_alignment)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in xrange(num_proc):
        threads.append(multiprocessing.Process(target=_thread_worker,
                                               args=(aln_reader, contigs_info,
                                                     err_mode, results_queue,
                                                     error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()

    if not error_queue.empty():
        raise error_queue.get()

    bubbles = []
    total_long_bubbles = 0
    total_long_branches = 0
    total_empty = 0
    total_aln_errors = []
    while not results_queue.empty():
        (ctg_bubbles, num_long_bubbles,
            num_empty, num_long_branch, aln_errors) = results_queue.get()
        total_long_bubbles += num_long_bubbles
        total_long_branches += num_long_branch
        total_empty += num_empty
        total_aln_errors.extend(aln_errors)
        bubbles.extend(ctg_bubbles)

    mean_aln_error = float(sum(total_aln_errors)) / (len(total_aln_errors) + 1)
    logger.debug("Alignment error rate: {0}".format(mean_aln_error))
    logger.debug("Generated {0} bubbles".format(len(bubbles)))
    logger.debug("Split {0} long bubbles".format(total_long_bubbles))
    logger.debug("Skipped {0} empty bubbles".format(total_empty))
    logger.debug("Skipped {0} bubbles with long branches".format(total_long_branches))

    return bubbles
Пример #3
0
    def run(self):
        logger.info("Polishing genome ({0}/{1})".format(
            self.stage_id, self.args.num_iters))
        contigs_fasta = fp.read_fasta_dict(self.in_reference)
        reference_file = os.path.join(
            self.work_dir, "blasr_ref_{0}.fasta".format(self.stage_id))
        aln.make_blasr_reference(contigs_fasta, reference_file)
        aln.make_alignment(reference_file, self.args.reads, self.args.threads,
                           self.out_alignment)
        os.remove(reference_file)

        Job.run_description["stage_name"] = self.name
        Job.run_description["stage_id"] = self.stage_id
Пример #4
0
def patch_genome(alignment, reference_file, out_patched):
    aln_by_ctg = defaultdict(list)
    for aln in alignment:
        aln_by_ctg[aln.trg_id].append(aln)

    ref_fasta = fp.read_fasta_dict(reference_file)
    fixed_fasta = {}

    for ctg_id, ctg_aln in aln_by_ctg.iteritems():
        patches = _get_patching_alignmemnts(ctg_aln)
        fixed_sequence = _apply_patches(patches, ref_fasta[ctg_id])
        fixed_fasta[ctg_id] = fixed_sequence

    fp.write_fasta_dict(fixed_fasta, out_patched)
Пример #5
0
def get_consensus(alignment_path, contigs_path, contigs_info, min_aln_length,
                  platform, num_proc):
    """
    Main function
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_fasta_dict(contigs_path),
                                       min_aln_length)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in xrange(num_proc):
        threads.append(
            multiprocessing.Process(target=_thread_worker,
                                    args=(aln_reader, contigs_info, platform,
                                          results_queue, error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()

    if not error_queue.empty():
        raise error_queue.get()

    out_fasta = {}
    total_aln_errors = []
    while not results_queue.empty():
        ctg_id, ctg_seq, aln_errors = results_queue.get()
        total_aln_errors.extend(aln_errors)
        if len(ctg_seq) > 0:
            out_fasta[ctg_id] = ctg_seq

    mean_aln_error = float(sum(total_aln_errors)) / (len(total_aln_errors) + 1)
    logger.debug("Alignment error rate: {0}".format(mean_aln_error))

    return out_fasta
Пример #6
0
def concatenate_contigs(contigs_file):
    """
    Concatenates contig parts output by assembly module
    """
    genome_framents = fp.read_fasta_dict(contigs_file)
    contig_types = {}
    by_contig = defaultdict(list)
    for h, s in genome_framents.iteritems():
        tokens = h.split("_")
        cont_type = tokens[0]
        contig_id = tokens[0] + "_" + tokens[1]
        part_id = int(tokens[3])
        contig_types[contig_id] = cont_type
        by_contig[contig_id].append((part_id, s))

    contigs_fasta = {}
    for contig_id, contig_seqs in by_contig.iteritems():
        seqs_sorted = sorted(contig_seqs, key=lambda p: p[0])
        contig_concat = "".join(map(lambda p: p[1], seqs_sorted))
        contig_len = len(contig_concat)
        contigs_fasta[contig_id] = contig_concat

    return contigs_fasta