Exemplo n.º 1
0
def merge_bed(repeats_fpath, uncovered_fpath, insert_size, output_dirpath,
              err_path):
    combined_bed_fpath = join(output_dirpath, 'skipped_regions.bed')
    with open(combined_bed_fpath, 'w') as out:
        if exists(repeats_fpath):
            with open(repeats_fpath) as in_f:
                for line in in_f:
                    l = line.split('\t')
                    repeat_len = int(l[2]) - int(l[1])
                    if repeat_len >= insert_size:
                        out.write(line)
        if exists(uncovered_fpath):
            with open(uncovered_fpath) as in_f:
                for line in in_f:
                    out.write(line)

    sorted_bed_fpath = add_suffix(combined_bed_fpath, 'sorted')
    qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', combined_bed_fpath],
                           stdout=open(sorted_bed_fpath, 'w'),
                           stderr=open(err_path, 'a'),
                           logger=logger)
    merged_bed_fpath = add_suffix(combined_bed_fpath, 'merged')
    qutils.call_subprocess(
        [bedtools_fpath('bedtools'), 'merge', '-i', sorted_bed_fpath],
        stdout=open(merged_bed_fpath, 'w'),
        stderr=open(err_path, 'a'),
        logger=logger)
    return merged_bed_fpath
Exemplo n.º 2
0
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto':
        if using_reads == 'pe' and sam_fpath:
            insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.optimal_assembly_insert_size = insert_size
        elif using_reads == 'all' and is_non_empty_file(insert_size_fpath):
            try:
                insert_size = int(open(insert_size_fpath).readline())
                if insert_size:
                    qconfig.optimal_assembly_insert_size = insert_size
            except:
                pass

    if not required_files:
        return sam_fpath, bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    if calculate_coverage:
        bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
        bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

        if is_non_empty_file(bam_sorted_fpath):
            logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
        else:
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        if not is_non_empty_file(uncovered_fpath) and calculate_coverage:
            get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                         correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return sam_fpath, bam_fpath, uncovered_fpath
Exemplo n.º 3
0
def check_repeats_instances(coords_fpath, repeats_fpath, use_long_reads=False):
    query_instances = defaultdict(list)
    with open(coords_fpath) as f:
        for line in f:
            fs = line.split('\t')
            contig, align_start, align_end, strand, ref_name, ref_start = \
                fs[0], fs[2], fs[3], fs[4], fs[5], fs[7]
            align_start, align_end, ref_start = map(int, (align_start, align_end, ref_start))
            align_start += 1
            ref_start += 1
            matched_bases, bases_in_mapping = map(int, (fs[9], fs[10]))
            if matched_bases > qconfig.optimal_assembly_insert_size:
                query_instances[contig].append((align_start, align_end))
    repeats_regions = defaultdict(list)
    filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered')
    with open(filtered_repeats_fpath, 'w') as out_f:
        with open(repeats_fpath) as f:
            for line in f:
                fs = line.split()
                query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2])
                if query_id in query_instances and len(query_instances[query_id]) > 1:
                    mapped_repeats = sorted(list(set(query_instances[query_id][1:])))
                    merged_intervals = []
                    i_start, i_end = mapped_repeats[0]
                    merged_interval = (i_start, i_end)
                    for s, e in mapped_repeats[1:]:
                        if s <= merged_interval[1]:
                            merged_interval = (merged_interval[0], max(merged_interval[1], e))
                        else:
                            merged_intervals.append(merged_interval)
                            merged_interval = (s, e)
                    merged_intervals.append(merged_interval)
                    aligned_bases = sum([end - start + 1 for start, end in merged_intervals])
                    if aligned_bases >= (int(fs[2]) - int(fs[1])) * 0.9:
                        if use_long_reads and len(mapped_repeats) > 1:
                            solid_repeats = []
                            full_repeat_pos = int(fs[1])
                            mapped_repeats.sort(key=lambda x: (x[1], x[1] - x[0]), reverse=True)
                            cur_repeat_start, cur_repeat_end = mapped_repeats[0]
                            for repeat_start, repeat_end in mapped_repeats[1:]:
                                if (cur_repeat_start >= repeat_start - REPEAT_CONF_INTERVAL and cur_repeat_end <= repeat_end + REPEAT_CONF_INTERVAL) or \
                                        (repeat_start >= cur_repeat_start - REPEAT_CONF_INTERVAL and repeat_end <= cur_repeat_end + REPEAT_CONF_INTERVAL):
                                    cur_repeat_start, cur_repeat_end = min(repeat_start, cur_repeat_start), max(repeat_end, cur_repeat_end)
                                else:
                                    solid_repeats.append((cur_repeat_start, cur_repeat_end))
                                    cur_repeat_start, cur_repeat_end = repeat_start, repeat_end
                            solid_repeats.append((cur_repeat_start, cur_repeat_end))
                            for repeat in solid_repeats:
                                out_f.write('\t'.join((fs[0], str(repeat[0] + full_repeat_pos), str(repeat[1] + full_repeat_pos))) + '\n')
                                repeats_regions[fs[0]].append((repeat[0] + full_repeat_pos, repeat[1] + full_repeat_pos))
                        else:
                            out_f.write(line)
                            repeats_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    sorted_repeats_fpath = add_suffix(repeats_fpath, 'sorted')
    qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', filtered_repeats_fpath],
                           stdout=open(sorted_repeats_fpath, 'w'), logger=logger)
    return sorted_repeats_fpath, repeats_regions
Exemplo n.º 4
0
def align_reference(ref_fpath, output_dir, using_reads='all'):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath

    if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto':
        if using_reads == 'paired_end' and sam_fpath:
            insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.ideal_assembly_insert_size = insert_size

    if not required_files:
        return bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(bam_sorted_fpath):
        logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
    else:
        sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    if not is_non_empty_file(uncovered_fpath):
        get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                     correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return bam_fpath, uncovered_fpath
Exemplo n.º 5
0
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                            '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath],
                           stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    ## sort by read names
    bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted')
    sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True)
    matepair_regions = defaultdict(list)
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            matepair_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    return matepair_regions
Exemplo n.º 6
0
def fill_gaps_mate_pair(bam_fpath, ref_fpath, assembly_fpath, assembly_covered_regions, output_dir, uncovered_fpath, err_fpath):
    matepair_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_fasta = []
    matepair_regions = connect_with_matepairs(bam_fpath, output_dir, err_fpath)
    final_assembly_fpath = add_suffix(assembly_fpath, mp_polished_suffix)
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = list(find_overlaps(assembly_covered_regions[name], matepair_reads_covered_regions[name], overlap=50))
        total_contigs = 0
        if name not in matepair_regions or len(covered_regions) == 1:
            for region in covered_regions:
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), seq[region[0]: region[1]]))
                total_contigs += 1
        else:
            frags_to_merge = [covered_regions.pop(0)]
            sorted_mp_intervals = sorted(matepair_regions[name])
            while covered_regions:
                region2 = covered_regions.pop(0)
                if is_overlapped(frags_to_merge[-1], region2, sorted_mp_intervals):
                    frags_to_merge.append(region2)
                else:
                    merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                    final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                    total_contigs += 1
                    frags_to_merge = [region2]
            if frags_to_merge:
                merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                total_contigs += 1
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
Exemplo n.º 7
0
def check_repeats_instances(coords_fpath, repeats_fpath):
    query_instances = dict()
    with open(coords_fpath) as f:
        for line in f:
            fs = line.split('\t')
            contig, align_start, align_end, strand, ref_name, ref_start = \
                fs[0], fs[2], fs[3], fs[4], fs[5], fs[7]
            align_start, align_end, ref_start = map(
                int, (align_start, align_end, ref_start))
            align_start += 1
            ref_start += 1
            matched_bases, bases_in_mapping = map(int, (fs[9], fs[10]))
            score = matched_bases
            if contig in query_instances:
                if score >= max(query_instances[contig]) * 0.8:
                    query_instances[contig].append(score)
            else:
                query_instances[contig] = [score]
    repeats_regions = defaultdict(list)
    filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered')
    with open(filtered_repeats_fpath, 'w') as out_f:
        with open(repeats_fpath) as f:
            for line in f:
                fs = line.split()
                query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2])
                if query_id in query_instances and len(
                        query_instances[query_id]) > 1:
                    out_f.write(line)
                    repeats_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    return filtered_repeats_fpath, repeats_regions
Exemplo n.º 8
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped and proper_pair')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Exemplo n.º 9
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Exemplo n.º 10
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath):
    merged_bam_fpath = add_suffix(bam_fpath, 'merged')
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                            output_dir, merged_bam_fpath, bam_fpath],
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return merged_bam_fpath
Exemplo n.º 11
0
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            if reads_type == 'pacbio' or reads_type == 'nanopore':
                if reads_type == 'pacbio':
                    preset = ' -ax map-pb '
                else:
                    preset = ' -ax map-ont '
                cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads
            else:
                cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        bam_fpath = output_fpath.replace('.sam', '.bam')
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        if not is_non_empty_file(bam_fpath):
            if not is_non_empty_file(bam_fpath):
                sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
            if reads_type == 'pe':
                bam_dedup_fpath = add_suffix(bam_fpath, 'dedup')
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                                        output_dir, bam_fpath, bam_dedup_fpath],
                                        stderr=open(err_fpath, 'a'), logger=logger)
                if exists(bam_dedup_fpath):
                    shutil.move(bam_dedup_fpath, bam_fpath)
        if reads_type == 'pe':
            insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
            if insert_size < qconfig.optimal_assembly_max_IS:
                insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.optimal_assembly_insert_size = max(insert_sizes)
        ref_name = qutils.name_from_fpath(ref_fpath)
        insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt')
        with open(insert_size_fpath, 'w') as out:
            out.write(str(qconfig.optimal_assembly_insert_size))
Exemplo n.º 12
0
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath,
                using_reads):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    if not is_non_empty_file(bam_filtered_fpath):
        filter_rule = 'not unmapped and not supplementary and not secondary_alignment'
        sambamba_view(bam_fpath,
                      bam_filtered_fpath,
                      qconfig.max_threads,
                      err_fpath,
                      logger,
                      filter_rule=filter_rule)
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_sorted_fpath):
        sort_bam(bam_filtered_fpath,
                 bam_sorted_fpath,
                 err_fpath,
                 logger,
                 sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath,
                           using_reads,
                           bam_sorted_fpath,
                           err_fpath,
                           logger,
                           bedpe=using_reads == 'mp')
    intervals = defaultdict(list)
    if using_reads == 'mp':
        insert_size, std_dev = calculate_insert_size(sam_fpath,
                                                     output_dirpath,
                                                     ref_name,
                                                     reads_suffix='mp')
        min_is = insert_size - std_dev
        max_is = insert_size + std_dev
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            if using_reads == 'mp' and insert_size:
                interval_len = int(fs[2]) - int(fs[1])
                if min_is <= abs(interval_len) <= max_is:
                    intervals[fs[0]].append((int(fs[1]), int(fs[2])))
            else:
                intervals[fs[0]].append((int(fs[1]), int(fs[2])))
    return intervals
Exemplo n.º 13
0
def fill_gaps_single(ref_fpath, assembly_fpath, assembly_covered_regions, uncovered_fpath):
    single_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_assembly_fpath = add_suffix(assembly_fpath, single_polished_suffix)
    final_fasta = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = find_overlaps(assembly_covered_regions[name], single_reads_covered_regions[name], overlap=50)
        for i, region in enumerate(covered_regions):
            start, end = region
            final_fasta.append((name.split()[0] + "_" + str(i + 1), seq[start: end]))
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
Exemplo n.º 14
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath):
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return sam_fpath
Exemplo n.º 15
0
def seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, fasta_fpath=None, seq=None, name=None, is_ref=False,
                     intersect_with=None, kmer_fraction=1):
    can_reuse = True
    if not fasta_fpath:
        can_reuse = False
        fasta_fpath = join(tmp_dirpath, name + '.fasta')
        if is_ref:
            fasta_fpath = add_suffix(fasta_fpath, 'reference')
        with open(fasta_fpath, 'w') as out_f:
            out_f.write(seq)
    kmc_out_fpath = count_kmers(tmp_dirpath, fasta_fpath, log_fpath, err_fpath, can_reuse=can_reuse)
    if intersect_with:
        kmc_out_fpath = intersect_kmers(tmp_dirpath, [kmc_out_fpath, intersect_with], log_fpath, err_fpath)
    return kmc_out_fpath
Exemplo n.º 16
0
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath):
    sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam')
    bam_fpath = sam_fpath.replace('.sam', '.bam')
    bam_mapped_fpath = add_suffix(bam_fpath, 'mapped')
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_fpath):
        bwa_index(ref_fpath, err_fpath, logger)
        qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath],
                               stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    if not is_non_empty_file(bam_sorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'not unmapped', bam_fpath],
                               stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    ref_name = qutils.name_from_fpath(ref_fpath)
    correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger)
    get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath,
                 correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return uncovered_fpath
Exemplo n.º 17
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Exemplo n.º 18
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Exemplo n.º 19
0
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
            if reads_type == 'paired_end':
                insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
                if insert_size < qconfig.ideal_assembly_max_IS:
                    insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.ideal_assembly_insert_size = max(insert_sizes)
Exemplo n.º 20
0
def download_wgsmaster_contigs(ref_id, ref_fpath):
    temp_fpath = add_suffix(ref_fpath, 'tmp') + '.gz'
    response = try_send_request(
        ncbi_url +
        'esummary.fcgi?db=nuccore&id=%s&rettype=text&validate=false' % ref_id)
    xml_tree = ET.fromstring(response)

    for field in xml_tree[0]:
        if field.get('Name') == 'Extra':
            download_system = field.text.split('|')[-1][:6]
            genome_version = int(field.text.split('|')[3].split('.')[-1])
            break
    fsize = None
    while genome_version != 0 and not fsize:
        try:
            fname = "%s.%s.fsa_nt.gz" % (download_system, genome_version)
            url = "ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/%s/%s/%s/%s" % (
                download_system[:2], download_system[2:4], download_system,
                fname)
            response = urlopen(url)
            meta = response.info()
            fsize = int(meta.getheaders("Content-length")[0])
            bsize = 1048576
        except:
            fsize = None
            if genome_version != 0:
                genome_version -= 1
    with open(temp_fpath, 'wb') as f:
        while True:
            buffer = response.read(bsize)
            if not buffer:
                break
            f.write(buffer)

    with open(ref_fpath, 'w') as f:
        subprocess.call(['gunzip', '-c', temp_fpath], stdout=f)
    os.remove(temp_fpath)
Exemplo n.º 21
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Simulating Optimal Assembly...")

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)
    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)

    if os.path.isfile(result_fpath) or os.path.isfile(
            ref_prepared_optimal_assembly):
        already_done_fpath = result_fpath if os.path.isfile(
            result_fpath) else ref_prepared_optimal_assembly
        logger.notice(
            '  Will reuse already generated Optimal Assembly with insert size %d (%s)'
            % (insert_size, already_done_fpath))
        return already_done_fpath

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Optimal Assembly on this platform, skipping...'
        )
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning('  Sorry, can\'t create Optimal Assembly, skipping...')
        return None

    log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size,
        uncovered_fpath)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Optimal Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_uncovered_fpath(
            uncovered_fpath, ref_fpath, return_covered_regions=False
        ) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretically optimal Assembly saved to ' +
                result_fpath)
    logger.notice(
        'You can copy it to ' + ref_prepared_optimal_assembly +
        ' and QUAST will reuse it in further runs against the same reference ('
        + original_ref_fpath + ')')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
Exemplo n.º 22
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Exemplo n.º 23
0
def align_and_analyze(is_cyclic,
                      index,
                      contigs_fpath,
                      output_dirpath,
                      ref_fpath,
                      old_contigs_fpath,
                      bed_fpath,
                      parallel_by_chr=False,
                      threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stdout')
        log_err_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stderr')
        icarus_out_fpath = join(
            output_dirpath,
            qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.mis_contigs.info')
        unaligned_info_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = [
        'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous',
        'Best_group'
    ]
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' +
                    log_out_fpath + ' and ' + os.path.basename(log_err_fpath) +
                    '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath,
                                  old_contigs_fpath, index, parallel_by_chr,
                                  threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error(
                    '  ' + qutils.index_to_str(index) +
                    'Failed aligning contigs ' +
                    qutils.label_from_fpath(contigs_fpath) +
                    ' to the reference (non-zero exit code). ' +
                    ('Run with the --debug flag to see additional information.'
                     if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Alignment failed for ' +
                    contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Alignment failed for ' + '\'' + assembly_label +
                            '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Nothing aligned for ' +
                    contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Nothing aligned for ' + '\'' + assembly_label +
                            '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n')  # TODO: move up
    references = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        references[name] = seq
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(
                line[0]
            )  # Kolya: python don't convert int<->str types automatically
            loc = int(line[3])  # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(
                    SNP(ref_pos=pos,
                        ctg_pos=loc,
                        ref_nucl=line[1],
                        ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [
                    SNP(ref_pos=pos,
                        ctg_pos=loc,
                        ref_nucl=line[1],
                        ctg_nucl=line[2])
                ]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    ref_lens = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq in references.items():
        regions.setdefault(name, []).append([1, len(seq)])
        ref_lens[name] = len(seq)
        total_regions += 1
        total_reg_len += ref_lens[name]
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f,
                         misassembly_f=misassembly_f,
                         coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file,
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)
    if qconfig.large_genome:
        log_out_f.write('Analyzing large blocks...\n')
        large_misassembly_fpath = add_suffix(
            misassembly_fpath,
            'large_blocks') if not qconfig.space_efficient else '/dev/null'
        ca_large_output = CAOutput(stdout_f=log_out_f,
                                   misassembly_f=open(large_misassembly_fpath,
                                                      'w'),
                                   coords_filtered_f=coords_filtered_file,
                                   used_snps_f=open('/dev/null', 'w'),
                                   icarus_out_f=open('/dev/null', 'w'))
        min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
        qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
        result.update(
            analyze_contigs(ca_large_output,
                            contigs_fpath,
                            '/dev/null',
                            '/dev/null',
                            aligns,
                            ref_features,
                            ref_lens,
                            is_cyclic,
                            large_misassemblies_search=True)[0])
        qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(
        analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps,
                         total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath,
                           total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq)
                 for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(
            join(output_dirpath,
                 qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'),
            fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(
            output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(
            output_dirpath,
            qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' +
                     qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(
                                r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(
                                    contig)[0][0]
                                contig_cov = len_cov_pattern.findall(
                                    contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' +
                                                           str(aligned_len) +
                                                           '\t' + contig_cov +
                                                           '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Exemplo n.º 24
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Exemplo n.º 25
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Generating Upper Bound Assembly...")

    if not reads_analyzer.compile_reads_analyzer_tools(logger):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...'
        )
        return None

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly on this platform '
            '(only linux64 and macOS are supported), skipping...')
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to install/download third-party repeat finding tool [Red]), skipping...'
        )
        return None

    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    if long_reads:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, mp_polished_suffix)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)
    already_done_fpath = check_prepared_optimal_assembly(
        insert_size, result_fpath, ref_prepared_optimal_assembly)
    if already_done_fpath:
        return already_done_fpath

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)

    if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size:
        calculated_insert_size = qconfig.optimal_assembly_insert_size
        result_fpath = result_fpath.replace('is' + str(insert_size),
                                            'is' + str(calculated_insert_size))
        prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace(
            'is' + str(insert_size), 'is' + str(calculated_insert_size))
        insert_size = calculated_insert_size
        ref_prepared_optimal_assembly = os.path.join(
            os.path.dirname(original_ref_fpath),
            prepared_optimal_assembly_basename)
        already_done_fpath = check_prepared_optimal_assembly(
            insert_size, result_fpath, ref_prepared_optimal_assembly)
        if already_done_fpath:
            return already_done_fpath

    log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath,
        tmp_dir,
        log_fpath,
        binary_fpath,
        insert_size,
        uncovered_fpath,
        use_long_reads=long_reads)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Upper Bound Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_bed(
            uncovered_fpath) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretical Upper Bound Assembly is saved to ' +
                result_fpath)
    logger.notice(
        '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n'
        '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). '
        'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n'
        '\t\tOR\n'
        '\tYou can copy ' + result_fpath + ' to ' +
        ref_prepared_optimal_assembly + '. '
        'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference ('
        + original_ref_fpath + ') and '
        'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size '
        + str(insert_size) + '), '
        'QUAST will reuse this Upper Bound Assembly.\n')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
Exemplo n.º 26
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Simulating Ideal Assembly...")

    uncovered_fpath = None
    if qconfig.paired_reads or qconfig.reference_sam or qconfig.reference_sam:
        sam_fpath, uncovered_fpath = reads_analyzer.align_reference(ref_fpath, join(dirname(output_dirpath),
                                                                    qconfig.reads_stats_dirname), using_reads='paired_end')
    insert_size = qconfig.ideal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.ideal_assembly_default_IS
    if insert_size % 2 == 0:
        insert_size += 1
        logger.notice('  Current implementation cannot work with even insert sizes, '
                      'will use the closest odd value (%d)' % insert_size)

    ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (ref_basename, qconfig.ideal_assembly_basename, insert_size)
    if qconfig.paired_reads and qconfig.unpaired_reads:
        result_basename = add_suffix(result_basename, single_polished_suffix)
    if qconfig.paired_reads and qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(original_ref_fpath))
    prepared_ideal_assembly_basename = '%s.%s.is%d.fasta' % (original_ref_basename, qconfig.ideal_assembly_basename, insert_size)
    ref_prepared_ideal_assembly = os.path.join(os.path.dirname(original_ref_fpath), prepared_ideal_assembly_basename)

    if os.path.isfile(result_fpath) or os.path.isfile(ref_prepared_ideal_assembly):
        already_done_fpath = result_fpath if os.path.isfile(result_fpath) else ref_prepared_ideal_assembly
        logger.notice('  Will reuse already generated Ideal Assembly with insert size %d (%s)' %
                      (insert_size, already_done_fpath))
        return already_done_fpath

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t create Ideal Assembly on this platform, skipping...')
        return None

    base_aux_dir = os.path.join(qconfig.LIBS_LOCATION, 'ideal_assembly')
    configs_dir = os.path.join(base_aux_dir, 'configs')
    binary_fpath = download_external_tool('spades', os.path.join(base_aux_dir, 'bin'), 'spades', platform_specific=True)
    if not os.path.isfile(binary_fpath):
        logger.warning('  Sorry, can\'t create Ideal Assembly, skipping...')
        return None

    log_fpath = os.path.join(output_dirpath, 'spades.log')

    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    processed_ref_fpath = preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath)

    dst_configs = os.path.join(tmp_dir, 'configs')
    main_config = os.path.join(dst_configs, 'config.info')
    dir_util._path_created = {}  # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree
    dir_util.copy_tree(configs_dir, dst_configs, preserve_times=False)

    prepare_config_spades(main_config, insert_size, processed_ref_fpath, tmp_dir)

    log_file = open(log_fpath, 'w')
    spades_output_fpath = os.path.join(tmp_dir, 'K%d' % insert_size, 'ideal_assembly.fasta')
    logger.info('  ' + 'Running SPAdes with K=' + str(insert_size) + '...')
    return_code = qutils.call_subprocess(
        [binary_fpath, main_config], stdout=log_file, stderr=log_file, indent='    ')
    if return_code != 0 or not os.path.isfile(spades_output_fpath):
        logger.error('  Failed to create Ideal Assembly, see log for details: ' + log_fpath)
        return None

    if qconfig.mate_pairs or qconfig.unpaired_reads:
        spades_output_fpath = polish_assembly(ref_fpath, spades_output_fpath, output_dirpath, tmp_dir)

    shutil.move(spades_output_fpath, result_fpath)
    logger.info('  ' + 'Ideal Assembly saved to ' + result_fpath)
    logger.notice('You can copy it to ' + ref_prepared_ideal_assembly +
                  ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath