Пример #1
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
Пример #2
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
Пример #3
0
def _read_vcf_records_per_bed_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath,
                                                  region_type, sample):
    info()
    info('Intersecting VCF ' + vcf_fpath + ' using BED ' + bed_fpath)

    vcf_columns_num = count_bed_cols(vcf_fpath)
    bed_columns_num = count_bed_cols(bed_fpath)

    vcf_bed_intersect = join(
        cnf.work_dir,
        splitext(basename(vcf_fpath))[0] + '_' + region_type +
        '_vcf_bed.intersect')
    bedtools = get_system_path(cnf, 'bedtools')
    if not cnf.reuse_intermediate or not verify_file(
            vcf_bed_intersect, silent=True, is_critical=False):
        cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath} -wo'.format(
            **locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vcf_bed_intersect,
                   max_number_of_tries=1,
                   exit_on_error=False)
        if not res:
            return None, None, None, None

    regions_in_order = []
    regions_set = set()
    vars_by_region = defaultdict(dict)
    var_by_site = dict()

    clipped_vcf_fpath = intermediate_fname(cnf,
                                           splitext(basename(vcf_fpath))[0],
                                           '_' + region_type + '_clip')

    with open(vcf_bed_intersect) as f, open(clipped_vcf_fpath,
                                            'w') as clip_vcf:
        for l in f:
            l = l.strip()
            if not l or l.startswith('#'):
                clip_vcf.write(l + '\n')
                continue
            fs = l.split('\t')
            chrom, pos, id_, ref, alt, qual, filt, info_fields = fs[:8]
            chrom_b, start_b, end_b, symbol, strand, feature, biotype = None, None, None, None, None, None, None
            if bed_columns_num >= 8:
                chrom_b, start_b, end_b, symbol, _, strand, feature, biotype, _ = fs[
                    -(bed_columns_num + 1):][:9]
            elif bed_columns_num >= 4:
                chrom_b, start_b, end_b, symbol, _ = fs[-(bed_columns_num +
                                                          1):][:5]
            assert chrom == chrom_b, l
            r = chrom, id_, start_b, end_b, symbol, strand, feature, biotype
            if r not in regions_set:
                regions_set.add(r)
                regions_in_order.append(r)

            cls = None
            if '=Hotspot' in info_fields: cls = 'Hotspot'
            if '=Deleterious' in info_fields: cls = 'Deleterious'
            if cls:
                var = Variant(chrom, pos, ref, alt, cls)
                vars_by_region[r][(chrom, pos, ref, alt)] = var
                var_by_site[(chrom, pos, ref, alt)] = var
                clip_vcf.write('\t'.join(
                    [chrom, pos, id_, ref, alt, qual, filt, info_fields]) +
                               '\n')

    clipped_gz_vcf_fpath = bgzip_and_tabix(cnf,
                                           clipped_vcf_fpath,
                                           max_number_of_tries=1,
                                           exit_on_error=False)

    return clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site
Пример #4
0
def __seq2c_coverage(cnf, samples, bams_by_sample, bed_fpath, is_wgs, output_fpath):
    if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    jobs_by_sample = dict()
    depth_output_by_sample = dict()
    seq2cov_output_by_sample = dict()
    seq2c_work_dirpath = join(cnf.work_dir, source.seq2c_name)
    safe_mkdir(seq2c_work_dirpath)
    info()

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0
    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []

        for s in not_submitted_samples:
            info('*' * 50)
            info(s.name + ':')
            with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf:
                safe_mkdir(cnf.work_dir)
                seq2cov_output_by_sample[s.name] = join(seq2c_work_dirpath, s.name + '.seq2cov.txt')

                if not cnf.reuse_intermediate and isfile(seq2cov_output_by_sample[s.name]):
                    os.remove(seq2cov_output_by_sample[s.name])

                if cnf.reuse_intermediate and verify_file(seq2cov_output_by_sample[s.name], silent=True):
                    info(seq2cov_output_by_sample[s.name] + ' exists, reusing')
                    reused_samples.append(s)
                    continue

                elif verify_file(s.targetcov_detailed_tsv, silent=True):
                    info('Using targetcov detailed output for Seq2C coverage.')
                    info(s.name + ': using targetseq output')
                    targetcov_details_to_seq2cov(cnf, s.targetcov_detailed_tsv, seq2cov_output_by_sample[s.name], s.name, is_wgs=is_wgs)
                    reused_samples.append(s)
                    continue

                else:
                    info(s.name + ': ' + s.targetcov_detailed_tsv + ' does not exist: submitting sambamba depth')
                    bam_fpath = bams_by_sample[s.name]
                    depth_output = join(seq2c_work_dirpath, s.name + '_depth' + '.txt')
                    depth_output_by_sample[s.name] = depth_output
                    if cnf.reuse_intermediate and verify_file(depth_output, silent=True):
                        info(depth_output + ' exists, reusing')
                        reused_samples.append(s)
                        continue
                    else:
                        j = sambamba_depth(cnf, bed_fpath, bam_fpath, depth_output, use_grid=True, sample_name=s.name)
                        jobs_by_sample[s.name] = j
                        submitted_samples.append(s)

                        if not j.is_done:
                            jobs_to_wait.append(j)

                        if len(jobs_to_wait) >= cnf.threads:
                            not_submitted_samples = [_s for _s in not_submitted_samples if
                                                     _s not in submitted_samples and
                                                     _s not in reused_samples]

                            if not_submitted_samples:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before '
                                         'submitting more ' + str(len(not_submitted_samples)))
                            else:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                            info()
                            break
                        info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if 'work_dir' in j.__dict__ and isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [s for s in not_submitted_samples if
                                 s not in submitted_samples and
                                 s not in reused_samples]
        info()
        info('*' * 50)

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))

    # wait_for_jobs(cnf, jobs_by_sample.values())
    for s_name, seq2cov_output_fpath in seq2cov_output_by_sample.items():
        if not isfile(seq2cov_output_fpath):
            if verify_file(depth_output_by_sample[s_name], is_critical=True, description='depth_output_by_sample for ' + s_name):
                info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
                bed_col_num = count_bed_cols(bed_fpath)
                sambamba_depth_to_seq2cov(cnf, depth_output_by_sample[s_name], seq2cov_output_by_sample[s_name], s_name, bed_col_num)

            # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
            #                             is_critical=True)
            # bedcov_hist_fpath = depth_output_by_sample[s_name]
            # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
            # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
            # sum_jobs_by_sample[s_name] = j

    # sum_jobs_by_sample = dict()
    # info('* Submitting seq2cov output *')
    # for s_name, j in jobs_by_sample.items():
    #     if not verify_file(seq2cov_output_by_sample[s_name], silent=True):
    #         info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
    #
    #         script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
    #                                     is_critical=True)
    #         bedcov_hist_fpath = depth_output_by_sample[s_name]
    #         bed_col_num = count_bed_cols(seq2c_bed)
    #         cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
    #         j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
    #         sum_jobs_by_sample[s_name] = j
    #
    # wait_for_jobs(cnf, sum_jobs_by_sample.values())

    info()
    info('Done')
    info('*' * 50)
    info()
    info('Combining seq2cov output')
    with open(output_fpath, 'w') as out:
        for i, s in enumerate(samples):
            verify_file(seq2cov_output_by_sample[s.name], description='seq2cov_output for ' + s.name, is_critical=True)
            with open(seq2cov_output_by_sample[s.name]) as inp:
                for l in inp:
                    out.write(l)

    verify_file(output_fpath, description='__simulate_cov2cnv_w_bedtools output_fpath', is_critical=True)
    info('Saved combined seq2cov output to ' + output_fpath)
    info()
    return output_fpath
Пример #5
0
def main():
    if len(sys.argv) < 2:
        sys.exit('Usage: ' + __file__ + ' bed_file > merged_bed_file')

    summarize_by_genes = True  # len(sys.argv) > 2
    # sys.stderr.write('Setting summarize_by_genes to ' + str(summarize_by_genes) + '\n')

    num_bed_cols = count_bed_cols(sys.argv[1])
    if num_bed_cols < 3:
        sys.exit('Incorrect number of fields: ' + str(num_bed_cols) +
                 '. Should be at least 3.')
    if num_bed_cols < 7:
        summarize_by_genes = False
        sys.stderr.write(
            'less than 7 columns in BED; no summarizing by genes\n')

    gene_by_chrom_and_name = OrderedDict()

    total_lines = 0
    feature_counter = defaultdict(int)
    with open(sys.argv[1]) as inp:
        for l in inp:
            if not l:
                pass
            elif l.startswith('#') or '\t' not in l:
                sys.stdout.write(l)
            else:
                fields = l.replace('\n', '').split('\t')
                if len(fields) != num_bed_cols:
                    sys.stderr.write(
                        'Error: number of fields inconsistent. Expected ' +
                        str(num_bed_cols) + ', got ' + str(len(fields)) +
                        ' at + ' + ' | '.join(fields) + '\n')
                    sys.exit(1)
                else:
                    chrom, start, end = fields[:3]
                    start, end = int(start), int(end)
                    gname = fields[3] if num_bed_cols >= 4 else '.'
                    strand = fields[5] if num_bed_cols >= 6 else None
                    (feature,
                     biotype) = fields[6:8] if num_bed_cols >= 8 else (None,
                                                                       None)
                    if feature:
                        feature_counter[feature] += 1

                    gene = gene_by_chrom_and_name.get((chrom, gname, strand))
                    if gene is None:
                        gene = Gene(gname, chrom, strand, 'Gene', biotype)
                        gene_by_chrom_and_name[(chrom, gname, strand)] = gene

                    if feature in [
                            'Gene', 'Multi_Gene'
                    ]:  # in fact '*Gene' features in BED files are optional
                        if gene.already_met_gene_feature_for_this_gene:
                            # sys.stderr.write(gene.name + ' is duplicating: ' + str(gene) + '\n')
                            # sys.exit(1)
                            # miltiple records for gene, picking the lowest start and the biggest end
                            gene.start = min(gene.start, start)
                            gene.end = max(gene.end, end)
                            gene.biotype = merge_fields(gene.biotype, biotype)
                            # assert gene.strand == strand, 'Prev gene strand is ' + gene.strand + ', new strand is ' + strand + ' gene is ' + gene.name

                        assert gene.strand == strand, str(
                            gene) + ' strand is not ' + strand
                        gene.feature = feature
                        gene.start = start
                        gene.end = end
                        gene.biotype = biotype
                        gene.already_met_gene_feature_for_this_gene = True

                    elif feature in [
                            None, '.', 'CDS', 'Exon', 'UTR/Intron/Decay'
                    ]:
                        assert gene.strand == strand, str(
                            gene) + ' strand is not ' + strand
                        gene.regions.append(
                            Exon(int(start), int(end), biotype, feature))

            total_lines += 1
            if total_lines % 10000 == 0:
                sys.stderr.write('processed ' + str(total_lines / 1000) +
                                 'k lines\n')
                sys.stderr.flush()

    sys.stderr.write('Processed ' + str(total_lines) + ' lines, found ' +
                     str(len(gene_by_chrom_and_name)) + ' unique genes.\n')
    if feature_counter:
        sys.stderr.write('Features:\n')
        for ft, cnt in feature_counter.items():
            sys.stderr.write('  ' + ft + ': ' + str(cnt) + '\n')
    sys.stderr.write('\n')

    genes = []
    for gene in gene_by_chrom_and_name.values():
        if gene.sort_regions() is not None:
            genes.append(gene)

    sys.stderr.write('Merging regions...\n')
    final_regions = []
    for gene in sorted(genes, key=lambda g: g.get_key()):
        if summarize_by_genes and gene.name != '.':
            final_regions.append((gene.chrom, gene.start, gene.end, gene.name,
                                  gene.strand, gene.feature, gene.biotype))

        merged_regions = gene.merge_regions()
        for r in merged_regions:
            final_regions.append((gene.chrom, r.start, r.end, gene.name,
                                  gene.strand, r.feature, r.biotype))

    sys.stderr.write('Merged, regions after merge: ' +
                     str(len(final_regions)) + ', saving...\n')

    for chrom, start, end, gname, strand, feature, biotype in final_regions:
        fs = [
            chrom,
            str(start),
            str(end), gname, '.', strand or '.', feature or '.', biotype or '.'
        ]
        sys.stdout.write('\t'.join(fs[:num_bed_cols]) + '\n')
    sys.stderr.write('Saved\n')