Exemplo n.º 1
0
def read_samples(sample2bam_fpath):
    bam_fpaths = []
    sample_names = []
    bad_bam_fpaths = []

    info('Reading sample info from ' + sample2bam_fpath)
    with open(sample2bam_fpath) as f:
        for l in f:
            if l.startswith('#'):
                continue
            l = l.replace('\n', '')
            if not l:
                continue
            sample_name = None
            if len(l.split('\t')) == 2:
                sample_name, bam_fpath = l.split('\t')
            else:
                sample_name, bam_fpath = None, l
            if not verify_bam(bam_fpath):
                bad_bam_fpaths.append(bam_fpath)
            bam_fpath = verify_bam(bam_fpath, is_critical=True)
            bam_fpaths.append(bam_fpath)

            if sample_name is None:
                sample_name = basename(splitext(bam_fpath)[0])
                if sample_name.endswith('-ready'):
                    sample_name = sample_name.split('-ready')[0]
            sample_names.append(sample_name)
            info(sample_name + ': ' + bam_fpath)

    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths))

    return sample_names, bam_fpaths
Exemplo n.º 2
0
def proc_fastq(cnf, sample, l_fpath, r_fpath):
    if cnf.downsample_to:
        info('Downsampling the reads to ' + str(cnf.downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample.nname,
                                      l_fpath,
                                      r_fpath,
                                      cnf.downsample_to,
                                      output_dir=cnf.work_dir,
                                      suffix='subset')

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bwa = get_system_path(cnf, 'bwa')
    bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
    if not (sambamba and bwa and bammarkduplicates):
        critical(
            'sambamba, BWA, and bammarkduplicates are required to align BAM')
    info()
    info('Aligning reads to the reference')
    bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa,
                      bammarkduplicates, cnf.genome.bwa, cnf.is_pcr)
    bam_fpath = verify_bam(bam_fpath)
    if not bam_fpath:
        critical('Sample ' + sample + ' was not aligned successfully.')
    return bam_fpath
 def _verify_input_file(_key):
     cnf[_key] = adjust_path(cnf[_key])
     if not verify_file(cnf[_key], _key):
         return False
     if 'bam' in _key and not verify_bam(cnf[_key]):
         return False
     if 'bed' in _key and not verify_bed(cnf[_key]):
         return False
     return True
Exemplo n.º 4
0
def main():
    cnf, output_dir, fastq_fpaths = proc_opts()

    targqc_dirpath = output_dir

    fastqs_by_sample = find_fastq_pairs(fastq_fpaths)
    samples = []
    for sname, (l, r) in fastqs_by_sample.items():
        s = source.TargQC_Sample(sname, join(cnf.output_dir, sname))
        s.l_fpath = l
        s.r_fpath = r
        samples.append(s)

    threads = len(samples)
    info('Found ' + str(len(samples)) + ' samples.')
    if len(samples) == 0:
        critical('ERROR: No fastq pairs found.')
    info()

    # samples = [source.TargQC_Sample(
    #     s.name,
    #     dirpath=join(targqc_dirpath, s.name),
    #     bed=cnf.bed) for s in fastq_fpaths]

    if cnf.downsample_to == 0:
        lefts = [s.l_fpath for s in samples]
        rights = [s.r_fpath for s in samples]
    else:
        if cnf.downsample_to is None:
            downsample_to = int(5e5)
        else:
            downsample_to = cnf.downsample_to

        info('Downsampling the reads to ' + str(downsample_to))
        lefts, rights = downsample_fastq(cnf, samples, downsample_to)

    bam_by_sample = OrderedDict()
    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bwa = get_system_path(cnf, 'bwa')
    bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
    if sambamba and bwa and bammarkduplicates:
        info()
        info('Aligning reads to the reference')
        bam_fpaths = Parallel(n_jobs=threads)(
            delayed(align)(CallCnf(cnf.__dict__), s, l, r, sambamba, bwa,
                           bammarkduplicates, cnf.genome.bwa, cnf.is_pcr)
            for s, l, r in zip(samples, lefts, rights))
        for sample, bam_fpath in zip(samples, bam_fpaths):
            if verify_bam(bam_fpath):
                bam_by_sample[sample.name] = bam_fpath
            else:
                err('Sample ' + sample + ' was not aligned successfully.')
        if not bam_by_sample:
            err('ERROR: No sample was alined.')
        else:
            info()
            cnf.work_dir = join(cnf.work_dir, source.targqc_name)
            safe_mkdir(cnf.work_dir)
            info('Making TargQC reports for BAMs from reads')
            safe_mkdir(targqc_dirpath)
            run_targqc(cnf, bam_by_sample, cnf.bed, targqc_dirpath)
            cnf.work_dir = dirname(cnf.work_dir)
            info('Done TargQC')
    info()
    info('*' * 70)
def main(args):
    if len(args) < 2:
        sys.exit('Usage ' + __file__ +
                 ' input.tsv bcbio.csv [dir_with_bams] [bina_dir]')

    inp_fpath = args[0]
    verify_file(args[0], is_critical=True)

    out_fpath = args[1]
    verify_dir(dirname(adjust_path(out_fpath)), is_critical=True)

    bam_dirpath = None
    if len(args) > 2:
        bam_dirpath = args[2]
        verify_dir(adjust_path(bam_dirpath), is_critical=True)

    # bam_opt = args[2]
    # try:
    #     bam_col = int(bam_opt)
    #     bam_dirpath = None
    # except ValueError:
    #     bam_col = None
    #     verify_dir(bam_opt, is_critical=True)
    #     bam_dirpath = args[2]

    bina_dirpath = None
    if len(args) > 3:
        bina_dirpath = args[3]
        verify_dir(dirname(adjust_path(bina_dirpath)), is_critical=True)

    # filtered_bams_dirpath = adjust_path(sys.argv[3])
    # verify_dir(join(filtered_bams_dirpath, os.pardir), is_critical=True)

    columns_names = 'study	barcode	disease	disease_name	sample_type	sample_type_name	analyte_type	library_type	center	center_name	platform	platform_name	assembly	filename	 files_size 	checksum	analysis_id	aliquot_id	participant_id	sample_id	tss_id	sample_accession	published	uploaded	modified	state	reason'

    samples_by_patient = defaultdict(list)

    delim = '\t'
    barcode_col = 1
    bam_col = 13
    is_tcga_tsv = True

    with open(inp_fpath) as fh:
        for i, l in enumerate(fh):
            if not l.strip():
                continue

            if i == 0:
                if len(l.split('\t')) == 27:
                    err('Interpreting as TCGA tsv')
                    if l.split('\t')[0] != 'TCGA': continue  # skipping header
                else:
                    delim = None
                    for j, f in enumerate(l.split()):
                        if f.startswith('TCGA'):
                            barcode_col = j
                            err('barcode col is ' + str(j))
                        if f.endswith('bam'):
                            bam_col = j
                            err('bam col is ' + str(j))
                    is_tcga_tsv = False

            fs = l.split(delim)

            barcode = fs[barcode_col].split(
                '-')  # TCGA-05-4244-01A-01D-1105-08

            sample = Sample()
            sample.bam = fs[bam_col]
            sample.bam_base_name = basename(os.path.splitext(fs[bam_col])[0])
            sample.description = fs[barcode_col]
            sample.patient = '-'.join(barcode[:3])
            if is_tcga_tsv:
                sample.reason = fs[26]

            sample_type = int(barcode[3][:2])
            if sample_type >= 20 or sample_type <= 0:
                continue
            sample.is_normal = 10 <= sample_type < 20
            sample.is_blood = sample_type in [
                3, 4, 9, 10
            ]  # https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm

            if any(s.description == sample.description
                   for s in samples_by_patient[sample.patient]):
                prev_sample = next(s
                                   for s in samples_by_patient[sample.patient]
                                   if s.description == sample.description)

                # comp reason
                # if 'Fileset modified' not in prev_sample.reason and 'Fileset modified' in sample.reason:
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in old ' + prev_sample.name + ' over ' + sample.name)
                #     pass
                # elif 'Fileset modified' in prev_sample.reason and 'Fileset modified' not in sample.reason:
                #     samples_by_patient[sample.patient].remove(prev_sample)
                #     samples_by_patient[sample.patient].append(sample)
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in new ' + sample.name + ' over ' + prev_sample.name)
                # else:
                # comp version
                prev_version = get_bam_version(prev_sample.bam_base_name)
                version = get_bam_version(sample.bam_base_name)
                err('Duplicated sample: ' + sample.description +
                    '  Resolving by version (' + ' over '.join(
                        map(str,
                            sorted([prev_version, version])[::-1])) + ')')
                if version > prev_version:
                    samples_by_patient[sample.patient].remove(prev_sample)
                    samples_by_patient[sample.patient].append(sample)
            else:
                samples_by_patient[sample.patient].append(sample)

    batches = []
    final_samples = set()

    if bina_dirpath:
        safe_mkdir(bina_dirpath)

    for patient, patient_samples in samples_by_patient.iteritems():
        tumours = [s for s in patient_samples if not s.is_normal]
        normals = [s for s in patient_samples if s.is_normal]

        main_normal = None
        if len(normals) >= 1:
            if any(n.is_blood for n in normals):
                main_normal = next(n for n in normals if n.is_blood)
            else:
                main_normal = normals[0]
                if tumours:
                    for n in normals[1:]:
                        b = Batch(n.description + '-batch')
                        b.tumour = n
                        batches.append(b)

        for t in tumours:
            b = Batch(t.description + '-batch')
            b.tumour = t
            t.batches.add(b)
            final_samples.add(t)
            if main_normal:
                b.normal = main_normal
                main_normal.batches.add(b)
                final_samples.add(main_normal)
            batches.append(b)

        ##################
        ###### Bina ######
        if bina_dirpath:
            bina_patient_dirpath = join(bina_dirpath, patient)
            safe_mkdir(bina_patient_dirpath)
            normals_csv_fpath = join(bina_patient_dirpath, 'normals.csv')
            tumours_csv_fpath = join(bina_patient_dirpath, 'tumors.csv')

            if main_normal:
                with open(normals_csv_fpath, 'w') as f:
                    f.write('name,bam\n')
                    bam_fpath = join(
                        bam_dirpath,
                        main_normal.bam) if bam_dirpath else main_normal.bam
                    f.write(main_normal.description + ',' + bam_fpath + '\n')

            with open(tumours_csv_fpath, 'w') as f:
                f.write('name,bam\n')
                for t in tumours:
                    bam_fpath = join(bam_dirpath,
                                     t.bam) if bam_dirpath else t.bam
                    f.write(t.description + ',' + bam_fpath + '\n')

    if bina_dirpath:
        err('Saved bina CSVs to ' + bina_dirpath)

    ###########################
    ######## Bcbio CSV ########
    print 'bcbio_nextgen.py -w template bcbio.yaml', out_fpath,
    with open(out_fpath, 'w') as out:
        out.write('sample,description,batch,phenotype\n')
        for s in sorted(final_samples, key=lambda s: s.bam_base_name):
            out.write(','.join([
                s.bam_base_name, s.description, ';'.join(
                    sorted(b.name for b in s.batches)),
                ('normal' if s.is_normal else 'tumor')
            ]) + '\n')
            bam_fpath = join(bam_dirpath, s.bam) if bam_dirpath else s.bam

            if verify_bam(bam_fpath, is_critical=False):
                try:
                    bam = pysam.Samfile(bam_fpath, "rb")
                except ValueError:
                    err(traceback.format_exc())
                    err('Cannot read ' + bam_fpath)
                    err()
                    # n_rgs = max(1, len(bam.header.get("RG", [])))
                else:
                    print bam_fpath,
Exemplo n.º 6
0
def main(args):
    cnf = read_opts_and_cnfs(extra_opts=[
        (['--bam'], dict(dest='bam', help='a path to the BAM file to study')),
        (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')),
        (['--bed', '--capture', '--amplicons'],
         dict(dest='bed', help='a BED file for capture panel or amplicons')),
        (['--exons', '--exome', '--features'],
         dict(
             dest='features',
             help=
             'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)'
         )),
        (['--exons-no-genes', '--features-no-genes'],
         dict(
             dest='features_no_genes',
             help=
             'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)'
         )),
        (['--original-bed'],
         dict(dest='original_target_bed', help=SUPPRESS_HELP)),
        (['--original-exons', '--original-features'],
         dict(
             dest='original_features_bed',
             help='original features genes bed file path (just for reporting)')
         ),
        (['--reannotate'],
         dict(dest='reannotate',
              help='re-annotate BED file with gene names',
              action='store_true',
              default=False)),
        (['--no-prep-bed'],
         dict(dest='prep_bed',
              help='do not fix input beds and exons',
              action='store_false',
              default=True)),
        (['-e', '--extended'],
         dict(dest='extended',
              help='extended - flagged regions and missed variants',
              action='store_true',
              default=False)),
        (['--genes'], dict(dest='genes', help='custom list of genes')),
        (['--padding'],
         dict(
             dest='padding',
             help=
             'integer indicating the number of bases to extend each target region up and down-stream. '
             'Default is ' + str(defaults['coverage_reports']['padding']),
             type='int')),
        (['--no-dedup'],
         dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)),
        (['--downsample-to'],
         dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)),
        (['--downsampled'],
         dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)),
        (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP))
    ],
                             file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'],
                             key_for_sample_name='bam')

    if cnf.padding:
        cnf.coverage_reports.padding = cnf.padding

    check_system_resources(cnf, required=['bedtools'], optional=[])

    check_genome_resources(cnf)

    features_bed = adjust_path(cnf.features) if cnf.features else adjust_path(
        cnf.genome.features)
    if features_bed:
        info('Features: ' + features_bed)
        features_bed = verify_file(features_bed)
    else:
        info('No features BED found')

    if cnf.bed:
        cnf.bed = verify_file(cnf.bed, is_critical=True)
        info('Using amplicons/capture panel ' + cnf.bed)
    elif features_bed:
        info('WGS, taking CDS as target')

    cnf.bam = verify_bam(cnf.bam, is_critical=True)

    reports = process_one(cnf,
                          cnf.output_dir,
                          cnf.bam,
                          features_bed=features_bed,
                          features_no_genes_bed=cnf.features_no_genes)
    summary_report, gene_report = reports[:2]

    info('')
    info('*' * 70)
    if summary_report.txt_fpath:
        info('Summary report: ' + summary_report.txt_fpath)
    if gene_report:
        if gene_report.txt_fpath:
            info('All regions: ' + gene_report.txt_fpath + ' (' +
                 str(len(gene_report.rows)) + ' regions)')

    if len(reports) > 2:
        selected_regions_report = reports[2]
        if selected_regions_report.txt_fpath:
            info('Flagged regions: ' + selected_regions_report.txt_fpath +
                 ' (' + str(len(selected_regions_report.rows)) + ' regions)')

    for fpaths in reports:
        if fpaths:
            ok = True
            info('Checking expected results...')
            if not isinstance(fpaths, list):
                fpaths = [fpaths]
            for fpath in fpaths:
                if isinstance(fpath, basestring):
                    if not verify_file(fpath):
                        ok = False
            if ok:
                info('The results are good.')

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
def read_samples_info_and_split(common_cnf, options, inputs):
    #TODO: _set_up_dirs(cnf) for each sample

    info('')
    info('Processing input details...')

    details = None
    for key in inputs:
        if options.get(key):
            common_cnf[key] = adjust_path(options[key])
            info('Using ' + common_cnf[key])
            details = [common_cnf]
    if not details:
        details = common_cnf.get('details')
    if not details:
        critical('Please, provide input ' + ', '.join(inputs) +
                 ' in command line or in run info yaml config.')

    all_samples = OrderedDict()

    for one_item_cnf in details:
        if 'vcf' not in one_item_cnf:
            critical('ERROR: A section in details does not contain field "var".')
        one_item_cnf['vcf'] = adjust_path(one_item_cnf['vcf'])
        verify_file(one_item_cnf['vcf'], 'Input file', is_critical=True)

        join_parent_conf(one_item_cnf, common_cnf)

        work_vcf = join(one_item_cnf['work_dir'], basename(one_item_cnf['vcf']))
        check_file_changed(one_item_cnf, one_item_cnf['vcf'], work_vcf)
        if not one_item_cnf.get('reuse_intermediate'):
            with open_gzipsafe(one_item_cnf['vcf']) as inp, open_gzipsafe(work_vcf, 'w') as out:
                out.write(inp.read())
        one_item_cnf['vcf'] = work_vcf

        vcf_header_samples = read_sample_names_from_vcf(one_item_cnf['vcf'])

        # MULTIPLE SAMPELS
        if ('samples' in one_item_cnf or one_item_cnf.get('split_samples')) and len(vcf_header_samples) == 0:
            sample_cnfs = _verify_sample_info(one_item_cnf, vcf_header_samples)

            for header_sample_name in vcf_header_samples:
                if header_sample_name not in sample_cnfs:
                    sample_cnfs[header_sample_name] = one_item_cnf.copy()

                if header_sample_name in all_samples:
                    critical('ERROR: duplicated sample name: ' + header_sample_name)

                cnf = all_samples[header_sample_name] = sample_cnfs[header_sample_name]
                cnf['name'] = header_sample_name
                if cnf.get('keep_intermediate'):
                    cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log')

                # cnf['vcf'] = extract_sample(cnf, one_item_cnf['vcf'], cnf['name'])
                info()

        # SINGLE SAMPLE
        else:
            cnf = one_item_cnf

            if 'bam' in cnf:
                cnf['bam'] = adjust_path(cnf['bam'])
                verify_bam(cnf['bam'], is_critical=True)

            cnf['name'] = splitext_plus(basename(cnf['vcf']))[0]

            if cnf.get('keep_intermediate'):
                cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log')

            cnf['vcf'] = work_vcf
            all_samples[cnf['name']] = cnf

    if not all_samples:
        info('No samples.')
    else:
        info('Using samples: ' + ', '.join(all_samples) + '.')

    return all_samples
Exemplo n.º 8
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--work-dir', dest='work_dir', metavar='DIR')
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='re-annotate BED file with gene names')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='count duplicates in coverage metrics')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        '--features',
        dest='features',
        help=
        'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.'
    )

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No BAMs provided to input.')
    bam_fpaths = list(set([abspath(a) for a in args]))

    bad_bam_fpaths = []
    for fpath in bam_fpaths:
        if not verify_bam(fpath):
            bad_bam_fpaths.append(fpath)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' +
                 ', '.join(bad_bam_fpaths))

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'TargQC'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name

    check_genome_resources(cnf)

    verify_bed(cnf.bed, is_critical=True)
    bed_fpath = adjust_path(cnf.bed)
    info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(
        cnf.features) if cnf.features else adjust_path(cnf.genome.features)
    info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    info('*' * 70)
    info()

    targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath,
                                   features_bed_fpath, genes_fpath)
    if targqc_html_fpath:
        send_email(
            cnf, 'TargQC report for ' + cnf.project_name + ':\n  ' +
            targqc_html_fpath)