def read_samples(sample2bam_fpath): bam_fpaths = [] sample_names = [] bad_bam_fpaths = [] info('Reading sample info from ' + sample2bam_fpath) with open(sample2bam_fpath) as f: for l in f: if l.startswith('#'): continue l = l.replace('\n', '') if not l: continue sample_name = None if len(l.split('\t')) == 2: sample_name, bam_fpath = l.split('\t') else: sample_name, bam_fpath = None, l if not verify_bam(bam_fpath): bad_bam_fpaths.append(bam_fpath) bam_fpath = verify_bam(bam_fpath, is_critical=True) bam_fpaths.append(bam_fpath) if sample_name is None: sample_name = basename(splitext(bam_fpath)[0]) if sample_name.endswith('-ready'): sample_name = sample_name.split('-ready')[0] sample_names.append(sample_name) info(sample_name + ': ' + bam_fpath) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths)) return sample_names, bam_fpaths
def proc_fastq(cnf, sample, l_fpath, r_fpath): if cnf.downsample_to: info('Downsampling the reads to ' + str(cnf.downsample_to)) l_fpath, r_fpath = downsample(cnf, sample.nname, l_fpath, r_fpath, cnf.downsample_to, output_dir=cnf.work_dir, suffix='subset') sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bwa = get_system_path(cnf, 'bwa') bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not (sambamba and bwa and bammarkduplicates): critical( 'sambamba, BWA, and bammarkduplicates are required to align BAM') info() info('Aligning reads to the reference') bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa, bammarkduplicates, cnf.genome.bwa, cnf.is_pcr) bam_fpath = verify_bam(bam_fpath) if not bam_fpath: critical('Sample ' + sample + ' was not aligned successfully.') return bam_fpath
def _verify_input_file(_key): cnf[_key] = adjust_path(cnf[_key]) if not verify_file(cnf[_key], _key): return False if 'bam' in _key and not verify_bam(cnf[_key]): return False if 'bed' in _key and not verify_bed(cnf[_key]): return False return True
def main(): cnf, output_dir, fastq_fpaths = proc_opts() targqc_dirpath = output_dir fastqs_by_sample = find_fastq_pairs(fastq_fpaths) samples = [] for sname, (l, r) in fastqs_by_sample.items(): s = source.TargQC_Sample(sname, join(cnf.output_dir, sname)) s.l_fpath = l s.r_fpath = r samples.append(s) threads = len(samples) info('Found ' + str(len(samples)) + ' samples.') if len(samples) == 0: critical('ERROR: No fastq pairs found.') info() # samples = [source.TargQC_Sample( # s.name, # dirpath=join(targqc_dirpath, s.name), # bed=cnf.bed) for s in fastq_fpaths] if cnf.downsample_to == 0: lefts = [s.l_fpath for s in samples] rights = [s.r_fpath for s in samples] else: if cnf.downsample_to is None: downsample_to = int(5e5) else: downsample_to = cnf.downsample_to info('Downsampling the reads to ' + str(downsample_to)) lefts, rights = downsample_fastq(cnf, samples, downsample_to) bam_by_sample = OrderedDict() sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bwa = get_system_path(cnf, 'bwa') bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if sambamba and bwa and bammarkduplicates: info() info('Aligning reads to the reference') bam_fpaths = Parallel(n_jobs=threads)( delayed(align)(CallCnf(cnf.__dict__), s, l, r, sambamba, bwa, bammarkduplicates, cnf.genome.bwa, cnf.is_pcr) for s, l, r in zip(samples, lefts, rights)) for sample, bam_fpath in zip(samples, bam_fpaths): if verify_bam(bam_fpath): bam_by_sample[sample.name] = bam_fpath else: err('Sample ' + sample + ' was not aligned successfully.') if not bam_by_sample: err('ERROR: No sample was alined.') else: info() cnf.work_dir = join(cnf.work_dir, source.targqc_name) safe_mkdir(cnf.work_dir) info('Making TargQC reports for BAMs from reads') safe_mkdir(targqc_dirpath) run_targqc(cnf, bam_by_sample, cnf.bed, targqc_dirpath) cnf.work_dir = dirname(cnf.work_dir) info('Done TargQC') info() info('*' * 70)
def main(args): if len(args) < 2: sys.exit('Usage ' + __file__ + ' input.tsv bcbio.csv [dir_with_bams] [bina_dir]') inp_fpath = args[0] verify_file(args[0], is_critical=True) out_fpath = args[1] verify_dir(dirname(adjust_path(out_fpath)), is_critical=True) bam_dirpath = None if len(args) > 2: bam_dirpath = args[2] verify_dir(adjust_path(bam_dirpath), is_critical=True) # bam_opt = args[2] # try: # bam_col = int(bam_opt) # bam_dirpath = None # except ValueError: # bam_col = None # verify_dir(bam_opt, is_critical=True) # bam_dirpath = args[2] bina_dirpath = None if len(args) > 3: bina_dirpath = args[3] verify_dir(dirname(adjust_path(bina_dirpath)), is_critical=True) # filtered_bams_dirpath = adjust_path(sys.argv[3]) # verify_dir(join(filtered_bams_dirpath, os.pardir), is_critical=True) columns_names = 'study barcode disease disease_name sample_type sample_type_name analyte_type library_type center center_name platform platform_name assembly filename files_size checksum analysis_id aliquot_id participant_id sample_id tss_id sample_accession published uploaded modified state reason' samples_by_patient = defaultdict(list) delim = '\t' barcode_col = 1 bam_col = 13 is_tcga_tsv = True with open(inp_fpath) as fh: for i, l in enumerate(fh): if not l.strip(): continue if i == 0: if len(l.split('\t')) == 27: err('Interpreting as TCGA tsv') if l.split('\t')[0] != 'TCGA': continue # skipping header else: delim = None for j, f in enumerate(l.split()): if f.startswith('TCGA'): barcode_col = j err('barcode col is ' + str(j)) if f.endswith('bam'): bam_col = j err('bam col is ' + str(j)) is_tcga_tsv = False fs = l.split(delim) barcode = fs[barcode_col].split( '-') # TCGA-05-4244-01A-01D-1105-08 sample = Sample() sample.bam = fs[bam_col] sample.bam_base_name = basename(os.path.splitext(fs[bam_col])[0]) sample.description = fs[barcode_col] sample.patient = '-'.join(barcode[:3]) if is_tcga_tsv: sample.reason = fs[26] sample_type = int(barcode[3][:2]) if sample_type >= 20 or sample_type <= 0: continue sample.is_normal = 10 <= sample_type < 20 sample.is_blood = sample_type in [ 3, 4, 9, 10 ] # https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm if any(s.description == sample.description for s in samples_by_patient[sample.patient]): prev_sample = next(s for s in samples_by_patient[sample.patient] if s.description == sample.description) # comp reason # if 'Fileset modified' not in prev_sample.reason and 'Fileset modified' in sample.reason: # err('Duplicated sample: ' + sample.description + ' Fileset modified not in old ' + prev_sample.name + ' over ' + sample.name) # pass # elif 'Fileset modified' in prev_sample.reason and 'Fileset modified' not in sample.reason: # samples_by_patient[sample.patient].remove(prev_sample) # samples_by_patient[sample.patient].append(sample) # err('Duplicated sample: ' + sample.description + ' Fileset modified not in new ' + sample.name + ' over ' + prev_sample.name) # else: # comp version prev_version = get_bam_version(prev_sample.bam_base_name) version = get_bam_version(sample.bam_base_name) err('Duplicated sample: ' + sample.description + ' Resolving by version (' + ' over '.join( map(str, sorted([prev_version, version])[::-1])) + ')') if version > prev_version: samples_by_patient[sample.patient].remove(prev_sample) samples_by_patient[sample.patient].append(sample) else: samples_by_patient[sample.patient].append(sample) batches = [] final_samples = set() if bina_dirpath: safe_mkdir(bina_dirpath) for patient, patient_samples in samples_by_patient.iteritems(): tumours = [s for s in patient_samples if not s.is_normal] normals = [s for s in patient_samples if s.is_normal] main_normal = None if len(normals) >= 1: if any(n.is_blood for n in normals): main_normal = next(n for n in normals if n.is_blood) else: main_normal = normals[0] if tumours: for n in normals[1:]: b = Batch(n.description + '-batch') b.tumour = n batches.append(b) for t in tumours: b = Batch(t.description + '-batch') b.tumour = t t.batches.add(b) final_samples.add(t) if main_normal: b.normal = main_normal main_normal.batches.add(b) final_samples.add(main_normal) batches.append(b) ################## ###### Bina ###### if bina_dirpath: bina_patient_dirpath = join(bina_dirpath, patient) safe_mkdir(bina_patient_dirpath) normals_csv_fpath = join(bina_patient_dirpath, 'normals.csv') tumours_csv_fpath = join(bina_patient_dirpath, 'tumors.csv') if main_normal: with open(normals_csv_fpath, 'w') as f: f.write('name,bam\n') bam_fpath = join( bam_dirpath, main_normal.bam) if bam_dirpath else main_normal.bam f.write(main_normal.description + ',' + bam_fpath + '\n') with open(tumours_csv_fpath, 'w') as f: f.write('name,bam\n') for t in tumours: bam_fpath = join(bam_dirpath, t.bam) if bam_dirpath else t.bam f.write(t.description + ',' + bam_fpath + '\n') if bina_dirpath: err('Saved bina CSVs to ' + bina_dirpath) ########################### ######## Bcbio CSV ######## print 'bcbio_nextgen.py -w template bcbio.yaml', out_fpath, with open(out_fpath, 'w') as out: out.write('sample,description,batch,phenotype\n') for s in sorted(final_samples, key=lambda s: s.bam_base_name): out.write(','.join([ s.bam_base_name, s.description, ';'.join( sorted(b.name for b in s.batches)), ('normal' if s.is_normal else 'tumor') ]) + '\n') bam_fpath = join(bam_dirpath, s.bam) if bam_dirpath else s.bam if verify_bam(bam_fpath, is_critical=False): try: bam = pysam.Samfile(bam_fpath, "rb") except ValueError: err(traceback.format_exc()) err('Cannot read ' + bam_fpath) err() # n_rgs = max(1, len(bam.header.get("RG", []))) else: print bam_fpath,
def main(args): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='a path to the BAM file to study')), (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='a BED file for capture panel or amplicons')), (['--exons', '--exome', '--features'], dict( dest='features', help= 'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)' )), (['--exons-no-genes', '--features-no-genes'], dict( dest='features_no_genes', help= 'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)' )), (['--original-bed'], dict(dest='original_target_bed', help=SUPPRESS_HELP)), (['--original-exons', '--original-features'], dict( dest='original_features_bed', help='original features genes bed file path (just for reporting)') ), (['--reannotate'], dict(dest='reannotate', help='re-annotate BED file with gene names', action='store_true', default=False)), (['--no-prep-bed'], dict(dest='prep_bed', help='do not fix input beds and exons', action='store_false', default=True)), (['-e', '--extended'], dict(dest='extended', help='extended - flagged regions and missed variants', action='store_true', default=False)), (['--genes'], dict(dest='genes', help='custom list of genes')), (['--padding'], dict( dest='padding', help= 'integer indicating the number of bases to extend each target region up and down-stream. ' 'Default is ' + str(defaults['coverage_reports']['padding']), type='int')), (['--no-dedup'], dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)), (['--downsample-to'], dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)), (['--downsampled'], dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)), (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP)) ], file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'], key_for_sample_name='bam') if cnf.padding: cnf.coverage_reports.padding = cnf.padding check_system_resources(cnf, required=['bedtools'], optional=[]) check_genome_resources(cnf) features_bed = adjust_path(cnf.features) if cnf.features else adjust_path( cnf.genome.features) if features_bed: info('Features: ' + features_bed) features_bed = verify_file(features_bed) else: info('No features BED found') if cnf.bed: cnf.bed = verify_file(cnf.bed, is_critical=True) info('Using amplicons/capture panel ' + cnf.bed) elif features_bed: info('WGS, taking CDS as target') cnf.bam = verify_bam(cnf.bam, is_critical=True) reports = process_one(cnf, cnf.output_dir, cnf.bam, features_bed=features_bed, features_no_genes_bed=cnf.features_no_genes) summary_report, gene_report = reports[:2] info('') info('*' * 70) if summary_report.txt_fpath: info('Summary report: ' + summary_report.txt_fpath) if gene_report: if gene_report.txt_fpath: info('All regions: ' + gene_report.txt_fpath + ' (' + str(len(gene_report.rows)) + ' regions)') if len(reports) > 2: selected_regions_report = reports[2] if selected_regions_report.txt_fpath: info('Flagged regions: ' + selected_regions_report.txt_fpath + ' (' + str(len(selected_regions_report.rows)) + ' regions)') for fpaths in reports: if fpaths: ok = True info('Checking expected results...') if not isinstance(fpaths, list): fpaths = [fpaths] for fpath in fpaths: if isinstance(fpath, basestring): if not verify_file(fpath): ok = False if ok: info('The results are good.') if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def read_samples_info_and_split(common_cnf, options, inputs): #TODO: _set_up_dirs(cnf) for each sample info('') info('Processing input details...') details = None for key in inputs: if options.get(key): common_cnf[key] = adjust_path(options[key]) info('Using ' + common_cnf[key]) details = [common_cnf] if not details: details = common_cnf.get('details') if not details: critical('Please, provide input ' + ', '.join(inputs) + ' in command line or in run info yaml config.') all_samples = OrderedDict() for one_item_cnf in details: if 'vcf' not in one_item_cnf: critical('ERROR: A section in details does not contain field "var".') one_item_cnf['vcf'] = adjust_path(one_item_cnf['vcf']) verify_file(one_item_cnf['vcf'], 'Input file', is_critical=True) join_parent_conf(one_item_cnf, common_cnf) work_vcf = join(one_item_cnf['work_dir'], basename(one_item_cnf['vcf'])) check_file_changed(one_item_cnf, one_item_cnf['vcf'], work_vcf) if not one_item_cnf.get('reuse_intermediate'): with open_gzipsafe(one_item_cnf['vcf']) as inp, open_gzipsafe(work_vcf, 'w') as out: out.write(inp.read()) one_item_cnf['vcf'] = work_vcf vcf_header_samples = read_sample_names_from_vcf(one_item_cnf['vcf']) # MULTIPLE SAMPELS if ('samples' in one_item_cnf or one_item_cnf.get('split_samples')) and len(vcf_header_samples) == 0: sample_cnfs = _verify_sample_info(one_item_cnf, vcf_header_samples) for header_sample_name in vcf_header_samples: if header_sample_name not in sample_cnfs: sample_cnfs[header_sample_name] = one_item_cnf.copy() if header_sample_name in all_samples: critical('ERROR: duplicated sample name: ' + header_sample_name) cnf = all_samples[header_sample_name] = sample_cnfs[header_sample_name] cnf['name'] = header_sample_name if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') # cnf['vcf'] = extract_sample(cnf, one_item_cnf['vcf'], cnf['name']) info() # SINGLE SAMPLE else: cnf = one_item_cnf if 'bam' in cnf: cnf['bam'] = adjust_path(cnf['bam']) verify_bam(cnf['bam'], is_critical=True) cnf['name'] = splitext_plus(basename(cnf['vcf']))[0] if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') cnf['vcf'] = work_vcf all_samples[cnf['name']] = cnf if not all_samples: info('No samples.') else: info('Using samples: ' + ', '.join(all_samples) + '.') return all_samples
def main(): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input.' parser = OptionParser(description=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) parser.add_option('--work-dir', dest='work_dir', metavar='DIR') parser.add_option('--log-dir', dest='log_dir') parser.add_option('--only-summary', dest='only_summary', action='store_true') parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'targetqc')) parser.add_option('--reannotate', dest='reannotate', action='store_true', default=False, help='re-annotate BED file with gene names') parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='count duplicates in coverage metrics') parser.add_option('--bed', dest='bed', help='BED file to run targetSeq and Seq2C analysis on.') parser.add_option( '--exons', '--exome', '--features', dest='features', help= 'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.' ) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: critical('No BAMs provided to input.') bam_fpaths = list(set([abspath(a) for a in args])) bad_bam_fpaths = [] for fpath in bam_fpaths: if not verify_bam(fpath): bad_bam_fpaths.append(fpath) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths)) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'TargQC' set_up_dirs(cnf) # cnf.name = 'TargQC_' + cnf.project_name check_genome_resources(cnf) verify_bed(cnf.bed, is_critical=True) bed_fpath = adjust_path(cnf.bed) info('Using amplicons/capture panel ' + bed_fpath) features_bed_fpath = adjust_path( cnf.features) if cnf.features else adjust_path(cnf.genome.features) info('Features: ' + features_bed_fpath) genes_fpath = None if cnf.genes: genes_fpath = adjust_path(cnf.genes) info('Custom genes list: ' + genes_fpath) if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) info('*' * 70) info() targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath, features_bed_fpath, genes_fpath) if targqc_html_fpath: send_email( cnf, 'TargQC report for ' + cnf.project_name + ':\n ' + targqc_html_fpath)