def _resolve_ambiguities(annotated_by_loc_by_gene, chrom_order): annotated = [] for (chrom, start, end), overlaps_by_gene in annotated_by_loc_by_gene.iteritems(): for g_name, overlaps in overlaps_by_gene.iteritems(): consensus = Region(chrom, start, end, ref_chrom_order=chrom_order.get(chrom), gene_symbol=g_name, exon='', strand='', feature='', biotype='') for r, overlap_size in overlaps: if consensus.strand: # RefSeq has exons from different strands with the same gene name (e.g. CTAGE4 for hg19), # Such pair of exons may overlap with a single region, so taking strand from the first one if consensus.strand != r.strand: warn( 'Warning: different strands between consensus and next region (gene: ' + g_name + ')') #assert consensus.strand == r.strand, 'Consensus strand is ' + \ # consensus.strand + ', region strand is ' + r.strand else: consensus.strand = r.strand consensus.exon = merge_fields(consensus.exon, r.exon) consensus.feature = merge_fields(consensus.feature, r.feature) consensus.biotype = merge_fields(consensus.biotype, r.biotype) consensus.total_merged += 1 annotated.append(consensus) return annotated
def check_genome_resources(cnf): if cnf.genome is None: critical('Please, specify genome build (one of available in ' + cnf.sys_cnf + ') using the --genome option (e.g., --genome hg38).') if not cnf.genomes: critical('"genomes" section is not specified in system config ' + cnf.sys_cnf) info('Genome: ' + str(cnf.genome.name)) for key in cnf.genome.keys(): if key != 'name' and isinstance(cnf.genome[key], basestring): cnf.genome[key] = adjust_system_path(cnf.genome[key]) if not verify_obj_by_path(cnf.genome[key], key, silent=True): if not cnf.genome[key].endswith('.gz') and verify_file( cnf.genome[key] + '.gz', silent=True): gz_fpath = cnf.genome[key] + '.gz' if verify_file(gz_fpath, silent=True): cnf.genome[key] = gz_fpath if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds: warn( 'Warning: features and bed_annotation_features and cds in the system config (' + cnf.sys_cnf + ') must be specified.') if not cnf.transcripts_fpath: cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts( cnf.genome.name, ensembl=True)
def verify_vcf(vcf_fpath, silent=False, is_critical=False): if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical): return None debug('File ' + vcf_fpath + ' exists and not empty') vcf = open_gzipsafe(vcf_fpath) debug('File ' + vcf_fpath + ' opened') l = next(vcf, None) if l is None: (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath) return None if not l.startswith('##fileformat=VCF'): (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath) return None try: reader = vcf_parser.Reader(vcf) except: err('Error: cannot open the VCF file ' + vcf_fpath) if is_critical: raise else: debug('File ' + vcf_fpath + ' opened as VCF') try: rec = next(reader) except IndexError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('IndexError parsing VCF file ' + vcf_fpath) if is_critical: raise except ValueError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('ValueError parsing VCF file ' + vcf_fpath) if is_critical: raise except StopIteration: debug('No records in the VCF file ' + vcf_fpath) if not silent: warn('VCF file ' + vcf_fpath + ' has no records.') return vcf_fpath except: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('Other error parsing VCF file ' + vcf_fpath) if is_critical: raise else: debug('A record was read from the VCF file ' + vcf_fpath) return vcf_fpath # f = open_gzipsafe(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open_gzipsafe(output_fpath) # contents = f.read() # if not silent: # if is_critical: # critical('SnpSift failed with memory issue:\n' + contents) # else: # err('SnpSift failed with memory issue:\n' + contents) # return None # f.close() # return None # return output_fpath finally: vcf.close()
def finalize_one(cnf, qc_report_fpath, qc_plots_fpaths): if qc_report_fpath: info('Saved QC report to ' + qc_report_fpath) if qc_plots_fpaths: info('Saved QC plots are in: ' + ', '.join(qc_plots_fpaths)) elif not verify_module('matplotlib'): warn('Warning: QC plots were not generated because matplotlib is not installed.')
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names): fastq_by_sn = OrderedDict() for sn in sample_names: sn_fastq_fpaths = sorted( [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')]) if len(sn_fastq_fpaths) == 0: err('Error: no fastq found for ' + sn) fastq_by_sn[sn] = None elif len(sn_fastq_fpaths) > 2: critical('Error: more than 2 fastq files starting with ' + sn + '_R: ' + ', '.join(sn_fastq_fpaths)) elif len(sn_fastq_fpaths) == 1: warn('Warning: only single fastq file is found for ' + sn + '. Treating as single reads.') fastq_by_sn[sn] = [ verify_file(sn_fastq_fpaths[0], description='sn_fastq_fpaths[0] for ' + str(sn)), None ] else: fastq_by_sn[sn] = [ verify_file(fpath, description='fpath from sn_fastq_fpaths for ' + str(sn)) for fpath in sn_fastq_fpaths ] return fastq_by_sn
def parse_variants(fpath): sample_column_name = 'Sample' gene_column_name = 'Gene' genes_per_sample = dict() with open(fpath) as f: header = f.readline().split('\t') if sample_column_name not in header: warn('"' + sample_column_name + '" is not found in ' + fpath + ' header, skipping this file!') return genes_per_sample else: sample_column_id = header.index(sample_column_name) if gene_column_name not in header: warn('"' + gene_column_name + '" is not found in ' + fpath + ' header, skipping this file!') return genes_per_sample else: gene_column_id = header.index(gene_column_name) for line in f: line = line.split('\t') sample = line[sample_column_id] gene = line[gene_column_id] if sample not in genes_per_sample: genes_per_sample[sample] = set() genes_per_sample[sample].add(gene) info('Found info for %d samples:' % len(genes_per_sample)) for k, v in genes_per_sample.items(): info('\t%s (%d unique genes)' % (k, len(v))) return genes_per_sample
def merge_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath): if cnf.reuse_intermediate and isfile( combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath + '.gz'): info(combined_vcf_fpath + '.gz exists, reusing') return combined_vcf_fpath + '.gz' bcftools = get_system_path(cnf, 'bcftools') if not bcftools: info('bcftools is not found, skipping merging VCFs') return None cmdl = '{bcftools} merge --force-samples '.format(**locals()) for sample, vcf_fpath in vcf_fpath_by_sname.iteritems(): if vcf_fpath: cmdl += ' ' + vcf_fpath + ' ' cmdl += ' -o ' + combined_vcf_fpath res = call(cnf, cmdl, output_fpath=combined_vcf_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: info('Joined VCFs, saved into ' + combined_vcf_fpath) if isfile(combined_vcf_fpath + '.tx.idx'): try: os.remove(combined_vcf_fpath + '.tx.idx') except OSError: info() return bgzip_and_tabix(combined_vcf_fpath) else: warn('Could not join VCFs') return None
def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None): info('Parsing the NextSeq500 project structure') self.kind = 'nextseq500' DatasetStructure.__init__(self, dirpath, az_prjname_by_subprj, samplesheet=samplesheet) info('az_prjname_by_subprj: ' + str(az_prjname_by_subprj)) verify_dir(self.unaligned_dirpath, is_critical=True) for pname, project in self.project_by_name.items(): az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance( az_prjname_by_subprj, basestring) else az_prjname_by_subprj if az_proj_name is None: if len(self.project_by_name) > 1: warn( 'Warn: cannot correspond subproject ' + pname + ' and project names and JIRA cases. ' 'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting' ) continue az_proj_name = az_prjname_by_subprj.values()[0] project.set_dirpath(self.unaligned_dirpath, az_proj_name) for sample in project.sample_by_name.values(): sample.source_fastq_dirpath = project.dirpath sample.set_up_out_dirs(project.fastq_dirpath, project.fastqc_dirpath, project.downsample_targqc_dirpath) self.basecall_stat_html_reports = self.__get_basecall_stats_reports() self.get_fastq_regexp_fn = get_nextseq500_regexp
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None): """Perform non-stream based deduplication of BAM input files using biobambam. """ if not bammarkduplicates: bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not bammarkduplicates: warn('No biobambam bammarkduplicates, can\'t mark duplicates.') return None out_bam_fpath = add_suffix(in_bam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_bam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = ( '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}' ).format(**locals()) res = call(cnf, cmdline, output_fpath=out_bam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_bam_fpath else: return None
def tmpdir(): dirpath = make_tmpdir() try: yield dirpath finally: try: shutil.rmtree(dirpath) except OSError: warn('Warning: cannot clean up temporary dir ' + dirpath)
def __find_unaligned_dir(self): unaligned_dirpath = join(self.dirpath, 'Unalign') if verify_dir(unaligned_dirpath, description='"Unalign" directory', silent=True): unaligned_dirpath = unaligned_dirpath else: unaligned_dirpath = None warn('No unalign directory') return unaligned_dirpath
def workdir(cnf): if cnf.work_dir: verify_dir(cnf.work_dir, is_critical=True) yield cnf.work_dir else: cnf.work_dir = make_tmpdir() yield cnf.work_dir try: shutil.rmtree(cnf.work_dir) except OSError: warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
def __init__(self, dirpath, az_prjname_by_subprj=None, samplesheet=None): info('Parsing the HiSeq project structure') self.kind = 'hiseq' DatasetStructure.__init__(self, dirpath, az_prjname_by_subprj, samplesheet=samplesheet) verify_dir(self.unaligned_dirpath, is_critical=True) self.basecall_stat_html_reports = self.__get_basecall_stats_reports() for pname, project in self.project_by_name.items(): proj_dirpath = join( self.unaligned_dirpath, 'Project_' + pname.replace( ' ', '-')) #.replace('-', '_').replace('.', '_')) az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance( az_prjname_by_subprj, basestring) else az_prjname_by_subprj if az_proj_name is None: if len(self.project_by_name) > 1: warn( 'Warn: cannot correspond subproject ' + pname + ' and project names and JIRA cases. ' 'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting' ) continue az_proj_name = az_prjname_by_subprj.values()[0] project.set_dirpath(proj_dirpath, az_proj_name) for sname, sample in project.sample_by_name.items(): sample.source_fastq_dirpath = join( project.dirpath, 'Sample_' + sname.replace( ' ', '-')) #.replace('-', '_').replace('.', '_')) sample.set_up_out_dirs(project.fastq_dirpath, project.fastqc_dirpath, project.downsample_targqc_dirpath) basecalls_symlink = join(project.dirpath, 'BaseCallsReports') if not exists(basecalls_symlink): info('Creating BaseCalls symlink ' + self.basecalls_dirpath + ' -> ' + basecalls_symlink) try: os.symlink(self.basecalls_dirpath, basecalls_symlink) except OSError: err('Cannot create symlink') traceback.print_exc() else: info('Created') if exists(basecalls_symlink): self.basecalls_dirpath = basecalls_symlink self.get_fastq_regexp_fn = get_hiseq_regexp
def get_regions_coverage(cnf, samples): cov_thresholds = [1, 5, 10, 15, 20, 25, 30, 50, 100] depths_by_pos = defaultdict(lambda: [0] * len(samples)) info() info('Coverage to bedgraph for ' + cnf.chrom) coverage_fpaths = [] for index, sample in enumerate(samples): coverage_fpath = join(cnf.work_dir, sample.name + '_' + cnf.chrom + '.bedgraph') coverage_fpath = get_bedgraph_coverage(cnf, sample.bam, chr_len_fpath=cnf.chr_len_fpath, bed_fpath=cnf.bed, output_fpath=coverage_fpath, exit_on_error=False) if coverage_fpath and verify_file(coverage_fpath): coverage_fpaths.append(coverage_fpath) for line in open(coverage_fpath): if line.startswith('#'): continue chrom, start, end, depth = line.split('\t') start, end, depth = map(int, (start, end, depth)) for pos in xrange(start, end): depths_by_pos[pos][index] = depth info() if not coverage_fpaths: warn(cnf.chrom + ' is not covered in all samples') return None info() info('Writing coverage for ' + cnf.chrom) write_coverage(cnf, cnf.output_dir, cnf.chrom, depths_by_pos, cov_thresholds) for index, sample in enumerate(samples): info('Writing coverage for ' + sample.name + ', ' + chrom) sample_output_dirpath = join(cnf.output_dir, sample.name) output_fpath = join(sample_output_dirpath, chrom + '.txt.gz') if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') continue write_coverage(cnf, sample_output_dirpath, cnf.chrom, depths_by_pos, cov_thresholds, sample_index=index) if not verify_file(output_fpath, silent=True): warn(sample.name + ' has no coverage at chromosome ' + chrom) return depths_by_pos
def create_jbrowse_symlink(genome, project_name, sample, file_fpath): jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, project_name) base, ext = splitext_plus(file_fpath) if ext in ['.tbi', '.bai']: base, ext2 = splitext_plus(base) ext = ext2 + ext sym_link = join(jbrowse_project_dirpath, sample + ext) if not verify_dir(jbrowse_project_dirpath): safe_mkdir(jbrowse_project_dirpath) if isfile(file_fpath) and not isfile(sym_link): try: os.symlink(file_fpath, sym_link) except OSError: warn(traceback.format_exc()) if isfile(sym_link): change_permissions(sym_link) return sym_link
def combine_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath, additional_parameters=''): gatk = get_java_tool_cmdline(cnf, 'gatk') if not gatk: info('GATK is not found, skipping merging VCFs') return None cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format( **locals()) for s_name, vcf_fpath in vcf_fpath_by_sname.items(): if vcf_fpath: cmdl += ' --variant:' + s_name + ' ' + vcf_fpath if ' --variant:' not in cmdl: err('No VCFs to combine') return None if cnf.reuse_intermediate and isfile( combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath + '.gz'): info(combined_vcf_fpath + '.gz exists, reusing') return combined_vcf_fpath + '.gz' cmdl += ' -o ' + combined_vcf_fpath res = call(cnf, cmdl, output_fpath=combined_vcf_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: info('Joined VCFs, saved into ' + combined_vcf_fpath) if isfile(combined_vcf_fpath + '.tx.idx'): try: os.remove(combined_vcf_fpath + '.tx.idx') except OSError: err(traceback.format_exc()) info() return bgzip_and_tabix(cnf, combined_vcf_fpath) else: warn('Could not join VCFs') return None
def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None): info('Parsing the MiSeq project structure') self.kind = 'miseq' DatasetStructure.__init__(self, dirpath, az_prjname_by_subprj, samplesheet=samplesheet) base_dirpath = self.unaligned_dirpath if not verify_dir(base_dirpath, silent=True): base_dirpath = self.basecalls_dirpath verify_dir(base_dirpath, description='Source fastq dir') for pname, project in self.project_by_name.items(): proj_dirpath = join(base_dirpath, pname) if not verify_dir(proj_dirpath, silent=True): proj_dirpath = base_dirpath az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance( az_prjname_by_subprj, basestring) else az_prjname_by_subprj if az_proj_name is None: if len(self.project_by_name) > 1: warn( 'Warn: cannot correspond subproject ' + pname + ' and project names and JIRA cases. ' 'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting' ) continue az_proj_name = az_prjname_by_subprj.values()[0] project.set_dirpath(proj_dirpath, az_proj_name) for sample in project.sample_by_name.values(): sample.source_fastq_dirpath = project.dirpath sample.set_up_out_dirs(project.fastq_dirpath, project.fastqc_dirpath, project.downsample_targqc_dirpath) self.basecall_stat_html_reports = [] self.get_fastq_regexp_fn = get_hiseq4000_miseq_regexp
def remove_dups_picard(cnf, bam_fpath): picard = get_system_path(cnf, 'java', 'picard') if not picard: critical('No picard in the system') info('Running picard dedup for "' + basename(bam_fpath) + '"') dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt') output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup') cmdline = '{picard} MarkDuplicates' \ ' I={bam_fpath}' \ ' O={output_fpath}' \ ' METRICS_FILE={dup_metrics_txt}' \ ' REMOVE_DUPLICATES=True' \ ' VALIDATION_STRINGENCY=LENIENT' res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res != output_fpath: # error occurred, try to correct BAM and restart warn('Picard deduplication failed for "' + basename(bam_fpath) + '". Fixing BAM and restarting Picard...') bam_fpath = _fix_bam_for_picard(cnf, bam_fpath) res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res == output_fpath: dup_rate = _parse_picard_dup_report(dup_metrics_txt) assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate) info('Duplication rate (picard): ' + str(dup_rate)) return output_fpath else: return None
def get_sample_column_index(vcf_fpath, samplename, suppress_warn=False): vcf_header_samples = read_sample_names_from_vcf(vcf_fpath) if len(vcf_header_samples) == 0: return None if len(vcf_header_samples) == 1: if vcf_header_samples[0].lower() == samplename.lower(): return 0 else: return None name = next((name for name in vcf_header_samples if name.lower() == samplename.lower()), None) if name is None: if not suppress_warn: warn('No sample ' + samplename + ' in header with samples ' + ', '.join(vcf_header_samples) + ' for ' + vcf_fpath) name = next((name for name in vcf_header_samples if name.lower() != 'none'), None) if name is None: err('All sample names are None.') return None else: return vcf_header_samples.index(name)
def detect_sys_cnf_by_location(): if is_uk(): res = defaults['sys_cnfs']['uk'] elif is_sweden(): res = defaults['sys_cnfs']['sweden'] elif is_china(): res = defaults['sys_cnfs']['china'] elif is_us(): res = defaults['sys_cnfs']['us'] elif is_cloud(): res = defaults['sys_cnfs']['cloud'] elif is_local(): res = defaults['sys_cnfs']['local'] elif is_ace(): res = defaults['sys_cnfs']['ace'] elif is_chihua(): res = defaults['sys_cnfs']['chihua'] else: warn('Warning: could not detect location by hostname: ' + socket.gethostname() + '. Using local') res = defaults['sys_cnfs']['local'] return res
def del_jobs(cnf, jobs_running): done_job_ids = [j.job_id for j in jobs_running if not j.is_done and not j.not_wait] if done_job_ids: qdel = get_system_path(cnf, 'qdel', is_critical=False) command = ' '.join(done_job_ids) if qdel: res = call(cnf, qdel + ' ' + command, exit_on_error=False, silent=not cnf.debug) if res == 0: info('All running jobs for this project has been deleted from queue.') else: warn('Can\'t run qdel. Please kill the remaning jobs manually using the following command:') warn(' qdel ' + command) else: warn('Can\'t find qdel. Please kill the remaning jobs manually using the following command:') warn(' qdel ' + command) info()
def parse_response(res, mut): ok = True for f in ['allele_origin', 'clinical_significance', 'genomic_coordinates']: if f not in res: warn('No ' + f + ' in SolveBio for mutation ' + str(mut)) ok = False if not ok: return None rec = SolvebioRecord() rec.clinsig = res['clinical_significance'] if rec.clinsig.lower() == 'other': rec.clinsig = 'Uncertain' coords = res['genomic_coordinates'] rec.url = 'https://astrazeneca.solvebio.com/variant/GRCH37-{chrom}-{start}-{stop}-{alt}'.format( chrom=coords['chromosome'], start=coords['start'], stop=coords['stop'], alt=res['allele']) return rec
def markdup_sam(cnf, in_sam_fpath, samblaster=None): """Perform non-stream based deduplication of SAM input files using samblaster. """ if not samblaster: samblaster = get_system_path(cnf, 'samblaster') if not samblaster: warn('No samblaster, can\'t mark duplicates.') return None out_sam_fpath = add_suffix(in_sam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_sam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format( **locals()) res = call(cnf, cmdline, output_fpath=out_sam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_sam_fpath else: return None
def flag_stat(cnf, bam): output_fpath = join(cnf.work_dir, basename(bam) + '_flag_stats') cmdline = 'flagstat {bam}'.format(**locals()) call_sambamba(cnf, cmdline, output_fpath=output_fpath, bam_fpath=bam, command_name='flagstat') stats = dict() with open(output_fpath) as f: lines = f.readlines() for stat, fun in [ ('total', number_of_reads), ('duplicates', number_of_dup_reads), # '-f 1024' ('mapped', number_of_mapped_reads), # '-F 4' ('properly paired', number_of_properly_paired_reads) ]: # '-f 2' try: val = next(l.split()[0] for l in lines if stat in l) except StopIteration: warn('Cannot extract ' + stat + ' from flagstat output ' + output_fpath + '. Trying samtools view -c...') val = None else: try: val = int(val) except ValueError: warn('Cannot parse value ' + str(val) + ' from ' + stat + ' from flagstat output ' + output_fpath + '. Trying samtools view -c...') val = None if val is not None: stats[stat] = val else: stats[stat] = fun(cnf, bam) return stats
def create_oncoprints_link(cnf, bcbio_structure, project_name=None): if is_us(): loc = exposing.us # elif is_uk(): loc = exposing.uk else: loc = exposing.local return None if not bcbio_structure.variant_callers: info('No varianting calling performed, not generating Oncoprints') return None clinical_report_caller = \ bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if not clinical_report_caller: err('Warning: vardict is not in the variant callers list, this not generating Oncoprints') return None step_greetings('Creating Oncoprints link') zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS' if not isdir(zhongwu_data_query_dirpath): warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.') return None vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name) vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname) cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix) cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath samples = sorted(bcbio_structure.samples) cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir) study_name = re.sub('[\.\-:&]', '_', cnf.project_name) check_genome_resources(cnf) data_query_dirpath = join(loc.dirpath, 'DataQueryTool') data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt') info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt') altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath) if not altered_genes: err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.') return None print_info_txt(cnf, samples, info_fpath) data_ext_fpath = data_fpath.replace('/home/', '/users/') info_ext_fpath = info_fpath.replace('/home/', '/users/') # optional: data_symlink = join(data_query_dirpath, study_name + '.data.txt') info_symlink = join(data_query_dirpath, study_name + '.info.txt') (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink) (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink) properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties') add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath) genes = '%0D%0A'.join(altered_genes) data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?' 'analysis=oncoprint&' 'study={study_name}&' 'gene={genes}&' 'order=on&' 'freq=50&' 'nocheckgenes=true&' 'submit=Submit' .format(**locals())) info() info('Information about study was added in Data Query Tool, URL is ' + data_query_url) return data_query_url
def write_vcfs(cnf, var_samples, output_dirpath, caller_name, vcf2txt_res_fpath, mut_res_fpath, threads_num): info('') info('-' * 70) info('Writing VCFs') variants_by_sample = defaultdict(dict) mutations_by_sample = defaultdict(set) info('Collecting passed variants...') with open(mut_res_fpath) as fh: for l in fh: ts = l.split('\t') s_name, chrom, pos, alt = ts[0], ts[1], ts[2], ts[5] mutations_by_sample[s_name].add((chrom, pos, alt)) info('Collecting all vcf2txt variants...') with open(vcf2txt_res_fpath) as vcf2txt_f: pass_col = None for l in vcf2txt_f: if l.startswith('Sample'): pass_col = l.split('\t').index('PASS') else: ts = l.split('\t') s_name, chrom, pos, alt = ts[0], ts[1], ts[2], ts[5] filt = ts[pass_col] variants_by_sample[s_name][(chrom, pos, alt)] = filt info() info('Writing filtered VCFs in ' + str(threads_num) + ' threads') try: Parallel(n_jobs=threads_num) \ (delayed(postprocess_vcf) \ (None, cnf.work_dir, var_sample, caller_name, variants_by_sample[var_sample.name], mutations_by_sample[var_sample.name], vcf2txt_res_fpath) for var_sample in var_samples) info('Done postprocessing all filtered VCFs.') except OSError: err(traceback.format_exc()) warn('Running sequencially instead in ' + str(threads_num) + ' threads') try: Parallel(n_jobs=1) \ (delayed(postprocess_vcf) \ (None, cnf.work_dir, var_sample, caller_name, variants_by_sample[var_sample.name], mutations_by_sample[var_sample.name], vcf2txt_res_fpath) for var_sample in var_samples) info('Done postprocessing all filtered VCFs.') except OSError: err(traceback.format_exc()) err('Cannot postprocess VCF - skipping') err() info('Filtered VCFs are written.')
def extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed): gene_key_set = set() gene_key_list = [] info() info('Getting gene list') # if genes_fpath: # with open(genes_fpath) as f: # gene_key_list = [g.strip() for g in f.read().split('\n') if g] # gene_key_set = set(gene_key_list) # info('Using genes from ' + genes_fpath + ', filtering exons and amplicons with this genes.') # if target_bed: # target_bed = filter_bed_with_gene_set(cnf, target_bed, gene_key_set) # if exons_bed: # exons_bed = filter_bed_with_gene_set(cnf, exons_bed, gene_key_set) # exons_no_genes_bed = filter_bed_with_gene_set(cnf, exons_no_genes_bed, gene_key_set) # else: if target_bed: info() gene_key_set, gene_key_list = get_gene_keys(target_bed) info('Using genes from the amplicons list ' + target_bed) if features_bed and cnf.prep_bed is not False: info('Trying filtering exons with these ' + str(len(gene_key_list)) + ' genes.') features_filt_bed = filter_bed_with_gene_set( cnf, features_bed, gene_key_set, suffix='target_genes_1st_round') if not verify_file(features_filt_bed): info() warn( 'No gene symbols from the capture BED file was found in the features BED file. Re-annotating target...' ) target_bed = annotate_target(cnf, target_bed) #info('Merging regions within genes...') #target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False) info('Sorting amplicons_bed by (chrom, gene_name, start)') target_bed = sort_bed(cnf, target_bed) info('Getting gene names again...') gene_key_set, gene_key_list = get_gene_keys(target_bed) info() info( 'Using genes from the new amplicons list, filtering features with this genes again.' ) features_filt_bed = filter_bed_with_gene_set( cnf, features_bed, gene_key_set, suffix='target_genes_2nd_round') if not verify_file(features_filt_bed): critical( 'No gene symbols from the capture BED file was found in the features BED.' ) features_bed = features_filt_bed info('Filtering the full features file including gene records.') features_no_genes_bed = filter_bed_with_gene_set( cnf, features_no_genes_bed, gene_key_set, suffix='target_genes') elif features_no_genes_bed: info() info( 'No target (WGS), getting the gene names from the full features list...' ) gene_key_set, gene_key_list = get_gene_keys(features_no_genes_bed) info() return gene_key_set, gene_key_list, target_bed, features_bed, features_no_genes_bed
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None): if features_bed is None and target_bed is None: warn( 'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.' ) # return None, None, None, None if target_bed: target_bed = verify_bed(target_bed, is_critical=True) if seq2c_bed: seq2c_bed = verify_bed(seq2c_bed, is_critical=True) if features_bed: features_bed = verify_bed(features_bed, is_critical=True) # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed): # warn('Same file used for exons and amplicons: ' + features_bed) # Features features_no_genes_bed = None if features_bed: # info() # info('Merging regions within genes...') # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True) # # info() # info('Sorting exons by (chrom, gene name, start)') # exons_bed = sort_bed(cnf, exons_bed) info() info( 'Filtering the features bed file to have only non-gene and no-transcript records...' ) features_no_genes_bed = intermediate_fname(cnf, features_bed, 'no_genes') call(cnf, 'grep -vw Gene ' + features_bed + ' | grep -vw Transcript', output_fpath=features_no_genes_bed) ori_target_bed_path = target_bed if target_bed: info() info('Remove comments in target...') target_bed = remove_comments(cnf, target_bed) info() info('Cut -f1,2,3,4 target...') target_bed = cut(cnf, target_bed, 4) info() info('Sorting target...') target_bed = sort_bed(cnf, target_bed) cols = count_bed_cols(target_bed) if cnf.reannotate or cols < 4: info() if not features_bed: critical( str(cols) + ' columns (less than 4), and no features to annotate regions ' '(please make sure you have set the "features" key in the corresponding genome section ' '(' + cnf.genome.name + ') in ' + cnf.sys_cnf) info( 'cnf.reannotate is ' + str(cnf.reannotate) + ', and cols in the target BED is ' + str(cols) + '. Annotating target with the gene names from the "features" file ' + features_bed + '...') target_bed = annotate_target(cnf, target_bed) def remove_no_anno(l, i): if l.split('\t')[3].strip() == '.': return None else: return l if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path: info('Seq2C bed: remove regions with no gene annotation') seq2c_bed = target_bed seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') elif seq2c_bed: info() info('Remove comments in seq2c bed...') seq2c_bed = remove_comments(cnf, seq2c_bed) info() info('Sorting seq2c bed...') seq2c_bed = sort_bed(cnf, seq2c_bed) cols = count_bed_cols(seq2c_bed) if cols < 4: info() info('Number columns in SV bed is ' + str(cols) + '. Annotating amplicons with gene names...') seq2c_bed = annotate_target(cnf, seq2c_bed) elif 8 > cols > 4: seq2c_bed = cut(cnf, seq2c_bed, 4) elif cols > 8: seq2c_bed = cut(cnf, seq2c_bed, 8) info('Filtering non-annotated entries in seq2c bed') seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') else: seq2c_bed = verify_bed(cnf.genome.cds) if target_bed: info() # info('Merging amplicons...') # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False) info('Sorting target by (chrom, gene name, start)') target_bed = sort_bed(cnf, target_bed) return features_bed, features_no_genes_bed, target_bed, seq2c_bed
def _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') if cnf.reuse_intermediate and verify_dir(plots_dirpath) and [ f for f in listdir(plots_dirpath) if not f.startswith('.') ]: info('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len( [s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: qualimap = get_system_path(cnf, interpreter_or_name=None, name='qualimap') if qualimap is not None and get_qualimap_type(qualimap) == 'full': qualimap_output_dir = join(cnf.work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(cnf, samples) _correct_qualimap_insert_size_histogram(cnf, samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows( rows, join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = '{qualimap} multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format( **locals()) res = call(cnf, cmdline, exit_on_error=False, return_err_code=True, env_vars=dict(DISPLAY=None), output_fpath=qualimap_plots_dirpath, output_is_dir=True) if res is None or not verify_dir(qualimap_plots_dirpath): warn( 'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.' ) return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn( 'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.' ) return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def retrieve_jira_info(url): try: from jira import JIRA except ImportError, e: warn('Cannot import JIRA: ' + str(e)) return None