def read_samples(sample2bam_fpath): bam_fpaths = [] sample_names = [] bad_bam_fpaths = [] info('Reading sample info from ' + sample2bam_fpath) with open(sample2bam_fpath) as f: for l in f: if l.startswith('#'): continue l = l.replace('\n', '') if not l: continue sample_name = None if len(l.split('\t')) == 2: sample_name, bam_fpath = l.split('\t') else: sample_name, bam_fpath = None, l if not verify_bam(bam_fpath): bad_bam_fpaths.append(bam_fpath) bam_fpath = verify_bam(bam_fpath, is_critical=True) bam_fpaths.append(bam_fpath) if sample_name is None: sample_name = basename(splitext(bam_fpath)[0]) if sample_name.endswith('-ready'): sample_name = sample_name.split('-ready')[0] sample_names.append(sample_name) info(sample_name + ': ' + bam_fpath) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths)) return sample_names, bam_fpaths
def check_genome_resources(cnf): if cnf.genome is None: critical('Please, specify genome build (one of available in ' + cnf.sys_cnf + ') using the --genome option (e.g., --genome hg38).') if not cnf.genomes: critical('"genomes" section is not specified in system config ' + cnf.sys_cnf) info('Genome: ' + str(cnf.genome.name)) for key in cnf.genome.keys(): if key != 'name' and isinstance(cnf.genome[key], basestring): cnf.genome[key] = adjust_system_path(cnf.genome[key]) if not verify_obj_by_path(cnf.genome[key], key, silent=True): if not cnf.genome[key].endswith('.gz') and verify_file( cnf.genome[key] + '.gz', silent=True): gz_fpath = cnf.genome[key] + '.gz' if verify_file(gz_fpath, silent=True): cnf.genome[key] = gz_fpath if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds: warn( 'Warning: features and bed_annotation_features and cds in the system config (' + cnf.sys_cnf + ') must be specified.') if not cnf.transcripts_fpath: cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts( cnf.genome.name, ensembl=True)
def print_genes(genes, output_fpath, canon_only): regions = [] already_added_gene_features = set() transcripts = [] for g in genes: for tr in g.transcripts: if not canon_only or tr.is_canonical: transcripts.append(tr) for tr in sorted(transcripts, key=lambda _tr: _tr.get_key()): to_add_gene = all(tr2.biotype == 'protein_coding' for tr2 in tr.gene.transcripts if (tr2.is_canonical or not canon_only)) \ and tr.gene not in already_added_gene_features \ and (len(tr.gene.canonical_transcripts) == 1 or len(tr.gene.transcripts) == 1) if to_add_gene: # skip gene feature for all miRNA because there are multi-domain miRNA located in different # places with the same gene name regions.append(tr.gene) already_added_gene_features.add(tr.gene) if tr.exons: regions.append(tr) for e in tr.exons: regions.append(e) info('Writing ' + str(len(regions)) + ' regions') with open(adjust_path(output_fpath), 'w') as all_out: for r in regions: all_out.write(r.__str__())
def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( BCBioStructure.targqc_name, BCBioStructure.targqc_summary_dir, extra_opts= [(['--bed', '--capture', '--amplicons'], dict(dest='bed', help='BED file to run targetSeq and Seq2C analysis on.')), (['--exons', '--exome', '--features'], dict( dest='features', help= 'Annotated CDS/Exons/Gene/Transcript BED file to make targetSeq exon/amplicon regions reports.' ))]) bed_fpath, features_bed_fpath = adjust_path(cnf.bed), adjust_path( cnf.features) summarize_targqc(cnf, cnf.threads or len(bcbio_structure.samples), cnf.output_dir, bcbio_structure.samples, bed_fpath=bed_fpath, features_fpath=features_bed_fpath)
def set_dirpath(self, dirpath, az_project_name): self.dirpath = dirpath self.az_project_name = az_project_name verify_dir(self.dirpath, is_critical=True) merged_dirpath = join(self.dirpath, 'merged') if verify_dir(merged_dirpath, silent=True): self.mergred_dir_found = True self.fastq_dirpath = self.fastqc_dirpath = merged_dirpath else: self.mergred_dir_found = False self.fastq_dirpath = join(self.dirpath, 'fastq') self.fastqc_dirpath = join(self.fastq_dirpath, 'FastQC') info() self.comb_fastqc_fpath = join(self.fastqc_dirpath, 'FastQC.html') self.downsample_targqc_report_fpath = None self.project_report_html_fpath = None self.downsample_metamapping_dirpath = join(self.dirpath, 'Downsample_MetaMapping') self.downsample_targqc_dirpath = join(self.dirpath, 'Downsample_TargQC') self.downsample_targqc_report_fpath = join( self.downsample_targqc_dirpath, 'targQC.html') self.project_report_html_fpath = join(self.dirpath, az_project_name + '.html')
def finalize_one(cnf, qc_report_fpath, qc_plots_fpaths): if qc_report_fpath: info('Saved QC report to ' + qc_report_fpath) if qc_plots_fpaths: info('Saved QC plots are in: ' + ', '.join(qc_plots_fpaths)) elif not verify_module('matplotlib'): warn('Warning: QC plots were not generated because matplotlib is not installed.')
def main(): cnf, samples, bed_fpath, output_dir = proc_args(sys.argv) info('Processing ' + str(len(samples)) + ' samples') if cnf.prep_bed is not False: if not bed_fpath: info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds)) bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name) seq2c_bed_fname = basename(bed_fpath) bed_cols = count_bed_cols(bed_fpath) if bed_cols < 4: check_genome_resources(cnf) _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath) try: copyfile(bed_fpath, join(output_dir, seq2c_bed_fname)) except OSError: err(format_exc()) info() else: info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname)) bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file') info('Using target ' + bed_fpath) run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
def _convert_vcf(inp_f, out_f): max_bunch_size = 100000 written_records = 0 bunch = [] reader = vcf_parser.Reader(inp_f) writer = vcf_parser.Writer(out_f, reader) i = 0 while True: rec = next(reader, None) if rec is None: break rec = proc_rec_fun(Record(rec, input_fpath, i), *args, **kwargs) if rec: bunch.append(rec) written_records += 1 if len(bunch) >= max_bunch_size: writer.write_records(bunch) info('Written lines: ' + str(written_records)) bunch = [] i += 1 writer.write_records(bunch) bunch = [] info('Written lines: ' + str(written_records))
def leave_main_sample(cnf, vcf_fpath, samplename): index = get_sample_column_index(vcf_fpath, samplename) if index is None: return vcf_fpath # def _f1(rec): # rec.samples = [sample_name] # return rec # info('Keeping SAMPLE only for the first sample (' + samplename + ')') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name) # out_fpath = extract_sample(cnf, vcf_fpath, sample_name) # info() def _f(line, i): if line and (line.startswith('#CHROM') or line[0] != '#'): ts = line.split('\t') return '\t'.join(ts[:9] + [ts[9 + index]]) return line vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm') if not verify_file(vcf_fpath): err('Error: leave_first_sample didnt generate output file.') return None return vcf_fpath
def get_chr_lengths_from_seq(seq_fpath): chr_lengths = [] if seq_fpath.endswith('.fai'): seq_fpath = splitext(seq_fpath)[0] if verify_file(seq_fpath + '.fai', silent=True): info('Reading genome index file (.fai) to get chromosome lengths') with open(adjust_path(seq_fpath + '.fai'), 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], line.split()[1] chr_lengths.append((chrom, length)) elif verify_file(seq_fpath, silent=True): info('Reading genome sequence (.fa) to get chromosome lengths') with open(adjust_path(seq_fpath), 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai') return chr_lengths
def _get_gene_transcripts_id(cnf): genes_dict = dict() transcripts_dict = dict() if not cnf.genome.all_transcripts: critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.') if not verify_file(cnf.genome.all_transcripts): critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.') info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts) with open_gzipsafe(cnf.genome.all_transcripts) as f: for i, l in enumerate(f): if l.startswith('#'): continue chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') if feature != 'transcript': continue try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) transcript_id = _rm_quotes(_prop_dict['transcript_id']) #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id) genes_dict[gene_id] = gene_symbol transcripts_dict[transcript_id] = gene_symbol return genes_dict, transcripts_dict
def print_sample_tracks_info(sample_name, project_name, bam_link, bigwig_link, vcf_link, jbrowse_tracks_fpath): with open(jbrowse_tracks_fpath, 'a') as tracks: print >> tracks, '\n[ tracks.{sample_name} ]\n' \ '\nstoreClass = JBrowse/Store/SeqFeature/BAM' \ '\nurlTemplate = {bam_link}' \ '\nbaiUrlTemplate = {bam_link}.bai' \ '\nchunkSizeLimit = 100000000' \ '\nmaxHeight = 10000' \ '\ncategory = {project_name}' \ '\ntype = JBrowse/View/Track/Alignments2' \ '\nkey = {sample_name}\n'.format(**locals()) print >> tracks, '\n[ tracks.{sample_name}_cov ]\n' \ '\nstoreClass = JBrowse/Store/SeqFeature/BAM' \ '\nurlTemplate = {bam_link}' \ '\nbaiUrlTemplate = {bam_link}.bai' \ '\nchunkSizeLimit = 100000000' \ '\ncategory = {project_name}' \ '\ntype = SNPCoverage' \ '\nkey = {sample_name}_coverage_bam\n'.format(**locals()) print >> tracks, '\n[ tracks.{sample_name}_bigwig ]\n' \ '\nstoreClass = JBrowse/Store/SeqFeature/BigWig' \ '\nurlTemplate = {bigwig_link}' \ '\ncategory = {project_name}' \ '\ntype = JBrowse/View/Track/Wiggle/XYPlot' \ '\nautoscale = local' \ '\nkey = {sample_name}_coverage\n'.format(**locals()) if vcf_link: print >> tracks, '\n[ tracks.{sample_name}_vcf ]\n' \ '\nstoreClass = JBrowse/Store/SeqFeature/VCFTabix' \ '\nurlTemplate = {vcf_link}' \ '\ncategory = {project_name}' \ '\ntype = JBrowse/View/Track/CanvasVariants' \ '\nkey = {sample_name}_variants\n'.format(**locals()) info(sample_name + ' was successfully exported to jBrowse!')
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath): info('WGS; running vcftxt separately for each sample to save memory.') vcf2txt_outputs_by_vcf_fpath = OrderedDict() for vcf_fpath in vcf_fpath_by_sample.values(): sample_output_fpath = add_suffix(vcf2txt_out_fpath, splitext(basename(vcf_fpath))[0]) vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath info() info('Joining vcf2txt ouputs... (' + str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' + str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' + vcf2txt_out_fpath) with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx: with open(tx, 'w') as out: for i, (vcf_fpath, sample_output_fpath) in enumerate( vcf2txt_outputs_by_vcf_fpath.items()): info(' Reading ' + sample_output_fpath) with open(sample_output_fpath) as inp: for j, l in enumerate(inp): if j == 0 and i != 0: continue out.write(l) if verify_file(vcf2txt_out_fpath): info('Saved ' + vcf2txt_out_fpath) return vcf2txt_out_fpath else: return None
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath): cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True) cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv') controls = '' lr2gene_opt = '' if cnf.controls: controls = '-c ' + cnf.controls # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')]) lr2gene_opt = '-c' cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals()) call(cnf, cmdline, cov2lr_output, exit_on_error=False) info() if not verify_file(cov2lr_output): return None seq2c_opts = cnf.seq2c_opts or '' lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True) cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals()) res = call(cnf, cmdline, output_fpath, exit_on_error=False) info() if not verify_file(output_fpath): return None return res
def __call(cnf, cmdline, output_fpath=None): stdout = open(output_fpath, 'w') if output_fpath else None stderr = None if cnf.debug else open('/dev/null', 'w') if cnf.debug: info(cmdline) ret_code = subprocess.call(cmdline, shell=True, stdout=stdout, stderr=stderr, stdin=None) return ret_code
def set_up_log(cnf, proc_name=None, project_name=None, project_fpath=None, output_dir=None): logger.proc_name = proc_name logger.project_name = project_name logger.project_fpath = project_fpath or output_dir logger.cnf_address = remove_quotes(cnf.email) if cnf.email else '' logger.smtp_host = cnf.smtp_host if cnf.log_dir: log_fname = (proc_name + '_' if proc_name else '') + (cnf.sample + '_' if cnf.sample else '') + 'log.txt' log_fpath = join(cnf.log_dir, log_fname) if file_exists(log_fpath): timestamp = datetime.datetime.fromtimestamp( os.stat(log_fpath).st_mtime) mv_log_fpath = log_fpath + '.' + timestamp.strftime( "%Y-%m-%d_%H-%M-%S") try: if isfile(mv_log_fpath): os.remove(mv_log_fpath) if not isfile(mv_log_fpath): os.rename(log_fpath, mv_log_fpath) except OSError: pass info('log_fpath: ' + log_fpath) info() logger.log_fpath = cnf.log = log_fpath
def _submit_job(cnf, step, sample_name='', wait_for_steps=None, threads=1, is_critical=True, **kwargs): tool_cmdline = get_system_path(cnf, step.interpreter, step.script, is_critical=is_critical) if not tool_cmdline: return False kwargs['sample_name'] = sample_name cmdline = tool_cmdline + ' ' + step.param_line.format(**kwargs) info(step.name) job = submit_job(cnf, cmdline, job_name=step.job_name(sample_name), wait_for_steps=wait_for_steps, threads=threads) info() return job
def _symlink_vcfs(callers, datestamp_var_dirpath): errory = [] for caller in callers: info(caller.name) for sample in caller.samples: info(sample.name) filt_vcf_fpath = sample.find_filt_vcf_by_callername(caller.name) if not verify_file(filt_vcf_fpath): errory.append([sample.name, caller.name, filt_vcf_fpath]) else: base_filt_fpath = filt_vcf_fpath[: -3] if filt_vcf_fpath.endswith( '.gz') else filt_vcf_fpath for fpath in [ base_filt_fpath + '.gz', base_filt_fpath + '.idx', base_filt_fpath + '.gz.tbi' ]: if verify_file(fpath, silent=True): _symlink_to_dir(fpath, sample.dirpath) # _symlink_to_dir(fpath, datestamp_var_dirpath) BCBioStructure.move_vcfs_to_var(sample) return errory
def annotate_target(cnf, target_bed): output_fpath = intermediate_fname(cnf, target_bed, 'ann') if not cnf.genome.bed_annotation_features: return output_fpath if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath features_bed = verify_bed( cnf.genome.bed_annotation_features, is_critical=True, description='bed_annotation_features in system config') # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py')) # bedtools = get_system_path(cnf, 'bedtools') annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \ '-o {output_fpath} --canonical'.format(**locals()) # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \ # '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \ # '-o {output_fpath}'.format(**locals()) call(cnf, cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = remove_comments(cnf, output_fpath) return output_fpath
def launch_bedcoverage_hist(work_dir, bed, bam, chr_lengths_fpath, bedcov_output_fpath=None, bedtools='bedtools'): if not bedcov_output_fpath: bedcov_output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '__' + splitext_plus(basename(bam))[0] + '_bedcov_output.txt') if bam.endswith('bam'): bam = bam_to_bed_nocnf(bam, bedtools) verify_file(bam, is_critical=True, description='BAM to BED conversion result') v = bedtools_version(bedtools) if v and v >= 24: cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format( **locals()) else: cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format( **locals()) cmdline += ' > ' + bedcov_output_fpath info(cmdline) os.system(cmdline) res = verify_file(bedcov_output_fpath) if res: info('Done, saved to ' + bedcov_output_fpath) else: err('Error, result is non-existent or empty')
def parse_variants(fpath): sample_column_name = 'Sample' gene_column_name = 'Gene' genes_per_sample = dict() with open(fpath) as f: header = f.readline().split('\t') if sample_column_name not in header: warn('"' + sample_column_name + '" is not found in ' + fpath + ' header, skipping this file!') return genes_per_sample else: sample_column_id = header.index(sample_column_name) if gene_column_name not in header: warn('"' + gene_column_name + '" is not found in ' + fpath + ' header, skipping this file!') return genes_per_sample else: gene_column_id = header.index(gene_column_name) for line in f: line = line.split('\t') sample = line[sample_column_id] gene = line[gene_column_id] if sample not in genes_per_sample: genes_per_sample[sample] = set() genes_per_sample[sample].add(gene) info('Found info for %d samples:' % len(genes_per_sample)) for k, v in genes_per_sample.items(): info('\t%s (%d unique genes)' % (k, len(v))) return genes_per_sample
def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None): info('Parsing the NextSeq500 project structure') self.kind = 'nextseq500' DatasetStructure.__init__(self, dirpath, az_prjname_by_subprj, samplesheet=samplesheet) info('az_prjname_by_subprj: ' + str(az_prjname_by_subprj)) verify_dir(self.unaligned_dirpath, is_critical=True) for pname, project in self.project_by_name.items(): az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance( az_prjname_by_subprj, basestring) else az_prjname_by_subprj if az_proj_name is None: if len(self.project_by_name) > 1: warn( 'Warn: cannot correspond subproject ' + pname + ' and project names and JIRA cases. ' 'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting' ) continue az_proj_name = az_prjname_by_subprj.values()[0] project.set_dirpath(self.unaligned_dirpath, az_proj_name) for sample in project.sample_by_name.values(): sample.source_fastq_dirpath = project.dirpath sample.set_up_out_dirs(project.fastq_dirpath, project.fastqc_dirpath, project.downsample_targqc_dirpath) self.basecall_stat_html_reports = self.__get_basecall_stats_reports() self.get_fastq_regexp_fn = get_nextseq500_regexp
def _proc_file(inp_f, out_f, ctx=None): max_bunch_size = 1000 * 1000 written_lines = 0 bunch = [] for i, line in enumerate(inp_f): clean_line = line.replace('\n', '') if clean_line: if ctx: new_l = proc_line_fun(clean_line, i, ctx) else: new_l = proc_line_fun(clean_line, i) if new_l is not None: bunch.append(new_l + '\n') written_lines += 1 else: bunch.append(line) written_lines += 1 if len(bunch) >= max_bunch_size: out_f.writelines(bunch) info('Written lines: ' + str(written_lines)) bunch = [] out_f.writelines(bunch) info('Written lines: ' + str(written_lines))
def write_to_sqlite(work_dir, jira_case, project_list_fpath, country_id, project_name, samples_num=None, analysis_dirpath=None, html_report_url=None): info('Reading project list ' + project_list_fpath) conn = sqlite3.connect(project_list_fpath) c = conn.cursor() pid = project_name d = dict() if analysis_dirpath: d['Analyses_directory_' + (country_id if not is_local() else 'US')] = analysis_dirpath if project_name and ( analysis_dirpath or not __unquote(d['Name']) ): # update only if running after bcbio, or no value there at all d['Name'] = project_name if html_report_url and ( analysis_dirpath or not __unquote(d['HTML_report_path']) ): # update only if running after bcbio, or no value there at all d['HTML_report_path'] = html_report_url if jira_case: d['JIRA_URL'] = jira_case.url # if 'Updated By' in d and __unquote(d['Updated By']): d['Updated_By'] = getpass.getuser() if jira_case.description: d['Description'] = jira_case.summary if jira_case.data_hub: d['Data_Hub'] = jira_case.data_hub if jira_case.type: d['Type'] = jira_case.type if jira_case.department: d['Department'] = jira_case.department if jira_case.division: d['Division'] = jira_case.division if jira_case.assignee: d['Assignee'] = jira_case.assignee if jira_case.reporter: d['Reporter'] = jira_case.reporter if samples_num: d['Sample_Number'] = str(samples_num) d['Datestamp'] = timestamp() cmdl = ''' IF EXISTS (SELECT * FROM project WHERE PID="{pid}" AND Name="{project_name}") UPDATE project SET (...) WHERE PID="{pid}" AND Name="{project_name}" ELSE INSERT INTO project VALUES (...) '''.format(pid=pid, project_name=project_name) print cmdl c.execute(cmdl)
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def __init(self): logger.info("read/write file : " + self.__file_path + " , option : " + self.__option) try: f = open(self.__file_path, self.__option, encoding='utf-8') self.__file = f except (FileNotFoundError, IOError, ValueError) as error: self.__is_exists_file = False logger.error(self.__file_path + " is error : " + error)
def add_project_to_exac(cnf): info('Adding project to ExAC database') exac_venv_pythonpath = join(exac_venv_dir, 'bin', 'python') if is_local(): exac_venv_pythonpath = 'python' cmdline = exac_venv_pythonpath + ' ' + join(exac_code_dir, 'manage.py') + ' ' + 'add_project' + \ ' ' + cnf.project_name + ' ' + cnf.genome.name call(cnf, cmdline)
def get_padded_bed_file(cnf, bed, genome, padding): info('Making bed file for padded regions...') bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(cnf, bed, 'padded') call(cnf, cmdline, output_fpath) return output_fpath
def _split_reference_by_priority(cnf, features_bed_fpath): features = ['CDS', 'Exon', 'Transcript', 'Gene'] info('Splitting the reference file into ' + ', '.join(features)) features_and_beds = [] for f in features: features_and_beds.append( (f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f))) return features_and_beds
def _make_targetcov_symlinks(samples): for sample in samples: new_link = join(dirname(dirname(sample.targetcov_detailed_txt)), basename(sample.targetcov_detailed_txt)) if exists(new_link): os.unlink(new_link) symlink_plus(sample.targetcov_detailed_txt, new_link) info('TargetCov TXT symlink saved to ' + new_link)
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order): bed_params = BedParams() output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep') info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath) with open(bed_fpath, 'r') as in_f: with open(output_fpath, 'w') as out_f: for line in in_f: if line.startswith('#') or line.startswith('track') or line.startswith('browser'): # header bed_params.header.append(line if line.startswith('#') else '#' + line) else: cur_ncn = BedParams.calc_n_cols_needed(line) if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed: critical('number and type of columns should be the same on all lines!') bed_params.n_cols_needed = cur_ncn if line.startswith('chr'): if bed_params.GRCh_names is not None and bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = False if line.startswith('chrMT'): # common misprint, correcting chrMT --> chrM processed_line = '\t'.join(['chrM'] + line.split('\t')[1:]) else: processed_line = line elif line.split('\t')[0] in BedParams.GRCh_to_hg: # GRCh chr names if bed_params.GRCh_names is not None and not bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = True processed_line = '\t'.join([BedParams.GRCh_to_hg[line.split('\t')[0]]] + line.split('\t')[1:]) else: critical('incorrect chromosome name!') entries = processed_line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) if r.is_control(): r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None bed_params.controls.append(r) else: out_f.write(processed_line) return output_fpath, bed_params
def _annotate(bed_fpath, work_dirpath, cnf): annotated_files = [] input_fpath = bed_fpath references = [('RefSeq', cnf.genome.features), ('Ensembl', cnf.genome.ensembl)] for id, (db_name, db_bed_fpath) in enumerate(references): output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'ann_' + db_name.lower()) info('annotating based on {db_name}: {bed_fpath} --> {output_fpath}'.format(**locals())) annotate_bed_py = sys.executable + ' ' + splitext(annotate_bed.__file__)[0] + '.py' cmdline = '{annotate_bed_py} {input_fpath} --reference {db_bed_fpath} -o {output_fpath} --genome {cnf.genome}'.format(**locals()) __call(cnf, cmdline) if id < len(references) - 1: if cnf.debug: info("filtering annotated and not annotated regions into separate files:") only_annotated_bed = __intermediate_fname(work_dirpath, bed_fpath, 'only_ann_' + db_name.lower()) not_annotated_bed = __intermediate_fname(work_dirpath, bed_fpath, 'not_ann_' + db_name.lower()) with open(only_annotated_bed, 'w') as out: cmdline = 'grep -v -E "\.$" {output_fpath}'.format(**locals()) if cnf.debug: info(cmdline + ' > ' + only_annotated_bed) subprocess.call(cmdline, shell=True, stdout=out) with open(not_annotated_bed, 'w') as out: cmdline = 'grep -E "\.$" {output_fpath}'.format(**locals()) if cnf.debug: info(cmdline + ' > ' + not_annotated_bed) subprocess.call(cmdline, shell=True, stdout=out) if not cnf.debug: os.remove(output_fpath) output_fpath = only_annotated_bed input_fpath = not_annotated_bed annotated_files.append(output_fpath) if id != 0 and not cnf.debug: os.remove(input_fpath) return annotated_files
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order): ''' 1. Sorts. 1. Chooses appropriate number of columns (4 or 8 for BEDs with primers). 2. Removes duplicates. ''' info('postprocessing (sorting, cutting, removing duplicates)') key_genes = [] with open(adjust_path(cnf.key_genes), 'r') as f: for line in f: key_genes.append(line.strip()) approved_genes = [] if cnf.hgnc: with open(adjust_path(cnf.hgnc), 'r') as f: f.readline() # header for line in f: approved_genes.append(line.split('\t')[0]) Region.GRCh_names = bed_params.GRCh_names if cnf.output_grch: Region.GRCh_names = True if cnf.debug and not bed_params.GRCh_names: info('Changing chromosome names from hg-style to GRCh-style.') if cnf.output_hg: Region.GRCh_names = False if cnf.debug and bed_params.GRCh_names: info('Changing chromosome names from GRCh-style to hg-style.') Region.n_cols_needed = bed_params.n_cols_needed Region.key_genes = key_genes Region.approved_genes = approved_genes input_regions = set() # we want only unique regions with open(adjust_path(input_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None input_regions.add(r) annotated_regions = [] for annotated_fpath in annotated_fpaths: with open(adjust_path(annotated_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None annotated_regions.append(r) # starting to output result with open(adjust_path(output_bed_fpath), 'w') as f: for line in bed_params.header: f.write(line) annotated_regions.sort() i = 0 prev_region = None not_a_gene_count = 0 solid_regions = [] prev_is_solid = False all_regions = [] for cur_region in sorted(list(input_regions) + bed_params.controls): if not cur_region.is_control(): assert annotated_regions[i] == cur_region, str(cur_region) + ' != ' + str(annotated_regions[i]) + '(i=%d)' % i if annotated_regions[i].symbol != '.': cur_region.set_symbol(annotated_regions[i].symbol) else: if prev_region is None or \ prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"): not_a_gene_count += 1 cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count) i += 1 ambiguous_regions = [cur_region] while i < len(annotated_regions) and annotated_regions[i] == cur_region: # processing duplicates if annotated_regions[i].symbol != '.' and annotated_regions[i].symbol != cur_region.symbol: duplicate = copy.deepcopy(cur_region) duplicate.set_symbol(annotated_regions[i].symbol) if duplicate.type == 'approved' and cur_region.type == 'not_approved': cur_region = duplicate ambiguous_regions = [cur_region] elif annotated_regions[i].type == 'key' and cur_region.type != 'key': cur_region = duplicate ambiguous_regions = [cur_region] if cnf.debug: info('key gene priority over approved gene was used') elif annotated_regions[i].type == cur_region.type: ambiguous_regions.append(duplicate) i += 1 if len(ambiguous_regions) == 1: if not prev_is_solid: solid_regions.append(cur_region) prev_is_solid = True all_regions.append(cur_region) else: if prev_is_solid: solid_regions.append(prev_region) prev_is_solid = False all_regions.append(ambiguous_regions) else: all_regions.append(cur_region) prev_region = cur_region # outputting results cur_solid_id = -1 for entry in all_regions: if isinstance(entry, list): # list of ambiguous regions cur_region = entry[0] while cur_solid_id + 1 < len(solid_regions) and cur_region > solid_regions[cur_solid_id + 1]: cur_solid_id += 1 found = False if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \ and cur_region.chrom == solid_regions[cur_solid_id].chrom: prev_solid = solid_regions[cur_solid_id] for cur_region in entry: if cur_region.symbol == prev_solid.symbol: found = True if cnf.debug: info('gene name was chosen based on previous solid region') break if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \ and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom: next_solid = solid_regions[cur_solid_id + 1] for cur_region in entry: if cur_region.symbol == next_solid.symbol: found = True if cnf.debug: info('gene name was chosen based on next solid region') break if not found: cur_region = entry[0] else: cur_region = entry f.write(str(cur_region) + '\n') # automatically outputs correct number of columns and GRCh/hg names
def _read_args(args_list): options = [ # (['-k', '--key-genes'], dict( # dest='key_genes_fpath', # help='list of key genes (they are at top priority when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt') # ), # (['-a', '--approved-genes'], dict( # dest='approved_genes_fpath', # help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt') # ), # (['-e', '--ensembl-bed'], dict( # dest='ensembl_bed_fpath', # help='reference BED file for annotation (Ensembl)') # ), # (['-r', '--refseq-bed'], dict( # dest='refseq_bed_fpath', # help='reference BED file for annotation (RefSeq)') # ), # (['-b', '--bedtools'], dict( # dest='bedtools', # help='path to bedtools', # default='bedtools') # ), (['-o', '--output-bed'], dict( dest='output_fpath') ), (['--debug'], dict( dest='debug', help='run in a debug more (verbose output, keeping of temporary files)', default=False, action='store_true') ), (['--output-hg'], dict( dest='output_hg', help='output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)', default=False, action='store_true') ), (['--output-grch'], dict( dest='output_grch', help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)', default=False, action='store_true') ), (['-g', '--genome'], dict( dest='genome', default='hg19') ), ] parser = OptionParser(usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file', description='Scripts outputs a standardized version of input BED file. ' 'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);' ' 2) has HGNC approved symbol in forth column if annotation is ' 'possible and not_a_gene_X otherwise;' ' 3) is sorted based on chromosome name -> start -> end;' ' 4) has no duplicated regions (regions with the same chromosome, start and end), ' 'the only exception is _CONTROL_ regions.') for args, kwargs in options: parser.add_option(*args, **kwargs) (opts, args) = parser.parse_args(args_list) if len(args) != 1: parser.print_help(file=sys.stderr) sys.exit(1) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) work_dirpath = tempfile.mkdtemp() info('Creating a temporary working directory ' + work_dirpath) if not exists(work_dirpath): os.mkdir(work_dirpath) input_bed_fpath = abspath(args[0]) info('Input: ' + input_bed_fpath) output_bed_fpath = adjust_path(cnf.output_fpath) info('Writing to: ' + output_bed_fpath) # process configuration # for k, v in opts.__dict__.items(): # if k.endswith('fpath') and verify_file(v, is_critical=True): # opts.__dict__[k] = verify_file(v, k) if cnf.output_grch and cnf.output_hg: info('you cannot specify --output-hg and --output-grch simultaneously!') # if not which(opts.bedtools): # info('bedtools executable not found, please specify correct path (current is %s)! ' # 'Did you forget to execute "module load bedtools"?' % opts.bedtools) # if opts.debug: # info('Configuration: ') # for k, v in opts.__dict__.items(): # info('\t' + k + ': ' + str(v)) info() # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed')) # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed')) return input_bed_fpath, output_bed_fpath, work_dirpath, cnf