def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( BCBioStructure.targqc_name, BCBioStructure.targqc_summary_dir, extra_opts= [(['--bed', '--capture', '--amplicons'], dict(dest='bed', help='BED file to run targetSeq and Seq2C analysis on.')), (['--exons', '--exome', '--features'], dict( dest='features', help= 'Annotated CDS/Exons/Gene/Transcript BED file to make targetSeq exon/amplicon regions reports.' ))]) bed_fpath, features_bed_fpath = adjust_path(cnf.bed), adjust_path( cnf.features) summarize_targqc(cnf, cnf.threads or len(bcbio_structure.samples), cnf.output_dir, bcbio_structure.samples, bed_fpath=bed_fpath, features_fpath=features_bed_fpath)
def get_chr_lengths_from_seq(seq_fpath): chr_lengths = [] if seq_fpath.endswith('.fai'): seq_fpath = splitext(seq_fpath)[0] if verify_file(seq_fpath + '.fai', silent=True): info('Reading genome index file (.fai) to get chromosome lengths') with open(adjust_path(seq_fpath + '.fai'), 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], line.split()[1] chr_lengths.append((chrom, length)) elif verify_file(seq_fpath, silent=True): info('Reading genome sequence (.fa) to get chromosome lengths') with open(adjust_path(seq_fpath), 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai') return chr_lengths
def check_dirs_and_files(cnf, file_keys=list(), dir_keys=list()): errors = [] def _verify_input_file(_key): cnf[_key] = adjust_path(cnf[_key]) if not verify_file(cnf[_key], _key): return False if 'bam' in _key and not verify_bam(cnf[_key]): return False if 'bed' in _key and not verify_bed(cnf[_key]): return False return True for key in file_keys: if key and key in cnf and cnf[key]: if not _verify_input_file(key): errors.append('File ' + cnf[key] + ' is empty or cannot be found') else: cnf[key] = adjust_path(cnf[key]) for key in dir_keys: if key and key in cnf and cnf[key]: cnf[key] = adjust_path(cnf[key]) if not verify_dir(cnf[key], key): errors.append('Directory ' + cnf[key] + ' is empty or cannot be found') else: cnf[key] = adjust_path(cnf[key]) return errors
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int', help='Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir: parser.print_usage() verify_file(opts.left_reads_fpath, is_critical=False) left_reads_fpath = adjust_path(opts.left_reads_fpath) verify_file(opts.right_reads_fpath, is_critical=False) right_reads_fpath = adjust_path(opts.right_reads_fpath) output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o') verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) left_reads_fpath, right_reads_fpath, output_dirpath =\ map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath]) ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******') fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py') fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM) fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM) cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}' if opts.sample_name: cmdl += ' --sample {opts.sample_name}' if opts.downsample_to: cmdl += ' --downsample-to ' + str(int(opts.downsample_to)) cmdl = cmdl.format(**locals()) cmdl += ' 2>&1' info(cmdl) stdin, stdout, stderr = ssh.exec_command(cmdl) for l in stdout: err(l, ending='') info() ssh.close()
def print_genes(genes, output_fpath, canon_only): regions = [] already_added_gene_features = set() transcripts = [] for g in genes: for tr in g.transcripts: if not canon_only or tr.is_canonical: transcripts.append(tr) for tr in sorted(transcripts, key=lambda _tr: _tr.get_key()): to_add_gene = all(tr2.biotype == 'protein_coding' for tr2 in tr.gene.transcripts if (tr2.is_canonical or not canon_only)) \ and tr.gene not in already_added_gene_features \ and (len(tr.gene.canonical_transcripts) == 1 or len(tr.gene.transcripts) == 1) if to_add_gene: # skip gene feature for all miRNA because there are multi-domain miRNA located in different # places with the same gene name regions.append(tr.gene) already_added_gene_features.add(tr.gene) if tr.exons: regions.append(tr) for e in tr.exons: regions.append(e) info('Writing ' + str(len(regions)) + ' regions') with open(adjust_path(output_fpath), 'w') as all_out: for r in regions: all_out.write(r.__str__())
def get_transcipts_with_exons_from_features(features_file, cur_chrom=None): transcripts = defaultdict(list) with open_gzipsafe(adjust_path(features_file)) as in_f: for line in in_f: if line.startswith('#'): continue fields = line.strip('\n').split('\t') chrom = fields[0] if cur_chrom and chrom != cur_chrom: continue feature_type = fields[6] if feature_type not in ['Exon', 'CDS', 'UTR']: continue start = int(fields[1]) stop = int(fields[2]) transcript_id = fields[8] exon = { 'transcript_id': transcript_id, 'chrom': chrom, 'start': start, 'stop': stop } transcripts[(transcript_id, chrom)].append(exon) return transcripts
def main(): if len(sys.argv) < 2: sys.stderr.write('The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.\n') sys.stderr.write('When the gene name is found in HGNC, it get replaced with an approved name.\n') sys.stderr.write('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.\n') sys.stderr.write('\n') sys.stderr.write('Usage:\n') sys.stderr.write(' ' + __file__ + ' Ensembl.gtf [HGNC_cBio_genes.tsv] [additional_feature_list] > Exons.bed\n') sys.stderr.write('\n') sys.stderr.write(' where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:\n') sys.stderr.write(' #Approved Symbol Previous Symbols Synonyms Chromosome Ensembl Gene ID UCSC ID(supplied by UCSC)\n') sys.stderr.write(' OR7E26P OR7E67P, OR7E69P, OR7E70P, OR7E68P OR1-51, OR1-72, OR1-73, OR912-95 19q13.43 ENSG00000121410 uc002qsg.3\n') sys.stderr.write(' ...\n') sys.stderr.write('\n') sys.stderr.write(' feature_list is by default empty, but could be transcript\n') sys.stderr.write('\n') sys.stderr.write(' and UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:\n') sys.stderr.write(' #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol\n') sys.stderr.write(' uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1\n') sys.stderr.write(' ...\n') sys.stderr.write(' or Ensembl.gtf (ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz)') sys.stderr.write(' 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";') sys.stderr.write(' 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";') sys.stderr.write(' ...\n') sys.stderr.write('\n') sys.stderr.write(' Writes to Exons.bed\n') sys.stderr.write('\n') sys.stderr.write('See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols\n') sys.exit(1) # if is_local(): # sys.stderr.write('Local: will run only for chr21\n') # sys.stderr.write('\n') input_fpath = adjust_path(sys.argv[1]) hgnc_fpath = adjust_path(sys.argv[2]) approved_gene_by_name = None if hgnc_fpath and hgnc_fpath != "''": sys.stderr.write('Synonyms file provided ' + hgnc_fpath + '\n') approved_gene_by_name = read_hgnc_genes(hgnc_fpath) else: sys.stderr.write('No synonyms file provided, skipping approving\n') out = sys.stdout with open(input_fpath) as inp: _ = inp.readline() not_approved_gene_names = _proc_ensembl(inp, out, approved_gene_by_name)
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def get_bed_targqc_inputs(cnf, bed_fpath=None): if bed_fpath: bed_fpath = verify_bed(bed_fpath, description='Input BED file', is_critical=True) info('Using amplicons/capture panel ' + bed_fpath) features_bed_fpath = adjust_path(cnf.features or cnf.genome.features) if features_bed_fpath: info('Features: ' + features_bed_fpath) genes_fpath = None if cnf.genes: genes_fpath = adjust_path(cnf.genes) info('Custom genes list: ' + genes_fpath) return bed_fpath, features_bed_fpath, genes_fpath
def _verify_input_file(_key): cnf[_key] = adjust_path(cnf[_key]) if not verify_file(cnf[_key], _key): return False if 'bam' in _key and not verify_bam(cnf[_key]): return False if 'bed' in _key and not verify_bed(cnf[_key]): return False return True
def main(): cnf, vcf2txt_res_fpath = get_args() info('-' * 70) info('Writing to ' + cnf.output_file) if cnf.all_transcripts_output_file: info('Writing info for all transcripts to ' + cnf.all_transcripts_output_file) if cnf.fm_output_file: info('Writing in FM format to ' + cnf.fm_output_file) if cnf.rejected_output_file: info('Writing rejected mutations to ' + cnf.rejected_output_file) f = Filtration(cnf) input_f = open(verify_file(vcf2txt_res_fpath)) output_f = open(adjust_path(cnf.output_file), 'w') rejected_output_f = open(adjust_path(cnf.rejected_output_file), 'w') if cnf.rejected_output_file else None fm_output_f = open(adjust_path(cnf.fm_output_file), 'w') if cnf.fm_output_file else None all_transcripts_output_f = open( adjust_path(cnf.all_transcripts_output_file), 'w') if cnf.all_transcripts_output_file else None info() info('-' * 70) info('Running filtering...') f.do_filtering(input_f, output_f, fm_output_f, all_transcripts_output_f, rejected_output_f) input_f.close() output_f.close() if fm_output_f: fm_output_f.close() if all_transcripts_output_f: all_transcripts_output_f.close() info() if cnf.rejected_output_file: info('Rejected mutations saved to ' + cnf.rejected_output_file) info('Saved to ' + cnf.output_file)
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None): input_bed_fpath = verify_bed(input_bed_fpath) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else intermediate_fname( cnf, input_bed_fpath, 'sorted') class Region(SortableByChrom): def __init__(self, chrom, start, end, other_fields, chrom_ref_order): SortableByChrom.__init__(self, chrom, chrom_ref_order) self.start = start self.end = end self.chrom_ref_order = chrom_ref_order self.other_fields = tuple(other_fields) def get_key(self): return self.chrom_ref_order, self.start, self.end, self.other_fields regions = [] chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} info('Sorting regions in ' + input_bed_fpath) if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed( output_bed_fpath): info(output_bed_fpath + ' exists, reusing') return output_bed_fpath with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') info('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath + '\n') return output_bed_fpath
def determine_run_cnf(opts, is_wgs=False, is_targetseq=False): if opts.run_cnf: opts.run_cnf = adjust_path(opts.run_cnf) elif is_wgs: opts.run_cnf = defaults['run_cnf_wgs'] elif is_targetseq: opts.run_cnf = defaults['run_cnf_deep_seq'] else: opts.run_cnf = defaults['run_cnf_exome_seq'] verify_file(opts.run_cnf, is_critical=True) debug('Using run configuration ' + opts.run_cnf) return opts.run_cnf
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option( '--downsample-to', dest='downsample_to', default=None, type='int', help= 'Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() logger.is_debug = opts.debug cnf = Config(opts.__dict__, determine_sys_cnf(opts), determine_run_cnf(opts)) left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True) right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True) output_dirpath = adjust_path( opts.output_dir) if opts.output_dir else critical( 'Please, specify output directory with -o') verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) with workdir(cnf): sample_name = cnf.sample_name if not sample_name: sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath) results_dirpath = run_fastq(cnf, sample_name, left_reads_fpath, right_reads_fpath, output_dirpath, downsample_to=cnf.downsample_to) verify_dir(results_dirpath, is_critical=True) info() info('*' * 70) info('Fastqc results:') info(' ' + results_dirpath)
def verify_bed(fpath, description='', is_critical=False, silent=False): if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) error = BedFile(fpath).checkformat() if error: fn = critical if is_critical else err fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) + '\n') return None return fpath
def main(): args = sys.argv[1:] if len(args) < 2: sys.exit('Usage: ' + __file__ + ' sambamba_depth_report sample_name bed_col_num') bedcov_hist_fpath, sample_name, bed_col_num = args amplicons = summarize_bedcoverage_hist_stats(adjust_path(bedcov_hist_fpath), sample_name, int(bed_col_num)) amplicons = sorted(amplicons, key=lambda a: (a.chrom, a.gene_name, a.start)) for r in amplicons: r.calc_avg_depth() save_regions_to_seq2cov_output__nocnf(sample_name, amplicons)
def main(args): if len(args) < 2: critical('Usage: ' + __file__ + ' InputRootDirectory OutputRootDirectory [Build=hg38]') sys.exit(1) inp_root = adjust_path(args[0]) out_root = adjust_path(args[1]) build = 'hg38' if len(args) >= 3: build = args[2] chain_fpath = chains[build.lower()] for inp_dirpath, subdirs, files in os.walk(inp_root): for fname in files: if fname == 'sample1-cn_mops.bed': pass if fname.endswith('.bed'): inp_fpath = adjust_path(join(inp_dirpath, fname)) print inp_fpath + ': ' + str( count_bed_cols(inp_fpath)) + ' columns' out_dirpath = adjust_path( join(out_root, relpath(inp_dirpath, inp_root))) safe_mkdir(out_dirpath) out_fpath = adjust_path(join(out_dirpath, fname)) unlifted_fpath = adjust_path( join(out_dirpath, fname + '.unlifted')) cmdline = '' with open(inp_fpath) as f: fs = f.readline().split('\t') try: int(fs[6]) int(fs[7]) except: info('Cutting ' + inp_fpath) cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; ' cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"' cmdline = cmdline.format(**locals()) info(cmdline) os.system(cmdline) verify_file(out_fpath) if isfile(unlifted_fpath): if getsize(unlifted_fpath) <= 0: os.remove(unlifted_fpath) else: err('Some records were unlifted and saved to ' + unlifted_fpath)
def main(): parser = OptionParser(usage='Usage: ' + basename(__file__) + ' -o Output_BED_file -g hg19 Input_BED_file') parser.add_option('-o', '--output-bed', dest='output_fpath') parser.add_option('-g', '--genome', dest='genome') (opts, args) = parser.parse_args(sys.argv[1:]) if len(args) < 1: parser.print_help(file=sys.stderr) sys.exit(1) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) check_genome_resources(cnf) if not cnf.output_fpath: critical(parser.usage) sort_bed(cnf, verify_bed(args[0], is_critical=True), adjust_path(cnf.output_fpath))
def get_chr_len_fpath(cnf): chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt') if cnf.reuse_intermediate and file_exists(chr_len_fpath): info(chr_len_fpath + ' exists, reusing') return chr_len_fpath else: if not cnf.genome.seq: critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' + cnf.genome.name + '" section') return None chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq)) with file_transaction(cnf.work_dir, chr_len_fpath) as tx: with open(tx, 'w') as handle: for c, l in chr_lengths: handle.write(c + '\t' + str(l) + '\n') return chr_len_fpath
def set_up_dirs(cnf, log_dir_name='log'): """ Creates output_dir, work_dir; sets up log """ if cnf.output_dir: cnf.output_dir = adjust_path(cnf.output_dir) safe_mkdir(cnf.output_dir, 'output_dir') info('Saving into ' + cnf.output_dir) set_up_work_dir(cnf) if cnf.log_dir == '-': cnf.log_dir = None else: if not cnf.log_dir: cnf.log_dir = join(cnf.work_dir, log_dir_name) safe_mkdir(cnf.log_dir) info('Created log dir ' + cnf.log_dir) set_up_log(cnf)
def _verify_sample_info(vcf_conf, vcf_header_samples): if 'samples' in vcf_conf: for header_sample_name, sample_conf in vcf_conf['samples'].items(): join_parent_conf(sample_conf, vcf_conf) bam = sample_conf.get('bam') if bam and not verify_file(bam, 'Bam file'): exit() sample_conf['bam'] = adjust_path(bam) sample_cnfs = vcf_conf.get('samples') or OrderedDict() # compare input sample names to vcf header if sample_cnfs: for input_sample_name, sample_conf in sample_cnfs.items(): if input_sample_name not in vcf_header_samples: critical('ERROR: sample ' + input_sample_name + ' is not in VCF header ' + vcf_header_samples + '\n' 'Available samples: ' + ', '.join(vcf_header_samples)) return sample_cnfs
def get_args(): info(' '.join(sys.argv)) info() parser = OptionParser() parser.add_option('-o', dest='output_file') parser.set_usage('Usage: ' + __file__ + ' dbsnp.vcf.gz -o output_fpath') (opts, args) = parser.parse_args() if len(args) < 1: critical("Provide the first argument - path to dbsnp VCF") vcf2txt_res_fpath = verify_file(args[0]) if not opts.output_file: critical('Please, specify the output fpath with -o') info() return vcf2txt_res_fpath, adjust_path(opts.output_file)
def set_up_work_dir(cnf): # timestamp = str(datetime.datetime.now()) # user_prid = getpass.getuser() # hasher = hashlib.sha1( + timestamp) # path_hash = base64.urlsafe_b64encode(hasher.digest()[0:4])[:-1] if not cnf.work_dir: if cnf.output_dir: work_dir_name = 'work' + ('_' + cnf.sample if cnf.sample else '') cnf.work_dir = join(cnf.output_dir, work_dir_name) info('Work dir: ' + cnf.work_dir) # if not cnf.reuse_intermediate and isdir(cnf.work_dir): # rmtree(cnf.work_dir) else: cnf.work_dir = tempfile.mkdtemp() info('Creating temprorary directory for work dir: ' + cnf.work_dir) else: cnf.work_dir = adjust_path(cnf.work_dir) info('Work dir: ' + cnf.work_dir) safe_mkdir(cnf.work_dir, 'working directory')
def verify_bam(fpath, description='', is_critical=False, silent=False): if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) logfn = critical if is_critical else err if not fpath.endswith('.bam'): logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam ' 'extension. Please, make sure you pass proper file.') return None textchars = ''.join( map(chr, [7, 8, 9, 10, 12, 13, 27] + range(0x20, 0x100))) is_binary_string = lambda baitiki: bool(baitiki.translate(None, textchars)) if not is_binary_string(open(fpath).read(3)): logfn('The BAM file ' + fpath + ' must be a binary file.') return None return fpath
def sort_bed_by_alphabet(cnf, input_bed_fpath, output_bed_fpath=None, chr_len_fpath=None): chr_lengths = get_chr_lengths(cnf, chr_len_fpath) chromosomes = set([c for (c, l) in chr_lengths]) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else add_suffix( input_bed_fpath, 'sorted') regions = defaultdict(list) info('Sorting regions...') chunk_size = 10 chunk_counter = 0 with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] if chrom not in chromosomes: continue if chunk_counter == chunk_size or not regions[chrom]: chunk_counter = 0 regions[chrom].append('') regions[chrom][-1] += l chunk_counter += 1 for chr in sorted(regions.keys()): for region in regions[chr]: out.write(region) return output_bed_fpath
def main(): info(' '.join(sys.argv)) info() description = 'This script converts Vardict TXT file to VCF.' parser = OptionParser( description=description, usage='Usage: ' + basename(__file__) + ' [-o Output_directory -c Var_caller_name] Project_directory') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('--log-dir', dest='log_dir', default='-') parser.add_option('-c', '--caller', dest='caller_name', default='vardict') parser.add_option('-o', dest='output_dir', help='Output directory.') cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \ = process_post_bcbio_args(parser) if not bcbio_project_dirpaths: parser.print_help(file=sys.stderr) sys.exit(1) bcbio_structures = [] for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip( bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths): bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf, final_dirpath) bcbio_structures.append(bs) cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work')) safe_mkdir(cnf.work_dir) info('') info('*' * 70) for bs in bcbio_structures: for sample in bs.samples: if sample.phenotype != 'normal': convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample)
def _read_args(args_list): options = [ # (['-k', '--key-genes'], dict( # dest='key_genes_fpath', # help='list of key genes (they are at top priority when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt') # ), # (['-a', '--approved-genes'], dict( # dest='approved_genes_fpath', # help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt') # ), # (['-e', '--ensembl-bed'], dict( # dest='ensembl_bed_fpath', # help='reference BED file for annotation (Ensembl)') # ), # (['-r', '--refseq-bed'], dict( # dest='refseq_bed_fpath', # help='reference BED file for annotation (RefSeq)') # ), # (['-b', '--bedtools'], dict( # dest='bedtools', # help='path to bedtools', # default='bedtools') # ), (['-o', '--output-bed'], dict( dest='output_fpath') ), (['--debug'], dict( dest='debug', help='run in a debug more (verbose output, keeping of temporary files)', default=False, action='store_true') ), (['--output-hg'], dict( dest='output_hg', help='output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)', default=False, action='store_true') ), (['--output-grch'], dict( dest='output_grch', help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)', default=False, action='store_true') ), (['-g', '--genome'], dict( dest='genome', default='hg19') ), ] parser = OptionParser(usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file', description='Scripts outputs a standardized version of input BED file. ' 'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);' ' 2) has HGNC approved symbol in forth column if annotation is ' 'possible and not_a_gene_X otherwise;' ' 3) is sorted based on chromosome name -> start -> end;' ' 4) has no duplicated regions (regions with the same chromosome, start and end), ' 'the only exception is _CONTROL_ regions.') for args, kwargs in options: parser.add_option(*args, **kwargs) (opts, args) = parser.parse_args(args_list) if len(args) != 1: parser.print_help(file=sys.stderr) sys.exit(1) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) work_dirpath = tempfile.mkdtemp() info('Creating a temporary working directory ' + work_dirpath) if not exists(work_dirpath): os.mkdir(work_dirpath) input_bed_fpath = abspath(args[0]) info('Input: ' + input_bed_fpath) output_bed_fpath = adjust_path(cnf.output_fpath) info('Writing to: ' + output_bed_fpath) # process configuration # for k, v in opts.__dict__.items(): # if k.endswith('fpath') and verify_file(v, is_critical=True): # opts.__dict__[k] = verify_file(v, k) if cnf.output_grch and cnf.output_hg: info('you cannot specify --output-hg and --output-grch simultaneously!') # if not which(opts.bedtools): # info('bedtools executable not found, please specify correct path (current is %s)! ' # 'Did you forget to execute "module load bedtools"?' % opts.bedtools) # if opts.debug: # info('Configuration: ') # for k, v in opts.__dict__.items(): # info('\t' + k + ': ' + str(v)) info() # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed')) # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed')) return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
def _read_args(args_list): options = [ # (['-k', '--key-genes'], dict( # dest='key_genes_fpath', # help='list of key genes (they are at top priority when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt') # ), # (['-a', '--approved-genes'], dict( # dest='approved_genes_fpath', # help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)', # default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt') # ), # (['-e', '--ensembl-bed'], dict( # dest='ensembl_bed_fpath', # help='reference BED file for annotation (Ensembl)') # ), # (['-r', '--refseq-bed'], dict( # dest='refseq_bed_fpath', # help='reference BED file for annotation (RefSeq)') # ), # (['-b', '--bedtools'], dict( # dest='bedtools', # help='path to bedtools', # default='bedtools') # ), (['-o', '--output-bed'], dict(dest='output_fpath')), (['--debug'], dict( dest='debug', help= 'run in a debug more (verbose output, keeping of temporary files)', default=False, action='store_true')), (['--output-hg'], dict( dest='output_hg', help= 'output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)', default=False, action='store_true')), (['--output-grch'], dict( dest='output_grch', help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)', default=False, action='store_true')), (['-g', '--genome'], dict(dest='genome', default='hg19')), ] parser = OptionParser( usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file', description='Scripts outputs a standardized version of input BED file. ' 'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);' ' 2) has HGNC approved symbol in forth column if annotation is ' 'possible and not_a_gene_X otherwise;' ' 3) is sorted based on chromosome name -> start -> end;' ' 4) has no duplicated regions (regions with the same chromosome, start and end), ' 'the only exception is _CONTROL_ regions.') for args, kwargs in options: parser.add_option(*args, **kwargs) (opts, args) = parser.parse_args(args_list) if len(args) != 1: parser.print_help(file=sys.stderr) sys.exit(1) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) work_dirpath = tempfile.mkdtemp() info('Creating a temporary working directory ' + work_dirpath) if not exists(work_dirpath): os.mkdir(work_dirpath) input_bed_fpath = abspath(args[0]) info('Input: ' + input_bed_fpath) output_bed_fpath = adjust_path(cnf.output_fpath) info('Writing to: ' + output_bed_fpath) # process configuration # for k, v in opts.__dict__.iteritems(): # if k.endswith('fpath') and verify_file(v, is_critical=True): # opts.__dict__[k] = verify_file(v, k) if cnf.output_grch and cnf.output_hg: info( 'you cannot specify --output-hg and --output-grch simultaneously!') # if not which(opts.bedtools): # info('bedtools executable not found, please specify correct path (current is %s)! ' # 'Did you forget to execute "module load bedtools"?' % opts.bedtools) # if opts.debug: # info('Configuration: ') # for k, v in opts.__dict__.iteritems(): # info('\t' + k + ': ' + str(v)) info() # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed')) # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \ # ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed')) return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order): ''' 1. Sorts. 1. Chooses appropriate number of columns (4 or 8 for BEDs with primers). 2. Removes duplicates. ''' info('postprocessing (sorting, cutting, removing duplicates)') key_genes = [] with open(adjust_path(cnf.key_genes), 'r') as f: for line in f: key_genes.append(line.strip()) approved_genes = [] if cnf.hgnc: with open(adjust_path(cnf.hgnc), 'r') as f: f.readline() # header for line in f: approved_genes.append(line.split('\t')[0]) Region.GRCh_names = bed_params.GRCh_names if cnf.output_grch: Region.GRCh_names = True if cnf.debug and not bed_params.GRCh_names: info('Changing chromosome names from hg-style to GRCh-style.') if cnf.output_hg: Region.GRCh_names = False if cnf.debug and bed_params.GRCh_names: info('Changing chromosome names from GRCh-style to hg-style.') Region.n_cols_needed = bed_params.n_cols_needed Region.key_genes = key_genes Region.approved_genes = approved_genes input_regions = set() # we want only unique regions with open(adjust_path(input_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'. format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None input_regions.add(r) annotated_regions = [] for annotated_fpath in annotated_fpaths: with open(adjust_path(annotated_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'. format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None annotated_regions.append(r) # starting to output result with open(adjust_path(output_bed_fpath), 'w') as f: for line in bed_params.header: f.write(line) annotated_regions.sort() i = 0 prev_region = None not_a_gene_count = 0 solid_regions = [] prev_is_solid = False all_regions = [] for cur_region in sorted(list(input_regions) + bed_params.controls): if not cur_region.is_control(): assert annotated_regions[i] == cur_region, str( cur_region) + ' != ' + str( annotated_regions[i]) + '(i=%d)' % i if annotated_regions[i].symbol != '.': cur_region.set_symbol(annotated_regions[i].symbol) else: if prev_region is None or \ prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"): not_a_gene_count += 1 cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count) i += 1 ambiguous_regions = [cur_region] while i < len(annotated_regions) and annotated_regions[ i] == cur_region: # processing duplicates if annotated_regions[i].symbol != '.' and annotated_regions[ i].symbol != cur_region.symbol: duplicate = copy.deepcopy(cur_region) duplicate.set_symbol(annotated_regions[i].symbol) if duplicate.type == 'approved' and cur_region.type == 'not_approved': cur_region = duplicate ambiguous_regions = [cur_region] elif annotated_regions[ i].type == 'key' and cur_region.type != 'key': cur_region = duplicate ambiguous_regions = [cur_region] if cnf.debug: info( 'key gene priority over approved gene was used' ) elif annotated_regions[i].type == cur_region.type: ambiguous_regions.append(duplicate) i += 1 if len(ambiguous_regions) == 1: if not prev_is_solid: solid_regions.append(cur_region) prev_is_solid = True all_regions.append(cur_region) else: if prev_is_solid: solid_regions.append(prev_region) prev_is_solid = False all_regions.append(ambiguous_regions) else: all_regions.append(cur_region) prev_region = cur_region # outputting results cur_solid_id = -1 for entry in all_regions: if isinstance(entry, list): # list of ambiguous regions cur_region = entry[0] while cur_solid_id + 1 < len( solid_regions) and cur_region > solid_regions[ cur_solid_id + 1]: cur_solid_id += 1 found = False if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \ and cur_region.chrom == solid_regions[cur_solid_id].chrom: prev_solid = solid_regions[cur_solid_id] for cur_region in entry: if cur_region.symbol == prev_solid.symbol: found = True if cnf.debug: info( 'gene name was chosen based on previous solid region' ) break if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \ and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom: next_solid = solid_regions[cur_solid_id + 1] for cur_region in entry: if cur_region.symbol == next_solid.symbol: found = True if cnf.debug: info( 'gene name was chosen based on next solid region' ) break if not found: cur_region = entry[0] else: cur_region = entry f.write( str(cur_region) + '\n' ) # automatically outputs correct number of columns and GRCh/hg names
def main(): if len(sys.argv) < 4: info( 'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.' ) # info('When the gene name is found in HGNC, it get replaced with an approved name. ') # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is. ') info( ' ' ) info( 'Usage: ' ) info(' ' + __file__ + ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path + '] [additional_feature_list]') info( ' ' ) info( ' where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:' ) info( ' #Approved Symbol Previous Symbols Synonyms Chromosome Ensembl Gene ID UCSC ID(supplied by UCSC)' ) info( ' OR7E26P OR7E67P, OR7E69P, OR7E70P, OR7E68P OR1-51, OR1-72, OR1-73, OR912-95 19q13.43 ENSG00000121410 uc002qsg.3' ) info( ' ... ' ) info( ' ' ) info( ' or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz' ) info( ' 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";' ) info( ' 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";' ) info( ' ... ' ) info( ' ' ) info( ' or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz' ) info( ' NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA' ) info( ' NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true' ) info( ' NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' ... ' ) info( ' ' ) info( ' or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:' ) info( ' #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol' ) info( ' uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1' ) info( ' ... ' ) info( ' ' ) info( ' Writes to Exons.bed ' ) info( ' ' ) info( 'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols' ) sys.exit(1) genome_name = sys.argv[1] seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath chr_lengths = get_chr_lengths_from_seq(seq_fpath) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} input_fpath = verify_file(sys.argv[2]) output_fpath = adjust_path(sys.argv[3]) synonyms_fpath = None if len(sys.argv) > 4: synonyms_fpath = verify_file(sys.argv[4]) info('Synonyms file provided ' + synonyms_fpath + '') else: info('No synonyms file provided, skipping approving') not_approved_fpath = None if len(sys.argv) > 5: not_approved_fpath = adjust_path(sys.argv[5]) with open(verify_file(canonical_transcripts_fpath)) as f: canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f) info('Reading the features...') with open_gzipsafe(input_fpath) as inp: l = inp.readline() if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath, chr_order) elif output_fpath.endswith('.gff3') or output_fpath.endswith( '.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath, chr_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order) if synonyms_fpath and synonyms_fpath != "''": gene_by_name_and_chrom, not_approved_gene_names = _approve( gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) # with open('serialized_genes.txt', 'w') as f: # for g in gene_by_name.values(): # f.write(str(g) + '\t' + str(g.db_id) + '\n') # for e in g.exons: # f.write('\t' + str(e) + '\n') info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_and_mirna_genes = [ g for g in genes if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts) ] coding_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'protein_coding' for t in g.transcripts) ] coding_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'protein_coding' ] mirna_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) ] mirna_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'miRNA' ] codingmiRNA_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) and any(t.biotype == 'protein_coding' for t in g.transcripts) ] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(mirna_genes)) + ' miRNA genes') info(' ' + str(len(mirna_transcripts)) + ' miRNA transcripts') info(' ' + str(len(codingmiRNA_genes)) + ' genes with both coding and miRNA transcripts') info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') print_genes(genes, output_fpath, canon_only=False) info() info('Sorting and printing canonical regions...') canon_output_fpath = add_suffix(output_fpath, 'canon') print_genes(canon_genes, canon_output_fpath, canon_only=True) info() info('Saved all regions to\n ' + output_fpath + '\n ' + canon_output_fpath)
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order): ''' 1. Sorts. 1. Chooses appropriate number of columns (4 or 8 for BEDs with primers). 2. Removes duplicates. ''' info('postprocessing (sorting, cutting, removing duplicates)') key_genes = [] with open(adjust_path(cnf.key_genes), 'r') as f: for line in f: key_genes.append(line.strip()) approved_genes = [] if cnf.hgnc: with open(adjust_path(cnf.hgnc), 'r') as f: f.readline() # header for line in f: approved_genes.append(line.split('\t')[0]) Region.GRCh_names = bed_params.GRCh_names if cnf.output_grch: Region.GRCh_names = True if cnf.debug and not bed_params.GRCh_names: info('Changing chromosome names from hg-style to GRCh-style.') if cnf.output_hg: Region.GRCh_names = False if cnf.debug and bed_params.GRCh_names: info('Changing chromosome names from GRCh-style to hg-style.') Region.n_cols_needed = bed_params.n_cols_needed Region.key_genes = key_genes Region.approved_genes = approved_genes input_regions = set() # we want only unique regions with open(adjust_path(input_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None input_regions.add(r) annotated_regions = [] for annotated_fpath in annotated_fpaths: with open(adjust_path(annotated_fpath)) as f: for line in f: entries = line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None annotated_regions.append(r) # starting to output result with open(adjust_path(output_bed_fpath), 'w') as f: for line in bed_params.header: f.write(line) annotated_regions.sort() i = 0 prev_region = None not_a_gene_count = 0 solid_regions = [] prev_is_solid = False all_regions = [] for cur_region in sorted(list(input_regions) + bed_params.controls): if not cur_region.is_control(): assert annotated_regions[i] == cur_region, str(cur_region) + ' != ' + str(annotated_regions[i]) + '(i=%d)' % i if annotated_regions[i].symbol != '.': cur_region.set_symbol(annotated_regions[i].symbol) else: if prev_region is None or \ prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"): not_a_gene_count += 1 cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count) i += 1 ambiguous_regions = [cur_region] while i < len(annotated_regions) and annotated_regions[i] == cur_region: # processing duplicates if annotated_regions[i].symbol != '.' and annotated_regions[i].symbol != cur_region.symbol: duplicate = copy.deepcopy(cur_region) duplicate.set_symbol(annotated_regions[i].symbol) if duplicate.type == 'approved' and cur_region.type == 'not_approved': cur_region = duplicate ambiguous_regions = [cur_region] elif annotated_regions[i].type == 'key' and cur_region.type != 'key': cur_region = duplicate ambiguous_regions = [cur_region] if cnf.debug: info('key gene priority over approved gene was used') elif annotated_regions[i].type == cur_region.type: ambiguous_regions.append(duplicate) i += 1 if len(ambiguous_regions) == 1: if not prev_is_solid: solid_regions.append(cur_region) prev_is_solid = True all_regions.append(cur_region) else: if prev_is_solid: solid_regions.append(prev_region) prev_is_solid = False all_regions.append(ambiguous_regions) else: all_regions.append(cur_region) prev_region = cur_region # outputting results cur_solid_id = -1 for entry in all_regions: if isinstance(entry, list): # list of ambiguous regions cur_region = entry[0] while cur_solid_id + 1 < len(solid_regions) and cur_region > solid_regions[cur_solid_id + 1]: cur_solid_id += 1 found = False if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \ and cur_region.chrom == solid_regions[cur_solid_id].chrom: prev_solid = solid_regions[cur_solid_id] for cur_region in entry: if cur_region.symbol == prev_solid.symbol: found = True if cnf.debug: info('gene name was chosen based on previous solid region') break if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \ and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom: next_solid = solid_regions[cur_solid_id + 1] for cur_region in entry: if cur_region.symbol == next_solid.symbol: found = True if cnf.debug: info('gene name was chosen based on next solid region') break if not found: cur_region = entry[0] else: cur_region = entry f.write(str(cur_region) + '\n') # automatically outputs correct number of columns and GRCh/hg names
def proc_opts(): parser = OptionParser() add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('--expose-only', dest='expose_to_ngs_server_only', action='store_true', default=False, help='Only add project to the webserver') parser.add_option('--no-expose', dest='expose', action='store_false', default=True, help='Do not expose the reports') parser.add_option('-o', dest='output_dir') parser.add_option('--bed', dest='bed', help='BED file to run targetSeq and Seq2C analysis on.') parser.add_option('--downsample-to', dest='downsample_to', type='int') (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) < 1: critical('Usage: ' + __file__ + ' *.fq.gz -o output_dir') # if len(args) < 2: # info('No dataset path specified, assuming it is the current working directory') # dataset_dirpath = adjust_path(os.getcwd()) # jira_url = args[0] fastq_fpaths = [verify_file(fpath) for fpath in args] fastq_fpaths = [fpath for fpath in fastq_fpaths if fpath] info(str(len(fastq_fpaths)) + ' fastq files') run_cnf = determine_run_cnf(opts) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) cnf.output_dir = adjust_path(cnf.output_dir) info('Writing to ' + str(cnf.output_dir)) cnf.project_name = cnf.project_name or 'preproc' if cnf.work_dir: cnf.debug = True else: all_work_dir = join(cnf.output_dir, 'work') safe_mkdir(all_work_dir) latest_fpath = join(all_work_dir, 'latest') if cnf.reuse_intermediate: cnf.work_dir = latest_fpath else: cnf.work_dir = join( all_work_dir, datetime.datetime.now().strftime("%Y-%b-%d_%H-%M")) if islink(latest_fpath): os.remove(latest_fpath) if isdir(latest_fpath): shutil.rmtree(latest_fpath) if not exists(latest_fpath): os.symlink(basename(cnf.work_dir), latest_fpath) cnf.work_dir = adjust_path(cnf.work_dir) safe_mkdir(cnf.work_dir) cnf.log_dir = join(cnf.work_dir, 'log') safe_mkdir(cnf.log_dir) set_up_log(cnf) try: subprocess.call(['chmod', '-R', 'g+w', cnf.work_dir]) except OSError: err(traceback.format_exc()) pass if cnf.samplesheet: cnf.samplesheet = verify_file(cnf.samplesheet, is_critical=True) info(' '.join(sys.argv)) info() info('Created a temporary working directory: ' + cnf.work_dir) if cnf.project_name: info('Project name: ' + cnf.project_name) if cnf.samplesheet: info('Using custom sample sheet ' + cnf.samplesheet) check_genome_resources(cnf) check_system_resources(cnf, optional=['fastq']) return cnf, cnf.output_dir, fastq_fpaths