def prepare_genes(self): curl_ensembl_gtf = Curl() curl_ensembl_gtf.remote = self.ensembl_gtf_remote curl_ensembl_gtf.output = "{}/genes/{}".format( self.outdir, os.path.basename(self.ensembl_gtf_remote)) curl_ensembl_gtf.jobname = "curl-ensembl-gtf" curl_ensembl_gtf.is_intermediate = True self.add(curl_ensembl_gtf) gunzip_ensembl_gtf = Gunzip() gunzip_ensembl_gtf.input = curl_ensembl_gtf.output gunzip_ensembl_gtf.output = stripsuffix(curl_ensembl_gtf.output, ".gz") gunzip_ensembl_gtf.is_intermediate = True self.add(gunzip_ensembl_gtf) filt_ensembl_gtf_chrs = FilterGTFChromosomes() filt_ensembl_gtf_chrs.input = gunzip_ensembl_gtf.output filt_ensembl_gtf_chrs.output = stripsuffix(gunzip_ensembl_gtf.output, ".gtf") + ".filtered.gtf" self.add(filt_ensembl_gtf_chrs) gtf2genepred_ensembl = GTF2GenePred() gtf2genepred_ensembl.input = filt_ensembl_gtf_chrs.output gtf2genepred_ensembl.output = stripsuffix(filt_ensembl_gtf_chrs.output, ".gtf") + ".genepred" self.add(gtf2genepred_ensembl) filt_genes_ensembl_gtf_genes = FilterGTFGenes() filt_genes_ensembl_gtf_genes.input = filt_ensembl_gtf_chrs.output filt_genes_ensembl_gtf_genes.output = stripsuffix( filt_ensembl_gtf_chrs.output, ".gtf") + ".genes-only.gtf" self.add(filt_genes_ensembl_gtf_genes) self.reference_data['ensemblVersion'] = self.ensembl_version self.reference_data['genesGtf'] = filt_ensembl_gtf_chrs.output self.reference_data['genesGenePred'] = gtf2genepred_ensembl.output self.reference_data[ 'genesGtfGenesOnly'] = filt_genes_ensembl_gtf_genes.output
def prepare_reference_genome(self): genome_unzipped = stripsuffix( os.path.basename(self.input_reference_sequence), ".gz") gunzip_ref = Gunzip() gunzip_ref.input = self.input_reference_sequence gunzip_ref.output = "{}/genome/{}".format(self.outdir, genome_unzipped) self.add(gunzip_ref) copy_ref_to_bwa = Copy(input_file=gunzip_ref.output, output_file="{}/bwa/{}".format( self.outdir, os.path.basename(gunzip_ref.output))) self.add(copy_ref_to_bwa) bwa_index = BwaIndex() bwa_index.input_fasta = copy_ref_to_bwa.output bwa_index.output = copy_ref_to_bwa.output + ".bwt" bwa_index.algorithm = "bwtsw" self.add(bwa_index) create_dict = PicardCreateSequenceDictionary() create_dict.input = gunzip_ref.output create_dict.output_dict = gunzip_ref.output.replace(".fasta", "") + ".dict" self.add(create_dict) samtools_faidx = SamtoolsFaidx() samtools_faidx.input_fasta = gunzip_ref.output samtools_faidx.output = gunzip_ref.output + ".fai" self.add(samtools_faidx) create_chrsizes = GenerateChrSizes() create_chrsizes.input_fai = samtools_faidx.output create_chrsizes.output = gunzip_ref.output.replace( ".fasta", "") + ".chrsizes.txt" self.add(create_chrsizes) copy_qdnaseq_bg = Copy(input_file=self.qdnaseq_background, output_file="{}/genome/{}".format( self.outdir, os.path.basename(self.qdnaseq_background))) self.add(copy_qdnaseq_bg) self.reference_data['reference_genome'] = gunzip_ref.output self.reference_data['reference_dict'] = create_dict.output_dict self.reference_data['chrsizes'] = create_chrsizes.output self.reference_data['bwaIndex'] = bwa_index.input_fasta self.reference_data['qdnaseq_background'] = copy_qdnaseq_bg.output
def prepare_intervals(self): self.reference_data['targets'] = {} target_intervals_dir = "{}/target_intervals/".format( self.genome_resources) input_files = [ f for f in os.listdir(target_intervals_dir) if f.endswith(".interval_list") ] scan_for_microsatellites = MsiSensorScan() scan_for_microsatellites.input_fasta = self.reference_data[ 'reference_genome'] scan_for_microsatellites.homopolymers_only = True scan_for_microsatellites.output = "{}/intervals/msisensor-microsatellites.tsv".format( self.outdir) self.add(scan_for_microsatellites) for f in input_files: file_full_path = "{}/target_intervals/{}".format( self.genome_resources, f) logging.debug("Parsing intervals file {}".format(file_full_path)) capture_name = stripsuffix(f, ".interval_list") self.reference_data['targets'][capture_name] = {} copy_file = Copy(input_file=file_full_path, output_file="{}/intervals/targets/{}".format( self.outdir, os.path.basename(file_full_path))) self.add(copy_file) slop_interval_list = SlopIntervalList() slop_interval_list.input = copy_file.output slop_interval_list.output = stripsuffix( copy_file.output, ".interval_list") + ".slopped20.interval_list" self.add(slop_interval_list) interval_list_to_bed = IntervalListToBed() interval_list_to_bed.input = slop_interval_list.output interval_list_to_bed.output = stripsuffix( slop_interval_list.output, ".interval_list") + ".bed" self.add(interval_list_to_bed) intersect_msi = IntersectMsiSites() intersect_msi.input_msi_sites = scan_for_microsatellites.output intersect_msi.target_bed = interval_list_to_bed.output intersect_msi.output_msi_sites = stripsuffix( interval_list_to_bed.output, ".bed") + ".msisites.tsv" self.add(intersect_msi) self.prepare_msings(stripsuffix(file_full_path, ".interval_list"), capture_name) self.reference_data['targets'][capture_name][ 'blacklist-bed'] = None blacklist_bed = stripsuffix(file_full_path, ".interval_list") + ".blacklist.bed" if os.path.exists(blacklist_bed): blacklist_copy = Copy( input_file=blacklist_bed, output_file="{}/intervals/targets/{}".format( self.outdir, os.path.basename(blacklist_bed), )) self.add(blacklist_copy) self.reference_data['targets'][capture_name][ 'blacklist-bed'] = blacklist_copy.output purecn_targets_file = stripsuffix(file_full_path, ".interval_list") + ".purecn.txt" if os.path.exists(purecn_targets_file): copy_purecn_targets = Copy( input_file=purecn_targets_file, output_file="{}/intervals/targets/{}".format( self.outdir, os.path.basename(purecn_targets_file))) self.add(copy_purecn_targets) self.reference_data['targets'][capture_name][ 'purecn_targets'] = copy_purecn_targets.output else: self.reference_data['targets'][capture_name][ 'purecn_targets'] = None self.reference_data['targets'][capture_name][ 'targets-interval_list'] = copy_file.output self.reference_data['targets'][capture_name][ 'targets-interval_list-slopped20'] = slop_interval_list.output self.reference_data['targets'][capture_name][ 'targets-bed-slopped20'] = interval_list_to_bed.output self.reference_data['targets'][capture_name][ 'msisites'] = intersect_msi.output_msi_sites # Find all .cnn files and copy + register them for use in cnv kit: for f in [ f for f in os.listdir(target_intervals_dir) if (f.endswith(".cnn") or "cnvkit-fix" in f) ]: self.prepare_cnvkit(f)