def runCallerWorkflow(self, sample): patientId = sample['PatientID'] callset = self._getRawCallset() #pprint(callset) #process_output_folder = getPipelineFolder('somaticseq-' + self.mode, patientId) vjsd_output_folder = os.path.join(self.output_folder, "0_fixed_variants") merged_output_folder = os.path.join(self.output_folder, "1_merged_variants") filtered_output_folder = os.path.join(self.output_folder, "2_filtered_variants") table_output_folder = os.path.join(self.output_folder, "3_tables") filetools.checkDir(vjsd_output_folder) filetools.checkDir(merged_output_folder) filetools.checkDir(filtered_output_folder) filetools.checkDir(table_output_folder) processed_callset = self._processVJSDFiles( callset, #self.output_folder, vjsd_output_folder, patientId) merged_raw_variant_file = self._mergeVariantFiles( processed_callset, #self.output_folder, merged_output_folder, patientId) filtered_raw_variant_file = self._filterVariantTargets( merged_raw_variant_file, #self.output_folder, filtered_output_folder, patientId) self.trained_snp_table = self._generateCovariateTable( sample, callset, filtered_raw_variant_file, #self.output_folder table_output_folder) if self.mode == 'table': pass #The trained_snp_table is the final result. elif self.mode == 'trainer': self.classifier = self.buildTrainer(self.trained_snp_table) elif self.mode == 'prediction': self.classifier = None prediction_table = self.runPredictor(self.trained_snp_table) prediction_vcf = self._convertToVcf(sample, prediction_table) else: message = "'{}' is not a supported mode for SomaticSeq! ('trainer', 'prediction', 'table')".format( self.mode) raise ValueError(message) return self.trained_snp_table
def __init__(self, sample, truthset_options, callset_type, truthset_type, **kwargs): """ Parameters ---------- sample: dict truthset_options: Settings callset_type: {'snp', 'indel'} truthset_type = {'rna', 'intersection'} """ ########################### Define Common Attributes ################## self.debug = False # Parameters to use when generating the truthsets. if isinstance(sample, str): patientId = sample else: patientId = sample['PatientID'] self.truthset_type = truthset_type self.indel_intersection = kwargs.get('indel_intersection', 2) self.snp_intersection = kwargs.get('snp_intersection', 5) self.min_tumor_vaf = 0.08 self.max_normal_vaf = 0.03 self.gatk_program = truthset_options['Programs']['GATK'] self.picard_program = truthset_options['Programs']['picard'] self.reference = truthset_options['Reference Files'][ 'reference genome'] self.output_folder = truthset_options.getPipelineFolder('truthset') self.temp_folder = os.path.join(self.output_folder, 'callsets', patientId) filetools.checkDir(self.temp_folder, True) ######################## Generate the Truthset ######################## self.filename = self.runWorkflow(patientId=patientId, callset_type=callset_type, truthset_type=truthset_type, truthset_options=truthset_options, **kwargs)
def getPipelineFolder(self, step, patientId=None, caller_name=None): if step == 'callset': subfolders = ["3_called_variants", patientId] elif step == 'variants-somatic': subfolders = ["3_called_variants", patientId, caller_name] elif step == 'variants-copynumber': subfolders = ["4_called_cnvs", patientId, caller_name] elif step == 'temporary': subfolders = ['5_temporary_files', patientId] elif step == 'bam-files': subfolders = [] elif step == 'reference': return "/home/upmc/Documents/Reference/" elif step == 'variants-rna': subfolders = ['7_rna_variants', patientId] elif step == 'truthset': subfolders = ['truthset'] elif step == 'somaticseq': subfolders = ['somaticseq'] elif step == 'somaticseq-callset': subfolders = ['somaticseq', 'callsets', patientId] elif step == 'somaticseq-training': subfolders = ['somaticseq', 'training'] elif step == 'somaticseq-prediction': subfolders = ['somaticseq', 'prediction'] elif step == 'somaticseq-table': subfolders = ['somaticseq', 'tables'] else: message = "'{}' is not a valid step in the pipeline!".format(step) raise ValueError(message) pipeline_folder = os.path.join(self.base_pipeline_folder, *subfolders) filetools.checkDir(pipeline_folder, True) return pipeline_folder
def _overwriteExistingFiles(self): """ Deletes any existing files """ shutil.rmtree(self.output_folder) filetools.checkDir(self.output_folder)
def __init__(self, sample, options): ##### Define commonly-used variables self.caller_name = self.__class__.__name__ self.sample_log = options.getPipelineFile('sample log') self.targets = sample['ExomeTargets'] self.reference = options['Reference Files']['reference genome'] self.dbSNP = options['Reference Files']['dbSNP'] self.cosmic = options['Reference Files']['COSMIC'] self.verbose_level = options['globals']['verbose'] self.program = options['Programs'].get(self.caller_name.lower()) self.gatk_program = options['Programs']['GATK'] self.max_cpu_threads = options['Parameters']['MAX_CORES'] self.max_memory_usage = options['Parameters']['JAVA_MAX_MEMORY_USAGE'] self.min_base_quality = options['Parameters']['MIN_NUCLEOTIDE_QUALITY'] self.min_mapping_quality = options['Parameters']['MIN_MAPPING_QUALITY'] self.min_somatic_quality = options['Parameters']['SOMATIC_QUALITY'] self.min_coverage = options['Parameters']['MIN_COVERAGE'] ##### Define the paths and common partial filenames self.output_folder = options['variants-somatic', sample['PatientID'], self.caller_name] #self.output_folder = options['Pipeline Options']['somatic pipeline folder'] self.base_prefix = "{normal}_vs_{tumor}.{prefix}".format( tumor=sample['SampleID'], normal=sample['NormalID'], prefix=self.caller_name.lower()) self.abs_prefix = os.path.join(self.output_folder, self.base_prefix) self.temp_folder = options['temporary', sample['PatientID']] #self.temp_folder = options['Pipeline Options']['temporary folder'] self.temp_files = list() self.full_output = [] self.setCustomEnvironment(sample, options) filetools.checkDir(self.output_folder, True) filetools.checkDir(self.temp_folder, True) self.console_file = os.path.join(self.output_folder, self.caller_name + ".console_log.txt") self.readme_filename = os.path.join(self.output_folder, self.caller_name + ".readme.txt") if options['globals']['overwrite'] and os.path.exists( self.output_folder): shutil.rmtree(self.output_folder) print("\tRunning ", self.caller_name) if False: print("\tprogram location: ", self.program) print("\toutput folder: ", self.output_folder) print("\ttemp folder: ", self.temp_folder) print("\tcaller output prefix: ", self.base_prefix) self.createReadmeFile(sample) self.runCallerWorkflow(sample) self.renameOutputFiles() self.verifyOutputFiles(self.full_output)