Exemplo n.º 1
0
    def runCallerWorkflow(self, sample):
        patientId = sample['PatientID']
        callset = self._getRawCallset()
        #pprint(callset)
        #process_output_folder = getPipelineFolder('somaticseq-' + self.mode, patientId)

        vjsd_output_folder = os.path.join(self.output_folder,
                                          "0_fixed_variants")
        merged_output_folder = os.path.join(self.output_folder,
                                            "1_merged_variants")
        filtered_output_folder = os.path.join(self.output_folder,
                                              "2_filtered_variants")
        table_output_folder = os.path.join(self.output_folder, "3_tables")

        filetools.checkDir(vjsd_output_folder)
        filetools.checkDir(merged_output_folder)
        filetools.checkDir(filtered_output_folder)
        filetools.checkDir(table_output_folder)

        processed_callset = self._processVJSDFiles(
            callset,
            #self.output_folder,
            vjsd_output_folder,
            patientId)

        merged_raw_variant_file = self._mergeVariantFiles(
            processed_callset,
            #self.output_folder,
            merged_output_folder,
            patientId)

        filtered_raw_variant_file = self._filterVariantTargets(
            merged_raw_variant_file,
            #self.output_folder,
            filtered_output_folder,
            patientId)

        self.trained_snp_table = self._generateCovariateTable(
            sample,
            callset,
            filtered_raw_variant_file,
            #self.output_folder
            table_output_folder)

        if self.mode == 'table':
            pass  #The trained_snp_table is the final result.
        elif self.mode == 'trainer':
            self.classifier = self.buildTrainer(self.trained_snp_table)
        elif self.mode == 'prediction':
            self.classifier = None
            prediction_table = self.runPredictor(self.trained_snp_table)
            prediction_vcf = self._convertToVcf(sample, prediction_table)
        else:
            message = "'{}' is not a supported mode for SomaticSeq! ('trainer', 'prediction', 'table')".format(
                self.mode)
            raise ValueError(message)

        return self.trained_snp_table
Exemplo n.º 2
0
    def __init__(self, sample, truthset_options, callset_type, truthset_type,
                 **kwargs):
        """
			Parameters
			----------
				sample: dict
				truthset_options: Settings
				callset_type: {'snp', 'indel'}
				truthset_type = {'rna', 'intersection'}
		"""
        ########################### Define Common Attributes ##################
        self.debug = False

        # Parameters to use when generating the truthsets.
        if isinstance(sample, str): patientId = sample
        else:
            patientId = sample['PatientID']
        self.truthset_type = truthset_type

        self.indel_intersection = kwargs.get('indel_intersection', 2)
        self.snp_intersection = kwargs.get('snp_intersection', 5)

        self.min_tumor_vaf = 0.08
        self.max_normal_vaf = 0.03

        self.gatk_program = truthset_options['Programs']['GATK']
        self.picard_program = truthset_options['Programs']['picard']
        self.reference = truthset_options['Reference Files'][
            'reference genome']
        self.output_folder = truthset_options.getPipelineFolder('truthset')
        self.temp_folder = os.path.join(self.output_folder, 'callsets',
                                        patientId)
        filetools.checkDir(self.temp_folder, True)

        ######################## Generate the Truthset ########################
        self.filename = self.runWorkflow(patientId=patientId,
                                         callset_type=callset_type,
                                         truthset_type=truthset_type,
                                         truthset_options=truthset_options,
                                         **kwargs)
Exemplo n.º 3
0
    def getPipelineFolder(self, step, patientId=None, caller_name=None):
        if step == 'callset':
            subfolders = ["3_called_variants", patientId]
        elif step == 'variants-somatic':
            subfolders = ["3_called_variants", patientId, caller_name]
        elif step == 'variants-copynumber':
            subfolders = ["4_called_cnvs", patientId, caller_name]
        elif step == 'temporary':
            subfolders = ['5_temporary_files', patientId]
        elif step == 'bam-files':
            subfolders = []
        elif step == 'reference':
            return "/home/upmc/Documents/Reference/"
        elif step == 'variants-rna':
            subfolders = ['7_rna_variants', patientId]

        elif step == 'truthset':
            subfolders = ['truthset']
        elif step == 'somaticseq':
            subfolders = ['somaticseq']
        elif step == 'somaticseq-callset':
            subfolders = ['somaticseq', 'callsets', patientId]
        elif step == 'somaticseq-training':
            subfolders = ['somaticseq', 'training']
        elif step == 'somaticseq-prediction':
            subfolders = ['somaticseq', 'prediction']
        elif step == 'somaticseq-table':
            subfolders = ['somaticseq', 'tables']
        else:
            message = "'{}' is not a valid step in the pipeline!".format(step)
            raise ValueError(message)

        pipeline_folder = os.path.join(self.base_pipeline_folder, *subfolders)
        filetools.checkDir(pipeline_folder, True)

        return pipeline_folder
Exemplo n.º 4
0
    def _overwriteExistingFiles(self):
        """ Deletes any existing files """

        shutil.rmtree(self.output_folder)
        filetools.checkDir(self.output_folder)
Exemplo n.º 5
0
    def __init__(self, sample, options):

        ##### Define commonly-used variables
        self.caller_name = self.__class__.__name__
        self.sample_log = options.getPipelineFile('sample log')
        self.targets = sample['ExomeTargets']
        self.reference = options['Reference Files']['reference genome']
        self.dbSNP = options['Reference Files']['dbSNP']
        self.cosmic = options['Reference Files']['COSMIC']

        self.verbose_level = options['globals']['verbose']

        self.program = options['Programs'].get(self.caller_name.lower())
        self.gatk_program = options['Programs']['GATK']
        self.max_cpu_threads = options['Parameters']['MAX_CORES']
        self.max_memory_usage = options['Parameters']['JAVA_MAX_MEMORY_USAGE']
        self.min_base_quality = options['Parameters']['MIN_NUCLEOTIDE_QUALITY']
        self.min_mapping_quality = options['Parameters']['MIN_MAPPING_QUALITY']
        self.min_somatic_quality = options['Parameters']['SOMATIC_QUALITY']
        self.min_coverage = options['Parameters']['MIN_COVERAGE']

        ##### Define the paths and common partial filenames
        self.output_folder = options['variants-somatic', sample['PatientID'],
                                     self.caller_name]
        #self.output_folder = options['Pipeline Options']['somatic pipeline folder']

        self.base_prefix = "{normal}_vs_{tumor}.{prefix}".format(
            tumor=sample['SampleID'],
            normal=sample['NormalID'],
            prefix=self.caller_name.lower())
        self.abs_prefix = os.path.join(self.output_folder, self.base_prefix)

        self.temp_folder = options['temporary', sample['PatientID']]
        #self.temp_folder = options['Pipeline Options']['temporary folder']
        self.temp_files = list()
        self.full_output = []

        self.setCustomEnvironment(sample, options)
        filetools.checkDir(self.output_folder, True)
        filetools.checkDir(self.temp_folder, True)

        self.console_file = os.path.join(self.output_folder,
                                         self.caller_name + ".console_log.txt")
        self.readme_filename = os.path.join(self.output_folder,
                                            self.caller_name + ".readme.txt")

        if options['globals']['overwrite'] and os.path.exists(
                self.output_folder):
            shutil.rmtree(self.output_folder)
        print("\tRunning ", self.caller_name)
        if False:

            print("\tprogram location: ", self.program)
            print("\toutput folder: ", self.output_folder)
            print("\ttemp folder: ", self.temp_folder)
            print("\tcaller output prefix: ", self.base_prefix)

        self.createReadmeFile(sample)

        self.runCallerWorkflow(sample)

        self.renameOutputFiles()

        self.verifyOutputFiles(self.full_output)