def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file): # For this conversion, you must specify the barcodes manually override_annotations = dict() override_annotations.update({ 'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal' }) other_opts = { OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True, OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True } # Use an empty datasource dir in order to speed this up. annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file, datasource_dir=".", global_annotations=override_annotations, is_skip_no_alts=True, other_opts=other_opts) annotator.initialize(runSpec) annotator.annotate()
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT', "dbDir") inputFilename = os.path.join( *["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) outputFilename = os.path.join( "out", "example.trailing_whitespace_in_alleles.vcf") annotator = Annotator() from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS, other_opts={'vcf_out_infer_genotypes': False}) annotator.initialize(run_spec) annotator.annotate() #check output vcf_data = open(outputFilename).read() self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data) self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data) self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data) self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
def test_basic_rendering(self): """Test that we can render a basic seg file as a gene list""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_basic_rendering.gene_list.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") self.assertTrue(line_dict['segment_end'] is not None) self.assertTrue(line_dict['segment_end'].strip() != "") self.assertTrue("gene" in line_dict.keys()) self.assertTrue(len(line_dict["gene"]) > 0) self.assertTrue(float(line_dict["segment_num_probes"])) self.assertTrue(line_dict['sample'] == "Patient0")
def test_full_seg_file_annotations(self): """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_full_seg_file_annotations.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) required_cols = ["Sample", "Num_Probes", "Segment_Mean"] headers = output_reader.getFieldNames() for rcol in required_cols: self.assertTrue(rcol in headers) for line_dict in output_reader: self.assertTrue(line_dict['start'] is not None) self.assertTrue(line_dict['start'].strip() != "") self.assertTrue(line_dict['end'] is not None) self.assertTrue(line_dict['end'].strip() != "") self.assertTrue("genes" in line_dict.keys()) self.assertTrue(len(line_dict["genes"].split(",")) > 0)
def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False): self.logger.info("Initializing Annotator...") if override_annotations is None: override_annotations = dict() annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( inputFormat, outputFormat, inputFilename, outputFilename, defaultAnnotations=default_annotations, datasourceDir=datasource_dir, globalAnnotations=override_annotations, is_skip_no_alts=is_skip_no_alts) annotator.initialize(runSpec) self.logger.info("Annotation starting...") return annotator.annotate()
def test_full_seg_file_annotations(self): """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_full_seg_file_annotations.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) required_cols = ["Sample", "Num_Probes", "Segment_Mean"] headers = output_reader.getFieldNames() for rcol in required_cols: self.assertTrue(rcol in headers) for line_dict in output_reader: self.assertTrue(line_dict['start'] is not None) self.assertTrue(line_dict['start'].strip() != "") self.assertTrue(line_dict['end'] is not None) self.assertTrue(line_dict['end'].strip() != "") self.assertTrue("genes" in line_dict.keys()) self.assertTrue(len(line_dict["genes"].split(",")) > 0)
def test_rendering_combined_to_tsv(self): """Test that we produce a merged ONP simple tsv file without crashing """ input_filename = os.path.join(*["testdata", "maflite", "onp_combination.maf.txt"]) output_filename = os.path.join("out", "onp_combination.tsv") spec = RunSpecificationFactory.create_run_spec("MAFLITE","SIMPLE_TSV",input_filename, output_filename, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False): self.logger.info("Initializing Annotator...") if override_annotations is None: override_annotations = dict() annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename, defaultAnnotations=default_annotations, datasourceDir=datasource_dir, globalAnnotations=override_annotations, is_skip_no_alts=is_skip_no_alts) annotator.initialize(runSpec) self.logger.info("Annotation starting...") return annotator.annotate()
def test_single_sample_onp_combiner(self): """test that we can create an onp combined TCGA maf without crashing""" input_filename = 'testdata/maflite/onp.singlesample.maf.txt' output_filename = 'out/testSingleSampleOnpCombiner.maf' config = TestUtils.createUnitTestConfig() defaultdb = config.get('DEFAULT',"dbDir") spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename,datasourceDir=defaultdb, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def test_single_sample_onp_combiner(self): """test that we can create an onp combined TCGA maf without crashing""" input_filename = 'testdata/maflite/onp.singlesample.maf.txt' output_filename = 'out/testSingleSampleOnpCombiner.maf' config = TestUtils.createUnitTestConfig() defaultdb = config.get('DEFAULT',"dbDir") spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename, datasource_dir=defaultdb, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file): # For this conversion, you must specify the barcodes manually override_annotations = dict() override_annotations.update({'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal'}) other_opts = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True, OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True} # Use an empty datasource dir in order to speed this up. annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file, datasource_dir=".", global_annotations=override_annotations, is_skip_no_alts=True, other_opts=other_opts) annotator.initialize(runSpec) annotator.annotate()
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a maflite file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT',"dbDir") inputFilename = os.path.join(*["testdata", "maflite", "example.trailing_whitespace_in_alleles.maflite"]) outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.maf.txt") annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS) annotator.initialize(run_spec) annotator.annotate()
def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self): """Test FILTER col is properly rendered when using the collapse-filter-cols option.""" input_fname = 'testdata/vcf/example.vcf' output_fname = 'out/example.one_filter_col.maf.txt' annotator = Annotator() other_opts = {'collapse_filter_cols': True} run_spec = RunSpecificationFactory.create_run_spec( 'VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts) annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_fname) for line_dict in tsv_reader: self.assertIn('i_filter', line_dict) self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
def test_annotating_uniprot_test_file(self): """Test variants with known issues with older version of UniProt datasource. This test will fail if using older version of uniprot datasource (pre-2014) """ db_dir = TestUtils.createUnitTestConfig().get('DEFAULT',"dbDir") annotator = Annotator() out_file_name = "out/uniprot_recovery.maf.annotated" runSpec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", "testdata/maflite/uniprot_recovery.maflite", out_file_name, datasource_dir=db_dir, tx_mode=TranscriptProvider.TX_MODE_BEST_EFFECT) annotator.initialize(runSpec) annotator.annotate() out_file_reader = GenericTsvReader(out_file_name) for i,line_dict in enumerate(out_file_reader): self.assertTrue(line_dict['UniProt_AApos'] != "0") #TODO: The fourth entry is currently not picking up the uniprot entry for this. Remove the "if" statement once issue #253 is addressed if i != 4: self.assertTrue(line_dict['SwissProt_entry_Id'].endswith("HUMAN"))
def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self): """Test FILTER col is properly rendered when using the collapse-filter-cols option.""" input_fname = 'testdata/vcf/example.vcf' output_fname = 'out/example.one_filter_col.maf.txt' annotator = Annotator() other_opts = {'collapse_filter_cols': True} from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec('VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts) annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_fname) for line_dict in tsv_reader: self.assertIn('i_filter', line_dict) self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
def test_onp_combiner_snp_then_multiallelic(self): """test that we can handle reading a SNP then multiallelic from a VCF without crashing""" input_filename = 'testdata/vcf/infer_onp_fail_snp_then_multiallelic.vcf' output_filename = 'out/testSNPThenMultiallelic.maf.annotated' config = TestUtils.createUnitTestConfig() default_db = config.get('DEFAULT', "dbDir") spec = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_filename, output_filename, datasource_dir=default_db, is_skip_no_alts=True, other_opts={ OptionConstants.INFER_ONPS: True, OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True }) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT',"dbDir") inputFilename = os.path.join(*["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.vcf") annotator = Annotator() from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec("VCF", "VCF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS, other_opts={'vcf_out_infer_genotypes': False}) annotator.initialize(run_spec) annotator.annotate() #check output vcf_data = open(outputFilename).read() self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data) self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data) self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data) self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a maflite file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT', "dbDir") inputFilename = os.path.join(*[ "testdata", "maflite", "example.trailing_whitespace_in_alleles.maflite" ]) outputFilename = os.path.join( "out", "example.trailing_whitespace_in_alleles.maf.txt") annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "MAFLITE", "TCGAMAF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS) annotator.initialize(run_spec) annotator.annotate()
def test_annotating_uniprot_test_file(self): """Test variants with known issues with older version of UniProt datasource. This test will fail if using older version of uniprot datasource (pre-2014) """ db_dir = TestUtils.createUnitTestConfig().get('DEFAULT', "dbDir") annotator = Annotator() out_file_name = "out/uniprot_recovery.maf.annotated" runSpec = RunSpecificationFactory.create_run_spec( "MAFLITE", "TCGAMAF", "testdata/maflite/uniprot_recovery.maflite", out_file_name, datasourceDir=db_dir, tx_mode=TranscriptProvider.TX_MODE_BEST_EFFECT) annotator.initialize(runSpec) annotator.annotate() out_file_reader = GenericTsvReader(out_file_name) for i, line_dict in enumerate(out_file_reader): self.assertTrue(line_dict['UniProt_AApos'] != "0") #TODO: The fourth entry is currently not picking up the uniprot entry for this. Remove the "if" statement once issue #253 is addressed if i != 4: self.assertTrue( line_dict['SwissProt_entry_Id'].endswith("HUMAN"))
def test_rendering_with_exons(self): """Test that we can render a seg file that includes exons at end points""" inputFilename = "testdata/seg/Middle_of_exon.seg.txt" output_filename = "out/test_exon_seg2.gene_list.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") if line_dict['segment_end_gene'] == "MAPK1": self.assertTrue(line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
def test_rendering_with_exons(self): """Test that we can render a seg file that includes exons at end points""" inputFilename = "testdata/seg/Middle_of_exon.seg.txt" output_filename = "out/test_exon_seg2.gene_list.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") if line_dict['segment_end_gene'] == "MAPK1": self.assertTrue(line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
def test_reannotating_actual_file(self): """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values.""" # This test assumes that the numeric values are not being collapsed. input_filename = "testdata/m2_support/phasingExample.vcf" midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated" output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated" options_step1 = { OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: False, OptionConstants.SPLIT_ALLELIC_DEPTH: True, OptionConstants.INFER_ONPS: False } # Note that this will also test collapsing numeric values. options_step2 = { OptionConstants.REANNOTATE_TCGA_MAF_COLS: True, OptionConstants.INFER_ONPS: True, OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: False, OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True } run_spec_step1 = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_filename, midpoint_output_filename, is_skip_no_alts=True, other_opts=options_step1, datasource_dir=self._determine_db_dir()) annotator = Annotator() annotator.initialize(run_spec_step1) annotator.annotate() # To speed up this test, use the same datasources from step 1 ds_list = run_spec_step1.get_datasources() tsv_reader = GenericTsvReader(midpoint_output_filename) i = -1 for i, line in enumerate(tsv_reader): self.assertTrue( line["i_QSS"].find("|") == -1, "i_QSS annotation should not have a '|' in it in mutation: " + str(i + 1)) self.assertTrue( i == 2, 'Mutation count flawed... should have been three mutations: ' + str(i + 1)) run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources( "TCGAMAF", "TCGAMAF", midpoint_output_filename, output_filename, other_opts=options_step2, datasource_list=ds_list) annotator.initialize(run_spec_step2) annotator.annotate() gt_alt_count = [80, 7] gt_alt_count_full = ["82|80", "7"] gt_ref_count = [68, 151] # Please note that this is not "68|68" since these were collapsed by ONP combiner. gt_ref_count_full = ["68", "151"] gt_tumor_f = [.5375, .046] gt_tumor_f_full = ["0.538|0.537", "0.046"] tsv_reader = GenericTsvReader(output_filename) i = -1 for i, line in enumerate(tsv_reader): is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()] self.assertTrue(all(is_good_prefix), "i_i_ prefix found.") if i == 0: self.assertTrue( line["i_QSS"].find("|") != -1, "i_QSS tag should have a '|' in it for the first mutation") self.assertEqual(int(line['t_alt_count']), gt_alt_count[i]) self.assertEqual(int(line['t_ref_count']), gt_ref_count[i]) self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i]) self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i]) self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i]) self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i]) self.assertTrue( i == 1, 'Mutation count flawed... should have been two mutations: ' + str(i + 1))
def main(argv=None): # IGNORE:C0111 """Command line options.""" from oncotator.utils.OncotatorCLIUtils import OncotatorCLIUtils from oncotator.Annotator import Annotator if argv is None: argv = sys.argv else: sys.argv.extend(argv) program_version = "%s" % __version__ program_version_message = '%%(prog)s %s' % program_version try: args = parseOptions(program_version_message) verbose = args.verbose if verbose > 0: print("Verbose mode on") logFilename = args.log_name # 'oncotator.log' # Create a basic logger to a file loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s' logging.basicConfig(filename=logFilename, level=logging.INFO, format=loggingFormat) # Add a console logger to the root logger, which means that all loggers generated will have the console dump. # Output on the console will be the same as what is in the log file. ch = logging.StreamHandler() ch.setLevel(logging.WARN) formatter = logging.Formatter(loggingFormat) ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) print("Path:") print(sys.path) print(" ") logging.getLogger('').addHandler(ch) logger = logging.getLogger(__name__) logger.info("Oncotator " + program_version) logger.info("Args: " + str(args)) logger.info('Log file: ' + os.path.abspath(logFilename)) if DEBUG: logger.setLevel(logging.DEBUG) # Initiate an Oncotator session. inputFilename = os.path.expanduser(args.input_file) outputFilename = os.path.expanduser(args.output_file) inputFormat = args.input_format.upper() outputFormat = args.output_format.upper() datasourceDir = os.path.expanduser(args.dbDir) cache_url = args.cache_url read_only_cache = args.read_only_cache tx_mode = args.tx_mode is_skip_no_alts = args.skip_no_alt genome_build = args.genome_build is_no_prepend = not args.prepend # Parse annotation overrides commandLineManualOverrides = args.override_cli overrideConfigFile = args.override_config if overrideConfigFile is not None and not os.path.exists(overrideConfigFile): logger.warn("Could not find " + overrideConfigFile + " ... proceeding anyway.") overrideConfigFile = None manualOverrides = OncotatorCLIUtils.determineAllAnnotationValues(commandLineManualOverrides, overrideConfigFile) # Parse default overrides commandLineDefaultValues = args.default_cli defaultConfigFile = args.default_config if defaultConfigFile is not None and not os.path.exists(defaultConfigFile): if defaultConfigFile != DEFAULT_DEFAULT_ANNOTATIONS: logger.warn("Could not find " + defaultConfigFile + " ... proceeding anyway.") else: logger.info("Could not find Broad-specific " + defaultConfigFile + " ... proceeding without any default annotations. __UNKNOWN__ may appear in TCGA MAF outputs.") defaultConfigFile = None defaultValues = OncotatorCLIUtils.determineAllAnnotationValues(commandLineDefaultValues, defaultConfigFile) # Create a run configuration to pass to the Annotator class. annotating_type = None if inputFormat == "SEG_FILE": annotating_type = RunSpecification.ANNOTATE_SEGMENTS runConfig = RunSpecificationFactory.create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir=datasourceDir, isMulticore=(not args.noMulticore), defaultAnnotations=defaultValues, cacheUrl=cache_url, read_only_cache=read_only_cache, tx_mode=tx_mode, is_skip_no_alts=is_skip_no_alts, genomeBuild=genome_build, other_opts=determineOtherOptions(args), annotating_type=annotating_type) annotator = Annotator() annotator.initialize(runConfig) annotator.annotate() return 0 except KeyboardInterrupt: ### handle keyboard interrupt ### return 0
def test_reannotating_actual_file(self): """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values.""" # This test assumes that the numeric values are not being collapsed. input_filename = "testdata/m2_support/phasingExample.vcf" midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated" output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated" options_step1 = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: False, OptionConstants.SPLIT_ALLELIC_DEPTH: True, OptionConstants.INFER_ONPS: False} # Note that this will also test collapsing numeric values. options_step2 = {OptionConstants.REANNOTATE_TCGA_MAF_COLS: True, OptionConstants.INFER_ONPS: True, OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: False, OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True} run_spec_step1 = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_filename, midpoint_output_filename, is_skip_no_alts=True, other_opts=options_step1, datasource_dir=self._determine_db_dir()) annotator = Annotator() annotator.initialize(run_spec_step1) annotator.annotate() # To speed up this test, use the same datasources from step 1 ds_list = run_spec_step1.get_datasources() tsv_reader = GenericTsvReader(midpoint_output_filename) i = -1 for i, line in enumerate(tsv_reader): self.assertTrue(line["i_QSS"].find("|") == -1, "i_QSS annotation should not have a '|' in it in mutation: " + str(i+1)) self.assertTrue(i == 2, 'Mutation count flawed... should have been three mutations: ' + str(i+1)) run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources("TCGAMAF", "TCGAMAF", midpoint_output_filename, output_filename, other_opts=options_step2, datasource_list=ds_list) annotator.initialize(run_spec_step2) annotator.annotate() gt_alt_count = [80, 7] gt_alt_count_full = ["82|80", "7"] gt_ref_count = [68, 151] # Please note that this is not "68|68" since these were collapsed by ONP combiner. gt_ref_count_full = ["68", "151"] gt_tumor_f = [.5375, .046] gt_tumor_f_full = ["0.538|0.537", "0.046"] tsv_reader = GenericTsvReader(output_filename) i = -1 for i, line in enumerate(tsv_reader): is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()] self.assertTrue(all(is_good_prefix), "i_i_ prefix found.") if i == 0: self.assertTrue(line["i_QSS"].find("|") != -1, "i_QSS tag should have a '|' in it for the first mutation") self.assertEqual(int(line['t_alt_count']), gt_alt_count[i]) self.assertEqual(int(line['t_ref_count']), gt_ref_count[i]) self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i]) self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i]) self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i]) self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i]) self.assertTrue(i == 1, 'Mutation count flawed... should have been two mutations: ' + str(i+1))
def main(argv=None): # IGNORE:C0111 """Command line options.""" from oncotator.utils.OncotatorCLIUtils import OncotatorCLIUtils from oncotator.Annotator import Annotator if argv is None: argv = sys.argv else: sys.argv.extend(argv) program_version = "%s" % __version__ program_version_message = '%%(prog)s %s' % program_version try: args = parseOptions(program_version_message) verbose = args.verbose if verbose > 0: print("Verbose mode on") logFilename = args.log_name # 'oncotator.log' # Create a basic logger to a file loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s' logging.basicConfig(filename=logFilename, level=logging.INFO, format=loggingFormat) # Add a console logger to the root logger, which means that all loggers generated will have the console dump. # Output on the console will be the same as what is in the log file. ch = logging.StreamHandler() ch.setLevel(logging.WARN) formatter = logging.Formatter(loggingFormat) ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) print("Path:") print(sys.path) print(" ") logging.getLogger('').addHandler(ch) logger = logging.getLogger(__name__) logger.info("Oncotator " + program_version) logger.info("Args: " + str(args)) logger.info('Log file: ' + os.path.abspath(logFilename)) if DEBUG: logger.setLevel(logging.DEBUG) if not NGSLIB_INSTALLED: logger.warn( "ngslib module not installed. Will be unable to annotate with BigWig datasources." ) # Initiate an Oncotator session. inputFilename = os.path.expanduser(args.input_file) outputFilename = os.path.expanduser(args.output_file) inputFormat = args.input_format.upper() outputFormat = args.output_format.upper() datasourceDir = os.path.expanduser(args.dbDir) cache_url = args.cache_url read_only_cache = args.read_only_cache tx_mode = args.tx_mode is_skip_no_alts = args.skip_no_alt genome_build = args.genome_build is_no_prepend = not args.prepend # Parse annotation overrides commandLineManualOverrides = args.override_cli overrideConfigFile = args.override_config if overrideConfigFile is not None and not os.path.exists( overrideConfigFile): logger.warn("Could not find " + overrideConfigFile + " ... proceeding anyway.") overrideConfigFile = None manualOverrides = OncotatorCLIUtils.determineAllAnnotationValues( commandLineManualOverrides, overrideConfigFile) # Parse default overrides commandLineDefaultValues = args.default_cli defaultConfigFile = args.default_config if defaultConfigFile is not None and not os.path.exists( defaultConfigFile): if defaultConfigFile != DEFAULT_DEFAULT_ANNOTATIONS: logger.warn("Could not find " + defaultConfigFile + " ... proceeding anyway.") else: logger.info( "Could not find Broad-specific " + defaultConfigFile + " ... proceeding without any default annotations. __UNKNOWN__ may appear in TCGA MAF outputs." ) defaultConfigFile = None defaultValues = OncotatorCLIUtils.determineAllAnnotationValues( commandLineDefaultValues, defaultConfigFile) # Create a run configuration to pass to the Annotator class. annotating_type = None if inputFormat == "SEG_FILE": annotating_type = RunSpecification.ANNOTATE_SEGMENTS runConfig = RunSpecificationFactory.create_run_spec( inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir=datasourceDir, isMulticore=(not args.noMulticore), defaultAnnotations=defaultValues, cacheUrl=cache_url, read_only_cache=read_only_cache, tx_mode=tx_mode, is_skip_no_alts=is_skip_no_alts, genomeBuild=genome_build, other_opts=determineOtherOptions(args), annotating_type=annotating_type) annotator = Annotator() annotator.initialize(runConfig) annotator.annotate() return 0 except KeyboardInterrupt: ### handle keyboard interrupt ### return 0