def testOverwriteAnnotationsSupported(self): """Test that mutations support overwrite annotation in the VCFInputMutationCreator. (white box testing)""" inputFilename = os.path.join(*["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) vcf_overwriting_disallowed = VcfInputMutationCreator(inputFilename, MutationDataFactory()) vcf_overwriting_allowed = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True)) mutations = vcf_overwriting_disallowed.createMutations() for m in mutations: self.assertTrue(m._new_required) mutations = vcf_overwriting_allowed.createMutations() for m in mutations: self.assertFalse(m._new_required)
def test_mutation_combiner(self): """Test that attributes and annotations are set properly with combine mutations""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeValue", "value1", "INPUT", "STRING", "a value") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeValue", "value2", tags=["IT"]) mut2.createAnnotation("AnotherValue", "5") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeValue", "value1|value2", "INPUT", "STRING", "a value", tags=["IT"]) expected.createAnnotation("AnotherValue", "5") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def _onp_ordered_combiner_test(self, inputs, expected): input_muts = iter(self._tuples_to_MutationData(inputs)) expected_muts = self._tuples_to_MutationData(expected) mut_factory = MutationDataFactory() combiner = OnpQueue(input_muts, mut_factory) results = list(combiner.get_combined_mutations()) self._assert_mutation_lists_equal(expected_muts, results)
def test_mutation_combiner_identical_annotation(self): """Test that annotations with all identical values are not repeated with | between them""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SampleName", "John Doe") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SampleName", "John Doe") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SampleName", "John Doe") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def testAnnotationRoundTripEmpty(self): """Read a VCF, annotate it with no datasources, write it, and read it again without changes""" inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) outputFilename = os.path.join("out", "test_round_trip_empty_annotated.vcf") other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir="THIS_DIR_DOES_NOT_EXIST__", genomeBuild="hg19", other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotated_filename = annotator.annotate() vcf_input2 = VcfInputMutationCreator( annotated_filename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)
def test_mutation_combiner_ordering(self): """Test that ordering of combined attributes makes matches original order""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeDepth", "2") mut1.createAnnotation("AnotherDepth", "1") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeDepth", "1") mut2.createAnnotation("AnotherDepth", "2") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeDepth", "2|1") expected.createAnnotation("AnotherDepth", "1|2") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def __init__(self, sourceFilename, mutation_data_factory, configFile="", genomeBuild="hg19", other_options=None): """ Constructor """ if mutation_data_factory is None: logging.getLogger(__name__).info("No mutation data factory provided, using default settings.") self._mutation_data_factory = MutationDataFactory() if mutation_data_factory is None else mutation_data_factory
def test_tnp_blank_snp(self): """Test a harder scenario for ONP combination""" mut1 = MutationData(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("phasing_id", "value1", "INPUT") mut1.createAnnotation("phasing_genotype", "0|1", "INPUT") mut2 = MutationData(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("phasing_id", "value1", "INPUT") mut2.createAnnotation("phasing_genotype", "0|1", "INPUT") mut3 = MutationData(chr=1, start=102, end=102, ref_allele="C", alt_allele="T") mut3.createAnnotation("phasing_id", "value1", "INPUT") mut3.createAnnotation("phasing_genotype", "0|1", "INPUT") # Note the differing ID in mut4 mut4 = MutationData(chr=1, start=103, end=103, ref_allele="C", alt_allele="T") mut4.createAnnotation("phasing_id", "value2", "INPUT") mut4.createAnnotation("phasing_genotype", "0|1", "INPUT") mut5 = MutationData(chr=1, start=104, end=104, ref_allele="C", alt_allele="T") mut5.createAnnotation("phasing_id", "value1", "INPUT") mut5.createAnnotation("phasing_genotype", "0|1", "INPUT") # Note separate chromosome for mut6 mut6 = MutationData(chr=2, start=105, end=105, ref_allele="C", alt_allele="T") mut6.createAnnotation("phasing_id", "value1", "INPUT") mut6.createAnnotation("phasing_genotype", "0|1", "INPUT") gt_alts = ["ATT", "T", "T", "T"] mutations = [mut1, mut2, mut3, mut4, mut5, mut6] mdf = MutationDataFactory() queue = OnpQueue(mutations, mdf) for i, mut in enumerate(queue.get_combined_mutations()): self.assertTrue(gt_alts[i] == mut.alt_allele)
def test_indel(self): """Test indel not used in onp combination no matter what the phasing info""" mut1 = MutationData(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("phasing_id", "value1", "INPUT") mut1.createAnnotation("phasing_genotype", "0|1", "INPUT") mut2 = MutationData(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("phasing_id", "value1", "INPUT") mut2.createAnnotation("phasing_genotype", "0|1", "INPUT") mut3 = MutationData(chr=1, start=102, end=102, ref_allele="C", alt_allele="T") mut3.createAnnotation("phasing_id", "value1", "INPUT") mut3.createAnnotation("phasing_genotype", "0|1", "INPUT") # Indel mut4 = MutationData(chr=1, start=103, end=104, ref_allele="-", alt_allele="TT") mut4.createAnnotation("phasing_id", "value1", "INPUT") mut4.createAnnotation("phasing_genotype", "0|1", "INPUT") mut5 = MutationData(chr=1, start=104, end=104, ref_allele="C", alt_allele="T") mut5.createAnnotation("phasing_id", "value1", "INPUT") mut5.createAnnotation("phasing_genotype", "0|1", "INPUT") mut6 = MutationData(chr=1, start=105, end=105, ref_allele="C", alt_allele="T") mut6.createAnnotation("phasing_id", "value1", "INPUT") mut6.createAnnotation("phasing_genotype", "0|1", "INPUT") gt_alts = ["ATT", "TT", "TT"] mutations = [mut1, mut2, mut3, mut4, mut5, mut6] mdf = MutationDataFactory() queue = OnpQueue(mutations, mdf) for i, mut in enumerate(queue.get_combined_mutations()): self.assertTrue(gt_alts[i] == mut.alt_allele)
def _onp_unordered_combiner_test(self, inputs, expected): """Convert input and expected tuples into MutationData objects, then run the inputs through the ONP combiner on the inputs and compare to the expected""" input_muts = iter(self._tuples_to_MutationData(inputs)) expected = self._tuples_to_MutationData(expected) mdf = MutationDataFactory() combiner = OnpQueue(input_muts, mdf) results = list(combiner.get_combined_mutations()) self.assert_mutations_match_expected(expected, results)
def testFailureWithSpanningDeletion(self): """Fail with a spanning deletion unless alternates are being ignored.""" inputFilename = os.path.join(*["testdata", "simple_vcf_spanning_deletion.vcf"]) vcf_input = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True)) muts = vcf_input.createMutations() ctr = 0 for m in muts: ctr += 1
def test_annotation_overwriting_on(self): """Test that the factory can produce a mutation that allows overwriting. Just need to make sure no exception thrown.""" mdf = MutationDataFactory(allow_overwriting=True) mut = mdf.create() mut.createAnnotation("blah", "123") self.assertTrue(mut['blah'] == "123") mut.createAnnotation("blah", "456") self.assertTrue(mut['blah'] == "456")
def testSimpleRoundTripWithoutAnnotating(self): """Read a VCF, write it, and read it again without changes""" other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) vcf_input = VcfInputMutationCreator( inputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts = [m for m in vcf_input.createMutations()] outputFilename = os.path.join("out", "test_round_trip.vcf") vcf_output = VcfOutputRenderer(outputFilename, otherOptions=other_opts) vcf_output.renderMutations(muts) vcf_input2 = VcfInputMutationCreator( outputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)
def testSuccesseWithSpanningDeletion(self): """Succeed with a spanning deletion since alternates are being ignored.""" inputFilename = os.path.join(*["testdata", "simple_vcf_spanning_deletion.vcf"]) other_options = {InputMutationCreatorOptions.IS_SKIP_ALTS: True} vcf_input = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_options) muts = vcf_input.createMutations() ctr = 0 for m in muts: ctr += 1 self.assertTrue(ctr == 1, "There should only have been one mutation seen, instead saw: " + str(ctr))
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build, mutation_data_factory=None): mutation_data_factory = MutationDataFactory( ) if mutation_data_factory is None else mutation_data_factory mut = mutation_data_factory.create(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type( mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def test_annotation_overwriting_off(self): """Test that the factory can produce a mutation that does not allow overwriting. Make sure DuplicateAnnotationException is thrown.""" mdf = MutationDataFactory(allow_overwriting=False) mut = mdf.create() mut.createAnnotation("blah", "123") self.assertTrue(mut['blah'] == "123") is_exception_raised = False try: mut.createAnnotation("blah", "456") except DuplicateAnnotationException as dae: is_exception_raised = True self.assertTrue( is_exception_raised, "DuplicateAnnotationException should have been seen, but wasn't")
def initialize(self, run_spec): """ Given a RunSpecification instance, initialize self properly. Do not start annotation. """ self.setInputCreator(run_spec.inputCreator) self.setOutputRenderer(run_spec.outputRenderer) self.setManualAnnotations(run_spec.manualAnnotations) self.setDefaultAnnotations(run_spec.defaultAnnotations) self._datasources = run_spec.datasources self.setIsMulticore(run_spec.get_is_multicore()) self.setNumCores(run_spec.get_num_cores()) self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = run_spec.get_is_skip_no_alts() self.initialize_cache_manager(run_spec) self.set_annotating_type(run_spec.annotating_type) self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(self._annotating_type, _annotate_mut) self._is_allow_annotation_overwriting = run_spec.is_allow_annotation_overwriting self._mutation_data_factory = MutationDataFactory(allow_overwriting=self._is_allow_annotation_overwriting)
def create_run_spec_given_datasources(input_format, output_format, input_filename, output_filename, global_annotations=None, datasource_list=None, genomeBuild="hg19", is_multicore=False, num_cores=4, default_annotations=None, cache_url=None, read_only_cache=True, tx_mode=TranscriptProvider.TX_MODE_CANONICAL, is_skip_no_alts=False, other_opts=None, annotating_type=None): """Same as create_run_spec, but a list of datasource instances can be used. Typically, this method is only called by automated tests.""" global_annotations = dict() if global_annotations is None else global_annotations default_annotations = dict() if default_annotations is None else default_annotations datasource_list = [] if datasource_list is None else datasource_list other_opts = dict() if other_opts is None else other_opts if input_format == "TCGAMAF" and not other_opts.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False): other_opts[OptionConstants.REANNOTATE_TCGA_MAF_COLS] = True other_opts[InputMutationCreatorOptions.IS_SKIP_ALTS] = is_skip_no_alts # Step 0 Validate given parameters and log messages. If an error or critical is found, throw an exception. validation_messages = RunSpecificationFactory._validate_run_spec_parameters(input_format, output_format, input_filename, output_filename, global_annotations, datasource_list, genomeBuild, is_multicore, num_cores, default_annotations, cache_url, read_only_cache, tx_mode, is_skip_no_alts, other_opts, annotating_type) for msg in validation_messages: logging.getLogger(__name__).log(msg.level, msg.message) if (msg.level == logging.ERROR) or (msg.level == logging.CRITICAL): raise RunSpecificationException(msg.message) # Step 1 Initialize input and output is_allow_annotation_overwriting = other_opts.get(OptionConstants.ALLOW_ANNOTATION_OVERWRITING, False) mutation_data_factory = MutationDataFactory(is_allow_annotation_overwriting) inputCreator = OncotatorCLIUtils.create_input_creator(input_filename, input_format, mutation_data_factory, genomeBuild, other_opts) outputRenderer = OncotatorCLIUtils.create_output_renderer(output_filename, output_format, other_opts) result = RunSpecification() result.initialize(inputCreator, outputRenderer, manualAnnotations=global_annotations, datasources=datasource_list, isMulticore=is_multicore, numCores=num_cores, defaultAnnotations=default_annotations, cacheUrl=cache_url, read_only_cache=read_only_cache, is_skip_no_alts=is_skip_no_alts, annotating_type=annotating_type, is_allow_annotation_overwriting=is_allow_annotation_overwriting) return result
def test_mutation_combiner_no_mut(self): """Combining no mutations should return None""" mdf = MutationDataFactory() result = OnpQueue._combine_mutations([], mdf) self.assertIsNone(result)