def testChrom2HashCodeTable(self): chroms = ["1", "X", "3", "contig1", "Y", "25", "mt"] h = MutUtils.createChrom2HashCodeTable(chroms) self.assertTrue( h["1"] == 1, "For chrom 1, hash code should be 1 but it was %s." % h["1"]) self.assertTrue( h["3"] == 3, "For chrom 3, hash code should be 3 but it was %s." % h["3"]) self.assertTrue( h["25"] == 25, "For chrom 25, hash code should be 25 but it was %s." % h["25"]) self.assertTrue( h["X"] == 26, "For chrom X, hash code should be 26 but it was %s." % h["X"]) self.assertTrue( h["Y"] == 27, "For chrom Y, hash code should be 27 but it was %s." % h["Y"]) self.assertTrue( h["mt"] == 28, "For chrom mt, hash code should be 28 but it was %s." % h["mt"]) self.assertTrue( h["contig1"] == 29, "For chrom contig1, hash code should be 29 but it was %s." % h["contig1"]) chroms = ["contig1", "mt"] h = MutUtils.createChrom2HashCodeTable(chroms) self.assertTrue( h["mt"] == 3, "For chrom mt, hash code should be 3 but it was %s." % h["mt"]) self.assertTrue( h["contig1"] == 4, "For chrom contig1, hash code should be 4 but it was %s." % h["contig1"])
def test_build_ensembl_transcript_index(self): """Build the gtf portion of the ensembl transcript db """ # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g" # snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W, # # grep -Pzo ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa # ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" output_filename = "out/test_ensembl_gtf.db" protocol = "file" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol) self.assertTrue(os.path.exists(output_filename)) shove = Shove(protocol + "://" + output_filename, "memory://") self.assertTrue(len(shove.keys()) > 0) self.assertTrue("YDR529C" in shove.keys()) t = shove["YDR529C"] self.assertTrue(t.get_seq() is not None) self.assertTrue(t.get_seq() is not "") self.assertTrue(len(t.get_cds()) > 0) self.assertTrue(len(t.get_exons()) > 0) MutUtils.removeDir(output_filename)
def _createMutation(self, record, alt_index, build): chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = int(record.POS) endPos = int(record.POS) ref = record.REF.strip() ref = "" if ref == "." else ref alt = ref if not record.is_monomorphic: alt = str(record.ALT[alt_index]).strip() mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build, self._mutation_data_factory) ID = "" if record.ID is None else record.ID mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID]) mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL]) mut.createAnnotation("alt_allele_seen", str(True), "INPUT") if self.collapse_filter_fields: mut = self._add_filter_data_2_mutation_single_field(mut, record) else: mut = self._addFilterData2Mutation(mut, record) mut = self._addInfoData2Mutation(mut, record, alt_index) return mut
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def testChromosomeConversionHG19(self): """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y """ self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
def testSimpleRead(self): """ Read a good maflite file and make sure that each mutation validates """ tmp = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt", 'configs/maflite_input.config') muts = tmp.createMutations() # If no exception is thrown, then this test passes. for m in muts: MutUtils.validateMutation(m)
def testExampleVcfDBAnnotationWithSNPExactMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName) chrom = "20" start = "1110696" end = "1110696" ref_allele = "A" alt_allele = "T" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="0.667", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AC") cur_annotation = Annotation(value="2,4", datasourceName="ESP", dataType="Integer", description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="False", datasourceName="ESP", dataType="Flag", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=0) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") chrom = "20" start = "1230237" end = "1230237" ref_allele = "T" alt_allele = "A" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_NS") cur_annotation = Annotation(value="3", datasourceName="ESP", dataType="Integer", description="Number of Samples With Data", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def _determine_matching_alt_indices(self, mut, record, build): """ :param mut: :param record: :return: """ indices = [] if record.is_monomorphic: chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = record.POS endPos = record.POS ref_allele = record.REF if self.match_mode == "exact": if mut.chr == chrom and mut.ref_allele == ref_allele: indices = [-1] else: if mut.chr == chrom and int(mut.start) <= startPos and int( mut.end) >= endPos: indices = [-1] else: # Iterate over all alternates in the record for index in xrange(0, len(record.ALT)): chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = record.POS endPos = record.POS ref = str(record.REF) alt = str(record.ALT[index]) ds_mut = MutUtils.initializeMutFromAttributes( chrom, startPos, endPos, ref, alt, build) if self.match_mode == "exact": if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): indices += [index] else: # cases whether the match mode isn't exact if mut.chr == ds_mut.chr and int(mut.start) == int( ds_mut.start) and int(mut.end) == int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \ and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int( ds_mut.start) and int(mut.end) >= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \ and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start): indices += [index] # if len(indices) == 0: # indices = [None] return indices
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testSimpleRead(self): """ Read a good maflite file and make sure that each mutation validates """ tmp = MafliteInputMutationCreator( "testdata/maflite/Patient0.indel.maf.txt", None, 'configs/maflite_input.config') muts = tmp.createMutations() # If no exception is thrown, then this test passes. for m in muts: MutUtils.validateMutation(m)
def testNoUnknownAnnotations(self): """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """ inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt') gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) mutations = inputCreator.createMutations() for m in mutations: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) unknownAnnotations = MutUtils.getUnknownAnnotations(m) self.assertTrue(len(unknownAnnotations) == 0, "Unknown annotations exist in mutation: " + str(unknownAnnotations))
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY' ] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping( fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def testCreateGPTsvDatasource(self): """ """ datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt" datasourceType = "gp_tsv" datasourceName = "ORegAnno" datasourceFoldername = "ORegAnno" datasourceVersion = "UCSC Track" genomeBuild = "hg19" genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion, genomicPositionColumnNames) datasourceFilename = "oreganno_trim.hg19.txt" configFilename = os.path.join(*[destDir, "ORegAnno.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") MutUtils.removeDir(tmpDir)
def testChrGLs(self): """ Test that mutations on unaligned transcripts can be annotated properly. I.e. when chromosome = GL.....""" inputCreator = MafliteInputMutationCreator('testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config") gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) mutations = inputCreator.createMutations() for m in mutations: try: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) except Exception as e: # Fail this test because an exception was thrown self.assertTrue(False, "Erroneous exception was thrown: " + str(e) + "\n" + traceback.format_exc()) self.assertTrue(m['gene'] != '')
def _handleMissingAnnotations(self, m): missingHeaderAnnotations = MutUtils.retrieveMissingAnnotations(m, TcgaVcfOutputRenderer.requiredHeaderAnnotations) missingMutAnnotations = MutUtils.retrieveMissingAnnotations(m, TcgaVcfOutputRenderer.requiredMutAnnotations) if len(missingHeaderAnnotations) > 0: sError = "The following annotations are required for rendering a TCGA VCF 1.1, but were not found: " + str( missingHeaderAnnotations) self.logger.error(sError) raise MissingAnnotationException(sError) if len(missingMutAnnotations) > 0: sError = "The following annotations important for rendering a TCGA VCF 1.1. Proceeding... : " + str( missingMutAnnotations) self.logger.warn(sError)
def testCreateIndexedVcfDatasource(self): datasourceFilename = "testdata/vcf/example.vcf" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_vcf" datasourceVersion = "V4.1" genomeBuild = "hg19" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion) datasourceFilename = "example.tabix_indexed.vcf.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") # Data source was created correctly tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"]) self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.") vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True) vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237) for vcfRecord in vcfRecords: self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"])) self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"])) MutUtils.removeDir(tmpDir)
def _determine_matching_alt_indices(self, mut, record, build): """ :param mut: :param record: :return: """ indices = [] if record.is_monomorphic: chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = record.POS endPos = record.POS ref_allele = record.REF if self.match_mode == "exact": if mut.chr == chrom and mut.ref_allele == ref_allele: indices = [-1] else: if mut.chr == chrom and int(mut.start) <= startPos and int(mut.end) >= endPos: indices = [-1] else: # Iterate over all alternates in the record for index in xrange(0, len(record.ALT)): chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = record.POS endPos = record.POS ref = str(record.REF) alt = str(record.ALT[index]) ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) if self.match_mode == "exact": if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): indices += [index] else: # cases whether the match mode isn't exact if mut.chr == ds_mut.chr and int(mut.start) == int(ds_mut.start) and int(mut.end) == int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \ and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) and int(mut.end) >= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \ and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start): indices += [index] # if len(indices) == 0: # indices = [None] return indices
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._specified_fields for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = self._mutation_data_factory.create(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def _handleMissingAnnotations(self, m): missingHeaderAnnotations = MutUtils.retrieveMissingAnnotations( m, TcgaVcfOutputRenderer.requiredHeaderAnnotations) missingMutAnnotations = MutUtils.retrieveMissingAnnotations( m, TcgaVcfOutputRenderer.requiredMutAnnotations) if len(missingHeaderAnnotations) > 0: sError = "The following annotations are required for rendering a TCGA VCF 1.1, but were not found: " + str( missingHeaderAnnotations) self.logger.error(sError) raise MissingAnnotationException(sError) if len(missingMutAnnotations) > 0: sError = "The following annotations important for rendering a TCGA VCF 1.1. Proceeding... : " + str( missingMutAnnotations) self.logger.warn(sError)
def testNoLostMutations(self): """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """ inputFilename = 'testdata/maflite/Patient0.snp.maf.txt' inputCreator = MafliteInputMutationCreator(inputFilename, "configs/maflite_input.config") gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numMutsInput = len(file(inputFilename, 'r').readlines()) - 1 mutations = inputCreator.createMutations() ctr = 0 for m in mutations: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) ctr += 1 self.assertEqual(ctr, numMutsInput, "Gaf data source altered mutation count.")
def testChromosomeConversionHG19(self): """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y """ self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
def testChrom2HashCodeTable(self): chroms = ["1", "X", "3", "contig1", "Y", "25", "mt"] h = MutUtils.createChrom2HashCodeTable(chroms) self.assertTrue(h["1"] == 1, "For chrom 1, hash code should be 1 but it was %s." % h["1"]) self.assertTrue(h["3"] == 3, "For chrom 3, hash code should be 3 but it was %s." % h["3"]) self.assertTrue(h["25"] == 25, "For chrom 25, hash code should be 25 but it was %s." % h["25"]) self.assertTrue(h["X"] == 26, "For chrom X, hash code should be 26 but it was %s." % h["X"]) self.assertTrue(h["Y"] == 27, "For chrom Y, hash code should be 27 but it was %s." % h["Y"]) self.assertTrue(h["mt"] == 28, "For chrom mt, hash code should be 28 but it was %s." % h["mt"]) self.assertTrue(h["contig1"] == 29, "For chrom contig1, hash code should be 29 but it was %s." % h["contig1"]) chroms = ["contig1", "mt"] h = MutUtils.createChrom2HashCodeTable(chroms) self.assertTrue(h["mt"] == 3, "For chrom mt, hash code should be 3 but it was %s." % h["mt"]) self.assertTrue(h["contig1"] == 4, "For chrom contig1, hash code should be 4 but it was %s." % h["contig1"])
def testNoUnknownAnnotations(self): """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """ inputCreator = MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt') gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) mutations = inputCreator.createMutations() for m in mutations: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) unknownAnnotations = MutUtils.getUnknownAnnotations(m) self.assertTrue( len(unknownAnnotations) == 0, "Unknown annotations exist in mutation: " + str(unknownAnnotations))
def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() geneTuple = self.gene_id_idx.get(gene, None) if geneTuple is None: return result ctr = 0 contig = MutUtils.convertChromosomeStringToMutationDataFormat(geneTuple[0]) for b in self.Transcripts.get(contig, []): for i in self.Transcripts[contig][b]: if i["gene"] == gene: if isCodingOnly and gaf_annotation.is_non_coding_transcript(i, self): ctr += 1 continue if isCodingOnly: genomic_coords = self.getCodingTranscriptCoords(i) else: genomic_coords = i["genomic_coords"] for coord in genomic_coords: start = min(coord[0], coord[1]) end = max(coord[0], coord[1]) result.add((gene, i["chr"], str(start - padding), str(end + padding))) return result
def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() geneTuple = self.gene_id_idx.get(gene, None) if geneTuple is None: return result ctr = 0 contig = MutUtils.convertChromosomeStringToMutationDataFormat( geneTuple[0]) for b in self.Transcripts.get(contig, []): for i in self.Transcripts[contig][b]: if i['gene'] == gene: if isCodingOnly and gaf_annotation.is_non_coding_transcript( i, self): ctr += 1 continue if isCodingOnly: genomic_coords = self.getCodingTranscriptCoords(i) else: genomic_coords = i['genomic_coords'] for coord in genomic_coords: start = min(coord[0], coord[1]) end = max(coord[0], coord[1]) result.add((gene, i['chr'], str(start - padding), str(end + padding))) return result
def retrieve_cached_annotations(self, m): """ :param m: mutation :return: list of Annotations, or None, if cache miss. """ cache_key = MutUtils.create_variant_key_by_mutation(m, self.get_db_dir_key()) return self.get_cache().retrieve_from_cache(cache_key)
def _is_matching(self, mut, tsv_record): chrom = tsv_record[self.tsv_index["chrom"]] startPos = tsv_record[self.tsv_index["start"]] endPos = tsv_record[self.tsv_index["end"]] build = "hg19" if self.match_mode == "exact": if "ref" in self.tsv_index and "alt" in self.tsv_index: # ref and alt information is present ref = tsv_record[self.tsv_index["ref"]] alt = tsv_record[self.tsv_index["alt"]] if ref == "-" or alt == "-": # addresses Mutation Annotation Format based tsv records # TODO: This looks risky to be calling the MutationData constructor directly ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build) else: # addresses tsv records where the input isn't a Mutation Annotation Format file ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): return True else: # do not use ref and alt information if mut.chr == chrom and int(mut.start) == int(startPos) and int(mut.end) == int(endPos): return True else: return TranscriptProviderUtils.test_overlap(int(mut.start), int(mut.end), int(startPos), int(endPos)) return False
def _is_matching(self, mut, tsv_record): chrom = tsv_record[self.tsv_index["chrom"]] startPos = tsv_record[self.tsv_index["start"]] endPos = tsv_record[self.tsv_index["end"]] build = "hg19" if self.match_mode == "exact": if "ref" in self.tsv_index and "alt" in self.tsv_index: # ref and alt information is present ref = tsv_record[self.tsv_index["ref"]] alt = tsv_record[self.tsv_index["alt"]] if ref == "-" or alt == "-": # addresses Mutation Annotation Format based tsv records # TODO: This looks risky to be calling the MutationData constructor directly ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build) else: # addresses tsv records where the input isn't a Mutation Annotation Format file ds_mut = MutUtils.initializeMutFromAttributes( chrom, startPos, endPos, ref, alt, build) if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): return True else: # do not use ref and alt information if mut.chr == chrom and int( mut.start) == int(startPos) and int( mut.end) == int(endPos): return True else: return TranscriptProviderUtils.test_overlap( int(mut.start), int(mut.end), int(startPos), int(endPos)) return False
def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand): cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand) prot_seq = MutUtils.translate_sequence(seq[int(cds_start_exon_space):int(cds_stop_exon_space)]) if len(prot_seq) > 0 and prot_seq[-1] == '*': prot_seq = prot_seq[:-1] return prot_seq
def testChrGLs(self): """ Test that mutations on unaligned transcripts can be annotated properly. I.e. when chromosome = GL.....""" inputCreator = MafliteInputMutationCreator( 'testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config") gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) mutations = inputCreator.createMutations() for m in mutations: try: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) except Exception as e: # Fail this test because an exception was thrown self.assertTrue( False, "Erroneous exception was thrown: " + str(e) + "\n" + traceback.format_exc()) self.assertTrue(m['gene'] != '')
def testNoLostMutations(self): """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """ inputFilename = 'testdata/maflite/Patient0.snp.maf.txt' inputCreator = MafliteInputMutationCreator( inputFilename, "configs/maflite_input.config") gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numMutsInput = len(file(inputFilename, 'r').readlines()) - 1 mutations = inputCreator.createMutations() ctr = 0 for m in mutations: m = gafDatasource.annotate_mutation(m) MutUtils.validateMutation(m) ctr += 1 self.assertEqual(ctr, numMutsInput, "Gaf data source altered mutation count.")
def store_annotations_in_cache(self, m): if self.is_read_only(): return cache_key = MutUtils.create_variant_key_by_mutation(m, self.get_db_dir_key()) annotations = self._determine_annotations_to_cache(m) self._store_basic_annotations_in_cache(cache_key, annotations)
def main(): # Attempt to create the corresponding data source. """ :raise: """ tmpDir = None try: tmpDir = tempfile.mkdtemp(prefix="initializeDatasource_") createDatasource(tmpDir) finally: try: MutUtils.removeDir(tmpDir) except OSError as exc: if exc.errno != 2: # code 2 - no such file or directory raise # re-raise exception
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._specified_fields for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = self._mutation_data_factory.create(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(), mut.alt_allele.strip() #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY'] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def determineOtherOptions(args, logger): opts = dict() opts[OptionConstants.NO_PREPEND] = not args.prepend opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(args.infer_genotypes) if args.input_format == "VCF" and args.output_format == "VCF": if opts[OptionConstants.VCF_OUT_INFER_GENOTYPES]: logger.warn("Infer genotypes option has been set to true. " "Because the input is a VCF file, infer genotypes will have no effect on the output.") return opts
def determineOtherOptions(args): opts = dict() opts[OptionConstants.NO_PREPEND] = not args.prepend opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool( args.infer_genotypes) opts[OptionConstants.INFER_ONPS] = args.infer_onps opts[ OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file return opts
def _fixVal(self, val, isSplit): """ :param val: :param isSplit: :return: """ if isSplit: val = MutUtils.replaceChrs(val, ",=;\n\t ", "|~|#__") # exclude ":" else: val = MutUtils.replaceChrs(val, "=;\n\t :", "~|#__>") # exclude ":" and "," if not isSplit: val = self._correct(val.split(",")) else: val = self._correct([val]) return val
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def _createMutRow(self,m): if m is None: return None # Calculate values # TODO: This is a sloppy way to do GT n_lod = self._extract_lod(m, 'init_n_lod') t_lod = self._extract_lod(m, 't_lod_fstar') # Nothing to call if both LODs are too low. if (t_lod < 0) and (n_lod < 0): return qual = max(t_lod, 0) gtN, gtT = self.genotype(n_lod, t_lod) ref,alt,new_start = MutUtils.retrievePrecedingBaseFromReference(m) ss,ssCode = self.determineSomaticStatus(gtN, gtT, qual) if ss is None or ssCode is None: return n_alt_count = self._get_annotation_value(m, 'n_alt_count', '0', is_blank_default=True) n_ref_count = self._get_annotation_value(m, 'n_ref_count', '0', is_blank_default=True) n_alt_sum = self._get_annotation_value(m, 'n_alt_sum', '0', is_blank_default=True) t_alt_count = self._get_annotation_value(m, 't_alt_count', '0', is_blank_default=True) t_ref_count = self._get_annotation_value(m, 't_ref_count', '0', is_blank_default=True) t_alt_sum = self._get_annotation_value(m, 't_alt_sum', '0', is_blank_default=True) mq0=0 normalFormat = self._generateFormatFieldWithValues(gtN, n_alt_count, n_ref_count, mq0, self._generateBQ(n_alt_count, n_alt_sum), ssCode) primaryFormat = self._generateFormatFieldWithValues(gtT, t_alt_count, t_ref_count, mq0, self._generateBQ(t_alt_count, t_alt_sum), ssCode) filterVal = self._generateFilterField(m) info = self._generateInfoField(m,filterVal, mq0, ss) #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL PRIMARY mutRow = dict() mutRow['CHROM'] = self._renderChrom(m.chr) mutRow['POS'] = new_start mutRow['ID'] = self.renderID(m) mutRow['REF'] = ref mutRow['ALT'] = alt mutRow['QUAL'] = qual mutRow['FILTER'] = filterVal mutRow['INFO'] = info mutRow['FORMAT'] = self._generateFormatField() mutRow['NORMAL'] = normalFormat mutRow['PRIMARY'] = primaryFormat return mutRow
def _createMutation(self, record, alt_index, build): chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = int(record.POS) endPos = int(record.POS) ref = record.REF ref = "" if ref == "." else ref alt = ref if not record.is_monomorphic: alt = str(record.ALT[alt_index]) mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) ID = "" if record.ID is None else record.ID mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID]) mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL]) mut.createAnnotation("alt_allele_seen", str(True), "INPUT") mut = self._addFilterData2Mutation(mut, record) mut = self._addInfoData2Mutation(mut, record, alt_index) return mut
def _correctFieldName(self, fieldName): """ Replaces unwanted characters in the field name :param fieldName: :return: corrected field name """ fieldName = MutUtils.replaceChrs(fieldName, "=; :", "~|_>") # Replace whitespace and other characters if fieldName.endswith("__FORMAT__"): # Drop "__FORMAT__" from the end fieldName = fieldName[0:len(fieldName)-len("__FORMAT__")] return fieldName
def determineOtherOptions(args): opts = dict() opts[OptionConstants.NO_PREPEND] = not args.prepend opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(args.infer_genotypes) opts[OptionConstants.INFER_ONPS] = args.infer_onps opts[OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file opts[OptionConstants.COLLAPSE_FILTER_COLS] = args.collapse_filter_cols opts[OptionConstants.REANNOTATE_TCGA_MAF_COLS] = args.reannotate_tcga_maf_cols opts[OptionConstants.ALLOW_ANNOTATION_OVERWRITING] = args.allow_overwriting opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = args.collapse_number_annotations return opts
def testRetrieveMissingAnnotations(self): """ Test simple case. """ m = MutationDataFactory.default_create() m.createAnnotation("a1", "1") m.createAnnotation("a2", "1") m.createAnnotation("a3", "1") m.createAnnotation("a4", "1") annotationNames = ["a3", "a2"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertIsNotNone(result) self.assertTrue(len(result) == 0, "Result was not empty: " + str(result)) annotationNames = ["zztop", "a1", "blah", "dummy"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertTrue(result[0] == "blah", "Result was not sorted") self.assertTrue("blah" in result and "dummy" in result and "zztop" in result, "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
def testProteinChange(self): """ Test that protein change parsing of start and end works. """ # Each tuple is test, ground truth testInOuts = [ ("p.K128_R130del", ['128','130']), ("p.W274G", ["274", "274"]), ("p.13_14AA>A", ["13", "14"]), ("p.G25_splice", ["25", "25"]), ("p.E813*", ["813", "813"]), ("p.SLPQPEQRPY59del", ["59", "59"]) ] ctr = 1 for test in testInOuts: result = MutUtils.extractProteinPosition(test[0]) self.assertTrue(result != ['', ''], "Result was empty. " + str(test[0]) + ". ") self.assertTrue(result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ". " + str(result) + " GT: " + str(test[1])) ctr += 1 self.assertTrue(MutUtils.extractProteinPosition("blahblah") == ['', ''])
def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand): cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space( int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand) prot_seq = MutUtils.translate_sequence( seq[int(cds_start_exon_space):int(cds_stop_exon_space)]) if len(prot_seq) > 0 and prot_seq[-1] == '*': prot_seq = prot_seq[:-1] return prot_seq
def test_previous_release_download(self): """Download an older ensembl transcript package. This test needs an internet connection and will fail w/o one. """ download_dir = "out/test_ensembl_download_previous/" release_num = "68" MutUtils.removeDir(download_dir) os.mkdir(download_dir) GenomeBuildInstallUtils.download_reference_data_from_ensembl(download_dir, "saccharomyces_cerevisiae", release=release_num) downloaded_files = os.listdir(download_dir) transcript_file = None for f in downloaded_files: if f.find("." + release_num + ".cdna.") != -1: transcript_file = f break self.assertIsNotNone(transcript_file) statinfo = os.stat(download_dir + transcript_file) self.assertTrue(statinfo.st_size > 0, "downloaded transcript file (" + transcript_file + ") is empty.")
def _correctFieldName(self, fieldName): """ Replaces unwanted characters in the field name :param fieldName: :return: corrected field name """ fieldName = MutUtils.replaceChrs(fieldName, "=; :", "~|_>") # Replace whitespace and other characters if fieldName.endswith("__FORMAT__"): # Drop "__FORMAT__" from the end fieldName = fieldName[0:len(fieldName)-len("__FORMAT__")] elif fieldName.endswith("__INFO__"): # Drop "__INFO__" from the end fieldName = fieldName[0:len(fieldName)-len("__INFO__")] return fieldName
def testRetrieveMissingAnnotations(self): """ Test simple case. """ m = MutationData() m.createAnnotation("a1", "1") m.createAnnotation("a2", "1") m.createAnnotation("a3", "1") m.createAnnotation("a4", "1") annotationNames = ["a3", "a2"] result = MutUtils.retrieveMissingAnnotations(m, annotationNames) self.assertIsNotNone(result) self.assertTrue( len(result) == 0, "Result was not empty: " + str(result)) annotationNames = ["zztop", "a1", "blah", "dummy"] result = MutUtils.retrieveMissingAnnotations(m, annotationNames) self.assertTrue(result[0] == "blah", "Result was not sorted") self.assertTrue( "blah" in result and "dummy" in result and "zztop" in result, "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
def testProteinChange(self): """ Test that protein change parsing of start and end works. """ # Each tuple is test, ground truth testInOuts = [("p.K128_R130del", ['128', '130']), ("p.W274G", ["274", "274"]), ("p.13_14AA>A", ["13", "14"]), ("p.G25_splice", ["25", "25"]), ("p.E813*", ["813", "813"]), ("p.SLPQPEQRPY59del", ["59", "59"])] ctr = 1 for test in testInOuts: result = MutUtils.extractProteinPosition(test[0]) self.assertTrue(result != ['', ''], "Result was empty. " + str(test[0]) + ". ") self.assertTrue( result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ". " + str(result) + " GT: " + str(test[1])) ctr += 1 self.assertTrue( MutUtils.extractProteinPosition("blahblah") == ['', ''])
def _yieldPartitions(self, iterable, func, fieldnameIndexes, length): """ This method parses a set of lines for a partition, applies an anonymous function that converts each line of the partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields the partition. Through this method, we obtain several sorted chunks. :param iterable: lines of text :param func: function that converts each row of the input file to an unique key :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes :param length: determines the number of lines in the buffer """ isKeyTuple = False # Take the first "length" number of items and return them as list. lines = list(itertools.islice(iterable, length)) data = collections.OrderedDict() while len(lines) > 0: pairs = [None] * len(lines) # Create a list of (key, value) pairs # Each key consists of a tuple, value is the corresponding text # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text for i in xrange(len(lines)): # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text line = lines[i] tokens = MutUtils.getTokens(line, self.delimiter, self.lineterminator) for fieldname, index in fieldnameIndexes.items(): data[fieldname] = tokens[index] key = func(data) if not isKeyTuple: isKeyTuple = isinstance(key, tuple) if not isKeyTuple: raise CallbackException( "The value returned by the callback must be a tuple. Instead, a value " "of %s was returned." % (type(key))) pairs[i] = self._Pair(key, line) partition = sorted(pairs, key=operator.attrgetter("key")) lines = list(itertools.islice(iterable, length)) yield partition
def get_protein_seq(self, transcript_id): gaf_record = self.get_transcript(transcript_id) tx_seq = self.get_transcript_seq(transcript_id) if not gaf_record or not tx_seq: return None if 'cds_start' not in gaf_record or not gaf_record['cds_start']: return None prot_seq = MutUtils.translate_sequence( tx_seq[gaf_record['cds_start'] - 1:gaf_record['cds_stop']]) if prot_seq[-1] == '*': prot_seq = prot_seq[:-1] return prot_seq
def _determineHeaders(self, mut, metadata): if mut is None: headers = [] else: headers = MutUtils.getAllAttributeNames(mut) if len(headers) == 0: headers = metadata.keys() # Remove headers that start with "_" for header in headers: if header.startswith("_"): headers.remove(header) return headers
def determineOtherOptions(args): opts = dict() opts[OptionConstants.NO_PREPEND] = not args.prepend opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool( args.infer_genotypes) opts[OptionConstants.INFER_ONPS] = args.infer_onps opts[ OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file opts[OptionConstants.COLLAPSE_FILTER_COLS] = args.collapse_filter_cols opts[OptionConstants. REANNOTATE_TCGA_MAF_COLS] = args.reannotate_tcga_maf_cols opts[OptionConstants.ALLOW_ANNOTATION_OVERWRITING] = args.allow_overwriting opts[OptionConstants. COLLAPSE_NUMBER_ANNOTATIONS] = args.collapse_number_annotations return opts
def testGenotypeFieldIsHonored(self): """ Tests that no issues arise with genotype values >1 when multiple variants appear on one line. """ inputFilename = os.path.join(*["testdata", "vcf", "example.severalGTs.vcf"]) creator = VcfInputMutationCreator(inputFilename) muts = creator.createMutations() ctr = 0 for mut in muts: if MutUtils.str2bool(mut["alt_allele_seen"]): self.assertTrue(mut['sample_name'] != "NA 00001") ctr += 1 self.assertTrue(ctr == 7, str(ctr) + " mutations with alt seen, but expected 7. './.' should not show as a variant.")