Пример #1
0
    def testChrom2HashCodeTable(self):
        chroms = ["1", "X", "3", "contig1", "Y", "25", "mt"]
        h = MutUtils.createChrom2HashCodeTable(chroms)
        self.assertTrue(
            h["1"] == 1,
            "For chrom 1, hash code should be 1 but it was %s." % h["1"])
        self.assertTrue(
            h["3"] == 3,
            "For chrom 3, hash code should be 3 but it was %s." % h["3"])
        self.assertTrue(
            h["25"] == 25,
            "For chrom 25, hash code should be 25 but it was %s." % h["25"])
        self.assertTrue(
            h["X"] == 26,
            "For chrom X, hash code should be 26 but it was %s." % h["X"])
        self.assertTrue(
            h["Y"] == 27,
            "For chrom Y, hash code should be 27 but it was %s." % h["Y"])
        self.assertTrue(
            h["mt"] == 28,
            "For chrom mt, hash code should be 28 but it was %s." % h["mt"])
        self.assertTrue(
            h["contig1"] == 29,
            "For chrom contig1, hash code should be 29 but it was %s." %
            h["contig1"])

        chroms = ["contig1", "mt"]
        h = MutUtils.createChrom2HashCodeTable(chroms)
        self.assertTrue(
            h["mt"] == 3,
            "For chrom mt, hash code should be 3 but it was %s." % h["mt"])
        self.assertTrue(
            h["contig1"] == 4,
            "For chrom contig1, hash code should be 4 but it was %s." %
            h["contig1"])
Пример #2
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Пример #3
0
    def _createMutation(self, record, alt_index, build):
        chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
            record.CHROM)
        startPos = int(record.POS)
        endPos = int(record.POS)
        ref = record.REF.strip()
        ref = "" if ref == "." else ref

        alt = ref
        if not record.is_monomorphic:
            alt = str(record.ALT[alt_index]).strip()

        mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos,
                                                   ref, alt, build,
                                                   self._mutation_data_factory)
        ID = "" if record.ID is None else record.ID
        mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID])
        mut.createAnnotation("qual",
                             str(record.QUAL),
                             "INPUT",
                             tags=[TagConstants.QUAL])
        mut.createAnnotation("alt_allele_seen", str(True), "INPUT")
        if self.collapse_filter_fields:
            mut = self._add_filter_data_2_mutation_single_field(mut, record)
        else:
            mut = self._addFilterData2Mutation(mut, record)
        mut = self._addInfoData2Mutation(mut, record, alt_index)
        return mut
Пример #4
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Пример #5
0
    def testRetrievePrecedingBaseFromAnnotationForInsertions(self):
        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCT"
        build = "19"
        mut = MutationData(chrom, start, end, ref_allele, alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(
            annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
            annotationValue=preceding_bases)
        updated_ref_allele, updated_alt_allele, updated_start = \
            MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut)
        self.assertTrue(
            updated_start == start,
            "Mut start should be %s but was %s." % (start, updated_start))
        self.assertTrue(
            updated_ref_allele == ref_allele,
            "Ref allele should be %s but was %s." %
            (ref_allele, updated_ref_allele))
        self.assertTrue(
            updated_alt_allele == alt_allele,
            "Alt allele should be %s but was %s." %
            (alt_allele, updated_alt_allele))

        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCTT"
        build = "19"
        mut = MutationData(chrom, start, end, ref_allele, alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(
            annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
            annotationValue=preceding_bases)
        updated_ref_allele, updated_alt_allele, updated_start = \
            MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut)
        self.assertTrue(
            updated_start == start,
            "Mut start should be %s but was %s." % (start, updated_start))
        self.assertTrue(
            updated_ref_allele == ref_allele,
            "Ref allele should be %s but was %s." %
            (ref_allele, updated_ref_allele))
        self.assertTrue(
            updated_alt_allele == alt_allele,
            "Alt allele should be %s but was %s." %
            (alt_allele, updated_alt_allele))
Пример #6
0
    def testChromosomeConversionHG19(self):
        """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y
        """
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"))
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"))

        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"))
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
 def testSimpleRead(self):
     """ Read a good maflite file and make sure that each mutation validates """
     tmp = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt", 'configs/maflite_input.config')
     muts = tmp.createMutations()
     
     # If no exception is thrown, then this test passes.
     for m in muts:
         MutUtils.validateMutation(m)
    def testExampleVcfDBAnnotationWithSNPExactMatch(self):
        """

        """
        tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"])
        tabixIndexedVcfDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName)

        chrom = "20"
        start = "1110696"
        end = "1110696"
        ref_allele = "A"
        alt_allele = "T"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="0.667", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AC")
        cur_annotation = Annotation(value="2,4", datasourceName="ESP", dataType="Integer",
                                    description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_H2")
        cur_annotation = Annotation(value="False", datasourceName="ESP", dataType="Flag",
                                    description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=0)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        chrom = "20"
        start = "1230237"
        end = "1230237"
        ref_allele = "T"
        alt_allele = "A"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_NS")
        cur_annotation = Annotation(value="3", datasourceName="ESP", dataType="Integer",
                                    description="Number of Samples With Data",
                                    tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
Пример #9
0
    def _determine_matching_alt_indices(self, mut, record, build):
        """

        :param mut:
        :param record:
        :return:
        """
        indices = []
        if record.is_monomorphic:
            chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
                record.CHROM)
            startPos = record.POS
            endPos = record.POS
            ref_allele = record.REF

            if self.match_mode == "exact":
                if mut.chr == chrom and mut.ref_allele == ref_allele:
                    indices = [-1]
            else:
                if mut.chr == chrom and int(mut.start) <= startPos and int(
                        mut.end) >= endPos:
                    indices = [-1]
        else:
            # Iterate over all alternates in the record
            for index in xrange(0, len(record.ALT)):
                chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
                    record.CHROM)
                startPos = record.POS
                endPos = record.POS
                ref = str(record.REF)
                alt = str(record.ALT[index])
                ds_mut = MutUtils.initializeMutFromAttributes(
                    chrom, startPos, endPos, ref, alt, build)

                if self.match_mode == "exact":
                    if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                        and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                        and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                else:  # cases whether the match mode isn't exact
                    if mut.chr == ds_mut.chr and int(mut.start) == int(
                            ds_mut.start) and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \
                        and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(
                            ds_mut.start) and int(mut.end) >= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \
                        and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start):
                        indices += [index]

        # if len(indices) == 0:
        #     indices = [None]

        return indices
Пример #10
0
    def testRetrievePrecedingBasesForInsertions(self):
        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCT"
        build = "19"
        mut = MutationDataFactory.default_create(chrom, start, end, ref_allele,
                                                 alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(
            annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
            annotationValue=preceding_bases)
        self.assertTrue("_preceding_bases" in mut,
                        "_preceding_bases is missing in the mutation data.")
        self.assertTrue(mut.start == 1234569,
                        "Mut start should be 1234570 but was %s." % mut.start)
        self.assertTrue(mut.end == 1234570,
                        "Mut end should be 1234570 but was %s." % mut.end)
        self.assertTrue(mut.ref_allele == "-",
                        "Ref allele should be - but was %s." % mut.ref_allele)
        self.assertTrue(mut.alt_allele == "T",
                        "Alt allele should be T but was %s." % mut.alt_allele)

        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCTT"
        build = "19"
        mut = MutationDataFactory.default_create(chrom, start, end, ref_allele,
                                                 alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(
            annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME,
            annotationValue=preceding_bases)
        self.assertTrue("_preceding_bases" in mut,
                        "_preceding_bases is missing in the mutation data.")
        self.assertTrue(mut.start == 1234569,
                        "Mut start should be 1234570 but was %s." % mut.start)
        self.assertTrue(mut.end == 1234570,
                        "Mut end should be 1234571 but was %s." % mut.end)
        self.assertTrue(mut.ref_allele == "-",
                        "Ref allele should be - but was %s." % mut.ref_allele)
        self.assertTrue(mut.alt_allele == "TT",
                        "Alt allele should be TT but was %s." % mut.alt_allele)
    def testSimpleRead(self):
        """ Read a good maflite file and make sure that each mutation validates """
        tmp = MafliteInputMutationCreator(
            "testdata/maflite/Patient0.indel.maf.txt", None,
            'configs/maflite_input.config')
        muts = tmp.createMutations()

        # If no exception is thrown, then this test passes.
        for m in muts:
            MutUtils.validateMutation(m)
Пример #12
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations()    
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(len(unknownAnnotations) == 0, "Unknown annotations exist in mutation: " + str(unknownAnnotations))
Пример #13
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = [
            'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
            'FORMAT', 'NORMAL', 'PRIMARY'
        ]

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(
                fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp,
                                   outputHeaders,
                                   delimiter="\t",
                                   lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) +
                         " mutations.  Could not render: " +
                         str(unrenderableRows))
    def testCreateGPTsvDatasource(self):
        """


        """
        datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt"
        datasourceType = "gp_tsv"
        datasourceName = "ORegAnno"
        datasourceFoldername = "ORegAnno"
        datasourceVersion = "UCSC Track"
        genomeBuild = "hg19"
        genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion, genomicPositionColumnNames)

        datasourceFilename = "oreganno_trim.hg19.txt"
        configFilename = os.path.join(*[destDir, "ORegAnno.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        MutUtils.removeDir(tmpDir)
Пример #15
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator('testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations() 
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(False, "Erroneous exception was thrown: " + str(e) + "\n" + traceback.format_exc())
         self.assertTrue(m['gene'] != '')
Пример #16
0
 def _handleMissingAnnotations(self, m):
     missingHeaderAnnotations = MutUtils.retrieveMissingAnnotations(m,
                                                                    TcgaVcfOutputRenderer.requiredHeaderAnnotations)
     missingMutAnnotations = MutUtils.retrieveMissingAnnotations(m, TcgaVcfOutputRenderer.requiredMutAnnotations)
     if len(missingHeaderAnnotations) > 0:
         sError = "The following annotations are required for rendering a TCGA VCF 1.1, but were not found: " + str(
             missingHeaderAnnotations)
         self.logger.error(sError)
         raise MissingAnnotationException(sError)
     if len(missingMutAnnotations) > 0:
         sError = "The following annotations important for rendering a TCGA VCF 1.1.  Proceeding... : " + str(
             missingMutAnnotations)
         self.logger.warn(sError)
    def testCreateIndexedVcfDatasource(self):
        datasourceFilename = "testdata/vcf/example.vcf"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_vcf"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion)

        datasourceFilename = "example.tabix_indexed.vcf.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        # Data source was created correctly
        tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"])
        self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.")

        vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True)
        vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237)
        for vcfRecord in vcfRecords:
            self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"]))
            self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"]))

        MutUtils.removeDir(tmpDir)
Пример #18
0
    def _determine_matching_alt_indices(self, mut, record, build):
        """

        :param mut:
        :param record:
        :return:
        """
        indices = []
        if record.is_monomorphic:
            chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
            startPos = record.POS
            endPos = record.POS
            ref_allele = record.REF

            if self.match_mode == "exact":
                if mut.chr == chrom and mut.ref_allele == ref_allele:
                    indices = [-1]
            else:
                if mut.chr == chrom and int(mut.start) <= startPos and int(mut.end) >= endPos:
                    indices = [-1]
        else:
            # Iterate over all alternates in the record
            for index in xrange(0, len(record.ALT)):
                chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
                startPos = record.POS
                endPos = record.POS
                ref = str(record.REF)
                alt = str(record.ALT[index])
                ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)

                if self.match_mode == "exact":
                    if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                        and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                        and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                else:  # cases whether the match mode isn't exact
                    if mut.chr == ds_mut.chr and int(mut.start) == int(ds_mut.start) and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \
                        and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) and int(mut.end) >= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \
                        and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start):
                        indices += [index]

        # if len(indices) == 0:
        #     indices = [None]

        return indices
Пример #19
0
    def createMutations(self):
        """ No inputs.
        Returns a generator of mutations built from the specified maflite file. """

        aliasKeys = self._reverseAlternativeDict.keys()
        allColumns = self._specified_fields

        for line in self._tsvReader:

            # We only need to assign fields that are mutation attributes and have a different name in the maflite file.
            mut = self._mutation_data_factory.create(build=self._build)

            for col in allColumns:
                # Three scenarios:
                #   1) col is name of mutation data field -- simple createAnnotation
                #   2) col name is an alias for a mutation data field -- do lookup then createAnnotation
                #   3) col name is not an alias for a mutation data field -- simple createAnnotation
                if col in aliasKeys:
                    realKey = self._reverseAlternativeDict[col]
                    self.logger.debug(realKey + " found from " + col)
                    val = line[col]
                    if realKey == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(
                            line[col])
                    mut.createAnnotation(realKey, val, 'INPUT')
                else:
                    # Scenario 1 and 3
                    # Make sure to convert chromosome values.
                    val = line[col]
                    if col == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(
                            line[col])
                    mut.createAnnotation(col, val, 'INPUT')

            mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(
            ), mut.alt_allele.strip(
            )  #remove any trailing whitespace if present

            # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different.
            if mut.alt_allele == mut.ref_allele:
                mut.alt_allele = self._find_alt_allele_in_other_field(
                    line, mut.ref_allele)

            # FIXME: Support more than one alias in the reverse dictionary.  Then this line can be removed.
            if mut.start is not "" and mut.end is "":
                mut.end = mut.start
            if mut.end is not "" and mut.start is "":
                mut.start = mut.end

            yield mut
Пример #20
0
 def _handleMissingAnnotations(self, m):
     missingHeaderAnnotations = MutUtils.retrieveMissingAnnotations(
         m, TcgaVcfOutputRenderer.requiredHeaderAnnotations)
     missingMutAnnotations = MutUtils.retrieveMissingAnnotations(
         m, TcgaVcfOutputRenderer.requiredMutAnnotations)
     if len(missingHeaderAnnotations) > 0:
         sError = "The following annotations are required for rendering a TCGA VCF 1.1, but were not found: " + str(
             missingHeaderAnnotations)
         self.logger.error(sError)
         raise MissingAnnotationException(sError)
     if len(missingMutAnnotations) > 0:
         sError = "The following annotations important for rendering a TCGA VCF 1.1.  Proceeding... : " + str(
             missingMutAnnotations)
         self.logger.warn(sError)
Пример #21
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()  
        ctr = 0  
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput, "Gaf data source altered mutation count.")
Пример #22
0
    def testChromosomeConversionHG19(self):
        """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y
        """
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("23",
                                                                 build="hg19"),
            "X", "chrom of 23 did not produce X: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("23",
                                                                 build="hg19"))
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("24",
                                                                 build="hg19"),
            "Y", "chrom of 24 did not produce Y: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("24",
                                                                 build="hg19"))

        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("2",
                                                                 build="hg19"),
            "2", "chrom of 2 yielded different value: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("2",
                                                                 build="hg19"))
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("4",
                                                                 build="hg19"),
            "4", "chrom of 4 yielded different value: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("4",
                                                                 build="hg19"))
Пример #23
0
    def testChrom2HashCodeTable(self):
        chroms = ["1", "X", "3", "contig1", "Y", "25", "mt"]
        h = MutUtils.createChrom2HashCodeTable(chroms)
        self.assertTrue(h["1"] == 1, "For chrom 1, hash code should be 1 but it was %s." % h["1"])
        self.assertTrue(h["3"] == 3, "For chrom 3, hash code should be 3 but it was %s." % h["3"])
        self.assertTrue(h["25"] == 25, "For chrom 25, hash code should be 25 but it was %s." % h["25"])
        self.assertTrue(h["X"] == 26, "For chrom X, hash code should be 26 but it was %s." % h["X"])
        self.assertTrue(h["Y"] == 27, "For chrom Y, hash code should be 27 but it was %s." % h["Y"])
        self.assertTrue(h["mt"] == 28, "For chrom mt, hash code should be 28 but it was %s." % h["mt"])
        self.assertTrue(h["contig1"] == 29, "For chrom contig1, hash code should be 29 but it was %s." % h["contig1"])

        chroms = ["contig1", "mt"]
        h = MutUtils.createChrom2HashCodeTable(chroms)
        self.assertTrue(h["mt"] == 3, "For chrom mt, hash code should be 3 but it was %s." % h["mt"])
        self.assertTrue(h["contig1"] == 4, "For chrom contig1, hash code should be 4 but it was %s." % h["contig1"])
Пример #24
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(
             len(unknownAnnotations) == 0,
             "Unknown annotations exist in mutation: " +
             str(unknownAnnotations))
Пример #25
0
    def retrieveExons(self, gene, padding=10, isCodingOnly=False):
        """Return a list of (chr, start, end) tuples for each exon"""
        result = set()
        geneTuple = self.gene_id_idx.get(gene, None)
        if geneTuple is None:
            return result
        ctr = 0
        contig = MutUtils.convertChromosomeStringToMutationDataFormat(geneTuple[0])
        for b in self.Transcripts.get(contig, []):
            for i in self.Transcripts[contig][b]:
                if i["gene"] == gene:
                    if isCodingOnly and gaf_annotation.is_non_coding_transcript(i, self):
                        ctr += 1
                        continue

                    if isCodingOnly:
                        genomic_coords = self.getCodingTranscriptCoords(i)
                    else:
                        genomic_coords = i["genomic_coords"]

                    for coord in genomic_coords:
                        start = min(coord[0], coord[1])
                        end = max(coord[0], coord[1])
                        result.add((gene, i["chr"], str(start - padding), str(end + padding)))
        return result
Пример #26
0
    def retrieveExons(self, gene, padding=10, isCodingOnly=False):
        """Return a list of (chr, start, end) tuples for each exon"""
        result = set()
        geneTuple = self.gene_id_idx.get(gene, None)
        if geneTuple is None:
            return result
        ctr = 0
        contig = MutUtils.convertChromosomeStringToMutationDataFormat(
            geneTuple[0])
        for b in self.Transcripts.get(contig, []):
            for i in self.Transcripts[contig][b]:
                if i['gene'] == gene:
                    if isCodingOnly and gaf_annotation.is_non_coding_transcript(
                            i, self):
                        ctr += 1
                        continue

                    if isCodingOnly:
                        genomic_coords = self.getCodingTranscriptCoords(i)
                    else:
                        genomic_coords = i['genomic_coords']

                    for coord in genomic_coords:
                        start = min(coord[0], coord[1])
                        end = max(coord[0], coord[1])
                        result.add((gene, i['chr'], str(start - padding),
                                    str(end + padding)))
        return result
Пример #27
0
 def retrieve_cached_annotations(self, m):
     """
     :param m: mutation
     :return: list of Annotations, or None, if cache miss.
     """
     cache_key = MutUtils.create_variant_key_by_mutation(m, self.get_db_dir_key())
     return self.get_cache().retrieve_from_cache(cache_key)
Пример #28
0
    def _is_matching(self, mut, tsv_record):

        chrom = tsv_record[self.tsv_index["chrom"]]
        startPos = tsv_record[self.tsv_index["start"]]
        endPos = tsv_record[self.tsv_index["end"]]
        build = "hg19"

        if self.match_mode == "exact":
            if "ref" in self.tsv_index and "alt" in self.tsv_index:  # ref and alt information is present
                ref = tsv_record[self.tsv_index["ref"]]
                alt = tsv_record[self.tsv_index["alt"]]
                if ref == "-" or alt == "-":  # addresses Mutation Annotation Format based tsv records

                    # TODO: This looks risky to be calling the MutationData constructor directly
                    ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build)
                else:  # addresses tsv records where the input isn't a Mutation Annotation Format file
                    ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)

                if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                    and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                    and int(mut.end) == int(ds_mut.end):
                    return True
            else:  # do not use ref and alt information
                if mut.chr == chrom and int(mut.start) == int(startPos) and int(mut.end) == int(endPos):
                    return True
        else:
           return TranscriptProviderUtils.test_overlap(int(mut.start), int(mut.end), int(startPos), int(endPos))
        return False
    def _is_matching(self, mut, tsv_record):

        chrom = tsv_record[self.tsv_index["chrom"]]
        startPos = tsv_record[self.tsv_index["start"]]
        endPos = tsv_record[self.tsv_index["end"]]
        build = "hg19"

        if self.match_mode == "exact":
            if "ref" in self.tsv_index and "alt" in self.tsv_index:  # ref and alt information is present
                ref = tsv_record[self.tsv_index["ref"]]
                alt = tsv_record[self.tsv_index["alt"]]
                if ref == "-" or alt == "-":  # addresses Mutation Annotation Format based tsv records

                    # TODO: This looks risky to be calling the MutationData constructor directly
                    ds_mut = MutationData(chrom, startPos, endPos, ref, alt,
                                          build)
                else:  # addresses tsv records where the input isn't a Mutation Annotation Format file
                    ds_mut = MutUtils.initializeMutFromAttributes(
                        chrom, startPos, endPos, ref, alt, build)

                if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                    and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                    and int(mut.end) == int(ds_mut.end):
                    return True
            else:  # do not use ref and alt information
                if mut.chr == chrom and int(
                        mut.start) == int(startPos) and int(
                            mut.end) == int(endPos):
                    return True
        else:
            return TranscriptProviderUtils.test_overlap(
                int(mut.start), int(mut.end), int(startPos), int(endPos))
        return False
Пример #30
0
    def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand):
        cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand)

        prot_seq = MutUtils.translate_sequence(seq[int(cds_start_exon_space):int(cds_stop_exon_space)])
        if len(prot_seq) > 0 and prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]

        return prot_seq
Пример #31
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(
                 False, "Erroneous exception was thrown: " + str(e) + "\n" +
                 traceback.format_exc())
         self.assertTrue(m['gene'] != '')
Пример #32
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(
            inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()
        ctr = 0
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput,
                         "Gaf data source altered mutation count.")
Пример #33
0
    def store_annotations_in_cache(self, m):

        if self.is_read_only():
            return

        cache_key = MutUtils.create_variant_key_by_mutation(m, self.get_db_dir_key())
        annotations = self._determine_annotations_to_cache(m)
        self._store_basic_annotations_in_cache(cache_key, annotations)
Пример #34
0
def main():
    # Attempt to create the corresponding data source.
    """


    :raise:
    """
    tmpDir = None
    try:
        tmpDir = tempfile.mkdtemp(prefix="initializeDatasource_")
        createDatasource(tmpDir)
    finally:
        try:
            MutUtils.removeDir(tmpDir)
        except OSError as exc:
            if exc.errno != 2:  # code 2 - no such file or directory
                raise  # re-raise exception
Пример #35
0
def main():
    # Attempt to create the corresponding data source.
    """


    :raise:
    """
    tmpDir = None
    try:
        tmpDir = tempfile.mkdtemp(prefix="initializeDatasource_")
        createDatasource(tmpDir)
    finally:
        try:
            MutUtils.removeDir(tmpDir)
        except OSError as exc:
            if exc.errno != 2:  # code 2 - no such file or directory
                raise  # re-raise exception
    def createMutations(self):
        """ No inputs.
        Returns a generator of mutations built from the specified maflite file. """

        aliasKeys = self._reverseAlternativeDict.keys()
        allColumns = self._specified_fields

        for line in self._tsvReader:

            # We only need to assign fields that are mutation attributes and have a different name in the maflite file.
            mut = self._mutation_data_factory.create(build=self._build)

            for col in allColumns:
                # Three scenarios:
                #   1) col is name of mutation data field -- simple createAnnotation
                #   2) col name is an alias for a mutation data field -- do lookup then createAnnotation
                #   3) col name is not an alias for a mutation data field -- simple createAnnotation
                if col in aliasKeys:
                    realKey = self._reverseAlternativeDict[col]
                    self.logger.debug(realKey + " found from " + col)
                    val = line[col]
                    if realKey == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(realKey, val, 'INPUT')
                else:
                    # Scenario 1 and 3
                    # Make sure to convert chromosome values.
                    val = line[col]
                    if col == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(col, val, 'INPUT') 

            mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(), mut.alt_allele.strip() #remove any trailing whitespace if present

            # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different.
            if mut.alt_allele == mut.ref_allele:
                mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele)

            # FIXME: Support more than one alias in the reverse dictionary.  Then this line can be removed.
            if mut.start is not "" and mut.end is "":
                mut.end = mut.start
            if mut.end is not "" and mut.start is "":
                mut.start = mut.end

            yield mut
Пример #37
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY']

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) + " mutations.  Could not render: " + str(unrenderableRows))
Пример #38
0
def determineOtherOptions(args, logger):
    opts = dict()
    opts[OptionConstants.NO_PREPEND] = not args.prepend
    opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(args.infer_genotypes)
    if args.input_format == "VCF" and args.output_format == "VCF":
        if opts[OptionConstants.VCF_OUT_INFER_GENOTYPES]:
            logger.warn("Infer genotypes option has been set to true.  "
                        "Because the input is a VCF file, infer genotypes will have no effect on the output.")
    return opts
Пример #39
0
def determineOtherOptions(args):
    opts = dict()
    opts[OptionConstants.NO_PREPEND] = not args.prepend
    opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(
        args.infer_genotypes)
    opts[OptionConstants.INFER_ONPS] = args.infer_onps
    opts[
        OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file
    return opts
Пример #40
0
    def _fixVal(self, val, isSplit):
        """

        :param val:
        :param isSplit:
        :return:
        """
        if isSplit:
            val = MutUtils.replaceChrs(val, ",=;\n\t ", "|~|#__")  # exclude ":"
        else:
            val = MutUtils.replaceChrs(val, "=;\n\t :", "~|#__>")  # exclude ":" and ","

        if not isSplit:
            val = self._correct(val.split(","))
        else:
            val = self._correct([val])

        return val
Пример #41
0
    def testRetrievePrecedingBaseFromAnnotationForInsertions(self):
        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCT"
        build = "19"
        mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases)
        updated_ref_allele, updated_alt_allele, updated_start = \
            MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut)
        self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start))
        self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s."
                                                          % (ref_allele, updated_ref_allele))
        self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s."
                                                          % (alt_allele, updated_alt_allele))

        chrom = "1"
        start = 1234567
        end = 1234567  # incorrect, but doesn't matter for the purposed of testing
        ref_allele = "GTC"
        alt_allele = "GTCTT"
        build = "19"
        mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build)
        preceding_bases, updated_alt_allele, updated_start, updated_end = \
            MutUtils.retrievePrecedingBasesForInsertions(mut)
        mut.ref_allele = "-"
        mut.alt_allele = updated_alt_allele
        mut.start = updated_start
        mut.end = updated_end
        mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases)
        updated_ref_allele, updated_alt_allele, updated_start = \
            MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut)
        self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start))
        self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s."
                                                          % (ref_allele, updated_ref_allele))
        self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s."
                                                          % (alt_allele, updated_alt_allele))
Пример #42
0
    def _createMutRow(self,m):

        if m is None:
            return None

        # Calculate values
        # TODO: This is a sloppy way to do GT
        n_lod = self._extract_lod(m, 'init_n_lod')
        t_lod = self._extract_lod(m, 't_lod_fstar')


        # Nothing to call if both LODs are too low.
        if (t_lod < 0) and (n_lod < 0):
            return

        qual = max(t_lod, 0)

        gtN, gtT = self.genotype(n_lod, t_lod)

        ref,alt,new_start = MutUtils.retrievePrecedingBaseFromReference(m)

        ss,ssCode = self.determineSomaticStatus(gtN, gtT, qual)
        if ss is None or ssCode is None:
            return

        n_alt_count = self._get_annotation_value(m, 'n_alt_count', '0', is_blank_default=True)
        n_ref_count = self._get_annotation_value(m, 'n_ref_count', '0', is_blank_default=True)
        n_alt_sum = self._get_annotation_value(m, 'n_alt_sum', '0', is_blank_default=True)
        t_alt_count = self._get_annotation_value(m, 't_alt_count', '0', is_blank_default=True)
        t_ref_count = self._get_annotation_value(m, 't_ref_count', '0', is_blank_default=True)
        t_alt_sum = self._get_annotation_value(m, 't_alt_sum', '0', is_blank_default=True)
        mq0=0
        normalFormat = self._generateFormatFieldWithValues(gtN, n_alt_count, n_ref_count, mq0,
                                                           self._generateBQ(n_alt_count, n_alt_sum), ssCode)
        primaryFormat = self._generateFormatFieldWithValues(gtT, t_alt_count, t_ref_count, mq0,
                                                            self._generateBQ(t_alt_count, t_alt_sum), ssCode)

        filterVal = self._generateFilterField(m)
        info = self._generateInfoField(m,filterVal, mq0, ss)

        #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	PRIMARY
        mutRow = dict()

        mutRow['CHROM'] = self._renderChrom(m.chr)
        mutRow['POS'] = new_start
        mutRow['ID'] = self.renderID(m)
        mutRow['REF'] = ref
        mutRow['ALT'] = alt
        mutRow['QUAL'] = qual
        mutRow['FILTER'] = filterVal
        mutRow['INFO'] = info
        mutRow['FORMAT'] = self._generateFormatField()
        mutRow['NORMAL'] = normalFormat
        mutRow['PRIMARY'] = primaryFormat

        return mutRow
Пример #43
0
    def _createMutation(self, record, alt_index, build):
        chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
        startPos = int(record.POS)
        endPos = int(record.POS)
        ref = record.REF
        ref = "" if ref == "." else ref

        alt = ref
        if not record.is_monomorphic:
            alt = str(record.ALT[alt_index])

        mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)
        ID = "" if record.ID is None else record.ID
        mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID])
        mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL])
        mut.createAnnotation("alt_allele_seen", str(True), "INPUT")
        mut = self._addFilterData2Mutation(mut, record)
        mut = self._addInfoData2Mutation(mut, record, alt_index)
        return mut
Пример #44
0
    def _fixVal(self, val, isSplit):
        """

        :param val:
        :param isSplit:
        :return:
        """
        if isSplit:
            val = MutUtils.replaceChrs(val, ",=;\n\t ",
                                       "|~|#__")  # exclude ":"
        else:
            val = MutUtils.replaceChrs(val, "=;\n\t :",
                                       "~|#__>")  # exclude ":" and ","

        if not isSplit:
            val = self._correct(val.split(","))
        else:
            val = self._correct([val])

        return val
Пример #45
0
    def _correctFieldName(self, fieldName):
        """
        Replaces unwanted characters in the field name

        :param fieldName:
        :return: corrected field name
        """
        fieldName = MutUtils.replaceChrs(fieldName, "=; :", "~|_>")  # Replace whitespace and other characters
        if fieldName.endswith("__FORMAT__"):  # Drop "__FORMAT__" from the end
            fieldName = fieldName[0:len(fieldName)-len("__FORMAT__")]
        return fieldName
Пример #46
0
def determineOtherOptions(args):
    opts = dict()
    opts[OptionConstants.NO_PREPEND] = not args.prepend
    opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(args.infer_genotypes)
    opts[OptionConstants.INFER_ONPS] = args.infer_onps
    opts[OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file
    opts[OptionConstants.COLLAPSE_FILTER_COLS] = args.collapse_filter_cols
    opts[OptionConstants.REANNOTATE_TCGA_MAF_COLS] = args.reannotate_tcga_maf_cols
    opts[OptionConstants.ALLOW_ANNOTATION_OVERWRITING] = args.allow_overwriting
    opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = args.collapse_number_annotations
    return opts
Пример #47
0
    def testRetrieveMissingAnnotations(self):
        """ Test simple case.
        """
        m = MutationDataFactory.default_create()
        m.createAnnotation("a1", "1")
        m.createAnnotation("a2", "1")
        m.createAnnotation("a3", "1")
        m.createAnnotation("a4", "1")

        annotationNames = ["a3", "a2"]

        result = MutUtils.retrieveMissingAnnotations(m,annotationNames)

        self.assertIsNotNone(result)
        self.assertTrue(len(result) == 0, "Result was not empty: " + str(result))

        annotationNames = ["zztop", "a1", "blah", "dummy"]
        result = MutUtils.retrieveMissingAnnotations(m,annotationNames)
        self.assertTrue(result[0] == "blah", "Result was not sorted")
        self.assertTrue("blah" in result and "dummy" in result and "zztop" in result, "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
Пример #48
0
    def testProteinChange(self):
        """ Test that protein change parsing of start and end works.
        """
        # Each tuple is test, ground truth
        testInOuts = [
            ("p.K128_R130del", ['128','130']),
            ("p.W274G", ["274", "274"]),
            ("p.13_14AA>A", ["13", "14"]),
            ("p.G25_splice", ["25", "25"]),
            ("p.E813*", ["813", "813"]),
            ("p.SLPQPEQRPY59del", ["59", "59"])
        ]

        ctr = 1
        for test in testInOuts:
            result = MutUtils.extractProteinPosition(test[0])
            self.assertTrue(result != ['', ''], "Result was empty.  " + str(test[0]) + ".  ")
            self.assertTrue(result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ".  " + str(result) + "  GT: " + str(test[1]))
            ctr += 1
        self.assertTrue(MutUtils.extractProteinPosition("blahblah") == ['', ''])
Пример #49
0
    def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space,
                                    cds_stop_genomic_space, strand):
        cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(
            int(cds_start_genomic_space), int(cds_stop_genomic_space), exons,
            strand)

        prot_seq = MutUtils.translate_sequence(
            seq[int(cds_start_exon_space):int(cds_stop_exon_space)])
        if len(prot_seq) > 0 and prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]

        return prot_seq
    def test_previous_release_download(self):
        """Download an older ensembl transcript package.  This test needs an internet connection and will fail w/o one.
        """
        download_dir = "out/test_ensembl_download_previous/"
        release_num = "68"
        MutUtils.removeDir(download_dir)
        os.mkdir(download_dir)
        GenomeBuildInstallUtils.download_reference_data_from_ensembl(download_dir, "saccharomyces_cerevisiae", release=release_num)

        downloaded_files = os.listdir(download_dir)

        transcript_file = None
        for f in downloaded_files:
            if f.find("." + release_num + ".cdna.") != -1:
                transcript_file = f
                break

        self.assertIsNotNone(transcript_file)

        statinfo = os.stat(download_dir + transcript_file)
        self.assertTrue(statinfo.st_size > 0, "downloaded transcript file (" + transcript_file + ") is empty.")
Пример #51
0
    def _correctFieldName(self, fieldName):
        """
        Replaces unwanted characters in the field name

        :param fieldName:
        :return: corrected field name
        """
        fieldName = MutUtils.replaceChrs(fieldName, "=; :", "~|_>")  # Replace whitespace and other characters
        if fieldName.endswith("__FORMAT__"):  # Drop "__FORMAT__" from the end
            fieldName = fieldName[0:len(fieldName)-len("__FORMAT__")]
        elif fieldName.endswith("__INFO__"):  # Drop "__INFO__" from the end
            fieldName = fieldName[0:len(fieldName)-len("__INFO__")]
        return fieldName
Пример #52
0
    def testRetrieveMissingAnnotations(self):
        """ Test simple case.
        """
        m = MutationData()
        m.createAnnotation("a1", "1")
        m.createAnnotation("a2", "1")
        m.createAnnotation("a3", "1")
        m.createAnnotation("a4", "1")

        annotationNames = ["a3", "a2"]

        result = MutUtils.retrieveMissingAnnotations(m, annotationNames)

        self.assertIsNotNone(result)
        self.assertTrue(
            len(result) == 0, "Result was not empty: " + str(result))

        annotationNames = ["zztop", "a1", "blah", "dummy"]
        result = MutUtils.retrieveMissingAnnotations(m, annotationNames)
        self.assertTrue(result[0] == "blah", "Result was not sorted")
        self.assertTrue(
            "blah" in result and "dummy" in result and "zztop" in result,
            "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
Пример #53
0
    def testProteinChange(self):
        """ Test that protein change parsing of start and end works.
        """
        # Each tuple is test, ground truth
        testInOuts = [("p.K128_R130del", ['128', '130']),
                      ("p.W274G", ["274", "274"]),
                      ("p.13_14AA>A", ["13", "14"]),
                      ("p.G25_splice", ["25", "25"]),
                      ("p.E813*", ["813", "813"]),
                      ("p.SLPQPEQRPY59del", ["59", "59"])]

        ctr = 1
        for test in testInOuts:
            result = MutUtils.extractProteinPosition(test[0])
            self.assertTrue(result != ['', ''],
                            "Result was empty.  " + str(test[0]) + ".  ")
            self.assertTrue(
                result[0] == test[1][0] and result[1] == test[1][1],
                "Result did not match for " + str(test[0]) + ".  " +
                str(result) + "  GT: " + str(test[1]))
            ctr += 1
        self.assertTrue(
            MutUtils.extractProteinPosition("blahblah") == ['', ''])
Пример #54
0
    def _yieldPartitions(self, iterable, func, fieldnameIndexes, length):
        """
        This method parses a set of lines for a partition, applies an anonymous function that converts each line of the
        partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields
        the partition. Through this method, we obtain several sorted chunks.

        :param iterable: lines of text
        :param func: function that converts each row of the input file to an unique key
        :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes
        :param length: determines the number of lines in the buffer
        """
        isKeyTuple = False

        # Take the first "length" number of items and return them as list.
        lines = list(itertools.islice(iterable, length))
        data = collections.OrderedDict()

        while len(lines) > 0:
            pairs = [None] * len(lines)

            # Create a list of (key, value) pairs
            # Each key consists of a tuple, value is the corresponding text
            # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
            # line of text
            for i in xrange(len(lines)):
                # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
                # line of text
                line = lines[i]
                tokens = MutUtils.getTokens(line, self.delimiter,
                                            self.lineterminator)

                for fieldname, index in fieldnameIndexes.items():
                    data[fieldname] = tokens[index]

                key = func(data)

                if not isKeyTuple:
                    isKeyTuple = isinstance(key, tuple)
                    if not isKeyTuple:
                        raise CallbackException(
                            "The value returned by the callback must be a tuple. Instead, a value "
                            "of %s was returned." % (type(key)))
                pairs[i] = self._Pair(key, line)

            partition = sorted(pairs, key=operator.attrgetter("key"))

            lines = list(itertools.islice(iterable, length))

            yield partition
Пример #55
0
    def get_protein_seq(self, transcript_id):
        gaf_record = self.get_transcript(transcript_id)
        tx_seq = self.get_transcript_seq(transcript_id)
        if not gaf_record or not tx_seq:
            return None

        if 'cds_start' not in gaf_record or not gaf_record['cds_start']:
            return None

        prot_seq = MutUtils.translate_sequence(
            tx_seq[gaf_record['cds_start'] - 1:gaf_record['cds_stop']])
        if prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]

        return prot_seq
Пример #56
0
    def _determineHeaders(self, mut, metadata):
        if mut is None:
            headers = []
        else:
            headers = MutUtils.getAllAttributeNames(mut)

        if len(headers) == 0:
            headers = metadata.keys()

        # Remove headers that start with "_"
        for header in headers:
            if header.startswith("_"):
                headers.remove(header)

        return headers
Пример #57
0
def determineOtherOptions(args):
    opts = dict()
    opts[OptionConstants.NO_PREPEND] = not args.prepend
    opts[OptionConstants.VCF_OUT_INFER_GENOTYPES] = MutUtils.str2bool(
        args.infer_genotypes)
    opts[OptionConstants.INFER_ONPS] = args.infer_onps
    opts[
        OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE] = args.canonical_tx_file
    opts[OptionConstants.COLLAPSE_FILTER_COLS] = args.collapse_filter_cols
    opts[OptionConstants.
         REANNOTATE_TCGA_MAF_COLS] = args.reannotate_tcga_maf_cols
    opts[OptionConstants.ALLOW_ANNOTATION_OVERWRITING] = args.allow_overwriting
    opts[OptionConstants.
         COLLAPSE_NUMBER_ANNOTATIONS] = args.collapse_number_annotations
    return opts
Пример #58
0
    def testGenotypeFieldIsHonored(self):
        """
        Tests that no issues arise with genotype values >1 when multiple variants appear on one line.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.severalGTs.vcf"])
        creator = VcfInputMutationCreator(inputFilename)
        muts = creator.createMutations()
        ctr = 0
        for mut in muts:

            if MutUtils.str2bool(mut["alt_allele_seen"]):
                self.assertTrue(mut['sample_name'] != "NA 00001")
                ctr += 1
        self.assertTrue(ctr == 7,
                        str(ctr) + " mutations with alt seen, but expected 7.  './.' should not show as a variant.")