예제 #1
0
    def testSortFileWithSpaces(self):
        """
        Tests sorting a file with spaces in the headers on the filesystem.
        """
        inputFilename = os.path.join(*["testdata", "small_cosmic_with_gp_and_gpp", "small_cosmic_trimmed_for_sorting.txt.tbi.byAA"])
        outputFilename = os.path.join("out", "small_cosmic_trimmed_for_sorting.txt.byAA.sorted.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
예제 #2
0
    def testSortFile(self):
        """
        Tests sorting a file on the filesystem.
        """
        inputFilename = os.path.join(*["testdata", "small_cosmic_gpp", "small_cosmic_gpp.tempForSorting.tsv"])
        outputFilename = os.path.join("out", "small_cosmic_gpp.tempForSorting.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: ((val["Gene_name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename), "No file was generated.")
예제 #3
0
 def testCallbackExceptionIncorrectType(self):
     """
     Tests that the CallbackException is raised when the input anonymous function does not return a tuple given a
     row.
     """
     inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
     outputFilename = os.path.join("out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv")
     tsvFileSorter = TsvFileSorter(inputFilename)
     func = lambda val: (val["Gene name"]).lower()
     try:
         tsvFileSorter.sortFile(outputFilename, func, 3)
     except CallbackException as msg:
         self.assertTrue(msg.value == "The value returned by the callback must be a tuple. Instead, a value of "
                                      "<type 'str'> was returned.", "Error msg is different.")
예제 #4
0
    def getSortedTsvFilename(self, path):
        """


        :param path:
        :return:
        """
        chrom2HashCode = MutUtils.createChrom2HashCodeTable(self.chroms)
        tsvFileSorter = TsvFileSorter(self.filename)
        sortedTempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False)
        func = lambda val: (chrom2HashCode[val["chr"]], int(val["start"]), val["alt_allele"])
        tsvFileSorter.sortFile(sortedTempTsvFile.name, func)
        os.remove(self.filename)

        return sortedTempTsvFile.name
예제 #5
0
    def getSortedTsvFilename(self, path):
        """


        :param path:
        :return:
        """
        chrom2HashCode = MutUtils.createChrom2HashCodeTable(self.chroms)
        tsvFileSorter = TsvFileSorter(self.filename)
        sortedTempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False)
        func = lambda val: (chrom2HashCode[val["chr"]], int(val["start"]), val["alt_allele"])
        self.logger.debug("Sorting tmp tsv %s->%s", self.filename, sortedTempTsvFile.name)
        tsvFileSorter.sortFile(sortedTempTsvFile.name, func)
        os.remove(self.filename)

        return sortedTempTsvFile.name
예제 #6
0
    def testSortMixedCaps(self):
        """
        Tests sorting a file with mixed capitalization in the reference column.
        """
        inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
        outputFilename = os.path.join("out", "sort_mixed_caps.tsv.sorted.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename), "No file was generated.")

        guessmd5 = hashlib.md5(file(outputFilename, 'r').read()).hexdigest()
        gtmd5 = hashlib.md5(file(os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv"]),
                                 "r").read()).hexdigest()
        self.assertTrue(guessmd5 == gtmd5)
예제 #7
0
    def testMultiplePartitionSorting(self):
        """
        Tests that the sorting works when the partition size is small and input file must be broken into multiple
        partitions.
        """
        inputFilename = os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
        outputFilename = os.path.join("out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func, 3)
        self.assertTrue(os.path.exists(outputFilename), "No file was generated.")

        guessmd5 = hashlib.md5(file(outputFilename, "r").read()).hexdigest()
        gtmd5 = hashlib.md5(file(os.path.join(*["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps_sorted.tsv"]),
                                 "r").read()).hexdigest()
        self.assertTrue(guessmd5 == gtmd5)
예제 #8
0
    def testSortFileWithSpaces(self):
        """
        Tests sorting a file with spaces in the headers on the filesystem.
        """
        inputFilename = os.path.join(*[
            "testdata", "small_cosmic_with_gp_and_gpp",
            "small_cosmic_trimmed_for_sorting.txt.tbi.byAA"
        ])
        outputFilename = os.path.join(
            "out", "small_cosmic_trimmed_for_sorting.txt.byAA.sorted.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: (
            (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename),
                        "No file was generated.")
예제 #9
0
    def testSortFile(self):
        """
        Tests sorting a file on the filesystem.
        """
        inputFilename = os.path.join(*[
            "testdata", "small_cosmic_gpp",
            "small_cosmic_gpp.tempForSorting.tsv"
        ])
        outputFilename = os.path.join(
            "out", "small_cosmic_gpp.tempForSorting.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: (
            (val["Gene_name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename),
                        "No file was generated.")
예제 #10
0
 def testCallbackExceptionIncorrectType(self):
     """
     Tests that the CallbackException is raised when the input anonymous function does not return a tuple given a
     row.
     """
     inputFilename = os.path.join(
         *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
     outputFilename = os.path.join(
         "out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv")
     tsvFileSorter = TsvFileSorter(inputFilename)
     func = lambda val: (val["Gene name"]).lower()
     try:
         tsvFileSorter.sortFile(outputFilename, func, 3)
     except CallbackException as msg:
         self.assertTrue(
             msg.value ==
             "The value returned by the callback must be a tuple. Instead, a value of "
             "<type 'str'> was returned.", "Error msg is different.")
예제 #11
0
    def testSortMixedCaps(self):
        """
        Tests sorting a file with mixed capitalization in the reference column.
        """
        inputFilename = os.path.join(
            *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
        outputFilename = os.path.join("out",
                                      "sort_mixed_caps.tsv.sorted.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: (
            (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func)

        self.assertTrue(os.path.exists(outputFilename),
                        "No file was generated.")

        guessmd5 = hashlib.md5(file(outputFilename, 'r').read()).hexdigest()
        gtmd5 = hashlib.md5(
            file(
                os.path.join(*[
                    "testdata", "sort_mixed_caps_tsv",
                    "sort_mixed_caps_sorted.tsv"
                ]), "r").read()).hexdigest()
        self.assertTrue(guessmd5 == gtmd5)
예제 #12
0
    def testMultiplePartitionSorting(self):
        """
        Tests that the sorting works when the partition size is small and input file must be broken into multiple
        partitions.
        """
        inputFilename = os.path.join(
            *["testdata", "sort_mixed_caps_tsv", "sort_mixed_caps.tsv"])
        outputFilename = os.path.join(
            "out", "multiple_partitions_sort_mixed_caps.tsv.sorted.out.tsv")
        tsvFileSorter = TsvFileSorter(inputFilename)
        func = lambda val: (
            (val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))
        tsvFileSorter.sortFile(outputFilename, func, 3)
        self.assertTrue(os.path.exists(outputFilename),
                        "No file was generated.")

        guessmd5 = hashlib.md5(file(outputFilename, "r").read()).hexdigest()
        gtmd5 = hashlib.md5(
            file(
                os.path.join(*[
                    "testdata", "sort_mixed_caps_tsv",
                    "sort_mixed_caps_sorted.tsv"
                ]), "r").read()).hexdigest()
        self.assertTrue(guessmd5 == gtmd5)
                continue

            row['startAA'] = feature[1]
            row['endAA'] = feature[2]
            row['gene'] = m['gene']

            row[annotation] = feature[3]
            tsvWriter.writerow(row)

    print("Could not get uniprot seq for " + str(numTranscriptsNotInUniprot) + " transcripts.")
    print("Attempted " + str(ctr) + " muts")

    print("Creating tabix index")
    print("Creating copy of tsv file (" + output_file + ") ...")
    tabixBasedFilename = output_file + ".copy.tsv"
    shutil.copyfile(output_file, tabixBasedFilename)

    print("Sorting ...")
    tsvFileSorter = TsvFileSorter(fieldNames=['gene','startAA', 'endAA'])
    tsvFileSorter.sortFile(tabixBasedFilename, tabixBasedFilename + ".sorted")
    print("Creating actual index ...")

    # swiss_data[key].features
    #  For each feature, position 0 is name.
    #  Look for "SITE" (site), "VARIANT" (natural_variation),
    # "COMPBIAS" or "REGION" or "DOMAIN"? (region)
    #   create a line for each entry
    # Then add trembl, but only if swiss_prot has not already covered it
    #
    #   Verify with old oncotator code?
    pass
예제 #14
0
    def indexGeneProteinPosition(geneColumn, proteinInfoColumn, inputFilename, outputFilename):
        """
        Creates an intermediate temporary file that includes two additional columns, startAA and endAA,
        sorts the file, writes thee sorted file to outputFilename, and then indexes the sorted file.

        :param geneColumn: name of the gene column in the inputFilename
        :param proteinInfoColumn: name of the protein change or position column. Can be of formats: p.K128_R130del
        (position 128 through 130) For more examples, see MutUtilsTest.testProteinChange()
        :param inputFilename: input tsv filename
        :param outputFilename: output filename
        """
        startAACol = "startAA"
        endAACol = "endAA"

        # Create intermediate file.  Do not use '#' for comments, since header can start with '#'
        tsvReader = GenericTsvReader(inputFilename, commentPrepend=";")

        # These are the outputHeaders for the intermediate file.
        headers = tsvReader.getFieldNames()

        if startAACol not in headers:
            headers += [startAACol]
        if endAACol not in headers:
            headers += [endAACol]

        # Write to the intermediate temporary file.
        # This file is created in the current working directory."
        temp = tempfile.NamedTemporaryFile()
        csvfile = file(temp.name, 'w')

        # Initialize the intermediate file's header.
        tsvWriter = csv.DictWriter(csvfile, headers, delimiter='\t', lineterminator='\n')
        # If the headers have a leading '#', get rid of it.
        for i in range(0, len(headers)):
            header = headers[i]
            if header.startswith("#"):
                headers[i] = header.replace("#", "")
        tsvWriter.writeheader()

        # Get indices of relevant columns.
        gene_i = headers.index(geneColumn)
        startAA_i = headers.index(startAACol)
        endAA_i = headers.index(endAACol)

        # Write each line of the intermediate file.
        for row in tsvReader:
            protein = row[proteinInfoColumn]
            if protein is None or not protein.strip():
                continue
            [startAA, endAA] = MutUtils.extractProteinPosition(protein)
            if not startAA.strip() or not endAA.strip():
                continue
            row[startAACol] = startAA
            row[endAACol] = endAA
            tsvWriter.writerow(row)
        csvfile.flush()
        csvfile.close()

        # Sort the intermediate tsv file.
        tsvSorter = TsvFileSorter(temp.name)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))

        # Use the whole file path name.
        outputFilename = os.path.abspath(outputFilename)
        tsvSorter.sortFile(outputFilename, func)

        return TabixIndexer.index(destDir=os.path.dirname(os.path.abspath(outputFilename)),
                                  inputFilename=outputFilename, fileColumnNumList=[gene_i, startAA_i, endAA_i])