def testProteinChange(self): """ Test that protein change parsing of start and end works. """ # Each tuple is test, ground truth testInOuts = [ ("p.K128_R130del", ['128','130']), ("p.W274G", ["274", "274"]), ("p.13_14AA>A", ["13", "14"]), ("p.G25_splice", ["25", "25"]), ("p.E813*", ["813", "813"]), ("p.SLPQPEQRPY59del", ["59", "59"]) ] ctr = 1 for test in testInOuts: result = MutUtils.extractProteinPosition(test[0]) self.assertTrue(result != ['', ''], "Result was empty. " + str(test[0]) + ". ") self.assertTrue(result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ". " + str(result) + " GT: " + str(test[1])) ctr += 1 self.assertTrue(MutUtils.extractProteinPosition("blahblah") == ['', ''])
def testProteinChange(self): """ Test that protein change parsing of start and end works. """ # Each tuple is test, ground truth testInOuts = [("p.K128_R130del", ['128', '130']), ("p.W274G", ["274", "274"]), ("p.13_14AA>A", ["13", "14"]), ("p.G25_splice", ["25", "25"]), ("p.E813*", ["813", "813"]), ("p.SLPQPEQRPY59del", ["59", "59"])] ctr = 1 for test in testInOuts: result = MutUtils.extractProteinPosition(test[0]) self.assertTrue(result != ['', ''], "Result was empty. " + str(test[0]) + ". ") self.assertTrue( result[0] == test[1][0] and result[1] == test[1][1], "Result did not match for " + str(test[0]) + ". " + str(result) + " GT: " + str(test[1])) ctr += 1 self.assertTrue( MutUtils.extractProteinPosition("blahblah") == ['', ''])
def indexGeneProteinPosition(geneColumn, proteinInfoColumn, inputFilename, outputFilename): """ Creates an intermediate temporary file that includes two additional columns, startAA and endAA, sorts the file, writes thee sorted file to outputFilename, and then indexes the sorted file. :param geneColumn: name of the gene column in the inputFilename :param proteinInfoColumn: name of the protein change or position column. Can be of formats: p.K128_R130del (position 128 through 130) For more examples, see MutUtilsTest.testProteinChange() :param inputFilename: input tsv filename :param outputFilename: output filename """ startAACol = "startAA" endAACol = "endAA" # Create intermediate file. Do not use '#' for comments, since header can start with '#' tsvReader = GenericTsvReader(inputFilename, commentPrepend=";") # These are the outputHeaders for the intermediate file. headers = tsvReader.getFieldNames() if startAACol not in headers: headers += [startAACol] if endAACol not in headers: headers += [endAACol] # Write to the intermediate temporary file. # This file is created in the current working directory." temp = tempfile.NamedTemporaryFile() csvfile = file(temp.name, 'w') # Initialize the intermediate file's header. tsvWriter = csv.DictWriter(csvfile, headers, delimiter='\t', lineterminator='\n') # If the headers have a leading '#', get rid of it. for i in range(0, len(headers)): header = headers[i] if header.startswith("#"): headers[i] = header.replace("#", "") tsvWriter.writeheader() # Get indices of relevant columns. gene_i = headers.index(geneColumn) startAA_i = headers.index(startAACol) endAA_i = headers.index(endAACol) # Write each line of the intermediate file. for row in tsvReader: protein = row[proteinInfoColumn] if protein is None or not protein.strip(): continue [startAA, endAA] = MutUtils.extractProteinPosition(protein) if not startAA.strip() or not endAA.strip(): continue row[startAACol] = startAA row[endAACol] = endAA tsvWriter.writerow(row) csvfile.flush() csvfile.close() # Sort the intermediate tsv file. tsvSorter = TsvFileSorter(temp.name) func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"])) # Use the whole file path name. outputFilename = os.path.abspath(outputFilename) tsvSorter.sortFile(outputFilename, func) return TabixIndexer.index(destDir=os.path.dirname(os.path.abspath(outputFilename)), inputFilename=outputFilename, fileColumnNumList=[gene_i, startAA_i, endAA_i])