def get_rna_genes(anRnaGeneFile, anRnaGeneFamilyFile, anIsDebug): ''' ' This function parses the RNA gene and RNA gene family blacklist files. ' ' anRnaGeneFile: An RNA gene file ' anRnaGeneFamilyFile: An RNA gene family file ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the file geneFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFile) geneFamilyFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFamilyFile) rnaGeneList = list() rnaGeneFamilyList = list() for line in geneFileHandler: # we can ignore the lines that start with # for now if (line.startswith("#") or line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("RNA Blacklist: %s", line) rnaGeneList.append(line) for line in geneFamilyFileHandler: # we can ignore the lines that start with # for now if (line.startswith("#") or line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("RNA Blacklist: %s", line) rnaGeneFamilyList.append(line) geneFileHandler.close() geneFamilyFileHandler.close() return rnaGeneList, rnaGeneFamilyList
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug): ''' ' This function parses the output from BLAT. Two formats are supported: BLAST NCBI-8 and PSL. It groups ' all of the information from one query sequence and uses the python generator to yield the information. ' It ignores empty lines and strips trailing \r\n characters. ' ' aBlatFile: A output file from BLAT ' anOutputFormat: BLAST or PSL ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the file fileHandler = radiaUtil.get_read_fileHandler(aBlatFile) blatHitsDict = collections.defaultdict(dict) for line in fileHandler: # we can ignore the lines that start with # for now if (line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("BLAT: %s", line) # split the line on the tab splitLine = line.split("\t") # get the coordinate data = rnaTumor_7_55196749_HS2144:2:1108:17342:164248 if (anOutputFormat == "PSL"): blatId = splitLine[9] elif (anOutputFormat == "BLAST"): blatId = splitLine[0] blatSplitId = blatId.split("_") prefix = blatSplitId[0] coordinateId = "_".join(blatSplitId[1:3]) readId = "_".join(blatSplitId[0:4]) if coordinateId not in blatHitsDict: blatHitsDict[coordinateId] = collections.defaultdict(dict) if prefix not in blatHitsDict[coordinateId]: blatHitsDict[coordinateId][prefix] = collections.defaultdict(list) blatHitsDict[coordinateId][prefix][readId].append(line) fileHandler.close() return blatHitsDict
def get_vcf_data(aVCFFile, anIsDebug): headerList = list() chromLine = None infoList = list() filterList = list() coordinateDict = dict() vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFile) for line in vcfFileHandler: # if it is an empty line, then just continue if (line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("vcfLine: %s", line) # if we find the FILTER section, then record the filters if (line.startswith("##FILTER")): filterList.append(line) # if we find the INFO section, then record the info elif (line.startswith("##INFO")): infoList.append(line) # if we find the header line section elif (line.startswith("#CHROM")): chromLine = line # if we find the header line section elif (line.startswith("#")): headerList.append(line) # now we are to the data else: # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element # chrom = splitLine[0] stopCoordinate = splitLine[1] coordinateDict[stopCoordinate] = line + "\n" return (headerList, chromLine, infoList, filterList, coordinateDict)
def load_from_file(self, fname, ci=0, sti=1, spi=2, vi=3): inFile = radiaUtil.get_read_fileHandler(fname) for line in inFile: data = line[:-1].split('\t') c = data[ci] st = int(data[sti]) sp = int(data[spi]) if len(data) < 4: v = '' else: v = data[vi] self.load_bins((c, st, sp, v)) inFile.close()
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug): ''' ' This function reads from a VCF input file and uses the python generator ' to yield the information one line at a time. It ignores empty lines and ' strips trailing \r\n characters. This function yields all the ' information from the VCF file. ' ' aVcfFile: A VCF file ' aPassOnlyFlag: If all calls should be processed or only those calls ' that passed the filters thus far ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the VCF file fileHandler = radiaUtil.get_read_fileHandler(aVcfFile) for line in fileHandler: # if it is an empty line, then just continue # if is is a header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("VCF: %s", line) # if we are only suppose to process the passed calls # and this call has not passed, then skip it if (aPassOnlyFlag and "PASS" not in line): continue yield line fileHandler.close() return
def merge_vcf_data(aDnaFile, anRnaFile, anOverlapsFile, aNonOverlapsFile, anIsDebug): # open the header file dnaFileHandler = radiaUtil.get_read_fileHandler(aDnaFile) rnaFileHandler = radiaUtil.get_read_fileHandler(anRnaFile) overlapsFileHandler = radiaUtil.get_read_fileHandler(anOverlapsFile) if (os.path.isfile(aNonOverlapsFile)): nonOverlapsFileHandler = radiaUtil.get_read_fileHandler( aNonOverlapsFile) headerList = list() coordinateDict = dict() # the dna file has the results from the dna mpileup filter # the rna file has the results from the rna mpileup filter # the overlaps file has calls that pass in both the DNA and RNA # the non-overlaps file originally has calls that don't pass in the DNA but # pass in the RNA, these RNA calls are further filtered to eliminate # possible germline (dnm*/DB/GERM) calls or false positives due to # pseudogens (EGPS/RTPS) and then the RNA reads are optionally run through # the blat filter to check for mapping uniqueness - these are the # RNA Rescue and RNA Editing calls # process all of the calls from the DNA mpileup filter for line in dnaFileHandler: # if it is an empty line, then just continue if (line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug and not line.startswith("#")): logging.debug("DNA mpileup Line: %s", line) # if it is a header line, then add it to the header list if (line.startswith("#")): # keep all the header lines headerList.append(line + "\n") # now we are to the data else: # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element stopCoordinate = splitLine[1] coordinateDict[stopCoordinate] = line + "\n" # these are all the calls that pass in both the DNA and RNA for line in overlapsFileHandler: # if it is an empty line, then continue # if it is a header line, then continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug and not line.startswith("#")): logging.debug("Overlaps file Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element stopCoordinate = splitLine[1] # if the call passed in both the RNA and DNA, # then adjust the origin if (stopCoordinate in coordinateDict): dnaLine = coordinateDict[stopCoordinate] if (anIsDebug): logging.debug("passed in both RNA and DNA (from overlaps " + "file) changing the origin to (DNA,RNA) " + "\nDNALine: %s RNALine: %s\n", dnaLine, line) dnaLine = dnaLine.replace("ORIGIN=DNA", "ORIGIN=DNA,RNA") coordinateDict[stopCoordinate] = dnaLine else: coordinateDict[stopCoordinate] = line + "\n" # loop through the RNA mpileup filtered calls # create 2 dictionaries: one for passing, one for non-passing # # if an RNA Rescue or RNA Editing call passes in the anRnaNonOverlapsFile # below, then we want to use the original RNA mpileup passing call to # overwrite the DNA call. the non-overlaps file is really the RNA mpileup # passing calls that are first filtered by DNA, then grep, and then blat. # the filtered by DNA part doesn't select one modType when no call passes, # so the final passing call has more than one modType which causes problems # in the next filter, therefore use the RNA mpileup passing call. # # mpileup_rna_origin: # 9 17464495 . G A 0.0 PASS # AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A; # MT=TUM_EDIT;NS=3;ORIGIN=RNA;SB=0.73;SS=5;START=0;STOP=5;VT=SNP # GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB # 0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0 # 0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0 # 0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5 # vs. # mpileup_rna_origin->dnaFiltered->blat: # 9 17464495 . G A 0.0 PASS # AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A,G>A; # MF=rnacall,dtmnab_dtmnbq;MFT=DNA_TUM_EDIT_G>A,DNA_SOM_G>A; # MT=TUM_EDIT,SOM;NS=3;ORIGIN=RNA;SB=0.73;SS=5;START=0;STOP=5;VT=SNP # GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB # 0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0 # 0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0 # 0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5 # # when merging a call that passes in the non-overlaps (dnaFiltered or blat) # file, replace the DNA call, with the original RNA mpileup passing call # # the non-passing dictionary will be used below to help merge filtered # calls when a call gets filtered by both the RNA and DNA rnaMpileupPassingDict = {} rnaMpileupNonpassingDict = {} for rnaLine in rnaFileHandler: # if it is an empty line, then just continue # if it is a header line, then just continue if (rnaLine.isspace() or rnaLine.startswith("#")): continue # strip the carriage return and newline characters rnaLine = rnaLine.rstrip("\r\n") if (anIsDebug and not rnaLine.startswith("#")): logging.debug("RNA mpileup Line: %s", rnaLine) # now we are to the data # split the line on the tab rnaLineSplit = rnaLine.split("\t") # the coordinate is the second element stopCoordinate = rnaLineSplit[1] # put the call in the right dict if "PASS" in rnaLineSplit[6]: rnaMpileupPassingDict[stopCoordinate] = rnaLine else: rnaMpileupNonpassingDict[stopCoordinate] = rnaLine # these are the RNA Rescue and RNA Editing calls after # the initial filtering but before filterByReadSupport.py if (os.path.isfile(aNonOverlapsFile)): for line in nonOverlapsFileHandler: # if it is an empty line, then just continue # if it is a header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug and not line.startswith("#")): logging.debug("Non-overlaps Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element stopCoordinate = splitLine[1] # if this call passed in the RNA, then overwrite # the DNA call that didn't pass if ("PASS" in splitLine[6]): # if this call existed in the DNA if (stopCoordinate in coordinateDict): dnaLine = coordinateDict[stopCoordinate] # get the RNA line from the RNA mpileups passing dict rnaLine = rnaMpileupPassingDict[stopCoordinate] # if it didn't pass in the DNA if ("PASS" not in dnaLine): if (anIsDebug): logging.debug("Overwriting non-passing DNA call " + "with passing RNA Rescue calls " + "\nDNALine: %s" + "RNALineNonOverlaps: %s" + "\nRNALineMpileup: %s\n", dnaLine, line, rnaLine) coordinateDict[stopCoordinate] = rnaLine + "\n" else: if (anIsDebug): # this call passed in both logging.debug("Unusual call in non-overlaps " + "file passed in both the RNA and " + "DNA but they probably don't have " + "the same modType! \nDNALine: %s " + "RNALineNonOverlaps: %s" + "\nRNALineMpileup: %s\n", dnaLine, line, rnaLine) # at this point, there are multiple events that pass # all the filters. in this case, pick the passing # event in the following order: # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH if ("GERM" in dnaLine or "SOM" in dnaLine): coordinateDict[stopCoordinate] = dnaLine else: coordinateDict[stopCoordinate] = rnaLine # this call didn't exist in the DNA else: logging.warning("Call didn't exist in DNA? " + "RNALine: %s\n", line) coordinateDict[stopCoordinate] = line + "\n" # this call didn't pass in the RNA else: if (anIsDebug): logging.debug("Call didn't pass in RNA: " + "RNALine: %s\n", line) # if this call existed in the DNA if (stopCoordinate in coordinateDict): dnaLine = coordinateDict[stopCoordinate] # if it didn't pass in the DNA if ("PASS" not in dnaLine): if (anIsDebug): logging.debug("RNANoPass: Didn't pass in both, " + "so change origin and merge " + "filters \nDNALine: %s " + "RNALine: %s\n", dnaLine, line) # change origin if ("ORIGIN=DNA,RNA" not in dnaLine): dnaLine = dnaLine.replace("ORIGIN=DNA", "ORIGIN=DNA,RNA") dnaLine = dnaLine.rstrip("\r\n") dnaLineSplit = dnaLine.split("\t") # merge the filters for the FILTER column dnaLineSplit[6] = merge_filters(splitLine[6], dnaLineSplit[6]) # merge the mod filters and filter types # in the INFO column dnaLineSplit[7] = merge_mod_filters( splitLine[7], dnaLineSplit[7]) newDnaLine = "\t".join(dnaLineSplit) + "\n" coordinateDict[stopCoordinate] = newDnaLine if (anIsDebug): logging.debug("RNANoPass: After change origin " + "and merge filters " + "\nFinalLine: %s\n", "\t".join(dnaLineSplit)) else: # this call passed in both: # DNALine: 17 4857042 . T A,G,C 0.0 PASS # AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4; # BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>A; # MT=GERM;NS=3;ORG_ISO_AD=16_2_1_2615, # 18_3_1_2791,18_1_2_2805;ORIGIN=DNA; # RS_GEN_POS=17:4854383-4860426, # 17:4854383-4860426,17:4854383-4860426; # RS_NAME=NM_001193503,NM_001976,NM_053013; # RS_ORG_POS=313,484,442;RS_STRAND=+,+,+; # SB=0.74;SS=1;START=1;STOP=0;VT=SNP # GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB # 0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0: # 29,28,3,0:0.39,0.5,1.0,0.0 # 0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0: # 31,0,0,0:0.56,0.0,0.0,0.0 # 3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0: # 45,12,12,58:0.94,1.0,1.0,0.97 # RNALine: 17 4857042 . T A,G,C 0.0 PASS # AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4; # BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>C; # MT=TUM_EDIT;NS=3;ORG_ISO_AD=16_2_1_2615, # 18_3_1_2791,18_1_2_2805;ORIGIN=RNA; # RS_GEN_POS=17:4854383-4860426, # 17:4854383-4860426,17:4854383-4860426; # RS_NAME=NM_001193503,NM_001976,NM_053013; # RS_ORG_POS=313,484,442;RS_STRAND=+,+,+; # SB=0.74;SS=5;START=1;STOP=0;VT=SNP # GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB # 0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0: # 29,28,3,0:0.39,0.5,1.0,0.0 # 0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0: # 31,0,0,0:0.56,0.0,0.0,0.0 # 3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0: # 45,12,12,58:0.94,1.0,1.0,0.97 logging.warning("RNANoPass: Call passed in both " + "RNA and DNA but they probably " + "don't have the same modType " + "\nDNALine: %s RNALine: %s\n", dnaLine, line) # at this point, there are multiple events that # pass all the filters. in this case, pick the # passing event in the following order: # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH if ("GERM" in dnaLine or "SOM" in dnaLine): coordinateDict[stopCoordinate] = dnaLine else: coordinateDict[stopCoordinate] = line # this call didn't exist in the DNA else: logging.warning("RNANoPass: Call didn't exist in DNA? " + "RNALine: %s\n", line) coordinateDict[stopCoordinate] = line + "\n" # these are needed for merging the RNA mpileup filters for (rnaStopCoordinate, rnaLine) in rnaMpileupNonpassingDict.iteritems(): if (anIsDebug and not rnaLine.startswith("#")): logging.debug("RNA mpileup non-passing Line: %s", rnaLine) # if this call existed in the DNA and # the user wants the merged calls if (rnaStopCoordinate in coordinateDict): # split the line on the tab rnaLineSplit = rnaLine.split("\t") # get the original line dnaLine = coordinateDict[rnaStopCoordinate] dnaLine = dnaLine.rstrip("\r\n") dnaLineSplit = dnaLine.split("\t") # if the call didn't pass in the RNA or DNA, # we want to merge the filters if "PASS" not in dnaLineSplit[6]: if (anIsDebug): logging.debug("Merging filters for \nDNALine: %s " + "\nRNALine: %s", dnaLine, rnaLine) # merge the filters for the FILTER column dnaLineSplit[6] = merge_filters(rnaLineSplit[6], dnaLineSplit[6]) # merge the mod filters and filter types in the INFO column dnaLineSplit[7] = merge_mod_filters(rnaLineSplit[7], dnaLineSplit[7]) finalLine = "\t".join(dnaLineSplit) if ("ORIGIN=DNA,RNA" not in finalLine): finalLine = finalLine.replace("ORIGIN=DNA", "ORIGIN=DNA,RNA") coordinateDict[rnaStopCoordinate] = finalLine + "\n" if (anIsDebug): logging.debug("Merged filters \nFinalLine: %s", finalLine) # this call didn't exist in the DNA else: coordinateDict[rnaStopCoordinate] = rnaLine + "\n" dnaFileHandler.close() rnaFileHandler.close() overlapsFileHandler.close() if (os.path.isfile(aNonOverlapsFile)): nonOverlapsFileHandler.close() return (headerList, coordinateDict)
def get_vcf_data(anId, anInputDir, anIsDebug): # for each file that starts with this id # load the first file to get the header # get the coordinates for all processedHeader = False headerDict = dict() headerDict["metadata"] = list() headerDict["format"] = list() headerDict["info"] = list() headerDict["filter"] = list() headerDict["chrom"] = list() coordinateDict = dict() coordinateDict["numbers"] = dict() coordinateDict["letters"] = dict() # if the input directory doesn't end with a forward slash, # then add one so that glob.glob will work if (not anInputDir.endswith("/")): anInputDir = anInputDir + "/" # for each vcf file # they might be gzipped, they might not for vcfFile in (glob.glob(anInputDir + anId + "_chr*.vcf*")): # open the file vcfFileHandler = radiaUtil.get_read_fileHandler(vcfFile) for line in vcfFileHandler: # if it is an empty line, then just continue if (line.isspace()): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("vcfLine: %s", line) # if we haven't processed the header yet, then do it here if (not processedHeader): # extract the metadata if (line.startswith("##FORMAT")): headerDict["format"].append(line) elif (line.startswith("##INFO")): headerDict["info"].append(line) elif (line.startswith("##FILTER")): headerDict["filter"].append(line) elif (line.startswith("##")): headerDict["metadata"].append(line) elif (line.startswith("#CHROM")): headerDict["chrom"].append(line) # now we've processed the header processedHeader = True if (line.startswith("#")): continue else: # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element chrom = splitLine[0] # we want to sort everything at the end, so keep track # of the chroms that are numbers and letters separately if (is_number(chrom)): if chrom not in coordinateDict["numbers"]: coordinateDict["numbers"][chrom] = list() coordinateDict["numbers"][chrom].append(line) else: if chrom not in coordinateDict["letters"]: coordinateDict["letters"][chrom] = list() coordinateDict["letters"][chrom].append(line) # close the file and move onto the next one vcfFileHandler.close() return (headerDict, coordinateDict)
def get_validation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The validation files must have at least 10 fields: chrom, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # if it is an empty line or header line, then just continue if (line.isspace() or line.startswith("#") or line.startswith("chrom")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("Validation Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # get the fields to yield # columnHeaders = ["chrom", "chr_start", "chr_stop", # "ref", "var", "source", "val_result"] # these are 0-based chrom = splitLine[0] # startCoordinate = splitLine[1] stopCoordinate = splitLine[2] # ref = splitLine[3] # variantAllele = splitLine[4] # center = splitLine[5] # valResult = splitLine[6] # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, # etc.) per file. their can be multiple keys for one filter such # as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, # call it using the keyString if (cmpKey in line): statKey = aPrefix + "_pass_" + cmpKeyString aStatsDict[statKey] += 1 # only count it once break inputFileHandler.close() return (outputDict, aStatsDict)
def get_vcf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The .vcf files must have at least 10 fields: chromosome, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # if it is an empty line or header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("VCF Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # get the fields to yield # columnHeaders = ["CHROM", "POS", "ID", "REF", "ALT", # "QUAL", "FILTER", "INFO", "FORMAT"] chrom = splitLine[0] stopCoordinate = splitLine[1] # outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file # aStatsDict[aPrefix + "_events"] += 1 # if ("PASS" in line): # aStatsDict[aPrefix + "_pass_events"] += 1 # the thing that is being compared to has to have the # smaller/limited amount i.e. only the passing som events, # otherwise everything will be found # if (aPrefix == "rad" and "SNP" in line): # if (aPrefix == "rad"): if (aPrefix == "rad" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)): # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line # elif (aPrefix == "cmp" and "PASS" in line): # elif (aPrefix == "cmp" and "SNP" in line): elif (aPrefix == "cmp" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)): outputDict[chrom + "_" + stopCoordinate] = line # if ("PASS" in line and "Somatic" in line and "SNP" in line): # if ("SOM" in line): # outputDict[chrom + "_" + stopCoordinate] = line # # keep track of the number of total events per file # aStatsDict[aPrefix + "_events"] += 1 # if ("PASS" in line): # aStatsDict[aPrefix + "_pass_events"] += 1 # if ("PASS" in line and "SOM" in line): # if ("SOM" in line): # outputDict[chrom + "_" + stopCoordinate] = line # aStatsDict[aPrefix + "_events"] += 1 # aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, # etc.) per file. there can be multiple keys for one filter such # as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "rad"): # break up the string to get the individual keys radKeyList = radKeyString.split(",") # search for each one of them for radKey in radKeyList: # if we find one if (radKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + radKeyString] += 1 # if this is a passing line, # call it using the keyString # if ("PASS" in line and # (radKey == "GERM" or "DB" not in line)): # if ("PASS" in line and # (radKey == "Germline" or "DB" not in line)): # if ("PASS" in line): # if (radKey in line): # if ("PASS" in line and "SNP" in line): if ("PASS" in line and "SNP" in line and radKey in line): statKey = aPrefix + "_pass_" + radKeyString aStatsDict[statKey] += 1 # only count it once break elif (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString # if ("SNP" in line): # aStatsDict[aPrefix + "_" + cmpKeyString] += 1 aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, # call it using the keyString # if ("PASS" in line and # (cmpKey == "GERM" or "DB" not in line)): # if ("PASS" in line and "SNP" in line and # (cmpKey == "Germline" or "DB" not in line)): # if ("PASS" in line and "SNP" in line and # "SS=2" in line and cmpKey in line): # if ("PASS" in line): if ("PASS" in line and "SNP" in line and cmpKey in line): statKey = aPrefix + "_pass_" + cmpKeyString aStatsDict[statKey] += 1 # only count it once break inputFileHandler.close() return (outputDict, aStatsDict)
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug): ''' ' This function reads from a .vcf input file and uses the python generator to yield the information ' one line at a time. It ignores empty lines and strips trailing \r\n characters. This function ' yields all the information from the VCF file. ' ' aVcfFile: A VCF file ' aPassOnlyFlag: If all calls should be processed or only those calls that passed the filters thus far ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the VCF file fileHandler = radiaUtil.get_read_fileHandler(aVcfFile) for line in fileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") if (anIsDebug): logging.debug("VCF: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue # these are header lines, so just continue elif (line.startswith("#")): continue # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (aPassOnlyFlag and "PASS" not in line): continue # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element chrom = splitLine[0] stopCoordinate = int(splitLine[1]) idList = splitLine[2] refList = splitLine[3] altList = splitLine[4] score = splitLine[5] filterSet = set(splitLine[6].split(";")) infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, SOMATIC, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") # yield all the information about the current coordinate yield (chrom, stopCoordinate, idList, refList, altList, score, filterSet, infoDict, "\t".join(splitLine[8:])) fileHandler.close() return
def main(): # create the usage statement usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help= "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default" ) i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help= "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed" ) # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 14, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_vcfFilename = str(i_cmdLineArgs[0]) i_rnaGeneFilename = str(i_cmdLineArgs[1]) i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None if (i_cmdLineOptions.outputFilename != None): i_outputFilename = str(i_cmdLineOptions.outputFilename) if (i_cmdLineOptions.logFilename != None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError( "Invalid log level: '%s' must be one of the following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename != None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename) logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename) logging.debug("outputFilename=%s", i_outputFilename) logging.debug("logFilename=%s", i_logFilename) logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag) # check for any errors i_writeFilenameList = [] if (i_outputFilename != None): i_writeFilenameList = [i_outputFilename] if (i_logFilename != None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [ i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename ] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream i_outputFileHandler = None if (i_outputFilename != None): i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # get the RNA gene blacklists (i_rnaGeneList, i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename, i_rnaGeneFamilyFilename, i_debug) hasAddedHeader = False i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename) vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n" vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n" for line in i_vcfFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") if (i_debug): logging.debug("vcfLine: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue # if we find the FILTER section, then add the filters from here elif ((not hasAddedHeader) and (line.startswith("##FILTER") or line.startswith("##INFO"))): hasAddedHeader = True if (i_outputFileHandler != None): i_outputFileHandler.write(vcfHeader) i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, vcfHeader print >> sys.stdout, line # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): if (i_outputFileHandler != None): i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, line # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (i_passedVCFCallsOnlyFlag and "PASS" not in line): if (i_outputFileHandler != None): i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, line # now we are to the data else: # split the line on the tab splitLine = line.split("\t") filterSet = set(splitLine[6].split(";")) # if there are no filters so far, then clear the list if (len(filterSet) == 1 and "PASS" in filterSet): filterSet = set() # parse the info column and create a dict infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, SOMATIC, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") effectList = infoDict["EFF"] effectRegEx = re.compile("(\\w).*\\({1}") ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"] isRnaBlacklistGene = False isRnaBlacklistGeneFamily = False for rawEffect in effectList: rawEffect = rawEffect.rstrip(")") iterator = effectRegEx.finditer(rawEffect) # for each match object in the iterator for match in iterator: effect = match.group() rawEffect = rawEffect.replace(effect, "") effect = effect.rstrip("(") if (effect in ignoreEffectsList): continue effectParts = rawEffect.split("|") #effectImpact = effectParts[0] #functionalClass = effectParts[1] #codonChange = effectParts[2] #aaChange = effectParts[3] #aaLength = effectParts[4] geneName = effectParts[5] transcriptBiotype = effectParts[6] #geneCoding = effectParts[7] #ensembleId = effectParts[8] #exonNumber = effectParts[9] #genotypeNumber = effectParts[10] # the RNA gene list can have "RP11" and that # should filter out any gene with RP11 in it for rnaGene in i_rnaGeneList: if (rnaGene in geneName): isRnaBlacklistGene = True break if (transcriptBiotype in i_rnaGeneFamilyList): isRnaBlacklistGeneFamily = True output = ["\t".join(splitLine[0:6])] # if the filter should be applied if (isRnaBlacklistGene): filterSet.add("rgene") # if the filter should be applied if (isRnaBlacklistGeneFamily): filterSet.add("rgfam") # if there are no filters so far, then this call passes if (len(filterSet) == 0): filterSet.add("PASS") output.append(";".join(filterSet)) output.append("\t".join(splitLine[7:])) if (i_outputFilename != None): i_outputFileHandler.write("\t".join(output) + "\n") else: print >> sys.stdout, "\t".join(output) # close the files i_vcfFileHandler.close() if (i_outputFilename != None): i_outputFileHandler.close() return
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The simulation files have 11 fields: mutation type, chrom, start, end, ' target AF, mutation position, base change, coverage in, coverage out, ' actual AF, highest AF of anything linked. The useful ones for comparing ' against RADIA are chromosome, mutation position, and base change. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # if it is an empty line or header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("Simulation Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # mutType = splitLine[0] chrom = splitLine[1] # startCoordinate = splitLine[2] # stopCoordinate = splitLine[3] # targetAF = splitLine[4] mutPosition = splitLine[5] # baseChange = splitLine[6] # coverageIn = splitLine[7] # coverageOut = splitLine[8] # actualAF = splitLine[9] # highestAF = splitLine[10] if (chrom + "_" + mutPosition) in outputDict: logging.debug(line + outputDict[chrom + "_" + mutPosition]) # add the coordinate to the output outputDict[chrom + "_" + mutPosition] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, # etc.) per file. their can be multiple keys for one filter such # as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, # call it using the keyString aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1 # only count it once break inputFileHandler.close() return (outputDict, aStatsDict)
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug): ''' ' This function parses the output from BLAT. Two formats are supported: ' BLAST NCBI-8 and PSL. It groups all of the information from one query ' sequence and uses the python generator to yield the information. It ' ignores empty lines and strips trailing \r\n characters. ' ' aBlatFile: A output file from BLAT ' anOutputFormat: BLAST or PSL ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the file fileHandler = radiaUtil.get_read_fileHandler(aBlatFile) blatHitsDict = collections.defaultdict(list) previousPrefix = "" for line in fileHandler: # if it is an empty line, then just continue # if is is a header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("BLAT: %s", line) # split the line on the tab splitLine = line.split("\t") # get the coordinate data = # rnaTumor_7_55196749_HS2144:2:1108:17342:164248 if (anOutputFormat == "PSL"): # the PSL output has a bunch of header lines that we want to skip # if the first column can't be converted into an int, then skip try: int(splitLine[0]) except ValueError: continue blatId = splitLine[9] elif (anOutputFormat == "BLAST"): blatId = splitLine[0] prefix = "_".join(blatId.split("_")[0:3]) readId = "_".join(blatId.split("_")[0:4]) # this catches all of the matches except the first one if (prefix == previousPrefix): blatHitsDict[readId].append(line) ''' if (anIsDebug): logging.debug("prefixes match, current=%s, prev=%s", prefix, previousPrefix) ''' # if the prefixes don't match and the blatHitsDict is not empty: # we've reached a new set of blat hits, so yield the previous ones elif blatHitsDict: ''' if (anIsDebug): logging.debug("new prefix=%s, prev=%s", prefix, previousPrefix) logging.debug("yielding len blatHits=%s", len(blatHitsDict)) ''' # yield the blat hits for this prefix yield blatHitsDict # clear the blat hits dict for the next matches blatHitsDict.clear() # set the previous prefix and start filling # the dict with the first prefix previousPrefix = prefix blatHitsDict[readId].append(line) ''' if (anIsDebug): logging.debug("after yield current=%s, prev=%s", prefix, previousPrefix) ''' # if the prefixes don't match, and the blatHitsDict is empty: # this is the first line of the VCF, set the previous prefix # and add it to the blatHitsDict else: blatHitsDict[readId].append(line) previousPrefix = prefix # this one is needed to yield the very last blatHitsDict when all # lines of the VCF have been processed yield blatHitsDict return
def get_maf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The .vcf files must have at least 10 fields: chromosome, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("MAF Line: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): continue; # now we are to the data else: # split the line on the tab splitLine = line.split("\t") # get the fields to yield #center = splitLine[2] chrom = splitLine[4] #startCoordinate = splitLine[5] stopCoordinate = splitLine[6] #variantType = splitLine[9] #dbSnp = splitLine[13] #if ("Somatic" in line and "SNP" in line): if (True): #if (chrom + "_" + stopCoordinate) in outputDict: # logging.debug(line + outputDict[chrom + "_" + stopCoordinate]) # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, etc.) per file # their can be multiple keys for one filter such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "rad"): # break up the string to get the individual keys radKeyList = radKeyString.split(",") # search for each one of them for radKey in radKeyList: # if we find one if (radKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + radKeyString] += 1 # if this is a passing line, call it using the keyString #if ("PASS" in line and ((radKey == "GERM") or ("DB" not in line))): #if ("PASS" in line and ((radKey == "Germline") or ("DB" not in line))): #if ("PASS" in line and radKey in line): #if ("SNP" in line): if ("SOMATIC" in line): aStatsDict[aPrefix + "_pass_" + radKeyString] += 1 # only count it once break; elif (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, call it using the keyString #if ("PASS" in line and ((cmpKey == "GERM") or ("DB" not in line))): #if ("PASS" in line and "SNP" in line and ((cmpKey == "Germline") or ("DB" not in line))): #if ("SNP" in line): #if ("PASS" in line): if ("SOMATIC" in line): aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1 # only count it once break; inputFileHandler.close() return (outputDict, aStatsDict)
def get_maf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The .vcf files must have at least 10 fields: chromosome, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # if it is an empty line or header line, then just continue if (line.isspace() or line.startswith("#")): continue # strip the carriage return and newline characters line = line.rstrip("\r\n") # if (anIsDebug): # logging.debug("MAF Line: %s", line) # now we are to the data # split the line on the tab splitLine = line.split("\t") # get the fields to yield # center = splitLine[2] chrom = splitLine[4] # startCoordinate = splitLine[5] stopCoordinate = splitLine[6] # variantType = splitLine[9] # dbSnp = splitLine[13] # if ("Somatic" in line and "SNP" in line): if (True): # coordinateKey = chrom + "_" + stopCoordinate # if (coordinateKey) in outputDict: # logging.debug(line + outputDict[coordinateKey]) # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, # etc.) per file. their can be multiple keys for one filter such # as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "rad"): # break up the string to get the individual keys radKeyList = radKeyString.split(",") # search for each one of them for radKey in radKeyList: # if we find one if (radKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + radKeyString] += 1 # if this is a passing line, # call it using the keyString # if ("PASS" in line and # (radKey == "GERM" or "DB" not in line)): # if ("PASS" in line and # (radKey == "Germline" or "DB" not in line)): # if ("PASS" in line and radKey in line): # if ("SNP" in line): if ("SOMATIC" in line): statKey = aPrefix + "_pass_" + radKeyString aStatsDict[statKey] += 1 # only count it once break elif (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, # call it using the keyString # if ("PASS" in line and # (cmpKey == "GERM" or "DB" not in line)): # if ("PASS" in line and "SNP" in line and # (cmpKey == "Germline" or "DB" not in line)): # if ("SNP" in line): # if ("PASS" in line): if ("SOMATIC" in line): statKey = aPrefix + "_pass_" + cmpKeyString aStatsDict[statKey] += 1 # only count it once break inputFileHandler.close() return (outputDict, aStatsDict)
def get_vcf_data(aVcfFile, aHeaderFile, aPassOnlyFlag, anIsDebug): ''' ' This function reads from a .vcf input file and uses the python generator to yield the information ' one line at a time. It ignores empty lines and strips trailing \r\n characters. This function ' yields all the information from the VCF file. ' ' aVcfFile: A VCF file ' aPassOnlyFlag: If all calls should be processed or only those calls that passed the filters thus far ' anIsDebug: A flag for outputting debug messages to STDERR ''' # open the header file fileHandler = radiaUtil.get_read_fileHandler(aHeaderFile) for line in fileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("VCF Header: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # if we find the column headers elif ("#CHROM" in line): columnsLine = line.lstrip("#") columnsList = columnsLine.split("\t") columnsList = columnsList[9:len(columnsList)] continue # if we find the vcfGenerator line, then create the dict of params elif ("vcfGenerator" in line): #generatorLine = line.rstrip(">") #generatorLine = generatorLine.lstrip("##vcfGenerator=<") generatorLine = line[0:(len(line)-1)] #print "generatorLine: %s", generatorLine generatorLine = generatorLine[16:len(generatorLine)] #print "generatorLine: %s", generatorLine generatorParamsList = generatorLine.split(",") generatorParamsDict = {} # create a dictionary of existing params for param in generatorParamsList: (key, value) = param.split("=") value = value.rstrip(">") value = value.lstrip("<") generatorParamsDict[key] = value continue # if we are done with the header, then stop elif (not line.startswith("#")): break fileHandler.close() # open the VCF file fileHandler = radiaUtil.get_read_fileHandler(aVcfFile) for line in fileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("VCF: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # if we find the column headers elif ("#CHROM" in line): columnsLine = line.lstrip("#") columnsList = columnsLine.split("\t") columnsList = columnsList[9:len(columnsList)] continue # if we find the vcfGenerator line, then create the dict of params elif ("vcfGenerator" in line): #generatorLine = line.rstrip(">") #generatorLine = generatorLine.lstrip("##vcfGenerator=<") generatorLine = line[0:(len(line)-1)] #print "generatorLine: %s", generatorLine generatorLine = generatorLine[16:len(generatorLine)] #print "generatorLine: %s", generatorLine generatorParamsList = generatorLine.split(",") generatorParamsDict = {} # create a dictionary of existing params for param in generatorParamsList: (key, value) = param.split("=") value = value.rstrip(">") value = value.lstrip("<") generatorParamsDict[key] = value continue # these are header lines, so just continue elif (line.startswith("#")): continue # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (aPassOnlyFlag and "PASS" not in line): continue; # split the line on the tab splitLine = line.split("\t") # the coordinate is the second element chrom = splitLine[0] stopCoordinate = int(splitLine[1]) idList = splitLine[2].split(";") refList = splitLine[3].split(",") altList = splitLine[4].split(",") score = float(splitLine[5]) filterSet = set(splitLine[6].split(";")) infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, SOMATIC, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") # yield all the information about the current coordinate yield (chrom, stopCoordinate, idList, refList, altList, score, filterSet, infoDict, "\t".join(splitLine[8:]), generatorParamsDict) fileHandler.close() return
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename, anOutputFilename, aFilterName, aFilterField, anIncludeOverlapInfo, anIncludeFilterName, anIdField, anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize, anIsDebug): ''' ' This function reads from a .bed file and a .vcf file line by line and ' looks for variants that should be filtered or tagged. The .bed file ' specifies coordinates for areas where variants should either be included ' or excluded. For example, a .bed file specifying transcription or exon ' start and stop coordinates can be provided along with the ' --includeOverlaps flag to indicate that the variants in these regions ' should be kept, and variants outside of these regions should be flagged ' or filtered out. Conversely, a bed file specifying areas of the genome ' that are accessible (as defined by the 1000 Genomes project) can be given ' without the --includeOverlaps flag to indicate that the variants outside ' of the accessible genome should be flagged or filtered out, and variants ' overlapping the accessible regions should not be flagged or filtered out. ' ' aTCGAId: The TCGA Id for this sample ' aChrom: The chromosome being filtered ' aBedFilename: A .bed file with at least 3 columns specifying the chrom, ' start, and stop coordinates and possibly a 4th column with an id ' aVCFFilename: A .vcf file with variants that will be either ' included or excluded ' anOutputFilename: An output file where the filtered variants are output ' aFilterName: The name of the filter ' aFilterField: The field where the filter name should be included ' (e.g. INFO or FILTER) ' anIncludeOverlapInfo: A flag specifying whether the variants should be ' included or excluded when they overlap ' anIncludeFilterName: A flag specifying whether the filtering name should ' be included in the output or not ' anIdField: The field where the ID should be specified (e.g. ID or INFO) ' anIncludeId: A flag specifying whether the id should be included in the ' output or not ' anIncludeCount: A flag specifying whether the number of overlaps should ' be included in the output or not ' aFilterHeaderLine: A filter header line that should be added to the VCF ' header describing this filter ' aBinSize: The size of the interval between each bin ' anIsDebug: A flag for outputting debug messages to STDERR ''' # initialize pybed with the filtering file filterPybed = pybed(binsize=aBinSize) filterPybed.load_from_file(aBedFilename) # get the vcf file i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename) # get the output file i_outputFileHandler = None if (anOutputFilename is not None): i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename) # create the generator for the vcf file vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler, aFilterHeaderLine, anIsDebug) # initialize some variables overlappingEvents = 0 nonOverlappingEvents = 0 totalEvents = 0 startTime = time.time() # for each vcf line for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine, vcf_line) in (vcfGenerator): totalEvents += 1 if (anIsDebug): logging.debug("VCF: %s", vcf_line) # check if this vcf coordinate overlaps with the filter coordinates posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate) (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple, anIncludeCount) # if an event overlaps with the filters if (isOverlap): # count the overlap overlappingEvents += 1 # if we want to add info about overlaps if (anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter( vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we don't want to add info about overlaps, just output them else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line # these events don't overlap with the filters else: # count the non overlap nonOverlappingEvents += 1 # if we don't want to add info about overlaps, # then we do want to add info about non-overlaps if (not anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter( vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we do want to add info about overlaps, # so just output non-overlaps else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60), (stopTime - startTime)) if (overlappingEvents + nonOverlappingEvents == totalEvents): logging.info( "For chrom %s and Id %s: %s (overlapping events) + " + "%s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) else: logging.info( "filterByPybed Warning: For chrom %s and Id %s: %s " + "(overlapping events) + %s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) # close the files i_vcfFileHandler.close() if (anOutputFilename is not None): i_outputFileHandler.close() return
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The simulation files have 11 fields: mutation type, chromosome, start, end, target AF, ' mutation position, base change, coverage in, coverage out, actual AF, highest AF of anything ' linked. The useful ones for comparing against RADIA are chromosome, mutation position, and base change. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("Simulation Line: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # these lines are from previous scripts in the pipeline, so skip them elif (line.startswith("#")): continue; # now we are to the data else: # split the line on the tab splitLine = line.split("\t") #mutType = splitLine[0] chrom = splitLine[1] #startCoordinate = splitLine[2] #stopCoordinate = splitLine[3] #targetAF = splitLine[4] mutPosition = splitLine[5] #baseChange = splitLine[6] #coverageIn = splitLine[7] #coverageOut = splitLine[8] #actualAF = splitLine[9] #highestAF = splitLine[10] if (chrom + "_" + mutPosition) in outputDict: logging.debug(line + outputDict[chrom + "_" + mutPosition]) # add the coordinate to the output outputDict[chrom + "_" + mutPosition] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, etc.) per file # their can be multiple keys for one filter such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, call it using the keyString aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1 # only count it once break; inputFileHandler.close() return (outputDict, aStatsDict)
def get_vcf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The .vcf files must have at least 10 fields: chromosome, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("VCF Line: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): continue; # now we are to the data else: # split the line on the tab splitLine = line.split("\t") # get the fields to yield #columnHeaders = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] chrom = splitLine[0] stopCoordinate = splitLine[1] #outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file #aStatsDict[aPrefix + "_events"] += 1 #if ("PASS" in line): # aStatsDict[aPrefix + "_pass_events"] += 1 # the thing that is being compared to has to have the smaller/limited amount # i.e. only the passing som events, otherwise everything will be found if (aPrefix == "rad" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)): #if (aPrefix == "rad" and "SNP" in line): #if (aPrefix == "rad"): # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line #elif (aPrefix == "cmp" and "PASS" in line): elif (aPrefix == "cmp" and "PASS" in line and "SNP" in line and ("SOM" in line or "EDIT" in line or "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)): #elif (aPrefix == "cmp" and "SNP" in line): outputDict[chrom + "_" + stopCoordinate] = line #if ("PASS" in line and "Somatic" in line and "SNP" in line): #if ("SOM" in line): # outputDict[chrom + "_" + stopCoordinate] = line # # keep track of the number of total events per file # aStatsDict[aPrefix + "_events"] += 1 # if ("PASS" in line): # aStatsDict[aPrefix + "_pass_events"] += 1 #if ("PASS" in line and "SOM" in line): #if ("SOM" in line): # outputDict[chrom + "_" + stopCoordinate] = line # aStatsDict[aPrefix + "_events"] += 1 # aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, etc.) per file # there can be multiple keys for one filter such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "rad"): # break up the string to get the individual keys radKeyList = radKeyString.split(",") # search for each one of them for radKey in radKeyList: # if we find one if (radKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + radKeyString] += 1 # if this is a passing line, call it using the keyString #if ("PASS" in line and ((radKey == "GERM") or ("DB" not in line))): #if ("PASS" in line and ((radKey == "Germline") or ("DB" not in line))): if ("PASS" in line and "SNP" in line and radKey in line): #if ("PASS" in line): #if (radKey in line): #if ("PASS" in line and "SNP" in line): aStatsDict[aPrefix + "_pass_" + radKeyString] += 1 # only count it once break; elif (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString #if ("SNP" in line): # aStatsDict[aPrefix + "_" + cmpKeyString] += 1 aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, call it using the keyString #if ("PASS" in line and ((cmpKey == "GERM") or ("DB" not in line))): #if ("PASS" in line and "SNP" in line and ((cmpKey == "Germline") or ("DB" not in line))): #if ("PASS" in line and "SNP" in line and "SS=2" in line and cmpKey in line): if ("PASS" in line and "SNP" in line and cmpKey in line): #if ("PASS" in line): aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1 # only count it once break; inputFileHandler.close() return (outputDict, aStatsDict)
def get_validation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug): ''' ' The validation files must have at least 10 fields: chromosome, coordinate, id ' references, alts, quality score, filters, infos, format, and summary info ' for at least one .bam file. ' ' anInputFileHandler: The input stream for the file ' aStatsDict: A dictionary holding stats about all the comparisons ' aCompareDict: The key,value pairs that should be used in the comparison ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files ' anIsDebug: A flag for outputting debug messages to STDERR ''' inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename) outputDict = {} for line in inputFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") #if (anIsDebug): # logging.debug("Validation Line: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue; # these lines are from previous scripts in the pipeline, so skip them elif (line.startswith("#")): continue; # this is a header line, so skip it elif (line.startswith("chrom")): continue; # now we are to the data else: # split the line on the tab splitLine = line.split("\t") # get the fields to yield # columnHeaders = ["chrom", "chr_start", "chr_stop", "ref", "var", "source", "val_result"] # these are 0-based chrom = splitLine[0] #startCoordinate = splitLine[1] stopCoordinate = splitLine[2] #ref = splitLine[3] #variantAllele = splitLine[4] #center = splitLine[5] #valResult = splitLine[6] # add the coordinate to the output outputDict[chrom + "_" + stopCoordinate] = line # keep track of the number of total events per file aStatsDict[aPrefix + "_events"] += 1 # all events are considered passing events aStatsDict[aPrefix + "_pass_events"] += 1 # keep track of the total number of comparison events (blck, dnSnp, etc.) per file # their can be multiple keys for one filter such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): if (aPrefix == "cmp"): # break up the string to get the individual keys cmpKeyList = cmpKeyString.split(",") # search for each one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in line): # count it using the keyString aStatsDict[aPrefix + "_" + cmpKeyString] += 1 # if this is a passing line, call it using the keyString if (cmpKey in line): aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1 # only count it once break; inputFileHandler.close() return (outputDict, aStatsDict)
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename, anOutputFilename, aFilterName, aFilterField, anIncludeOverlapInfo, anIncludeFilterName, anIdField, anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize, anIsDebug): ''' ' This function reads from a .bed file and a .vcf file line by line and ' looks for variants that should be filtered or tagged. The .bed file ' specifies coordinates for areas where variants should either be included ' or excluded. For example, a .bed file specifying transcription or exon ' start and stop coordinates can be provided along with the ' --includeOverlaps flag to indicate that the variants in these regions ' should be kept, and variants outside of these regions should be flagged ' or filtered out. Conversely, a bed file specifying areas of the genome ' that are accessible (as defined by the 1000 Genomes project) can be given ' without the --includeOverlaps flag to indicate that the variants outside ' of the accessible genome should be flagged or filtered out, and variants ' overlapping the accessible regions should not be flagged or filtered out. ' ' aTCGAId: The TCGA Id for this sample ' aChrom: The chromosome being filtered ' aBedFilename: A .bed file with at least 3 columns specifying the chrom, ' start, and stop coordinates and possibly a 4th column with an id ' aVCFFilename: A .vcf file with variants that will be either ' included or excluded ' anOutputFilename: An output file where the filtered variants are output ' aFilterName: The name of the filter ' aFilterField: The field where the filter name should be included ' (e.g. INFO or FILTER) ' anIncludeOverlapInfo: A flag specifying whether the variants should be ' included or excluded when they overlap ' anIncludeFilterName: A flag specifying whether the filtering name should ' be included in the output or not ' anIdField: The field where the ID should be specified (e.g. ID or INFO) ' anIncludeId: A flag specifying whether the id should be included in the ' output or not ' anIncludeCount: A flag specifying whether the number of overlaps should ' be included in the output or not ' aFilterHeaderLine: A filter header line that should be added to the VCF ' header describing this filter ' aBinSize: The size of the interval between each bin ' anIsDebug: A flag for outputting debug messages to STDERR ''' # initialize pybed with the filtering file filterPybed = pybed(binsize=aBinSize) filterPybed.load_from_file(aBedFilename) # get the vcf file i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename) # get the output file i_outputFileHandler = None if (anOutputFilename is not None): i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename) # create the generator for the vcf file vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler, aFilterHeaderLine, anIsDebug) # initialize some variables overlappingEvents = 0 nonOverlappingEvents = 0 totalEvents = 0 startTime = time.time() # for each vcf line for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine, vcf_line) in (vcfGenerator): totalEvents += 1 if (anIsDebug): logging.debug("VCF: %s", vcf_line) # check if this vcf coordinate overlaps with the filter coordinates posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate) (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple, anIncludeCount) # if an event overlaps with the filters if (isOverlap): # count the overlap overlappingEvents += 1 # if we want to add info about overlaps if (anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter(vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we don't want to add info about overlaps, just output them else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line # these events don't overlap with the filters else: # count the non overlap nonOverlappingEvents += 1 # if we don't want to add info about overlaps, # then we do want to add info about non-overlaps if (not anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter(vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we do want to add info about overlaps, # so just output non-overlaps else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) if (overlappingEvents + nonOverlappingEvents == totalEvents): logging.info("For chrom %s and Id %s: %s (overlapping events) + " + "%s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) else: logging.info("filterByPybed Warning: For chrom %s and Id %s: %s " + "(overlapping events) + %s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) # close the files i_vcfFileHandler.close() if (anOutputFilename is not None): i_outputFileHandler.close() return
def main(): # create the usage statement usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Opts]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-o", "--outputFilename", default=sys.stdout, dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters " + "thus far are processed, include this argument if all of the " + "VCF calls should be processed") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 14, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_vcfFilename = str(i_cmdLineArgs[0]) i_rnaGeneFilename = str(i_cmdLineArgs[1]) i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None if (i_cmdLineOptions.outputFilename is not None): i_outputFilename = str(i_cmdLineOptions.outputFilename) if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not sys.stdout): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename) logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename) logging.debug("outputFilename=%s", i_outputFilename) logging.debug("logFilename=%s", i_logFilename) logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag) # check for any errors i_writeFilenameList = [] if (i_outputFilename is not sys.stdout): i_writeFilenameList = [i_outputFilename] if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the input stream i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename) # open the output stream if i_outputFilename is not sys.stdout: i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) else: i_outputFileHandler = i_outputFilename # get the RNA gene blacklists (i_rnaGeneList, i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename, i_rnaGeneFamilyFilename, i_debug) hasAddedFilterHeader = False for line in i_vcfFileHandler: if (i_debug): logging.debug("vcfLine: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue # if we find the FILTER section, then add the filters from here elif ((not hasAddedFilterHeader) and (line.startswith("##FILTER"))): hasAddedFilterHeader = True i_outputFileHandler.write( "##FILTER=<ID=rgene,Description=\"This gene is on the " + "RNA gene blacklist\">\n") i_outputFileHandler.write( "##FILTER=<ID=rgfam,Description=\"This gene family is on " + "the RNA gene family blacklist\">\n") i_outputFileHandler.write(line) # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): i_outputFileHandler.write(line) # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (i_passedVCFCallsOnlyFlag and "PASS" not in line): i_outputFileHandler.write(line) # now we are to the data else: # strip the carriage return and newline characters line = line.rstrip("\r\n") # split the line on the tab splitLine = line.split("\t") filterSet = set(splitLine[6].split(";")) # if there are no filters so far, then clear the list if (len(filterSet) == 1 and "PASS" in filterSet): filterSet = set() # parse the info column and create a dict infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") effectList = infoDict["EFF"] effectRegEx = re.compile("(\\w).*\\({1}") ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"] isRnaBlacklistGene = False isRnaBlacklistGeneFamily = False for rawEffect in effectList: rawEffect = rawEffect.rstrip(")") iterator = effectRegEx.finditer(rawEffect) # for each match object in the iterator for match in iterator: effect = match.group() rawEffect = rawEffect.replace(effect, "") effect = effect.rstrip("(") if (effect in ignoreEffectsList): continue effectParts = rawEffect.split("|") # effectImpact = effectParts[0] # functionalClass = effectParts[1] # codonChange = effectParts[2] # aaChange = effectParts[3] # aaLength = effectParts[4] geneName = effectParts[5] transcriptBiotype = effectParts[6] # geneCoding = effectParts[7] # ensembleId = effectParts[8] # exonNumber = effectParts[9] # genotypeNumber = effectParts[10] # the RNA gene list can have "RP11" and that # should filter out any gene with RP11 in it for rnaGene in i_rnaGeneList: if (rnaGene in geneName): isRnaBlacklistGene = True break if (transcriptBiotype in i_rnaGeneFamilyList): isRnaBlacklistGeneFamily = True output = ["\t".join(splitLine[0:6])] # if the filter should be applied if (isRnaBlacklistGene): filterSet.add("rgene") # if the filter should be applied if (isRnaBlacklistGeneFamily): filterSet.add("rgfam") # if there are no filters so far, then this call passes if (len(filterSet) == 0): filterSet.add("PASS") output.append(";".join(filterSet)) output.append("\t".join(splitLine[7:])) if (i_outputFilename is not sys.stdout): i_outputFileHandler.write("\t".join(output) + "\n") else: print >> sys.stdout, "\t".join(output) # close the files i_vcfFileHandler.close() if (i_outputFilename is not sys.stdout): i_outputFileHandler.close() return