Exemplo n.º 1
def get_rna_genes(anRnaGeneFile, anRnaGeneFamilyFile, anIsDebug):
    ' This function parses the RNA gene and RNA gene family blacklist files.
    ' anRnaGeneFile:  An RNA gene file
    ' anRnaGeneFamilyFile:  An RNA gene family file
    ' anIsDebug: A flag for outputting debug messages to STDERR

    # open the file
    geneFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFile)
    geneFamilyFileHandler = radiaUtil.get_read_fileHandler(anRnaGeneFamilyFile)
    rnaGeneList = list()
    rnaGeneFamilyList = list()

    for line in geneFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)


    for line in geneFamilyFileHandler:

        # we can ignore the lines that start with # for now
        if (line.startswith("#") or line.isspace()):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("RNA Blacklist: %s", line)



    return rnaGeneList, rnaGeneFamilyList
Exemplo n.º 3
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    ' This function parses the output from BLAT.  Two formats are supported:  BLAST NCBI-8 and PSL.  It groups 
    ' all of the information from one query sequence and uses the python generator to yield the information.  
    ' It ignores empty lines and strips trailing \r\n characters.
    ' aBlatFile:  A output file from BLAT
    ' anOutputFormat:  BLAST or PSL
    ' anIsDebug: A flag for outputting debug messages to STDERR

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(dict)

    for line in fileHandler:

        # we can ignore the lines that start with # for now
        if (line.isspace()):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data = rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        blatSplitId = blatId.split("_")
        prefix = blatSplitId[0]
        coordinateId = "_".join(blatSplitId[1:3])
        readId = "_".join(blatSplitId[0:4])

        if coordinateId not in blatHitsDict:
            blatHitsDict[coordinateId] = collections.defaultdict(dict)
        if prefix not in blatHitsDict[coordinateId]:
            blatHitsDict[coordinateId][prefix] = collections.defaultdict(list)


    return blatHitsDict
Exemplo n.º 4
def get_vcf_data(aVCFFile, anIsDebug):

    headerList = list()
    chromLine = None
    infoList = list()
    filterList = list()
    coordinateDict = dict()

    vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFile)

    for line in vcfFileHandler:

        # if it is an empty line, then just continue
        if (line.isspace()):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug):
            logging.debug("vcfLine: %s", line)

        # if we find the FILTER section, then record the filters
        if (line.startswith("##FILTER")):

        # if we find the INFO section, then record the info
        elif (line.startswith("##INFO")):

        # if we find the header line section
        elif (line.startswith("#CHROM")):
            chromLine = line

        # if we find the header line section
        elif (line.startswith("#")):

        # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            # chrom = splitLine[0]
            stopCoordinate = splitLine[1]
            coordinateDict[stopCoordinate] = line + "\n"

    return (headerList, chromLine, infoList, filterList, coordinateDict)
Exemplo n.º 7
def get_vcf_data(aVcfFile, aPassOnlyFlag, anIsDebug):
    ' This function reads from a VCF input file and uses the python generator
    ' to yield the information one line at a time.  It ignores empty lines and
    ' strips trailing \r\n characters.  This function yields all the
    ' information from the VCF file.
    ' aVcfFile:         A VCF file
    ' aPassOnlyFlag:    If all calls should be processed or only those calls
    '                   that passed the filters thus far
    ' anIsDebug:         A flag for outputting debug messages to STDERR

    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("VCF: %s", line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        if (aPassOnlyFlag and "PASS" not in line):

        yield line

Exemplo n.º 9
def merge_vcf_data(aDnaFile, anRnaFile, anOverlapsFile,
                   aNonOverlapsFile, anIsDebug):

    # open the header file
    dnaFileHandler = radiaUtil.get_read_fileHandler(aDnaFile)
    rnaFileHandler = radiaUtil.get_read_fileHandler(anRnaFile)
    overlapsFileHandler = radiaUtil.get_read_fileHandler(anOverlapsFile)
    if (os.path.isfile(aNonOverlapsFile)):
        nonOverlapsFileHandler = radiaUtil.get_read_fileHandler(

    headerList = list()
    coordinateDict = dict()

    # the dna file has the results from the dna mpileup filter
    # the rna file has the results from the rna mpileup filter
    # the overlaps file has calls that pass in both the DNA and RNA
    # the non-overlaps file originally has calls that don't pass in the DNA but
    # pass in the RNA, these RNA calls are further filtered to eliminate
    # possible germline (dnm*/DB/GERM) calls or false positives due to
    # pseudogens (EGPS/RTPS) and then the RNA reads are optionally run through
    # the blat filter to check for mapping uniqueness - these are the
    # RNA Rescue and RNA Editing calls

    # process all of the calls from the DNA mpileup filter
    for line in dnaFileHandler:

        # if it is an empty line, then just continue
        if (line.isspace()):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug and not line.startswith("#")):
            logging.debug("DNA mpileup Line: %s", line)

        # if it is a header line, then add it to the header list
        if (line.startswith("#")):

            # keep all the header lines
            headerList.append(line + "\n")

        # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            stopCoordinate = splitLine[1]
            coordinateDict[stopCoordinate] = line + "\n"

    # these are all the calls that pass in both the DNA and RNA
    for line in overlapsFileHandler:

        # if it is an empty line, then continue
        # if it is a header line, then continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (anIsDebug and not line.startswith("#")):
            logging.debug("Overlaps file Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # the coordinate is the second element
        stopCoordinate = splitLine[1]

        # if the call passed in both the RNA and DNA,
        # then adjust the origin
        if (stopCoordinate in coordinateDict):
            dnaLine = coordinateDict[stopCoordinate]
            if (anIsDebug):
                logging.debug("passed in both RNA and DNA (from overlaps " +
                              "file) changing the origin to (DNA,RNA) " +
                              "\nDNALine: %s RNALine: %s\n",
                              dnaLine, line)
            dnaLine = dnaLine.replace("ORIGIN=DNA", "ORIGIN=DNA,RNA")
            coordinateDict[stopCoordinate] = dnaLine
            coordinateDict[stopCoordinate] = line + "\n"

    # loop through the RNA mpileup filtered calls
    # create 2 dictionaries:  one for passing, one for non-passing
    # if an RNA Rescue or RNA Editing call passes in the anRnaNonOverlapsFile
    # below, then we want to use the original RNA mpileup passing call to
    # overwrite the DNA call. the non-overlaps file is really the RNA mpileup
    # passing calls that are first filtered by DNA, then grep, and then blat.
    # the filtered by DNA part doesn't select one modType when no call passes,
    # so the final passing call has more than one modType which causes problems
    # in the next filter, therefore use the RNA mpileup passing call.
    # mpileup_rna_origin:
    # 9       17464495        .       G       A       0.0     PASS
    #    AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A;
    #    0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0
    #    0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0
    #    0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5
    # vs.
    # mpileup_rna_origin->dnaFiltered->blat:
    # 9       17464495        .       G       A       0.0     PASS
    #    AC=5;AF=0.1;AN=2;BQ=39;DP=49;FA=0.1;INS=0;DEL=0;;MC=G>A,G>A;
    #    MF=rnacall,dtmnab_dtmnbq;MFT=DNA_TUM_EDIT_G>A,DNA_SOM_G>A;
    #    0/0:22:22,0:1.0,0.0:0:0:0:1:32,0:0.68,0.0
    #    0/0:10:9,1:0.9,0.1:0:0:0:0:29,13:0.67,1.0
    #    0/1:17:13,4:0.76,0.24:0:0:0:4:60,32:0.92,0.5
    # when merging a call that passes in the non-overlaps (dnaFiltered or blat)
    # file, replace the DNA call, with the original RNA mpileup passing call
    # the non-passing dictionary will be used below to help merge filtered
    # calls when a call gets filtered by both the RNA and DNA

    rnaMpileupPassingDict = {}
    rnaMpileupNonpassingDict = {}
    for rnaLine in rnaFileHandler:

        # if it is an empty line, then just continue
        # if it is a header line, then just continue
        if (rnaLine.isspace() or rnaLine.startswith("#")):

        # strip the carriage return and newline characters
        rnaLine = rnaLine.rstrip("\r\n")

        if (anIsDebug and not rnaLine.startswith("#")):
            logging.debug("RNA mpileup Line: %s", rnaLine)

        # now we are to the data
        # split the line on the tab
        rnaLineSplit = rnaLine.split("\t")

        # the coordinate is the second element
        stopCoordinate = rnaLineSplit[1]

        # put the call in the right dict
        if "PASS" in rnaLineSplit[6]:
            rnaMpileupPassingDict[stopCoordinate] = rnaLine
            rnaMpileupNonpassingDict[stopCoordinate] = rnaLine

    # these are the RNA Rescue and RNA Editing calls after
    # the initial filtering but before filterByReadSupport.py
    if (os.path.isfile(aNonOverlapsFile)):
        for line in nonOverlapsFileHandler:

            # if it is an empty line, then just continue
            # if it is a header line, then just continue
            if (line.isspace() or line.startswith("#")):

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug and not line.startswith("#")):
                logging.debug("Non-overlaps Line: %s", line)

            # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")

            # the coordinate is the second element
            stopCoordinate = splitLine[1]

            # if this call passed in the RNA, then overwrite
            # the DNA call that didn't pass
            if ("PASS" in splitLine[6]):
                # if this call existed in the DNA
                if (stopCoordinate in coordinateDict):
                    dnaLine = coordinateDict[stopCoordinate]
                    # get the RNA line from the RNA mpileups passing dict
                    rnaLine = rnaMpileupPassingDict[stopCoordinate]
                    # if it didn't pass in the DNA
                    if ("PASS" not in dnaLine):
                        if (anIsDebug):
                            logging.debug("Overwriting non-passing DNA call " +
                                          "with passing RNA Rescue calls " +
                                          "\nDNALine: %s" +
                                          "RNALineNonOverlaps: %s" +
                                          "\nRNALineMpileup: %s\n",
                                          dnaLine, line, rnaLine)
                        coordinateDict[stopCoordinate] = rnaLine + "\n"
                        if (anIsDebug):
                            # this call passed in both
                            logging.debug("Unusual call in non-overlaps " +
                                          "file passed in both the RNA and " +
                                          "DNA but they probably don't have " +
                                          "the same modType! \nDNALine: %s " +
                                          "RNALineNonOverlaps: %s" +
                                          "\nRNALineMpileup: %s\n",
                                          dnaLine, line, rnaLine)
                        # at this point, there are multiple events that pass
                        # all the filters. in this case, pick the passing
                        # event in the following order:
                        # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH
                        if ("GERM" in dnaLine or "SOM" in dnaLine):
                            coordinateDict[stopCoordinate] = dnaLine
                            coordinateDict[stopCoordinate] = rnaLine
                # this call didn't exist in the DNA
                    logging.warning("Call didn't exist in DNA? " +
                                    "RNALine: %s\n", line)
                    coordinateDict[stopCoordinate] = line + "\n"
            # this call didn't pass in the RNA
                if (anIsDebug):
                    logging.debug("Call didn't pass in RNA: " +
                                  "RNALine: %s\n", line)

                # if this call existed in the DNA
                if (stopCoordinate in coordinateDict):
                    dnaLine = coordinateDict[stopCoordinate]
                    # if it didn't pass in the DNA
                    if ("PASS" not in dnaLine):
                        if (anIsDebug):
                            logging.debug("RNANoPass:  Didn't pass in both, " +
                                          "so change origin and merge " +
                                          "filters \nDNALine: %s " +
                                          "RNALine: %s\n", dnaLine, line)
                        # change origin
                        if ("ORIGIN=DNA,RNA" not in dnaLine):
                            dnaLine = dnaLine.replace("ORIGIN=DNA",
                        dnaLine = dnaLine.rstrip("\r\n")
                        dnaLineSplit = dnaLine.split("\t")

                        # merge the filters for the FILTER column
                        dnaLineSplit[6] = merge_filters(splitLine[6],

                        # merge the mod filters and filter types
                        # in the INFO column
                        dnaLineSplit[7] = merge_mod_filters(

                        newDnaLine = "\t".join(dnaLineSplit) + "\n"
                        coordinateDict[stopCoordinate] = newDnaLine

                        if (anIsDebug):
                            logging.debug("RNANoPass:  After change origin " +
                                          "and merge filters " +
                                          "\nFinalLine: %s\n",
                        # this call passed in both:
                        # DNALine: 17 4857042 .   T   A,G,C   0.0 PASS
                        #    AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4;
                        #    BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>A;
                        #    MT=GERM;NS=3;ORG_ISO_AD=16_2_1_2615,
                        #    18_3_1_2791,18_1_2_2805;ORIGIN=DNA;
                        #    RS_GEN_POS=17:4854383-4860426,
                        #    17:4854383-4860426,17:4854383-4860426;
                        #    RS_NAME=NM_001193503,NM_001976,NM_053013;
                        #    RS_ORG_POS=313,484,442;RS_STRAND=+,+,+;
                        #    SB=0.74;SS=1;START=1;STOP=0;VT=SNP
                        #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
                        #    0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0:
                        #        29,28,3,0:0.39,0.5,1.0,0.0
                        #    0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0:
                        #        31,0,0,0:0.56,0.0,0.0,0.0
                        #    3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0:
                        #        45,12,12,58:0.94,1.0,1.0,0.97
                        # RNALine: 17    4857042 .   T   A,G,C   0.0 PASS
                        #    AB=A,G,C;AC=10,5,8211;AF=0.0,0.0,0.98;AN=4;
                        #    BQ=31;DP=8379;FA=0.98;INS=0;DEL=0;;MC=T>C;
                        #    MT=TUM_EDIT;NS=3;ORG_ISO_AD=16_2_1_2615,
                        #    18_3_1_2791,18_1_2_2805;ORIGIN=RNA;
                        #    RS_GEN_POS=17:4854383-4860426,
                        #    17:4854383-4860426,17:4854383-4860426;
                        #    RS_NAME=NM_001193503,NM_001976,NM_053013;
                        #    RS_ORG_POS=313,484,442;RS_STRAND=+,+,+;
                        #    SB=0.74;SS=5;START=1;STOP=0;VT=SNP
                        #    GT:DP:AD:AF:INS:DEL:START:STOP:BQ:SB
                        #    0/1:36:31,4,1,0:0.86,0.11,0.03,0.0:0:0:1:0:
                        #        29,28,3,0:0.39,0.5,1.0,0.0
                        #    0/0:70:70,0,0,0:1.0,0.0,0.0,0.0:0:0:0:0:
                        #        31,0,0,0:0.56,0.0,0.0,0.0
                        #    3/3:8273:52,6,4,8211:0.01,0.0,0.0,0.99:0:0:0:0:
                        #        45,12,12,58:0.94,1.0,1.0,0.97
                        logging.warning("RNANoPass:  Call passed in both " +
                                        "RNA and DNA but they probably " +
                                        "don't have the same modType " +
                                        "\nDNALine: %s RNALine: %s\n",
                                        dnaLine, line)
                        # at this point, there are multiple events that
                        # pass all the filters. in this case, pick the
                        # passing event in the following order:
                        # GERM, NOR_EDIT, SOM, TUM_EDIT, RNA_TUM_VAR, LOH
                        if ("GERM" in dnaLine or "SOM" in dnaLine):
                            coordinateDict[stopCoordinate] = dnaLine
                            coordinateDict[stopCoordinate] = line
                # this call didn't exist in the DNA
                    logging.warning("RNANoPass:  Call didn't exist in DNA? " +
                                    "RNALine: %s\n", line)
                    coordinateDict[stopCoordinate] = line + "\n"

    # these are needed for merging the RNA mpileup filters
    for (rnaStopCoordinate, rnaLine) in rnaMpileupNonpassingDict.iteritems():

        if (anIsDebug and not rnaLine.startswith("#")):
            logging.debug("RNA mpileup non-passing Line: %s", rnaLine)

        # if this call existed in the DNA and
        # the user wants the merged calls
        if (rnaStopCoordinate in coordinateDict):

            # split the line on the tab
            rnaLineSplit = rnaLine.split("\t")

            # get the original line
            dnaLine = coordinateDict[rnaStopCoordinate]
            dnaLine = dnaLine.rstrip("\r\n")
            dnaLineSplit = dnaLine.split("\t")

            # if the call didn't pass in the RNA or DNA,
            # we want to merge the filters
            if "PASS" not in dnaLineSplit[6]:
                if (anIsDebug):
                    logging.debug("Merging filters for \nDNALine: %s " +
                                  "\nRNALine: %s", dnaLine, rnaLine)

                # merge the filters for the FILTER column
                dnaLineSplit[6] = merge_filters(rnaLineSplit[6],

                # merge the mod filters and filter types in the INFO column
                dnaLineSplit[7] = merge_mod_filters(rnaLineSplit[7],

                finalLine = "\t".join(dnaLineSplit)
                if ("ORIGIN=DNA,RNA" not in finalLine):
                    finalLine = finalLine.replace("ORIGIN=DNA",

                coordinateDict[rnaStopCoordinate] = finalLine + "\n"
                if (anIsDebug):
                    logging.debug("Merged filters \nFinalLine: %s", finalLine)
        # this call didn't exist in the DNA
            coordinateDict[rnaStopCoordinate] = rnaLine + "\n"

    if (os.path.isfile(aNonOverlapsFile)):

    return (headerList, coordinateDict)
Exemplo n.º 10
def get_vcf_data(anId, anInputDir, anIsDebug):

    # for each file that starts with this id
        # load the first file to get the header
        # get the coordinates for all

    processedHeader = False
    headerDict = dict()
    headerDict["metadata"] = list()
    headerDict["format"] = list()
    headerDict["info"] = list()
    headerDict["filter"] = list()
    headerDict["chrom"] = list()
    coordinateDict = dict()
    coordinateDict["numbers"] = dict()
    coordinateDict["letters"] = dict()

    # if the input directory doesn't end with a forward slash,
    # then add one so that glob.glob will work
    if (not anInputDir.endswith("/")):
        anInputDir = anInputDir + "/"

    # for each vcf file
    # they might be gzipped, they might not
    for vcfFile in (glob.glob(anInputDir + anId + "_chr*.vcf*")):

        # open the file
        vcfFileHandler = radiaUtil.get_read_fileHandler(vcfFile)

        for line in vcfFileHandler:

            # if it is an empty line, then just continue
            if (line.isspace()):

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug):
                logging.debug("vcfLine: %s", line)

            # if we haven't processed the header yet, then do it here
            if (not processedHeader):
                # extract the metadata
                if (line.startswith("##FORMAT")):
                elif (line.startswith("##INFO")):
                elif (line.startswith("##FILTER")):
                elif (line.startswith("##")):
                elif (line.startswith("#CHROM")):
                    # now we've processed the header
                    processedHeader = True

            if (line.startswith("#")):
                # split the line on the tab
                splitLine = line.split("\t")

                # the coordinate is the second element
                chrom = splitLine[0]

                # we want to sort everything at the end, so keep track
                # of the chroms that are numbers and letters separately
                if (is_number(chrom)):
                    if chrom not in coordinateDict["numbers"]:
                        coordinateDict["numbers"][chrom] = list()
                    if chrom not in coordinateDict["letters"]:
                        coordinateDict["letters"][chrom] = list()

        # close the file and move onto the next one

    return (headerDict, coordinateDict)
Exemplo n.º 11
def get_validation_data(anInputFilename, aStatsDict, aCompareDict,
                        aPrefix, anIsDebug):
    ' The validation files must have at least 10 fields:  chrom, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or
            line.startswith("#") or

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("Validation Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # get the fields to yield
        # columnHeaders = ["chrom", "chr_start", "chr_stop",
        #                  "ref", "var", "source", "val_result"]
        # these are 0-based
        chrom = splitLine[0]
        # startCoordinate = splitLine[1]
        stopCoordinate = splitLine[2]
        # ref = splitLine[3]
        # variantAllele = splitLine[4]
        # center = splitLine[5]
        # valResult = splitLine[6]

        # add the coordinate to the output
        outputDict[chrom + "_" + stopCoordinate] = line

        # keep track of the number of total events per file
        aStatsDict[aPrefix + "_events"] += 1

        # all events are considered passing events
        aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. their can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        if (cmpKey in line):
                            statKey = aPrefix + "_pass_" + cmpKeyString
                            aStatsDict[statKey] += 1
                        # only count it once


    return (outputDict, aStatsDict)
Exemplo n.º 12
def get_vcf_data(anInputFilename, aStatsDict, aCompareDict,
                 aPrefix, anIsDebug):
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("VCF Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # get the fields to yield
        # columnHeaders = ["CHROM", "POS", "ID", "REF", "ALT",
        #                  "QUAL", "FILTER", "INFO", "FORMAT"]
        chrom = splitLine[0]
        stopCoordinate = splitLine[1]

        # outputDict[chrom + "_" + stopCoordinate] = line

        # keep track of the number of total events per file
        # aStatsDict[aPrefix + "_events"] += 1

        # if ("PASS" in line):
        #    aStatsDict[aPrefix + "_pass_events"] += 1

        # the thing that is being compared to has to have the
        # smaller/limited amount i.e. only the passing som events,
        # otherwise everything will be found
        # if (aPrefix == "rad" and "SNP" in line):
        # if (aPrefix == "rad"):
        if (aPrefix == "rad" and "PASS" in line and "SNP" in line and
            ("SOM" in line or "EDIT" in line or
             "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            # add the coordinate to the output
            outputDict[chrom + "_" + stopCoordinate] = line
        # elif (aPrefix == "cmp" and "PASS" in line):
        # elif (aPrefix == "cmp" and "SNP" in line):
        elif (aPrefix == "cmp" and "PASS" in line and "SNP" in line and
              ("SOM" in line or "EDIT" in line or
               "RNA_TUM_VAR" in line or "RNA_NOR_VAR" in line)):
            outputDict[chrom + "_" + stopCoordinate] = line

        # if ("PASS" in line and "Somatic" in line and "SNP" in line):
        # if ("SOM" in line):
        #    outputDict[chrom + "_" + stopCoordinate] = line

        #    # keep track of the number of total events per file
        #    aStatsDict[aPrefix + "_events"] += 1

        #    if ("PASS" in line):
        #        aStatsDict[aPrefix + "_pass_events"] += 1

        # if ("PASS" in line and "SOM" in line):
        # if ("SOM" in line):
        #    outputDict[chrom + "_" + stopCoordinate] = line
        #    aStatsDict[aPrefix + "_events"] += 1
        #    aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. there can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "rad"):
                # break up the string to get the individual keys
                radKeyList = radKeyString.split(",")
                # search for each one of them
                for radKey in radKeyList:
                    # if we find one
                    if (radKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + radKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #     (radKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and
                        #     (radKey == "Germline" or "DB" not in line)):
                        # if ("PASS" in line):
                        # if (radKey in line):
                        # if ("PASS" in line and "SNP" in line):
                        if ("PASS" in line and
                            "SNP" in line and
                            radKey in line):
                            statKey = aPrefix + "_pass_" + radKeyString
                            aStatsDict[statKey] += 1
                        # only count it once

            elif (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        # if ("SNP" in line):
                        #    aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        # if ("PASS" in line and
                        #    (cmpKey == "GERM" or "DB" not in line)):
                        # if ("PASS" in line and "SNP" in line and
                        #    (cmpKey == "Germline" or "DB" not in line)):
                        # if ("PASS" in line and "SNP" in line and
                        #    "SS=2" in line and cmpKey in line):
                        # if ("PASS" in line):
                        if ("PASS" in line and
                            "SNP" in line and
                            cmpKey in line):
                            statKey = aPrefix + "_pass_" + cmpKeyString
                            aStatsDict[statKey] += 1
                        # only count it once


    return (outputDict, aStatsDict)
Exemplo n.º 14
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

        help="the name of the output file, STDOUT by default")
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
        help="the name of the log file, STDOUT by default")
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",

    # set up the logging
    if (i_logFilename != None):
                            datefmt='%m/%d/%Y %I:%M:%S %p')
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [
        i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # get the RNA gene blacklists
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename, i_debug)

    hasAddedHeader = False
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)
    vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n"
    vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n"

    for line in i_vcfFileHandler:

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedHeader)
              and (line.startswith("##FILTER") or line.startswith("##INFO"))):
            hasAddedHeader = True
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
                print >> sys.stdout, vcfHeader
                print >> sys.stdout, line

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
                print >> sys.stdout, line

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
                print >> sys.stdout, line

        # now we are to the data

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):

                effectParts = rawEffect.split("|")
                #effectImpact = effectParts[0]
                #functionalClass = effectParts[1]
                #codonChange = effectParts[2]
                #aaChange = effectParts[3]
                #aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                #geneCoding = effectParts[7]
                #ensembleId = effectParts[8]
                #exonNumber = effectParts[9]
                #genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):



            if (i_outputFilename != None):
                i_outputFileHandler.write("\t".join(output) + "\n")
                print >> sys.stdout, "\t".join(output)

    # close the files
    if (i_outputFilename != None):

Exemplo n.º 15
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict,
                        aPrefix, anIsDebug):
    ' The simulation files have 11 fields:  mutation type, chrom, start, end,
    ' target AF, mutation position, base change, coverage in, coverage out,
    ' actual AF, highest AF of anything linked. The useful ones for comparing
    ' against RADIA are chromosome, mutation position, and base change.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR

    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}

    for line in inputFileHandler:

        # if it is an empty line or header line, then just continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("Simulation Line: %s", line)

        # now we are to the data
        # split the line on the tab
        splitLine = line.split("\t")

        # mutType = splitLine[0]
        chrom = splitLine[1]
        # startCoordinate = splitLine[2]
        # stopCoordinate = splitLine[3]
        # targetAF = splitLine[4]
        mutPosition = splitLine[5]
        # baseChange = splitLine[6]
        # coverageIn = splitLine[7]
        # coverageOut = splitLine[8]
        # actualAF = splitLine[9]
        # highestAF = splitLine[10]

        if (chrom + "_" + mutPosition) in outputDict:
            logging.debug(line + outputDict[chrom + "_" + mutPosition])

        # add the coordinate to the output
        outputDict[chrom + "_" + mutPosition] = line

        # keep track of the number of total events per file
        aStatsDict[aPrefix + "_events"] += 1

        # all events are considered passing events
        aStatsDict[aPrefix + "_pass_events"] += 1

        # keep track of the total number of comparison events (blck, dnSnp,
        # etc.) per file. their can be multiple keys for one filter such
        # as blq and bldp for blacklists
        for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
            if (aPrefix == "cmp"):
                # break up the string to get the individual keys
                cmpKeyList = cmpKeyString.split(",")
                # search for each one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in line):
                        # count it using the keyString
                        aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                        # if this is a passing line,
                        # call it using the keyString
                        aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                        # only count it once


    return (outputDict, aStatsDict)
Exemplo n.º 16
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    ' This function parses the output from BLAT.  Two formats are supported:
    ' BLAST NCBI-8 and PSL.  It groups all of the information from one query
    ' sequence and uses the python generator to yield the information.  It
    ' ignores empty lines and strips trailing \r\n characters.
    ' aBlatFile:         A output file from BLAT
    ' anOutputFormat:    BLAST or PSL
    ' anIsDebug:         A flag for outputting debug messages to STDERR

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(list)
    previousPrefix = ""

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data =
        # rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            # the PSL output has a bunch of header lines that we want to skip
            # if the first column can't be converted into an int, then skip
            except ValueError:
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        prefix = "_".join(blatId.split("_")[0:3])
        readId = "_".join(blatId.split("_")[0:4])

        # this catches all of the matches except the first one
        if (prefix == previousPrefix):
            if (anIsDebug):
                logging.debug("prefixes match, current=%s, prev=%s",
                              prefix, previousPrefix)
        # if the prefixes don't match and the blatHitsDict is not empty:
        # we've reached a new set of blat hits, so yield the previous ones
        elif blatHitsDict:
            if (anIsDebug):
                logging.debug("new prefix=%s, prev=%s", prefix, previousPrefix)
                logging.debug("yielding len blatHits=%s", len(blatHitsDict))
            # yield the blat hits for this prefix
            yield blatHitsDict
            # clear the blat hits dict for the next matches
            # set the previous prefix and start filling
            # the dict with the first prefix
            previousPrefix = prefix
            if (anIsDebug):
                logging.debug("after yield current=%s, prev=%s",
                              prefix, previousPrefix)
        # if the prefixes don't match, and the blatHitsDict is empty:
        # this is the first line of the VCF, set the previous prefix
        # and add it to the blatHitsDict
            previousPrefix = prefix

    # this one is needed to yield the very last blatHitsDict when all
    # lines of the VCF have been processed
    yield blatHitsDict
Exemplo n.º 17
def get_maf_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    ' The .vcf files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        #if (anIsDebug):
        #    logging.debug("MAF Line: %s", line)
        # if it is an empty line, then just continue
        if (line.isspace()):
        # these lines are from previous scripts in the pipeline, so output them    
        elif (line.startswith("#")):
        # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")
            # get the fields to yield
            #center = splitLine[2]
            chrom = splitLine[4]
            #startCoordinate = splitLine[5]
            stopCoordinate = splitLine[6]
            #variantType = splitLine[9]
            #dbSnp = splitLine[13]
            #if ("Somatic" in line and "SNP" in line):
            if (True):
                #if (chrom + "_" + stopCoordinate) in outputDict:
                #    logging.debug(line + outputDict[chrom + "_" + stopCoordinate])
                # add the coordinate to the output
                outputDict[chrom + "_" + stopCoordinate] = line
                # keep track of the number of total events per file 
                aStatsDict[aPrefix + "_events"] += 1
                # all events are considered passing events
                aStatsDict[aPrefix + "_pass_events"] += 1
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "rad"):
                    # break up the string to get the individual keys
                    radKeyList = radKeyString.split(",")
                    # search for each one of them
                    for radKey in radKeyList:
                        # if we find one
                        if (radKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + radKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((radKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and ((radKey == "Germline") or ("DB" not in line))):
                            #if ("PASS" in line and radKey in line):
                            #if ("SNP" in line):
                            if ("SOMATIC" in line):
                                aStatsDict[aPrefix + "_pass_" + radKeyString] += 1
                            # only count it once
                elif (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            #if ("PASS" in line and ((cmpKey == "GERM") or ("DB" not in line))):
                            #if ("PASS" in line and "SNP" in line and ((cmpKey == "Germline") or ("DB" not in line))):
                            #if ("SNP" in line):
                            #if ("PASS" in line):
                            if ("SOMATIC" in line):
                                aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
    return (outputDict, aStatsDict)
Exemplo n.º 19
def get_vcf_data(aVcfFile, aHeaderFile, aPassOnlyFlag, anIsDebug):
    ' This function reads from a .vcf input file and uses the python generator to yield the information
    ' one line at a time.  It ignores empty lines and strips trailing \r\n characters.  This function
    ' yields all the information from the VCF file.
    ' aVcfFile:  A VCF file
    ' aPassOnlyFlag:  If all calls should be processed or only those calls that passed the filters thus far
    ' anIsDebug: A flag for outputting debug messages to STDERR
    # open the header file
    fileHandler = radiaUtil.get_read_fileHandler(aHeaderFile)
    for line in fileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        #if (anIsDebug):
        #    logging.debug("VCF Header: %s", line)    
        # if it is an empty line, then just continue
        if (line.isspace()):
        # if we find the column headers
        elif ("#CHROM" in line):
            columnsLine = line.lstrip("#")
            columnsList = columnsLine.split("\t")
            columnsList = columnsList[9:len(columnsList)]
        # if we find the vcfGenerator line, then create the dict of params
        elif ("vcfGenerator" in line):
            #generatorLine = line.rstrip(">")
            #generatorLine = generatorLine.lstrip("##vcfGenerator=<")
            generatorLine = line[0:(len(line)-1)]
            #print "generatorLine: %s", generatorLine
            generatorLine = generatorLine[16:len(generatorLine)]
            #print "generatorLine: %s", generatorLine
            generatorParamsList = generatorLine.split(",")
            generatorParamsDict = {}
            # create a dictionary of existing params
            for param in generatorParamsList:
                (key, value) = param.split("=")
                value = value.rstrip(">")
                value = value.lstrip("<")
                generatorParamsDict[key] = value
        # if we are done with the header, then stop    
        elif (not line.startswith("#")):
    # open the VCF file
    fileHandler = radiaUtil.get_read_fileHandler(aVcfFile)
    for line in fileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        #if (anIsDebug):
        #    logging.debug("VCF: %s", line)    
        # if it is an empty line, then just continue
        if (line.isspace()):
        # if we find the column headers
        elif ("#CHROM" in line):
            columnsLine = line.lstrip("#")
            columnsList = columnsLine.split("\t")
            columnsList = columnsList[9:len(columnsList)]
        # if we find the vcfGenerator line, then create the dict of params
        elif ("vcfGenerator" in line):
            #generatorLine = line.rstrip(">")
            #generatorLine = generatorLine.lstrip("##vcfGenerator=<")
            generatorLine = line[0:(len(line)-1)]
            #print "generatorLine: %s", generatorLine
            generatorLine = generatorLine[16:len(generatorLine)]
            #print "generatorLine: %s", generatorLine
            generatorParamsList = generatorLine.split(",")
            generatorParamsDict = {}
            # create a dictionary of existing params
            for param in generatorParamsList:
                (key, value) = param.split("=")
                value = value.rstrip(">")
                value = value.lstrip("<")
                generatorParamsDict[key] = value
        # these are header lines, so just continue    
        elif (line.startswith("#")):
        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (aPassOnlyFlag and "PASS" not in line):    

        # split the line on the tab
        splitLine = line.split("\t")

        # the coordinate is the second element
        chrom = splitLine[0]
        stopCoordinate = int(splitLine[1])
        idList = splitLine[2].split(";")
        refList = splitLine[3].split(",")
        altList = splitLine[4].split(",")
        score = float(splitLine[5])
        filterSet = set(splitLine[6].split(";"))
        infoList = splitLine[7].split(";")
        infoDict = collections.defaultdict(list)
        for info in infoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
            if (len(keyValueList) == 1):
                infoDict[keyValueList[0]] = ["True"]
                # the value can be a comma separated list
                infoDict[keyValueList[0]] = keyValueList[1].split(",")
        # yield all the information about the current coordinate
        yield (chrom, stopCoordinate, idList, refList, altList, score, filterSet, infoDict, "\t".join(splitLine[8:]), generatorParamsDict)
Exemplo n.º 21
def get_simulation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    ' The simulation files have 11 fields:  mutation type, chromosome, start, end, target AF, 
    ' mutation position, base change, coverage in, coverage out, actual AF, highest AF of anything 
    ' linked. The useful ones for comparing against RADIA are chromosome, mutation position, and base change.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        #if (anIsDebug):
        #    logging.debug("Simulation Line: %s", line)
        # if it is an empty line, then just continue
        if (line.isspace()):
        # these lines are from previous scripts in the pipeline, so skip them    
        elif (line.startswith("#")):
        # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")
            #mutType = splitLine[0]
            chrom = splitLine[1]
            #startCoordinate = splitLine[2]
            #stopCoordinate = splitLine[3]
            #targetAF = splitLine[4]
            mutPosition = splitLine[5]
            #baseChange = splitLine[6]
            #coverageIn = splitLine[7]
            #coverageOut = splitLine[8]
            #actualAF = splitLine[9]
            #highestAF = splitLine[10]
            if (chrom + "_" + mutPosition) in outputDict:
                logging.debug(line + outputDict[chrom + "_" + mutPosition])
            # add the coordinate to the output
            outputDict[chrom + "_" + mutPosition] = line
            # keep track of the number of total events per file 
            aStatsDict[aPrefix + "_events"] += 1
            # all events are considered passing events
            aStatsDict[aPrefix + "_pass_events"] += 1
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
    return (outputDict, aStatsDict)
Exemplo n.º 23
def get_validation_data(anInputFilename, aStatsDict, aCompareDict, aPrefix, anIsDebug):
    ' The validation files must have at least 10 fields:  chromosome, coordinate, id
    ' references, alts, quality score, filters, infos, format, and summary info
    ' for at least one .bam file.
    ' anInputFileHandler: The input stream for the file
    ' aStatsDict: A dictionary holding stats about all the comparisons
    ' aCompareDict: The key,value pairs that should be used in the comparison
    ' aPrefix: "rad" for the RADIA files, "cmp" for the compare files
    ' anIsDebug: A flag for outputting debug messages to STDERR
    inputFileHandler = radiaUtil.get_read_fileHandler(anInputFilename)
    outputDict = {}
    for line in inputFileHandler:
        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")
        #if (anIsDebug):
        #    logging.debug("Validation Line: %s", line)
        # if it is an empty line, then just continue
        if (line.isspace()):
        # these lines are from previous scripts in the pipeline, so skip them    
        elif (line.startswith("#")):

        # this is a header line, so skip it   
        elif (line.startswith("chrom")):
        # now we are to the data
            # split the line on the tab
            splitLine = line.split("\t")
            # get the fields to yield
            # columnHeaders = ["chrom", "chr_start", "chr_stop", "ref", "var", "source", "val_result"]
            # these are 0-based
            chrom = splitLine[0]
            #startCoordinate = splitLine[1]
            stopCoordinate = splitLine[2]
            #ref = splitLine[3]
            #variantAllele = splitLine[4]
            #center = splitLine[5]
            #valResult = splitLine[6]
            # add the coordinate to the output
            outputDict[chrom + "_" + stopCoordinate] = line
            # keep track of the number of total events per file 
            aStatsDict[aPrefix + "_events"] += 1
            # all events are considered passing events
            aStatsDict[aPrefix + "_pass_events"] += 1
            # keep track of the total number of comparison events (blck, dnSnp, etc.) per file
            # their can be multiple keys for one filter such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                if (aPrefix == "cmp"):
                    # break up the string to get the individual keys
                    cmpKeyList = cmpKeyString.split(",")
                    # search for each one of them
                    for cmpKey in cmpKeyList:
                        # if we find one
                        if (cmpKey in line):
                            # count it using the keyString
                            aStatsDict[aPrefix + "_" + cmpKeyString] += 1
                            # if this is a passing line, call it using the keyString
                            if (cmpKey in line):
                                aStatsDict[aPrefix + "_pass_" + cmpKeyString] += 1
                            # only count it once
    return (outputDict, aStatsDict)
def parse_blat_output(aBlatFile, anOutputFormat, anIsDebug):
    ' This function parses the output from BLAT.  Two formats are supported:
    ' BLAST NCBI-8 and PSL.  It groups all of the information from one query
    ' sequence and uses the python generator to yield the information.  It
    ' ignores empty lines and strips trailing \r\n characters.
    ' aBlatFile:         A output file from BLAT
    ' anOutputFormat:    BLAST or PSL
    ' anIsDebug:         A flag for outputting debug messages to STDERR

    # open the file
    fileHandler = radiaUtil.get_read_fileHandler(aBlatFile)
    blatHitsDict = collections.defaultdict(list)
    previousPrefix = ""

    for line in fileHandler:

        # if it is an empty line, then just continue
        # if is is a header line, then just continue
        if (line.isspace() or line.startswith("#")):

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        # if (anIsDebug):
        #    logging.debug("BLAT: %s", line)

        # split the line on the tab
        splitLine = line.split("\t")

        # get the coordinate data =
        # rnaTumor_7_55196749_HS2144:2:1108:17342:164248
        if (anOutputFormat == "PSL"):
            # the PSL output has a bunch of header lines that we want to skip
            # if the first column can't be converted into an int, then skip
            except ValueError:
            blatId = splitLine[9]
        elif (anOutputFormat == "BLAST"):
            blatId = splitLine[0]

        prefix = "_".join(blatId.split("_")[0:3])
        readId = "_".join(blatId.split("_")[0:4])

        # this catches all of the matches except the first one
        if (prefix == previousPrefix):
            if (anIsDebug):
                logging.debug("prefixes match, current=%s, prev=%s",
                              prefix, previousPrefix)
        # if the prefixes don't match and the blatHitsDict is not empty:
        # we've reached a new set of blat hits, so yield the previous ones
        elif blatHitsDict:
            if (anIsDebug):
                logging.debug("new prefix=%s, prev=%s", prefix, previousPrefix)
                logging.debug("yielding len blatHits=%s", len(blatHitsDict))
            # yield the blat hits for this prefix
            yield blatHitsDict
            # clear the blat hits dict for the next matches
            # set the previous prefix and start filling
            # the dict with the first prefix
            previousPrefix = prefix
            if (anIsDebug):
                logging.debug("after yield current=%s, prev=%s",
                              prefix, previousPrefix)
        # if the prefixes don't match, and the blatHitsDict is empty:
        # this is the first line of the VCF, set the previous prefix
        # and add it to the blatHitsDict
            previousPrefix = prefix

    # this one is needed to yield the very last blatHitsDict when all
    # lines of the VCF have been processed
    yield blatHitsDict
Exemplo n.º 27
def get_vcf_data(anId, anInputDir, anIsDebug):

    # for each file that starts with this id
    # load the first file to get the header
    # get the coordinates for all

    processedHeader = False
    headerDict = dict()
    headerDict["metadata"] = list()
    headerDict["format"] = list()
    headerDict["info"] = list()
    headerDict["filter"] = list()
    headerDict["chrom"] = list()
    coordinateDict = dict()
    coordinateDict["numbers"] = dict()
    coordinateDict["letters"] = dict()

    # if the input directory doesn't end with a forward slash,
    # then add one so that glob.glob will work
    if (not anInputDir.endswith("/")):
        anInputDir = anInputDir + "/"

    # for each vcf file
    # they might be gzipped, they might not
    for vcfFile in (glob.glob(anInputDir + anId + "_chr*.vcf*")):

        # open the file
        vcfFileHandler = radiaUtil.get_read_fileHandler(vcfFile)

        for line in vcfFileHandler:

            # if it is an empty line, then just continue
            if (line.isspace()):

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            if (anIsDebug):
                logging.debug("vcfLine: %s", line)

            # if we haven't processed the header yet, then do it here
            if (not processedHeader):
                # extract the metadata
                if (line.startswith("##FORMAT")):
                elif (line.startswith("##INFO")):
                elif (line.startswith("##FILTER")):
                elif (line.startswith("##")):
                elif (line.startswith("#CHROM")):
                    # now we've processed the header
                    processedHeader = True

            if (line.startswith("#")):
                # split the line on the tab
                splitLine = line.split("\t")

                # the coordinate is the second element
                chrom = splitLine[0]

                # we want to sort everything at the end, so keep track
                # of the chroms that are numbers and letters separately
                if (is_number(chrom)):
                    if chrom not in coordinateDict["numbers"]:
                        coordinateDict["numbers"][chrom] = list()
                    if chrom not in coordinateDict["letters"]:
                        coordinateDict["letters"][chrom] = list()

        # close the file and move onto the next one

    return (headerDict, coordinateDict)