Exemplo n.º 1
0
class CasperQuick:
    def __init__(self, casper_seq_file, output_file_path, ofa):
        self.csffile = casper_seq_file
        self.ST = SeqTranslate()
        self.allTargets = {}
        self.location = tuple()
        self.output = output_file_path
        self.off_target_all = ofa

    def loadGenesandTargets(self, rk):
        region_keggs = rk
        for region_kegg in region_keggs:
            self.allTargets[str(region_kegg)] = list()
            if type(region_kegg) == tuple:
                self.location = region_kegg
            else:
                k = Kegg()
                self.location = k.gene_locator(region_kegg)
            myfy = open(self.csffile)
            while True:
                line = myfy.readline()
                if line == '':
                    break
                if line.find('CHROMOSOME') != -1:
                    s = line.find("#")
                    if line[s + 1:-1] == str(
                            self.location[0]
                    ):  # checks to see if it is on the right chromosome
                        curpos = int()
                        while curpos < int(self.location[1]):
                            line = myfy.readline()
                            curpos = self.ST.decompress64(line.split(',')[0])
                        while curpos < int(self.location[2]):
                            line = self.ST.decompress_csf_tuple(
                                myfy.readline()[:-1])
                            curpos = line[0]
                            self.allTargets[str(region_kegg)].append(line)
                        break
            myfy.close()

        self.printoutresultstofile()

    def printoutresultstofile(self):
        out = self.output + "quickresults.txt"
        f = open(out, 'w')
        for item in self.allTargets.keys():
            f.write(item)
            f.write('\n')
            for target in self.allTargets[item]:
                insert = str(target[0]) + "," + str(target[1]) + "," + str(
                    target[2]) + '\n'
                f.write(insert)
        f.close()
Exemplo n.º 2
0
class OffTargetAlgorithm:
    def __init__(self, threshold, endo, base_org, csf_file, other_orgs,
                 casperofflist, output_path):
        self.ST = SeqTranslate()
        self.rSequences = []
        self.get_rseqs(casperofflist)
        self.mypath = csf_file[:csf_file.find(base_org)]
        self.ref_genomes = [base_org]
        self.ref_genomes += other_orgs
        self.endo = endo
        self.threshold = threshold
        self.dSequence = str(
        )  # global to class so that all scoring functions can use it

        # This is for autofilling the HsuMatrix
        self.matrixKeys = [
            "GT", "AC", "GG", "TG", "TT", "CA", "CT", "GA", "AA", "AG", "TC",
            "CC"
        ]
        self.matrix = {}
        self.fill_matrix()

        # This is where the data is stored before it is written
        self.output_data = dict()
        for myseq in self.rSequences:
            self.output_data[myseq[0]] = list()

        # BEGIN RUNNING THROUGH SEQUENCES
        for sequence in self.rSequences:
            print(sequence)
            for genome in self.ref_genomes:
                f = open(self.mypath + genome + self.endo + ".cspr", 'r')
                while True:
                    line = f.readline()
                    if line.find("CHROMOSOME") != -1:
                        curchrom = line[line.find("#") + 1:-1]
                        print("Finished checking " + curchrom)
                    else:
                        if line[0:-1] == "REPEATS":
                            break
                        # Checks for a signifcant number of mismatches:
                        #locseq = line[:-1].split(",")
                        if self.critical_similarity(
                                sequence[0],
                                self.ST.decompress_csf_tuple(line)[1]):
                            # This is where the real fun begins: off target analysis
                            print('found a similarity')
                            seqscore = self.get_scores(
                                sequence[1],
                                self.ST.decompress_csf_tuple(line)[1])
                            if seqscore > self.threshold:
                                self.output_data[sequence[0]].append(
                                    (str(curchrom),
                                     self.ST.decompress_csf_tuple(line[:-1]),
                                     int(seqscore * 100), genome))

        # END SEQUENCES RUN
        # Output the data acquired:
        out = open(
            output_path + "off_results" + str(datetime.datetime.now().time()) +
            '.txt', 'w')
        out.write(
            "Off-target sequences identified.  Scores are between O and 1.  A higher value indicates greater"
            "probability of off-target activity at that location.\n")
        for sequence in self.output_data:
            out.write(sequence + "\n")
            for off_target in self.output_data[sequence]:
                outloc = off_target[0] + "," + str(
                    off_target[1][0]) + "," + off_target[1][1]
                out.write(off_target[3] + "," + outloc + "\t" +
                          str(off_target[2] / 100) + '\n')
        out.close()

    def get_rseqs(self, offlist):
        targets = list()
        cofile = open(offlist, 'r')
        cofile.readline()
        while True:
            t = cofile.readline()[:-1]
            if t == 'EN':
                break
            targets.append(t)
        for tar in targets:
            compseed = self.ST.compress(tar[:16], 64)
            comptail = self.ST.compress(tar[16:], 64)
            compressed = compseed + "." + comptail
            rseq = ""
            for nt in tar[0:-1]:
                rseq = nt + rseq
            self.rSequences.append([tar, rseq])

    def get_scores(self, rseq, dseq):
        self.dSequence = Seq(dseq, IUPAC.unambiguous_dna).reverse_complement()
        hsu = self.get_hsu_score(rseq)
        qual = self.get_qualt_score(rseq)
        step = self.qualt_step_score(rseq)
        output = ((math.sqrt(hsu) + step) + pow(qual, 6))
        return output

    def fill_matrix(self):
        f = open('CASPERinfo', 'r')
        l = " "
        while True:
            l = f.readline()
            if l[0] == "H":
                break
        i = 0
        l = f.readline()
        while l[0] != '-':
            values = l.split("\t")
            self.matrix[self.matrixKeys[i]] = values
            i += 1
            l = f.readline()
        for element in self.matrix:
            self.matrix[element][18] = self.matrix[element][18][0:-1]

    def get_hsu_score(self, rSequence):
        score = 1.0
        for i in range(0, 19):
            rnt = rSequence[i]
            dnt = self.dSequence[i]
            lookup = str(rnt) + str(dnt)
            if lookup in self.matrixKeys:
                hsu = self.matrix[lookup][18 - i]
                score *= float(hsu)
        return score

    def get_qualt_score(self, rSequence):
        score = 3.5477
        for i in range(0, 19):
            lookup = rSequence[i] + self.dSequence[i]
            if lookup in self.matrixKeys:
                score -= 1.0 / (i + 1)
        return score / 3.5477

    def qualt_step_score(self, rSequence):
        score = 1.0
        for i in range(0, 19):
            lookup = rSequence[i] + self.dSequence[i]
            if lookup in self.matrixKeys:
                if i < 6:
                    score -= 0.1
                elif i < 12:
                    score -= 0.05
                elif i < 20:
                    score -= 0.0125
        return score

    def separation_score(self, rSequence):
        misses = []
        delta = 0
        for i in range(0, 19):
            lookup = rSequence[i] + self.dSequence[i]
            if lookup in self.matrixKeys:
                misses.append(i)
        if len(misses) == 2:
            delta = (misses[1] - misses[0]) / 2.0
        if len(misses) == 3:
            delta = ((misses[1] - misses[0]) + (misses[2] - misses[1])) / 3.0
        if len(misses) == 4:
            delta = ((misses[1] - misses[0]) + (misses[2] - misses[1])) / 3.0
        retval = 1.0 - (delta / 19.0)
        return retval

    # If there is more than four mismatches it returns false, else it will return true
    def critical_similarity(self, cseq1, cseq2):
        mismatches = 0
        lim = min([len(cseq1), len(cseq2)])
        check = True
        for i in range(
                lim
        ):  # Doesn't matter whether you use cseq1 or cseq2 they are the same length
            if cseq1[i] != cseq2[i]:
                mismatches += 1
                if mismatches == 5:
                    check = False
                    break
        return check

    def int_to_char(self, i):
        switcher = {0: 'A', 1: 'T', 2: 'C', 3: 'G'}
        return switcher[i]

    def char_to_int(self, c):
        switcher = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
        return switcher[c]
Exemplo n.º 3
0
class CSPRparser:
    #default ctor: currently just sets the file name and initializes all of the variables I will be using
    def __init__(self, inputFileName):

        # variables used in this class
        self.multiSum = 0  #multitargetting sum taken from the previous version of make_graphs
        self.multiCount = 0  #multitargetting count taken from the previous version of make_graphs
        self.seqTrans = SeqTranslate(
        )  #SeqTranslate variable. for decrompressing the data
        self.chromesomeList = list(
        )  # list of a list for the chromesomes. As it currently stands, this variable is used in both read_chromesomes and in read_targets
        self.karystatsList = list(
        )  # list of (ints) of the karyStats (whatever those are) to be used for the get_chrom_length function
        self.genome = ""  # genome name
        self.misc = ""  # anything from the misc line
        self.repeats = {
        }  #dictionary of the number of repeats. See the read_repeats function for more info
        self.seeds = {
        }  #dictionary of which chromesomes are repeats. See the read_repeats function for more info
        self.dec_tup_data = {}
        self.chromesomesSelectedList = list()
        # data for population analysis
        # dict:
        # key = the seed
        #       value = tuple (org name, chom #, location, sequence, pam, score, strand, endo)
        self.popData = {}

        #file path variable
        self.fileName = inputFileName

    # this is the parser that is used for the gen_lib window
    # it returns a list of lists, essentially all of the chromosomes in the file, and their data
    # to make it faster, this now uses read_targets
    def gen_lib_parser(self, genDict, endo):
        retDict = dict()

        #for item in genDict:
        #   retList.append((list()))

        for gene in genDict:
            retDict[gene] = list()
            retDict[gene] = self.read_targets(
                '', (genDict[gene][0], genDict[gene][1], genDict[gene][2]),
                endo)
        return retDict

    #this function reads the first 3 lines of the file: also stores the karyStats in a list of ints
    def read_first_lines(self):
        fileStream = open(self.fileName, 'r')

        #read and parse the genome line
        self.genome = fileStream.readline()
        colonIndex = self.genome.find(':') + 2
        buffer1 = self.genome[colonIndex:]
        self.genome = buffer1

        #read and store the karystats line on its own, it is parsed down below
        buffer = fileStream.readline()

        #read and parse the misc line
        self.misc = fileStream.readline()
        colonIndex = self.misc.find(':') + 2
        buffer1 = self.misc[colonIndex:]
        self.misc = buffer1

        #now parse the karystats line
        #ignore the first bit of the string. only care about what's after the colon
        colonIndex = buffer.find(':') + 2

        #parse the line, store the numbers in the list
        for i in range(colonIndex, len(buffer)):
            bufferString1 = ""
            if buffer[i] == ',':
                bufferString1 = buffer[colonIndex:i]
                #print(bufferString1)
                colonIndex = i + 1
                self.karystatsList.append(int(bufferString1))

        fileStream.close()
        #print(self.karystatsList)

    # this function gets the chromesome names out of the CSPR file provided
    # returns the gene line, and the misc line as well
    # also stores the Karystats
    def get_chromesome_names(self):
        self.read_first_lines()
        self.chromesomesSelectedList.clear()

        fileStream = open(self.fileName, 'r')

        retGen = fileStream.readline()
        junk = fileStream.readline()
        retMisc = fileStream.readline()

        buffer = fileStream.readline()

        while True:  # breaks out when the buffer line = REPEATS
            if buffer == 'REPEATS\n':
                break
            elif '>' in buffer:
                self.chromesomesSelectedList.append(buffer)
            buffer = fileStream.readline()

        return retGen, retMisc

#this function reads all of the chromosomes in the file
#stores the data into a list of lists. So the line starting with '>' is the first index of each sub list

    def read_chromesome(self, endo):
        self.chromesomeList.clear()
        tempList = list()
        fileStream = open(self.fileName, 'r')

        #ignore the first 3 lines
        fileStream.readline()
        fileStream.readline()
        fileStream.readline()

        bufferString = fileStream.readline()
        while (True):  #this loop breaks out when bufferString is REPEATS
            tempList.append(bufferString)

            if (bufferString == "REPEATS\n"):
                break
            bufferString = fileStream.readline()
            while (True):  #this loop breaks out when bufferString[0] is >
                if (bufferString == "REPEATS\n"):
                    self.chromesomeList.append(tempList)
                    tempList = []
                    break

                elif (
                        bufferString[0] == '>'
                ):  #if we get to the next chromesome, append the tempList, clear it, and break
                    self.chromesomeList.append(tempList)
                    tempList = []
                    break
                else:  #else decompress the data, and append it to the list
                    bufferString = self.seqTrans.decompress_csf_tuple(
                        bufferString, endo=endo)
                    tempList.append(bufferString)
                    #print(bufferString)
                    bufferString = fileStream.readline()
        fileStream.close()

########################################################################################################
#    this function reads just the repeats
#    it stores this data in 2 dictionaries:
#        repeats dictionary is the number of dictionaries
#               key = the seed, and the value is the number of repeats
#        seeds dictionary is each seed that is repeated
#           key =  the seeds, and the value is the actual chromesome that is repeated
#    this function also stores the sum and count in the class itself as well
#    this function is very similar to what make_graphs in Multitargeting.py was doing before
########################################################################################################

    def read_repeats(self, endoChoice):
        index = 0

        seedLength = int(self.seqTrans.endo_info[endoChoice][1])

        #clear what is already in there
        self.repeats.clear()
        self.seeds.clear()

        # only read the repeats section of the file
        fileStream = open(self.fileName, 'r')
        buf = fileStream.readline()
        while buf != "REPEATS\n":
            buf = fileStream.readline()
        split_info = fileStream.read().split('\n')
        fileStream.close()

        #parse the info now and store it in the correct dictionaries
        while (index + 1 < len(split_info)):
            seed = self.seqTrans.decompress64(split_info[index],
                                              slength=seedLength)
            repeat = split_info[index + 1].split("\t")

            self.repeats[seed] = 0
            self.seeds[seed] = []
            self.dec_tup_data[seed] = []
            for item in repeat:
                #print(self.seqTrans.decompress_csf_tuple(item, endo=endoChoice, bool=True))
                if item != "":
                    self.repeats[seed] += 1
                    sequence = item.split(',')
                    self.seeds[seed].append(sequence)
                    temp = sequence[1:4]

                    #print(seed)
                    #print(str(self.seqTrans.compress(seed,64)))
                    #print(temp[1])

                    #temp[1] = str(self.seqTrans.compress(seed,64)) + str(temp[1])
                    #print(temp)

                    temp.append(
                        str(
                            self.seqTrans.decompress64(
                                seed, toseq=True, slength=int(seedLength))))
                    #print(temp)
                    string = ",".join(temp)
                    #print(string)
                    #print('\t', self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice))
                    self.dec_tup_data[seed].append(
                        self.seqTrans.decompress_csf_tuple(string,
                                                           bool=True,
                                                           endo=endoChoice))
                    self.multiSum += self.seqTrans.decompress64(
                        sequence[3], slength=seedLength)
                    self.multiCount += 1

            index = index + 2

    # this function takes a list of all the file names
    # it finds the repeats for each file, and also checks to see if those repeats are in each file, not just the first
    # stores the data in a class object
    def popParser(self, cspr_file, endoChoice):
        self.popData.clear()
        seedLength = self.seqTrans.endo_info[endoChoice][1]

        referenceList = list()

        # skip the junk
        file_stream = open(cspr_file, 'r')
        genomeLine = file_stream.readline()
        file_stream.readline()

        # parse the genome line
        genomeLine = genomeLine.split(',')
        retNumber = int(genomeLine[len(genomeLine) - 1])

        # parse the miscalleneous line and get the data we want out of it
        misc_line = file_stream.readline()
        colonIndex = misc_line.find(':') + 2
        usefulData = misc_line[colonIndex:]

        usefulData = usefulData.split('|')
        usefulData.pop()

        i = 0
        while i < len(usefulData):
            temp = usefulData[i].split(',')
            referenceList.append((temp[0], temp[1]))
            i += 1

        buf = file_stream.readline()
        while buf != 'REPEATS\n':
            buf = file_stream.readline()

        split_info = file_stream.read().split('\n')
        file_stream.close()

        index = 0
        while (index + 1 < len(split_info)):
            # get the seed and repeat line
            seed_d = self.seqTrans.decompress64(split_info[index],
                                                slength=int(seedLength),
                                                toseq=True)
            repeat = split_info[index + 1].split('\t')

            # if the seed is not in the dict, put it in there
            if seed_d not in self.popData:
                self.popData[seed_d] = list()

            for item in repeat:
                if item != '':
                    commaIndex = item.find(',')
                    chrom = item[:commaIndex]
                    sequence = item.split(',')
                    temp = sequence[1:4]
                    temp.append(str(seed_d))
                    string = ",".join(temp)
                    tempTuple = self.seqTrans.decompress_csf_tuple(
                        string, bool=True, endo=endoChoice)
                    orgName = referenceList[int(chrom) - 1][0]

                    storeTuple = (
                        orgName,
                        chrom,
                        tempTuple[0],
                        tempTuple[1],
                        tempTuple[2],
                        tempTuple[3],
                        tempTuple[4],
                        tempTuple[5],
                    )

                    self.popData[seed_d].append(storeTuple)

            index += 2

        return retNumber, referenceList
        """
        # for each file given
        for count in range(len(file_list)):

            # open the file and get the orgName
            fileStream = open(file_list[count], 'r')
            buf = fileStream.readline()
            colonIndex = buf.find(':')
            orgName = buf[colonIndex + 2:]
            orgName = orgName.replace('\n', '')
            print(orgName)

            # now skip until the repeats section
            while buf != 'REPEATS\n':
                buf = fileStream.readline()

            # read the whole repeats section
            split_info = fileStream.read().split('\n')
            fileStream.close()

            index = 0
            seedLength = self.seqTrans.endo_info[endoChoice][1]
            while (index + 1 < len(split_info)):
                # get the seed and repeat line
                seed_d = self.seqTrans.decompress64(split_info[index], slength=int(seedLength), toseq=True)
                repeat = split_info[index + 1].split("\t")

                # if the seed is not in the dict, put it in there
                if seed_d not in self.popData:
                    self.popData[seed_d] = list()


                # go through and append each line
                for item in repeat:
                    if item != "":
                        # get the chromosome number
                        commaIndex = item.find(',')
                        chrom = item[:commaIndex]
                        # from read_repeats
                        sequence = item.split(',')
                        temp = sequence[1:4]
                        temp.append(str(seed_d))
                        string = ",".join(temp)
                        tempTuple = self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice)

                        # store what we need
                        storeTuple = (orgName, chrom,  tempTuple[0], tempTuple[1], tempTuple[2], tempTuple[3], tempTuple[4], tempTuple[5],)
                        #storeTuple = (orgName, chrom, temp)

                        # append it
                        self.popData[seed_d].append(storeTuple)
                index += 2
            split_info.clear()
        """

    #this function just reads the whole file
    def read_all(self):
        print("Reading First Lines.")
        self.read_first_lines()
        print("Reading Chromesomes.")
        self.read_chromesome()
        print("Reading Repeats.")
        self.read_repeats()

    #this functions reads the entirety of the file into one string
    def get_whole_file(self):
        fileStream = open(self.fileName)
        fileData = fileStream.read()
        fileStream.close()
        return (fileData)

    #this function reads all of the targets in the file. It is essentially a copy of get_targets from the results.py file, written by Brian Mendoza
    def read_targets(self, genename, pos_tuple, endo):
        #open the file, and store the genome and the misc tags.
        #Note: The KARYSTATS is not stored at all. This should not be hard to implement if it is needed
        fileStream = open(self.fileName)
        self.genome = fileStream.readline()
        fileStream.readline()
        retList = list()
        self.misc = fileStream.readline()

        header = fileStream.readline()

        # get the sequence length for the decompressor
        seqLength = self.seqTrans.endo_info[endo][2]
        # Find the right chromosome:
        while True:
            # quick error check so the loop eventually breaks out if nothing is found
            if header == "":
                print("Error: the target could not be found in this file!")
                break
            # in the right chromosome/scaffold?
            if header.find("(" + str(pos_tuple[0]) + ")") != -1:
                while True:
                    # Find the appropriate location by quickly decompressing the location at the front of the line
                    myline = fileStream.readline()
                    if self.seqTrans.decompress64(
                            myline.split(",")[0],
                            slength=seqLength) >= pos_tuple[1]:
                        while self.seqTrans.decompress64(
                                myline.split(",")[0],
                                slength=seqLength) < pos_tuple[2]:
                            retList.append(
                                self.seqTrans.decompress_csf_tuple(myline,
                                                                   endo=endo))
                            myline = fileStream.readline()
                    else:
                        continue
                    break
                break
            else:
                header = fileStream.readline()
        fileStream.close()

        return retList

    def uniq_seq_count(self):
        self.unique_targets = 0
        for chromo in self.chromesomeList:
            for data in chromo:
                if len(data) == 6:
                    self.unique_targets += 1
        return self.unique_targets