def __init__(self, casper_seq_file, output_file_path, ofa): self.csffile = casper_seq_file self.ST = SeqTranslate() self.allTargets = {} self.location = tuple() self.output = output_file_path self.off_target_all = ofa
def __init__(self, inputFileName): # variables used in this class self.multiSum = 0 #multitargetting sum taken from the previous version of make_graphs self.multiCount = 0 #multitargetting count taken from the previous version of make_graphs self.seqTrans = SeqTranslate( ) #SeqTranslate variable. for decrompressing the data self.chromesomeList = list( ) # list of a list for the chromesomes. As it currently stands, this variable is used in both read_chromesomes and in read_targets self.karystatsList = list( ) # list of (ints) of the karyStats (whatever those are) to be used for the get_chrom_length function self.genome = "" # genome name self.misc = "" # anything from the misc line self.repeats = { } #dictionary of the number of repeats. See the read_repeats function for more info self.seeds = { } #dictionary of which chromesomes are repeats. See the read_repeats function for more info self.dec_tup_data = {} self.chromesomesSelectedList = list() # data for population analysis # dict: # key = the seed # value = tuple (org name, chom #, location, sequence, pam, score, strand, endo) self.popData = {} #file path variable self.fileName = inputFileName
def __init__(self): # qt stuff super(genLibrary, self).__init__() uic.loadUi('library_prompt.ui', self) self.setWindowTitle('Generate Library') self.setWindowIcon(Qt.QIcon('cas9image.png')) # button connections self.cancel_button.clicked.connect(self.cancel_function) self.BrowseButton.clicked.connect(self.browse_function) self.submit_button.clicked.connect(self.submit_data) self.progressBar.setValue(0) # variables self.anno_data = dict() self.cspr_file = '' self.parser = CSPRparser('') self.kegg_nonKegg = '' self.gen_lib_dict = dict() self.S = SeqTranslate() self.cspr_data = dict() self.Output = dict() self.off_tol = .05 self.off_max_misMatch = 4 self.off_target_running = False # set the numbers for the num genes combo box item for i in range(10): self.numGenescomboBox.addItem(str(i + 1)) # set the numbers for the minOn combo box for i in range(19, 70): self.minON_comboBox.addItem(str(i + 1))
def __init__(self, parent=None): super(Multitargeting, self).__init__() uic.loadUi('multitargetingwindow.ui', self) self.setWindowIcon(QtGui.QIcon("cas9image.png")) # Storage containers for the repeats and seed sequences self.sq = SeqTranslate() # SeqTranslate object used in class # Initializes the three graphs self.chart_view_chro_bar = QChartView() self.chart_view_repeat_bar = QChartView() self.chart_view_repeat_line = QChartView() self.data = "" self.shortHand = "" self.chromo_length = list() # Listeners for changing the seed sequence or the .cspr file self.max_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom) self.min_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom) self.chromo_seed.currentIndexChanged.connect(self.chro_bar_data) self.Analyze_Button.clicked.connect(self.make_graphs) #go back to main button self.back_button.clicked.connect(self.go_back) #Tool Bar options self.actionCASPER.triggered.connect(self.changeto_main) # Statistics storage variables self.max_repeats = 1 self.average = 0 self.median = 0 self.mode = 0 self.average_unique = 0 self.average_rep = 0 self.bar_coords = [] self.seed_id_seq_pair = {} self.positions = [] #parser object self.parser = CSPRparser("") self.ready_chromo_min_max = True self.ready_chromo_make_graph = True self.directory = 'Cspr files' self.info_path = os.getcwd() ################################## self.scene = QtWidgets.QGraphicsScene() self.graphicsView.setScene(self.scene) self.scene2 = QtWidgets.QGraphicsScene() self.graphicsView_2.setScene(self.scene2) self.graphicsView.viewport().installEventFilter(self)
class CasperQuick: def __init__(self, casper_seq_file, output_file_path, ofa): self.csffile = casper_seq_file self.ST = SeqTranslate() self.allTargets = {} self.location = tuple() self.output = output_file_path self.off_target_all = ofa def loadGenesandTargets(self, rk): region_keggs = rk for region_kegg in region_keggs: self.allTargets[str(region_kegg)] = list() if type(region_kegg) == tuple: self.location = region_kegg else: k = Kegg() self.location = k.gene_locator(region_kegg) myfy = open(self.csffile) while True: line = myfy.readline() if line == '': break if line.find('CHROMOSOME') != -1: s = line.find("#") if line[s + 1:-1] == str( self.location[0] ): # checks to see if it is on the right chromosome curpos = int() while curpos < int(self.location[1]): line = myfy.readline() curpos = self.ST.decompress64(line.split(',')[0]) while curpos < int(self.location[2]): line = self.ST.decompress_csf_tuple( myfy.readline()[:-1]) curpos = line[0] self.allTargets[str(region_kegg)].append(line) break myfy.close() self.printoutresultstofile() def printoutresultstofile(self): out = self.output + "quickresults.txt" f = open(out, 'w') for item in self.allTargets.keys(): f.write(item) f.write('\n') for target in self.allTargets[item]: insert = str(target[0]) + "," + str(target[1]) + "," + str( target[2]) + '\n' f.write(insert) f.close()
def decode_targets(self): f = open(self.filename) # make sure to recognize chromosome number data = f.readline()[:-1] while data != "REPEATS": data = f.readline()[:-1] # parse location and sequence midpoint = data.find(',') location = data[:midpoint] sequence = data[midpoint + 1:] # decompress the location and sequence information s = SeqTranslate() location = s.decompress64(location, toseq=False) sequence = s.decompress64(sequence, toseq=True) # add location to storage vector self.targets.append((location, sequence))
def __init__(self, output_path, base_org_path, base_org, endo, other_genomes, csize): # initialize SeqTranslate object self.ST = SeqTranslate() self.output_path = output_path # my_orgs contains just the self.organisms = other_genomes self.organisms.append(base_org) self.organisms = sorted(self.organisms) self.db_path = base_org_path # This sets the size of the subsets. NOTE: DO NOT SET THIS TO A LARGE NUMBER IF THERE ARE A LOT OF ORGANISMS self.combo_size = csize # Dictionary of dictionaries. Key1: generic total sequence Key2: org Value: position self.searchableseqs = {} # Container that stores all the sequences seen the combination of organisms defined by the key # An example key would be (sce, yli) for the shared sequences between S.cerevisiae and Y.lipolytica self.buckets = {} # Intitialize the self.buckets container to contain the tuple of every organism subset for i in range(2, csize+1): for subset in itertools.combinations(self.organisms, i): self.buckets[subset] = [] print(subset) self.endo = endo # The object that is iterated over to decompress the output into readable form self.compressed_output = {} # Generates the sequence lists for org in self.organisms: print(org) self.make_lists(org) # Runs the comparison self.create_comparison() self.write_to_file()
def __init__(self, info_path): super(NewGenome, self).__init__() uic.loadUi('NewGenome.ui', self) self.setWindowTitle('New Genome') self.k = KEGG() self.info_path = info_path #---Button Modifications---# self.setWindowIcon(Qt.QIcon("cas9image.png")) self.whatsthisButton.clicked.connect(self.whatsthisclicked) self.KeggSearchButton.clicked.connect(self.updatekegglist) self.resetButton.clicked.connect(self.reset) self.submitButton.clicked.connect(self.submit) self.browseForFile.clicked.connect(self.selectFasta) self.NCBI_File_Search.clicked.connect(self.prep_ncbi_search) self.JobsQueueBox.setReadOnly(True) self.output_browser.setText("Waiting for program initiation...") self.CompletedJobs.setText(" ") self.contButton.clicked.connect(self.continue_to_main) self.comboBoxEndo.currentIndexChanged.connect(self.endo_settings) self.runButton.clicked.connect(self.run_jobs) self.clearButton.clicked.connect(self.clear_job_queue) self.viewStatButton.setEnabled(False) self.JobsQueue = [] # holds Job classes. self.Endos = dict() self.file = "" self.process = QtCore.QProcess() self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels) self.process.finished.connect(self.upon_process_finishing) self.seqTrans = SeqTranslate() self.first = False #show functionalities on window self.fillEndo() #self.show() self.num_chromo_next = False
def get_instances(self): ST = SeqTranslate() os.chdir(path) f = open(self.file_name, 'r') while True: x = f.readline() if x == 'REPEATS\n': print("reached repeat sequences") break while True: t = f.readline() if t == 'END_OF_FILE': print("reached end of repeat sequences") break ukey = t[:-1] # takes away the "\n" in the string key = ST.decompress64(ukey, slength=20, toseq=True) key = ST.fill_As(key, 16) self.BAD_instances[key] = list() # Add sequences and locations to the list v = f.readline().split('\t')[:-1] for item in v: loctup = item.split(',') chrom = loctup[0] location = ST.decompress64(loctup[1]) seq = ST.decompress64(loctup[2][1:], slength=20, toseq=True) seq = ST.fill_As( seq, 4 ) # when A's get lost in the compression this fills them back in mytup = (chrom, location, seq) self.BAD_instances[key].append(mytup) f.close() print("currently sorting") for key in self.BAD_instances: size = len(self.BAD_instances[key]) newtuple = (key, self.BAD_instances[key], size ) # sequence, location, size self.sorted_instances.append(newtuple)
class Multitargeting(QtWidgets.QMainWindow): BAD_instances = {} sorted_instances = [] def __init__(self, parent=None): super(Multitargeting, self).__init__() uic.loadUi('multitargetingwindow.ui', self) self.setWindowIcon(QtGui.QIcon("cas9image.png")) # Storage containers for the repeats and seed sequences self.sq = SeqTranslate() # SeqTranslate object used in class # Initializes the three graphs self.chart_view_chro_bar = QChartView() self.chart_view_repeat_bar = QChartView() self.chart_view_repeat_line = QChartView() self.data = "" self.shortHand = "" self.chromo_length = list() # Listeners for changing the seed sequence or the .cspr file self.max_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom) self.min_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom) self.chromo_seed.currentIndexChanged.connect(self.chro_bar_data) self.Analyze_Button.clicked.connect(self.make_graphs) #go back to main button self.back_button.clicked.connect(self.go_back) #Tool Bar options self.actionCASPER.triggered.connect(self.changeto_main) # Statistics storage variables self.max_repeats = 1 self.average = 0 self.median = 0 self.mode = 0 self.average_unique = 0 self.average_rep = 0 self.bar_coords = [] self.seed_id_seq_pair = {} self.positions = [] #parser object self.parser = CSPRparser("") self.ready_chromo_min_max = True self.ready_chromo_make_graph = True self.directory = 'Cspr files' self.info_path = os.getcwd() ################################## self.scene = QtWidgets.QGraphicsScene() self.graphicsView.setScene(self.scene) self.scene2 = QtWidgets.QGraphicsScene() self.graphicsView_2.setScene(self.scene2) self.graphicsView.viewport().installEventFilter(self) def eventFilter(self, source, event): if (event.type() == QtCore.QEvent.MouseMove and source is self.graphicsView.viewport()): coord = self.graphicsView.mapToScene(event.pos()) first = True for i in self.bar_coords: ind = i[0] x = i[1] y1 = i[2] y2 = i[3] dups = 0 if ((coord.x() == x or coord.x() == x + 1 or coord.x() == x - 1) and (coord.y() >= y1 and coord.y() <= y2)): listtemp = [] for a in self.bar_coords: if (x == a[1] and y1 == a[2] and y2 == a[3]): listtemp.append(a) dups += 1 self.scene2 = QtWidgets.QGraphicsScene() self.graphicsView_2.setScene(self.scene2) #self.graphicsView_2.hide() output = str() i = 1 for item in listtemp: ind = item[0] seq = str(self.seq_data[ind]) seed_id = self.seed_id_seq_pair[seq] temp = self.parser.dec_tup_data[seed_id] temp = temp[ind] if len(listtemp) > 1 and i < len(listtemp): output += 'Location: ' + str( temp[0]) + ' | Seq: ' + str( temp[1]) + ' | PAM: ' + str( temp[2]) + ' | SCR: ' + str( temp[3]) + ' | DIRA: ' + str( temp[4]) + '\n' else: output += 'Location: ' + str( temp[0]) + ' | Seq: ' + str( temp[1]) + ' | PAM: ' + str( temp[2]) + ' | SCR: ' + str( temp[3]) + ' | DIRA: ' + str( temp[4]) i += 1 text = self.scene2.addText(output) #self.graphicsView_2.adjustSize() font = QtGui.QFont() font.setBold(True) font.setPointSize(9) text.setFont(font) return Qt.QWidget.eventFilter(self, source, event) def launch(self, path): os.chdir(path) self.directory = path self.get_data() self.make_graphs() def get_data(self): onlyfiles = [ f for f in os.listdir(self.directory) if os.path.isfile(os.path.join(self.directory, f)) ] print(onlyfiles) orgsandendos = {} shortName = {} self.endo_drop.clear() for file in onlyfiles: if file.find('.cspr') != -1: newname = file[0:-4] s = newname.split('_') hold = open(file) buf = (hold.readline()) species = buf[8:buf.find('\n')] endo = str(s[1]) if species not in shortName: shortName[species] = s[0] if species in orgsandendos: orgsandendos[species].append(endo) else: orgsandendos[species] = [endo] if self.organism_drop.findText(species) == -1: self.organism_drop.addItem(species) self.data = orgsandendos self.shortHand = shortName temp = self.data[str(self.organism_drop.currentText())] temp1 = [] for i in temp: i = i.strip('.') temp1.append(i) self.endo_drop.addItems(temp1) self.organism_drop.currentIndexChanged.connect(self.changeEndos) def changeEndos(self): self.endo_drop.clear() temp = self.data[str(self.organism_drop.currentText())] temp1 = [] for i in temp: i = i.strip('.') temp1.append(i) print(i) print(temp1) self.endo_drop.addItems(temp1) def make_graphs(self): #get the correct file name self.chromo_length.clear() file_name = self.shortHand[self.organism_drop.currentText( )] + "_" + self.endo_drop.currentText() if self.directory.find("/") != -1: file = (self.directory + "/" + file_name + ".cspr") else: file = (self.directory + "\\" + file_name + ".cspr") #set up parser, and get the repeats and carry stats self.parser.fileName = file print(self.endo_drop.currentText()) self.parser.read_repeats(self.endo_drop.currentText()) self.parser.read_chromesome(self.endo_drop.currentText()) self.parser.read_first_lines() self.chromo_length = self.parser.karystatsList #calculations and setting the windows self.average_rep = self.parser.multiSum / self.parser.multiCount self.plot_repeats_vs_seeds() self.bar_seeds_vs_repeats() self.fill_min_max() #self.chro_bar_data() self.nbr_seq.setText(str(len(self.parser.seeds))) self.nbr_unq.setText(str(self.parser.uniq_seq_count())) self.avg_rep.setText(str(self.average)) self.med_rep.setText(str(self.median)) self.mode_rep.setText(str(self.mode)) self.scr_lbl.setText(str(self.average_rep)) #fill in chromo bar visualization def chro_bar_data(self): if self.ready_chromo_make_graph == False: return dic_info = {} seqLength = int(self.sq.endo_info[self.endo_drop.currentText()][1]) for seed in self.parser.seeds: temp = seed temp1 = str( self.sq.decompress64(temp, slength=seqLength, toseq=True)) self.seed_id_seq_pair[temp1] = seed dic_info[temp1] = {} for repeat in self.parser.seeds[seed]: if repeat[0] in dic_info[temp1]: dic_info[temp1][repeat[0]].append( self.sq.decompress64(repeat[1])) else: dic_info[temp1][repeat[0]] = [ self.sq.decompress64(repeat[1]) ] self.chro_bar_create(dic_info) self.fill_Chromo_Text(dic_info) #fill in chromo bar visualization def fill_Chromo_Text(self, info): chromo_pos = {} self.seq_data = [] self.positions.clear() chomonum = 0 for chromo in info[self.chromo_seed.currentText()]: pos = [] for position in info[(self.chromo_seed.currentText())][chromo]: self.seq_data.append(self.chromo_seed.currentText()) test1 = position / self.chromo_length[int(chromo) - 1] test1 = int(test1 * 485) self.positions.append(test1) pos.append(test1) chromo_pos[chromo] = pos chomonum += 1 i = 0 self.scene = QtWidgets.QGraphicsScene() self.graphicsView.setScene(self.scene) self.bar_coords.clear() #clear bar_coords list before creating visual ind = 0 for chromo in chromo_pos: pen_blk = QtGui.QPen(QtCore.Qt.black) pen_red = QtGui.QPen(QtCore.Qt.red) pen_blk.setWidth(3) pen_red.setWidth(3) if i == 0: text = self.scene.addText(str(chromo)) text.setPos(0, 0) font = QtGui.QFont() font.setBold(True) font.setPointSize(10) text.setFont(font) self.scene.addRect(40, (i * 25), 525, 25, pen_blk) else: text = self.scene.addText(str(chromo)) font = QtGui.QFont() font.setBold(True) font.setPointSize(10) text.setFont(font) text.setPos(0, i * 25 + 10 * i) self.scene.addRect(40, (i * 25) + 10 * i, 525, 25, pen_blk) for k in chromo_pos[chromo]: line = self.scene.addLine(k + 40, (i * 25) + 3 + 10 * i, k + 40, (i * 25) + 22 + 10 * i, pen_red) temp = [ ] #used for storing coordinates and saving them in self.bar_coords[] temp.append(ind) #index value temp.append(k + 40) #x value temp.append((i * 25) + 3 + 10 * i) #y1 temp.append((i * 25) + 22 + 10 * i) #y2 self.bar_coords.append(temp) #push x, y1, and y2 to this list ind += 1 i = i + 1 #creates bar graph num of repeats vs. chromsome #this graphs is connected to the repeats_vs_chromo.py file #to represent the widget space in the UI file def chro_bar_create(self, info): x1 = [] y1 = [] lentemp = 0 for chromo in info[self.chromo_seed.currentText()]: y1.append(len(info[self.chromo_seed.currentText()][chromo])) x1.append(chromo) if (int(chromo) > lentemp): lentemp = int(chromo) #clear the old graph self.repeats_vs_chromo.canvas.axes.clear() #x_pos used to format the addition of more bars appropriately x_pos = [i for i, _ in enumerate(x1)] #loop fixes when there is too many xlabels and they start running together, #replaces some with an empty string to space out the labels if (len(x_pos) > 20): temp = 0 for i in x_pos: if (i == 0): temp += 1 else: if (temp < len(str(lentemp)) + 2): x1[i] = "" temp += 1 else: temp = 0 #the following statements are plottings / formatting for the graph self.repeats_vs_chromo.canvas.axes.bar(x_pos, y1, align='center') self.repeats_vs_chromo.canvas.axes.yaxis.set_major_locator( MaxNLocator(integer=True)) self.repeats_vs_chromo.canvas.axes.set_ylim(0, max(y1) + 1) self.repeats_vs_chromo.canvas.axes.set_xticks(x_pos) self.repeats_vs_chromo.canvas.axes.set_xticklabels(x1) self.repeats_vs_chromo.canvas.axes.set_xlabel('Chromosome') self.repeats_vs_chromo.canvas.axes.set_ylabel('Number of Repeats') #for loop below could be used to rotae labels for spacing #for tick in self.repeats_vs_chromo.canvas.axes.get_xticklabels(): # tick.set_rotation(90) self.repeats_vs_chromo.canvas.draw() #plots the sequences per Number Repeats bar graph #this graph is connected to the seeds_vs_repeats_bar.py file #to represent the wdiget space in the UI file def bar_seeds_vs_repeats(self): data = {} self.average = 0 for seed in self.parser.repeats: self.average += int(self.parser.repeats[seed]) number = self.parser.repeats[seed] if number in data: data[number] += 1 else: data[number] = 1 data = self.order_high_low_rep(data) self.average = round(self.average / (len(self.parser.repeats))) holder = [] repeats = [] max = 0 for number in data: if data[number] > max: max = data[number] if (data[number] / max) > .01: holder.append(data[number]) repeats.append(number) #clear graph space self.seeds_vs_repeats_bar.canvas.axes.clear() #xpos used to handle appropriate formatting for more bars being added in x_pos = [i for i, _ in enumerate(repeats)] #the following are plotting / formatting for the graph self.seeds_vs_repeats_bar.canvas.axes.bar(x_pos, holder) self.seeds_vs_repeats_bar.canvas.axes.set_xticks(x_pos) self.seeds_vs_repeats_bar.canvas.axes.set_xticklabels(repeats) self.seeds_vs_repeats_bar.canvas.axes.set_xlabel('Number of Repeats') self.seeds_vs_repeats_bar.canvas.axes.set_ylabel('Number of Sequences') self.seeds_vs_repeats_bar.canvas.axes.set_title( 'Number of Sequences per Number of Repeats') #rects are all the bar objects in the graph rects = self.seeds_vs_repeats_bar.canvas.axes.patches rect_vals = [] #this for loop will calculate the height and create an annotation for each bar for rect in rects: height = rect.get_height() temp = self.seeds_vs_repeats_bar.canvas.axes.text( rect.get_x() + rect.get_width() / 2, height, '%d' % int(height), ha='center', va='bottom') temp.set_visible(False) rect_vals.append(temp) #function used for when user cursor is hovering over the bar, if hovering over a bar, the #height annotatin will appear above the bar, otherwise it will be hidden def on_plot_hover(event): i = 0 for rect in rects: height = rect.get_height() if rect.contains(event)[0]: rect_vals[i].set_visible(True) else: rect_vals[i].set_visible(False) i = i + 1 self.seeds_vs_repeats_bar.canvas.draw() #statement to detect cursor hovering over the bars self.seeds_vs_repeats_bar.canvas.mpl_connect('motion_notify_event', on_plot_hover) #must redraw after every change self.seeds_vs_repeats_bar.canvas.draw() #plots the repeats per ID number graph as line graph #this graph is connected to the repeats_vs_seeds_line.py file #to represent the widget space in the UI file def plot_repeats_vs_seeds(self): data = {} for seed in self.parser.repeats: number = self.parser.repeats[seed] if number in data: data[number] += 1 else: data[number] = 1 max = 0 y1 = [] x1 = [] index = 0 time = 0 for number in self.order(data): time += 1 if int(data[number]) > max: max = int(data[number]) self.mode = number hold = 0 while hold < data[number]: if index == int(round(len(self.parser.repeats) / 2)): self.median = number x1.append(index) y1.append(number) index = index + 1 hold += 1 #clear axes self.repeats_vs_seeds_line.canvas.axes.clear() #the following are for plotting / formatting self.repeats_vs_seeds_line.canvas.axes.plot(x1, y1) self.repeats_vs_seeds_line.canvas.axes.set_xlabel('Seed ID Number') self.repeats_vs_seeds_line.canvas.axes.set_ylabel('Number of Repeats') self.repeats_vs_seeds_line.canvas.axes.set_title( 'Number of Repeats per Seed ID Number') #always redraw at the end self.repeats_vs_seeds_line.canvas.draw() #fills min and max dropdown windows def fill_min_max(self, run_seed_fill=True): self.ready_chromo_min_max = False index = 1 self.max_chromo.clear() self.min_chromo.clear() while index < self.max_repeats + 1: self.min_chromo.addItem(str(index)) self.max_chromo.addItem(str(self.max_repeats + 1 - index)) index += 1 self.ready_chromo_min_max = True if run_seed_fill: self.fill_seed_id_chrom() #fill_seed_id_chrom will fill the seed ID dropdown, and create the chromosome graph def fill_seed_id_chrom(self): if self.ready_chromo_min_max == False: return if int(self.min_chromo.currentText()) > int( self.max_chromo.currentText()): self.ready_chromo_min_max = False self.max_chromo.clear() self.min_chromo.clear() self.ready_chromo_min_max = True self.fill_min_max(False) QtWidgets.QMessageBox.question( self, "Maximum cant be less than Minimum", "The Minimum number of repeats cant be more than the Maximum", QtWidgets.QMessageBox.Ok) self.fill_seed_id_chrom() return self.ready_chromo_make_graph = False self.chromo_seed.clear() any = False seqLength = int(self.sq.endo_info[self.endo_drop.currentText()][1]) for seed in self.parser.repeats: if self.parser.repeats[seed] >= int(self.min_chromo.currentText( )) and self.parser.repeats[seed] <= int( self.max_chromo.currentText()): any = True #temp = self.sq.compress(seed,64) self.chromo_seed.addItem( str( self.sq.decompress64(seed, slength=seqLength, toseq=True))) if any == False: QtWidgets.QMessageBox.question( self, "No matches found", "No seed that is within the specifications could be found", QtWidgets.QMessageBox.Ok) self.ready_chromo_min_max = False self.max_chromo.clear() self.min_chromo.clear() self.ready_chromo_min_max = True self.fill_min_max(False) self.fill_seed_id_chrom() return self.ready_chromo_make_graph = True self.chro_bar_data() def order(self, data_par): data = dict(data_par) data2 = [] while len(data) > 0: max = 0 for item in data: if item > max: max = item data2.append(max) if len(data2) == 1: self.max_repeats = max del data[max] return data2 def order_high_low_rep(self, dictionary): data = dict(dictionary) data_ordered = {} while len(data) > 0: max = 0 max_index = 0 for item in data: if data[item] > max: max_index = item max = data[item] data_ordered[max_index] = max del data[max_index] return data_ordered #connects to view->CASPER to switch back to the main CASPER window def changeto_main(self): GlobalSettings.mainWindow.show() self.hide() #connects to go back button in bottom left to switch back to the main CASPER window def go_back(self): GlobalSettings.mainWindow.show() self.hide() #-----------------------NOT USED----------------------------# def get_instances(self): ST = SeqTranslate() os.chdir(path) f = open(self.file_name, 'r') while True: x = f.readline() if x == 'REPEATS\n': print("reached repeat sequences") break while True: t = f.readline() if t == 'END_OF_FILE': print("reached end of repeat sequences") break ukey = t[:-1] # takes away the "\n" in the string key = ST.decompress64(ukey, slength=20, toseq=True) key = ST.fill_As(key, 16) self.BAD_instances[key] = list() # Add sequences and locations to the list v = f.readline().split('\t')[:-1] for item in v: loctup = item.split(',') chrom = loctup[0] location = ST.decompress64(loctup[1]) seq = ST.decompress64(loctup[2][1:], slength=20, toseq=True) seq = ST.fill_As( seq, 4 ) # when A's get lost in the compression this fills them back in mytup = (chrom, location, seq) self.BAD_instances[key].append(mytup) f.close() print("currently sorting") for key in self.BAD_instances: size = len(self.BAD_instances[key]) newtuple = (key, self.BAD_instances[key], size ) # sequence, location, size self.sorted_instances.append(newtuple) #not used # Returns the container self.sorted_instances but removes all "single" repeats. Old Code to fix an off-by-1 error def return_all_seqs(self): myseqs = [] for instance in self.sorted_instances: if instance[2] > 1: myseqs.append(instance) return myseqs #not used def return_sorted(self): sorted_seqs = sorted(self.sorted_instances, key=operator.itemgetter(2), reverse=True) amounts = {} for instance in sorted_seqs: if instance[2] > 1: if instance[2] in amounts: amounts[instance[2]] += 1 else: amounts[instance[2]] = 1 print( str(instance[0]) + "," + str(instance[2]) + "," + str(instance[1])) for element in amounts: print("Number of seed sequences with " + str(element) + " appearances: " + str(amounts[element])) #not used def return_positions(self): positions_mapped = [ ] # chromosme, beginning of range, end of range, and number of hits for instance in self.sorted_instances: if instance[2] > 1: for pos in instance[1]: chrom = pos[0] loc = int(pos[1]) # check to see if its already in the map need_new = True for position in positions_mapped: if chrom == position[0]: if position[1] < loc < position[2]: position[3] += 1 position[4].append(instance[0]) need_new = False print("position added") if need_new: newtuple = [ chrom, loc - 1000, loc + 1000, 1, [" ", instance[0]] ] positions_mapped.append(newtuple) sorted_positions = sorted(positions_mapped, key=operator.itemgetter(3), reverse=True) for element in sorted_positions: print( str(element[0]) + "," + str(element[1]) + "," + str(element[2]) + "," + str(element[3])) for element in sorted_positions: sequences = "" for sequence in element[4]: sequences += sequence + "," print(sequences) return sorted_positions #not used def int_to_char(self, i): switcher = {0: 'A', 1: 'T', 2: 'C', 3: 'G'} return switcher[i] # ----------------------------------------------------------# # this function calls the closingWindow class. def closeEvent(self, event): GlobalSettings.mainWindow.closeFunction() event.accept()
def __init__(self, threshold, endo, base_org, csf_file, other_orgs, casperofflist, output_path): self.ST = SeqTranslate() self.rSequences = [] self.get_rseqs(casperofflist) self.mypath = csf_file[:csf_file.find(base_org)] self.ref_genomes = [base_org] self.ref_genomes += other_orgs self.endo = endo self.threshold = threshold self.dSequence = str( ) # global to class so that all scoring functions can use it # This is for autofilling the HsuMatrix self.matrixKeys = [ "GT", "AC", "GG", "TG", "TT", "CA", "CT", "GA", "AA", "AG", "TC", "CC" ] self.matrix = {} self.fill_matrix() # This is where the data is stored before it is written self.output_data = dict() for myseq in self.rSequences: self.output_data[myseq[0]] = list() # BEGIN RUNNING THROUGH SEQUENCES for sequence in self.rSequences: print(sequence) for genome in self.ref_genomes: f = open(self.mypath + genome + self.endo + ".cspr", 'r') while True: line = f.readline() if line.find("CHROMOSOME") != -1: curchrom = line[line.find("#") + 1:-1] print("Finished checking " + curchrom) else: if line[0:-1] == "REPEATS": break # Checks for a signifcant number of mismatches: #locseq = line[:-1].split(",") if self.critical_similarity( sequence[0], self.ST.decompress_csf_tuple(line)[1]): # This is where the real fun begins: off target analysis print('found a similarity') seqscore = self.get_scores( sequence[1], self.ST.decompress_csf_tuple(line)[1]) if seqscore > self.threshold: self.output_data[sequence[0]].append( (str(curchrom), self.ST.decompress_csf_tuple(line[:-1]), int(seqscore * 100), genome)) # END SEQUENCES RUN # Output the data acquired: out = open( output_path + "off_results" + str(datetime.datetime.now().time()) + '.txt', 'w') out.write( "Off-target sequences identified. Scores are between O and 1. A higher value indicates greater" "probability of off-target activity at that location.\n") for sequence in self.output_data: out.write(sequence + "\n") for off_target in self.output_data[sequence]: outloc = off_target[0] + "," + str( off_target[1][0]) + "," + off_target[1][1] out.write(off_target[3] + "," + outloc + "\t" + str(off_target[2] / 100) + '\n') out.close()
class OffTargetAlgorithm: def __init__(self, threshold, endo, base_org, csf_file, other_orgs, casperofflist, output_path): self.ST = SeqTranslate() self.rSequences = [] self.get_rseqs(casperofflist) self.mypath = csf_file[:csf_file.find(base_org)] self.ref_genomes = [base_org] self.ref_genomes += other_orgs self.endo = endo self.threshold = threshold self.dSequence = str( ) # global to class so that all scoring functions can use it # This is for autofilling the HsuMatrix self.matrixKeys = [ "GT", "AC", "GG", "TG", "TT", "CA", "CT", "GA", "AA", "AG", "TC", "CC" ] self.matrix = {} self.fill_matrix() # This is where the data is stored before it is written self.output_data = dict() for myseq in self.rSequences: self.output_data[myseq[0]] = list() # BEGIN RUNNING THROUGH SEQUENCES for sequence in self.rSequences: print(sequence) for genome in self.ref_genomes: f = open(self.mypath + genome + self.endo + ".cspr", 'r') while True: line = f.readline() if line.find("CHROMOSOME") != -1: curchrom = line[line.find("#") + 1:-1] print("Finished checking " + curchrom) else: if line[0:-1] == "REPEATS": break # Checks for a signifcant number of mismatches: #locseq = line[:-1].split(",") if self.critical_similarity( sequence[0], self.ST.decompress_csf_tuple(line)[1]): # This is where the real fun begins: off target analysis print('found a similarity') seqscore = self.get_scores( sequence[1], self.ST.decompress_csf_tuple(line)[1]) if seqscore > self.threshold: self.output_data[sequence[0]].append( (str(curchrom), self.ST.decompress_csf_tuple(line[:-1]), int(seqscore * 100), genome)) # END SEQUENCES RUN # Output the data acquired: out = open( output_path + "off_results" + str(datetime.datetime.now().time()) + '.txt', 'w') out.write( "Off-target sequences identified. Scores are between O and 1. A higher value indicates greater" "probability of off-target activity at that location.\n") for sequence in self.output_data: out.write(sequence + "\n") for off_target in self.output_data[sequence]: outloc = off_target[0] + "," + str( off_target[1][0]) + "," + off_target[1][1] out.write(off_target[3] + "," + outloc + "\t" + str(off_target[2] / 100) + '\n') out.close() def get_rseqs(self, offlist): targets = list() cofile = open(offlist, 'r') cofile.readline() while True: t = cofile.readline()[:-1] if t == 'EN': break targets.append(t) for tar in targets: compseed = self.ST.compress(tar[:16], 64) comptail = self.ST.compress(tar[16:], 64) compressed = compseed + "." + comptail rseq = "" for nt in tar[0:-1]: rseq = nt + rseq self.rSequences.append([tar, rseq]) def get_scores(self, rseq, dseq): self.dSequence = Seq(dseq, IUPAC.unambiguous_dna).reverse_complement() hsu = self.get_hsu_score(rseq) qual = self.get_qualt_score(rseq) step = self.qualt_step_score(rseq) output = ((math.sqrt(hsu) + step) + pow(qual, 6)) return output def fill_matrix(self): f = open('CASPERinfo', 'r') l = " " while True: l = f.readline() if l[0] == "H": break i = 0 l = f.readline() while l[0] != '-': values = l.split("\t") self.matrix[self.matrixKeys[i]] = values i += 1 l = f.readline() for element in self.matrix: self.matrix[element][18] = self.matrix[element][18][0:-1] def get_hsu_score(self, rSequence): score = 1.0 for i in range(0, 19): rnt = rSequence[i] dnt = self.dSequence[i] lookup = str(rnt) + str(dnt) if lookup in self.matrixKeys: hsu = self.matrix[lookup][18 - i] score *= float(hsu) return score def get_qualt_score(self, rSequence): score = 3.5477 for i in range(0, 19): lookup = rSequence[i] + self.dSequence[i] if lookup in self.matrixKeys: score -= 1.0 / (i + 1) return score / 3.5477 def qualt_step_score(self, rSequence): score = 1.0 for i in range(0, 19): lookup = rSequence[i] + self.dSequence[i] if lookup in self.matrixKeys: if i < 6: score -= 0.1 elif i < 12: score -= 0.05 elif i < 20: score -= 0.0125 return score def separation_score(self, rSequence): misses = [] delta = 0 for i in range(0, 19): lookup = rSequence[i] + self.dSequence[i] if lookup in self.matrixKeys: misses.append(i) if len(misses) == 2: delta = (misses[1] - misses[0]) / 2.0 if len(misses) == 3: delta = ((misses[1] - misses[0]) + (misses[2] - misses[1])) / 3.0 if len(misses) == 4: delta = ((misses[1] - misses[0]) + (misses[2] - misses[1])) / 3.0 retval = 1.0 - (delta / 19.0) return retval # If there is more than four mismatches it returns false, else it will return true def critical_similarity(self, cseq1, cseq2): mismatches = 0 lim = min([len(cseq1), len(cseq2)]) check = True for i in range( lim ): # Doesn't matter whether you use cseq1 or cseq2 they are the same length if cseq1[i] != cseq2[i]: mismatches += 1 if mismatches == 5: check = False break return check def int_to_char(self, i): switcher = {0: 'A', 1: 'T', 2: 'C', 3: 'G'} return switcher[i] def char_to_int(self, c): switcher = {'A': 0, 'T': 1, 'C': 2, 'G': 3} return switcher[c]
def __init__(self, parent=None): super(Multitargeting, self).__init__() uic.loadUi(GlobalSettings.appdir + 'multitargetingwindow.ui', self) self.setWindowIcon(QtGui.QIcon(GlobalSettings.appdir + "cas9image.png")) self.sq = SeqTranslate() # SeqTranslate object used in class # Initializes the three graphs self.chart_view_chro_bar = QChartView() self.chart_view_repeat_bar = QChartView() self.chart_view_repeat_line = QChartView() self.data = "" self.shortHand = "" self.chromo_length = list() # Listeners for changing the seed sequence or the .cspr file self.chromo_seed.currentIndexChanged.connect(self.seed_chromo_changed) self.update_min_max.clicked.connect(self.update) self.Analyze_Button.clicked.connect(self.make_graphs) # go back to main button self.back_button.clicked.connect(self.go_back) # Tool Bar options self.actionCASPER.triggered.connect(self.changeto_main) # Statistics storage variables self.max_repeats = 1 self.average = 0 self.median = 0 self.mode = 0 self.average_unique = 0 self.average_rep = 0 self.bar_coords = [] self.seed_id_seq_pair = {} # parser object self.parser = CSPRparser("") self.ready_chromo_min_max = True self.ready_chromo_make_graph = True self.directory = 'Cspr files' self.info_path = os.getcwd() ################################## self.scene = QtWidgets.QGraphicsScene() self.graphicsView.setScene(self.scene) self.scene2 = QtWidgets.QGraphicsScene() self.graphicsView_2.setScene(self.scene2) self.graphicsView.viewport().installEventFilter(self) self.loading_window = loading_window() screen = QtGui.QGuiApplication.screenAt(QtGui.QCursor().pos()) self.mwfg = self.frameGeometry() ##Center window self.cp = QtWidgets.QDesktopWidget().availableGeometry().center( ) ##Center window self.mwfg.moveCenter(self.cp) ##Center window self.move(self.mwfg.topLeft()) ##Center window self.hide()
def __init__(self, info_path): super(NewGenome, self).__init__() uic.loadUi(GlobalSettings.appdir + 'NewGenome.ui', self) self.setWindowTitle('New Genome') self.setWindowTitle('New Genome') self.info_path = info_path #---Style Modifications---# groupbox_style = """ QGroupBox:title{subcontrol-origin: margin; left: 10px; padding: 0 5px 0 5px;} QGroupBox#Step1{border: 2px solid rgb(111,181,110); border-radius: 9px; font: 15pt "Arial"; font: bold; margin-top: 10px;}""" self.Step1.setStyleSheet(groupbox_style) self.Step2.setStyleSheet(groupbox_style.replace("Step1", "Step2")) self.Step3.setStyleSheet(groupbox_style.replace("Step1", "Step3")) #---Button Modifications---# self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.png")) self.resetButton.clicked.connect(self.reset) self.submitButton.clicked.connect(self.submit) self.browseForFile.clicked.connect(self.selectFasta) self.remove_job.clicked.connect(self.remove_from_queue) self.output_browser.setText("Waiting for program initiation...") self.contButton.clicked.connect(self.continue_to_main) self.comboBoxEndo.currentIndexChanged.connect(self.endo_settings) self.runButton.clicked.connect(self.run_jobs_wrapper) self.clearButton.clicked.connect(self.clear_job_queue) self.JobsQueue = [] # holds Job classes. self.Endos = dict() self.file = "" self.process = QtCore.QProcess() self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels) self.process.finished.connect(self.upon_process_finishing) self.seqTrans = SeqTranslate() self.exit = False self.first = False #show functionalities on window self.fillEndo() #self.show() self.num_chromo_next = False #Jobs Table #self.job_Table.setColumnCount(3) self.job_Table.setShowGrid(False) #self.job_Table.setHorizontalHeaderLabels(["Job Queue","Job in Progress", "Completed Jobs"]) self.job_Table.horizontalHeader().setSectionsClickable(True) #self.job_Table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeToContents) #self.job_Table.horizontalHeader().setSectionResizeMode(2, QHeaderView.Stretch) self.job_Table.setSelectionBehavior( QtWidgets.QAbstractItemView.SelectRows) self.job_Table.setEditTriggers( QtWidgets.QAbstractItemView.NoEditTriggers) self.job_Table.setSelectionMode( QtWidgets.QAbstractItemView.MultiSelection) self.job_Table.setSizeAdjustPolicy( QtWidgets.QAbstractScrollArea.AdjustToContents) self.fin_index = 0 self.mwfg = self.frameGeometry() ##Center window self.cp = QtWidgets.QDesktopWidget().availableGeometry().center( ) ##Center window self.total_chrom_count = 0 self.perc_increase = 0 self.progress = 0 #toolbar button actions self.visit_repo.triggered.connect(self.visit_repo_func) self.go_ncbi.triggered.connect(self.open_ncbi_web_page) self.comboBoxEndo.currentIndexChanged.connect(self.changeEndos) #ncbi tool self.NCBI_File_Search.clicked.connect(self.open_ncbi_tool) self.seed_length.setEnabled(False) self.five_length.setEnabled(False) self.three_length.setEnabled(False) self.repeats_box.setEnabled(False) #user prompt class self.goToPrompt = goToPrompt() self.goToPrompt.goToMain.clicked.connect(self.continue_to_main) self.goToPrompt.goToMT.clicked.connect(self.continue_to_MT) self.goToPrompt.goToPop.clicked.connect(self.continue_to_pop)
class genLibrary(QtWidgets.QDialog): def __init__(self): # qt stuff super(genLibrary, self).__init__() uic.loadUi('library_prompt.ui', self) self.setWindowTitle('Generate Library') self.setWindowIcon(Qt.QIcon('cas9image.png')) # button connections self.cancel_button.clicked.connect(self.cancel_function) self.BrowseButton.clicked.connect(self.browse_function) self.submit_button.clicked.connect(self.submit_data) self.progressBar.setValue(0) # variables self.anno_data = dict() self.cspr_file = '' self.parser = CSPRparser('') self.kegg_nonKegg = '' self.gen_lib_dict = dict() self.S = SeqTranslate() self.cspr_data = dict() self.Output = dict() self.off_tol = .05 self.off_max_misMatch = 4 self.off_target_running = False # set the numbers for the num genes combo box item for i in range(10): self.numGenescomboBox.addItem(str(i + 1)) # set the numbers for the minOn combo box for i in range(19, 70): self.minON_comboBox.addItem(str(i + 1)) # this function launches the window # Parameters: # annotation_data: a dictionary that has the data for the annotations searched for # currently MainWindow's searches dict is passed into this # org_file: the cspr_file that pertains to the organism that user is using at the time # anno_type: whether the user is using KEGG or another type of annotation file def launch(self, annotation_data, org_file, anno_type): self.cspr_file = org_file self.anno_data = annotation_data self.kegg_nonKegg = anno_type self.parser.fileName = self.cspr_file self.process = QtCore.QProcess() # setting the path and file name fields index1 = self.cspr_file.find('.') index2 = self.cspr_file.rfind('/') self.filename_input.setText(self.cspr_file[index2 + 1:index1] + '_lib.txt') self.output_path.setText(GlobalSettings.CSPR_DB + "/") # testing: #for data in self.anno_data: # print(data) # for item in self.anno_data[data]: # print('\t', item) # for piece in self.anno_data[data][item]: # print('\t\t', piece) # print(self.kegg_nonKegg) # depending on the type of file, build the dictionary accordingly if self.kegg_nonKegg == 'kegg': self.build_dict_kegg_version() else: self.build_dict_non_kegg() # get the data from the cspr file self.cspr_data = self.parser.gen_lib_parser( self.gen_lib_dict, GlobalSettings.mainWindow.endoChoice.currentText()) #self.generate(5, 200000000000, 15, "mybsulibrary2.txt") #for i in range(len(self.cspr_data)): # for j in range(len(self.cspr_data[i])): # print(self.cspr_data[i][j]) # print('\n\n') self.show() # this is here in case the user clicks 'x' instead of cancel. Just calls the cancel function def closeEvent(self, event): closeWindow = self.cancel_function() # if the user is doing OT and does not decide to cancel it ignore the event if closeWindow == -2: event.ignore() else: event.accept() # this function takes all of the cspr data and compresses it again for off-target usage def compress_file_off(self): f = open(GlobalSettings.CSPR_DB + "/off_compressed.txt", 'w') for gene in self.cspr_data: for j in range(len(self.cspr_data[gene])): loc = self.S.compress(self.cspr_data[gene][j][0], 64) seq = self.S.compress(self.cspr_data[gene][j][1], 64) pam = self.S.compress(self.cspr_data[gene][j][2], 64) score = self.S.compress(self.cspr_data[gene][j][3], 64) strand = self.cspr_data[gene][j][4] output = str(loc) + ',' + str(seq) + str(strand) + str( pam) + ',' + score f.write(output + '\n') f.close() # this function parses the temp_off file, which holds the off-target analysis results # it also updates each target in the cspr_data dictionary to replace the endo with the target's results in off-target def parse_off_file(self): f = open(GlobalSettings.CSPR_DB + '/temp_off.txt') file_data = f.read().split('\n') f.close() scoreDict = dict() # get the data from the file for i in range(len(file_data)): if file_data[i] == 'AVG OUTPUT': continue elif file_data[i] != '': buffer = file_data[i].split(':') scoreDict[buffer[0]] = buffer[1] # update cspr_Data for gene in self.cspr_data: for i in range(len(self.cspr_data[gene])): tempTuple = (self.cspr_data[gene][i][0], self.cspr_data[gene][i][1], self.cspr_data[gene][i][2], self.cspr_data[gene][i][3], self.cspr_data[gene][i][4], scoreDict[self.cspr_data[gene][i][1]]) self.cspr_data[gene][i] = tempTuple # this function runs the off_target command # NOTE: some changes may be needed to get it to work with other OS besides windows def get_offTarget_data(self, num_targets, minScore, spaceValue, output_file, fiveseq): self.perc = False self.bool_temp = False self.running = False # when finished, parse the off file, and then generate the lib def finished(): if self.off_target_running: self.progressBar.setValue(100) self.parse_off_file() did_work = self.generate(num_targets, minScore, spaceValue, output_file, fiveseq) self.off_target_running = False #self.process.kill() if did_work != -1: self.cancel_function() os.remove(GlobalSettings.CSPR_DB + '/off_compressed.txt') os.remove(GlobalSettings.CSPR_DB + '/temp_off.txt') # as off-targeting outputs things, update the off-target progress bar def progUpdate(p): line = str(p.readAllStandardOutput()) line = line[2:] line = line[:len(line) - 1] for lines in filter(None, line.split(r'\r\n')): if (lines.find("Running Off Target Algorithm for") != -1 and self.perc == False): self.perc = True if (self.perc == True and self.bool_temp == False and lines.find("Running Off Target Algorithm for") == -1): lines = lines[32:] lines = lines.replace("%", "") if (float(lines) <= 99.5): num = float(lines) self.progressBar.setValue(num) else: self.bool_temp = True app_path = GlobalSettings.appdir exe_path = app_path + '\OffTargetFolder\OT' exe_path = '"' + exe_path + '" ' data_path = '"' + GlobalSettings.CSPR_DB.replace( '/', '\\') + "\\off_compressed.txt" + '" ' compressed = r' True ' ## cspr_path = '"' + self.cspr_file.replace('/', '\\') + '" ' output_path = '"' + GlobalSettings.CSPR_DB.replace( '/', '\\') + '\\temp_off.txt" ' filename = output_path filename = filename[:len(filename) - 1] filename = filename[1:] filename = filename.replace('"', '') CASPER_info_path = r' "' + app_path + '\\CASPERinfo' + '" ' num_of_mismathes = self.off_max_misMatch tolerance = self.off_tol # create command string detailed_output = " False " avg_output = "True" # set the off_target_running to true, to keep the user from closing the window while it is running self.off_target_running = True cmd = exe_path + data_path + compressed + cspr_path + output_path + CASPER_info_path + str( num_of_mismathes) + ' ' + str( tolerance) + detailed_output + avg_output #print(cmd) self.process.readyReadStandardOutput.connect( partial(progUpdate, self.process)) self.progressBar.setValue(0) QtCore.QTimer.singleShot(100, partial(self.process.start, cmd)) self.process.finished.connect(finished) # submit function # this function takes all of the input from the window, and calls the generate function # Still need to add the checks for 5' seq, and the percentage thing def submit_data(self): if self.off_target_running: return output_file = self.output_path.text() + self.filename_input.text() minScore = int(self.minON_comboBox.currentText()) num_targets = int(self.numGenescomboBox.currentText()) fiveseq = '' # error check for csv or txt files if not output_file.endswith( '.txt') and not self.to_csv_checkbox.isChecked(): if output_file.endswith('.csv'): output_file = output_file.replace('.csv', '.txt') else: output_file = output_file + '.txt' elif self.to_csv_checkbox.isChecked(): if output_file.endswith('.txt'): output_file = output_file.replace('.txt', '.csv') elif not output_file.endswith('.txt') and not output_file.endswith( '.csv'): output_file = output_file + '.csv' # error checking for the space value # if they enter nothing, default to 15 and also make sure it's actually a digit if self.space_line_edit.text() == '': spaceValue = 15 elif self.space_line_edit.text().isdigit(): spaceValue = int(self.space_line_edit.text()) elif not self.space_line_edit.text().isdigit(): QtWidgets.QMessageBox.question( self, "Error", "Please enter integers only for space between guides.", QtWidgets.QMessageBox.Ok) return # if space value is more than 200, default to 200 if spaceValue > 200: spaceValue = 200 elif spaceValue < 0: QtWidgets.QMessageBox.question( self, "Error", "Please enter a space-value that is 0 or greater.", QtWidgets.QMessageBox.Ok) return if self.find_off_Checkbox.isChecked(): self.compress_file_off() # get the fiveprimseq data and error check it if self.fiveprimeseq.text() != '' and self.fiveprimeseq.text().isalpha( ): fiveseq = self.fiveprimeseq.text() elif self.fiveprimeseq.text() != '' and not self.fiveprimeseq.text( ).isalpha(): QtWidgets.QMessageBox.question( self, "Error", "Please make sure only the letters A, T, G, or C are added into 5' End specificity box.", QtWidgets.QMessageBox.Ok) return # get the targeting range data, and error check it here if not self.start_target_range.text().isdigit( ) or not self.end_target_range.text().isdigit(): QtWidgets.QMessageBox.question( self, "Error", "Error: Please make sure that the start and end target ranges are numbers only." " Please make sure that start is 0 or greater, and end is 100 or less. ", QtWidgets.QMessageBox.Ok) return elif int(self.start_target_range.text()) >= int( self.end_target_range.text()): QtWidgets.QMessageBox.question( self, "Error", "Please make sure that the start number is always less than the end number", QtWidgets.QMessageBox.Ok) return # if they check Off-Targeting if self.find_off_Checkbox.isChecked(): # make sure its a digit if self.maxOFF_comboBox.text( ) == '' or not self.maxOFF_comboBox.text().isdigit( ) and '.' not in self.maxOFF_comboBox.text(): QtWidgets.QMessageBox.question( self, "Error", "Please enter only numbers for Maximum Off-Target Score. It cannot be left blank", QtWidgets.QMessageBox.Ok) return else: # make sure it between 0 and .5 if not 0.0 < float(self.maxOFF_comboBox.text()) < .5: QtWidgets.QMessageBox.question( self, "Error", "Please enter a max off target score between 0 and .5!", QtWidgets.QMessageBox.Ok) return # compress the data, and then run off-targeting self.compress_file_off() self.get_offTarget_data(num_targets, minScore, spaceValue, output_file, fiveseq) else: # actually call the generaete function did_work = self.generate(num_targets, minScore, spaceValue, output_file, fiveseq) if did_work != -1: self.cancel_function() # cancel function # clears everything and hides the window def cancel_function(self): if self.off_target_running: error = QtWidgets.QMessageBox.question( self, "Off-Targeting is running", "Off-Targetting is running. Closing this window will cancel that process, and return to the main window. .\n\n" "Do you wish to continue?", QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, QtWidgets.QMessageBox.No) if (error == QtWidgets.QMessageBox.No): return -2 else: self.off_target_running = False self.process.kill() self.cspr_file = '' self.anno_data = dict() self.kegg_nonKegg = '' self.filename_input.setText('') self.output_path.setText('') self.gen_lib_dict.clear() self.cspr_data.clear() self.Output.clear() self.start_target_range.setText('0') self.end_target_range.setText('100') self.space_line_edit.setText('15') self.to_csv_checkbox.setChecked(False) self.find_off_Checkbox.setChecked(False) self.modifyParamscheckBox.setChecked(False) self.maxOFF_comboBox.setText('') self.fiveprimeseq.setText('') self.off_target_running = False self.progressBar.setValue(0) self.output_all_checkbox.setChecked(False) self.hide() # browse function # allows the user to browse for a folder # stores their selection in the output_path line edit def browse_function(self): if self.off_target_running: return # get the folder filed = QtWidgets.QFileDialog() mydir = QtWidgets.QFileDialog.getExistingDirectory( filed, "Open a Folder", GlobalSettings.CSPR_DB, QtWidgets.QFileDialog.ShowDirsOnly) if (os.path.isdir(mydir) == False): return # make sure to append the '/' to the folder path self.output_path.setText(mydir + "/") # this function builds the dictionary that is used in the generate function # this is the version that builds it from the KEGG data # builds it exactly as Brian built it in the files given def build_dict_kegg_version(self): for search in self.anno_data: for gene in self.anno_data[search]: for i in range(len(self.anno_data[search][gene])): self.gen_lib_dict[gene] = [ self.anno_data[search][gene][i][0], self.anno_data[search][gene][i][2], self.anno_data[search][gene][i][3], self.anno_data[search][gene][i][1] ] # this function builds the dictionary that is used in the generate function # this is the version that builds it from data from feature_table, gbff, or gff # builds it exactly as Brian built it in the files given def build_dict_non_kegg(self): for search in self.anno_data: for gene in self.anno_data[search]: descript = gene.split(';') temp_descript = descript[0] if temp_descript == 'hypothetical protein': temp_descript = temp_descript + " " + str( self.anno_data[search][gene][0][3]) temp_descript = temp_descript + '||' + descript[len(descript) - 1] self.gen_lib_dict[temp_descript] = [ self.anno_data[search][gene][0][1], self.anno_data[search][gene][0][3], self.anno_data[search][gene][0][4], self.anno_data[search][gene][0][5] ] # generate function taken from Brian's code def generate(self, num_targets_per_gene, score_limit, space, output_file, fiveseq): deletedDict = dict() # check and see if we need to search based on target_range startNum = float(self.start_target_range.text()) endNum = float(self.end_target_range.text()) checkStartandEndBool = False if startNum != 0.0 or endNum != 100.0: startNum = startNum / 100 endNum = endNum / 100 checkStartandEndBool = True for gene in self.gen_lib_dict: #print(self.gen_lib_dict[gene]) target_list = self.cspr_data[ gene] # Gets the chromosome the gene is on #target_list = chrom_list[k:l+1] # Reverse the target list if the gene is on negative strand: if self.gen_lib_dict[gene][3] == "-": target_list.reverse() # Filter out the guides with low scores and long strings of T's # also store the ones deleted if the user selects 'modify search parameters' if self.modifyParamscheckBox.isChecked(): deletedDict[gene] = list() for i in range(len(target_list) - 1, -1, -1): # check the target_range here if target_list[i][3] < score_limit: if self.modifyParamscheckBox.isChecked(): deletedDict[gene].append(target_list[i]) target_list.pop(i) # check for T's here # what is this??? and shouldn't it be pulled out into its own loop? elif re.search("T{5,10}", target_list[i][1]) is not None: if self.modifyParamscheckBox.isChecked(): deletedDict[gene].append(target_list[i]) target_list.pop(i) # check for the fiveseq if fiveseq != '': for i in range(len(target_list) - 1, -1, -1): if not target_list[i][1].startswith(fiveseq.upper()): if self.modifyParamscheckBox.isChecked(): deletedDict[gene].append(target_list[i]) target_list.pop(i) # check the target range here if checkStartandEndBool: for i in range(len(target_list) - 1, -1, -1): totalDistance = self.gen_lib_dict[gene][ 2] - self.gen_lib_dict[gene][1] target_loc = target_list[i][0] - self.gen_lib_dict[gene][1] myRatio = target_loc / totalDistance if not (startNum <= myRatio <= endNum): if self.modifyParamscheckBox.isChecked(): deletedDict[gene].append(target_list[i]) target_list.pop(i) # if the user selected off-targetting, check to see that the targets do not exceed the selected max score if self.find_off_Checkbox.isChecked(): maxScore = float(self.maxOFF_comboBox.text()) for i in range(len(target_list) - 1, -1, -1): if maxScore < float(target_list[i][5]): if self.modifyParamscheckBox.isChecked(): deletedDict[gene].append(target_list[i]) target_list.pop(i) # Now generating the targets self.Output[gene] = list() i = 0 vec_index = 0 prev_target = (0, "xyz", 'abc', 1, "-") while i < num_targets_per_gene: # select the first five targets with the score and space filter that is set in the beginning if len(target_list) == 0 or vec_index >= len(target_list): break while abs(target_list[vec_index][0] - prev_target[0]) < space: if target_list[vec_index][3] > prev_target[ 3] and prev_target != (0, "xyz", "abc", 1, "-"): self.Output[gene].remove(prev_target) self.Output[gene].append(target_list[vec_index]) prev_target = target_list[vec_index] vec_index += 1 # check and see if there will be a indexing error if vec_index >= len(target_list) - 1: vec_index = vec_index - 1 break # Add the new target to the output and add another to i self.Output[gene].append(target_list[vec_index]) prev_target = target_list[vec_index] i += 1 vec_index += 1 # if the user selects modify search parameters, go through and check to see if each one has the number of targets that the user wanted # if not, append from the deletedDict until they do if self.modifyParamscheckBox.isChecked(): for gene in self.Output: if len(self.Output[gene]) < num_targets_per_gene: for i in range(len(deletedDict[gene])): if len(self.Output[gene]) == num_targets_per_gene: break else: loc = deletedDict[gene][i][0] seq = deletedDict[gene][i][1] pam = deletedDict[gene][i][2] score = deletedDict[gene][i][3] strand = deletedDict[gene][i][4] + '*' endo = deletedDict[gene][i][5] self.Output[gene].append( (loc, seq, pam, score, strand, endo)) """ for essential in self.Output: print(essential) for i in range(len(self.Output[essential])): print('\t', self.Output[essential][i]) print('***********************') """ # Now output to the file try: f = open(output_file, 'w') # if both OT and output all are checked if self.find_off_Checkbox.isChecked( ) and self.output_all_checkbox.isChecked(): f.write( 'Gene Name,Sequence,On-Target Score,Off-Target Score,Location,PAM,Strand\n' ) # if only output all is checked elif not self.find_off_Checkbox.isChecked( ) and self.output_all_checkbox.isChecked(): f.write( 'Gene Name,Sequence,On-Target Score,Location,PAM,Strand\n') # if only OT is checked elif self.find_off_Checkbox.isChecked( ) and not self.output_all_checkbox.isChecked(): f.write('Gene Name,Sequence,Off-Target Score\n') # if neither is checked elif not self.find_off_Checkbox.isChecked( ) and not self.output_all_checkbox.isChecked(): f.write('Gene Name,Sequence\n') for essential in self.Output: i = 0 for target in self.Output[essential]: # check to see if the target did not match the user's parameters and they selected 'modify' # if the target has an error, put 2 asterisks in front of the target sequence if '*' in target[4]: tag_id = "**" + essential + "-" + str(i + 1) else: tag_id = essential + "-" + str(i + 1) i += 1 if self.to_csv_checkbox.isChecked(): tag_id = tag_id.replace(',', '') # if both OT and output all are checked if self.find_off_Checkbox.isChecked( ) and self.output_all_checkbox.isChecked(): f.write(tag_id + ',' + target[1] + ',' + str(target[3]) + ',' + str(target[5]) + ',' + str(target[0]) + ',' + target[2] + ',' + target[4][0] + '\n') # if only output all is checked elif not self.find_off_Checkbox.isChecked( ) and self.output_all_checkbox.isChecked(): f.write(tag_id + ',' + target[1] + ',' + str(target[3]) + ',' + str(target[0]) + ',' + target[2] + ',' + target[4][0] + '\n') # if only OT is checked elif self.find_off_Checkbox.isChecked( ) and not self.output_all_checkbox.isChecked(): f.write(tag_id + ',' + target[1] + ',' + target[5] + '\n') # if neither is checked elif not self.find_off_Checkbox.isChecked( ) and not self.output_all_checkbox.isChecked(): f.write(tag_id + "," + target[1] + "\n") f.close() except PermissionError: QtWidgets.QMessageBox.question( self, "File Cannot Open", "This file cannot be opened. Please make sure that the file is not opened elsewhere and try again.", QtWidgets.QMessageBox.Ok) return -1 except Exception as e: print(e) return
class Compare_Orgs: def __init__(self, output_path, base_org_path, base_org, endo, other_genomes, csize): # initialize SeqTranslate object self.ST = SeqTranslate() self.output_path = output_path # my_orgs contains just the self.organisms = other_genomes self.organisms.append(base_org) self.organisms = sorted(self.organisms) self.db_path = base_org_path # This sets the size of the subsets. NOTE: DO NOT SET THIS TO A LARGE NUMBER IF THERE ARE A LOT OF ORGANISMS self.combo_size = csize # Dictionary of dictionaries. Key1: generic total sequence Key2: org Value: position self.searchableseqs = {} # Container that stores all the sequences seen the combination of organisms defined by the key # An example key would be (sce, yli) for the shared sequences between S.cerevisiae and Y.lipolytica self.buckets = {} # Intitialize the self.buckets container to contain the tuple of every organism subset for i in range(2, csize+1): for subset in itertools.combinations(self.organisms, i): self.buckets[subset] = [] print(subset) self.endo = endo # The object that is iterated over to decompress the output into readable form self.compressed_output = {} # Generates the sequence lists for org in self.organisms: print(org) self.make_lists(org) # Runs the comparison self.create_comparison() self.write_to_file() # Takes an organism and parses the target data into positions and repeated sequences containers def make_lists(self, org): name1 = self.db_path + org + "_" + self.endo + ".cspr" f = open(name1, 'r') curchrom = 0 genomeID = f.readline() while True: position = f.readline() if position.startswith(">"): curchrom += 1 print(curchrom) else: if position[0:-1] == "REPEATS": break # adds to the positions container the unique position with organism, and chromosome as keys line = position[:-1].split(",") # change line into generic (no "+" or "-" change to generic .) totseq = self.ST.to_generic_compressed(line[1]) self.add_to_sequence_matrix(totseq, org, curchrom, line[0]) while True: seedseq = f.readline()[:-1] if seedseq.find("END_OF_FIL") != -1: break taillocs = f.readline().split('\t')[:-1] for item in taillocs: loctup = item.split(',') totseq = self.ST.to_generic_compressed([seedseq,loctup[2][1:]]) self.add_to_sequence_matrix(totseq, org, loctup[0], loctup[1]) f.close() # Takes in the variables of a sequence including what organism it is found on and adds it to the dict of dicts # named:self.searchableseqs def add_to_sequence_matrix(self, totseq, org, chrom, location): if totseq in self.searchableseqs.keys(): # already seen this organism and sequence if org in self.searchableseqs[totseq].keys(): self.searchableseqs[totseq][org].append((chrom, location)) # already seen this sequence but not this sequence in the organism else: self.searchableseqs[totseq][org] = [] self.searchableseqs[totseq][org].append((chrom, location)) # new organism and new sequence else: self.searchableseqs[totseq] = {} self.searchableseqs[totseq][org] = [] self.searchableseqs[totseq][org].append((chrom, location)) def int_to_char(self, i): switcher = { 0: 'A', 1: 'T', 2: 'C', 3: 'G' } return switcher[i] def revcom(self, sequence): revseq = "" change = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'} for nt in sequence: if nt in change: rnt = change[nt] else: rnt = nt revseq = rnt + revseq return revseq def create_comparison(self): # Put every sequence in the appropriate bucket tempdict = dict() for sequence in self.searchableseqs: # Look for the set of organisms containing this sequence if len(self.searchableseqs[sequence].keys()) > 1: # Make sure the tuple is in the right order orgs = self.searchableseqs[sequence].keys() orgs = tuple(sorted(orgs)) # iterate through all the sequences contained in each organism for org in self.searchableseqs[sequence].keys(): tempdict[org] = [] for location in self.searchableseqs[sequence][org]: tempdict[org].append(location) insert_tup = (sequence, tempdict) tempdict = {} # contains a list of tuples with the sequence then short dictionary of organisms containing sequence if orgs in self.buckets: self.buckets[orgs].append(insert_tup) def write_to_file(self): filename = self.output_path + "compare_" for org in self.organisms: filename += org + "_" filename += self.endo + ".txt" f = open(filename, 'w') for key in self.buckets: f.write(str(key) + " " + str(len(self.buckets[key])) + "\n") for seq in self.buckets[key]: #f.write(self.ST.decompress64(seq[0], True) + "\n") for suborg in seq[1]: #f.write(str(suborg) + ":") for locs in seq[1][suborg]: poo = 1 #f.write(str(locs[0]) + "," + str(self.ST.decompress64(locs[1])) + "\t") #f.write("\n") f.write("\n") f.close()
class CSPRparser: #default ctor: currently just sets the file name and initializes all of the variables I will be using def __init__(self, inputFileName): # variables used in this class self.multiSum = 0 #multitargetting sum taken from the previous version of make_graphs self.multiCount = 0 #multitargetting count taken from the previous version of make_graphs self.seqTrans = SeqTranslate( ) #SeqTranslate variable. for decrompressing the data self.chromesomeList = list( ) # list of a list for the chromesomes. As it currently stands, this variable is used in both read_chromesomes and in read_targets self.karystatsList = list( ) # list of (ints) of the karyStats (whatever those are) to be used for the get_chrom_length function self.genome = "" # genome name self.misc = "" # anything from the misc line self.repeats = { } #dictionary of the number of repeats. See the read_repeats function for more info self.seeds = { } #dictionary of which chromesomes are repeats. See the read_repeats function for more info self.dec_tup_data = {} self.chromesomesSelectedList = list() # data for population analysis # dict: # key = the seed # value = tuple (org name, chom #, location, sequence, pam, score, strand, endo) self.popData = {} #file path variable self.fileName = inputFileName # this is the parser that is used for the gen_lib window # it returns a list of lists, essentially all of the chromosomes in the file, and their data # to make it faster, this now uses read_targets def gen_lib_parser(self, genDict, endo): retDict = dict() #for item in genDict: # retList.append((list())) for gene in genDict: retDict[gene] = list() retDict[gene] = self.read_targets( '', (genDict[gene][0], genDict[gene][1], genDict[gene][2]), endo) return retDict #this function reads the first 3 lines of the file: also stores the karyStats in a list of ints def read_first_lines(self): fileStream = open(self.fileName, 'r') #read and parse the genome line self.genome = fileStream.readline() colonIndex = self.genome.find(':') + 2 buffer1 = self.genome[colonIndex:] self.genome = buffer1 #read and store the karystats line on its own, it is parsed down below buffer = fileStream.readline() #read and parse the misc line self.misc = fileStream.readline() colonIndex = self.misc.find(':') + 2 buffer1 = self.misc[colonIndex:] self.misc = buffer1 #now parse the karystats line #ignore the first bit of the string. only care about what's after the colon colonIndex = buffer.find(':') + 2 #parse the line, store the numbers in the list for i in range(colonIndex, len(buffer)): bufferString1 = "" if buffer[i] == ',': bufferString1 = buffer[colonIndex:i] #print(bufferString1) colonIndex = i + 1 self.karystatsList.append(int(bufferString1)) fileStream.close() #print(self.karystatsList) # this function gets the chromesome names out of the CSPR file provided # returns the gene line, and the misc line as well # also stores the Karystats def get_chromesome_names(self): self.read_first_lines() self.chromesomesSelectedList.clear() fileStream = open(self.fileName, 'r') retGen = fileStream.readline() junk = fileStream.readline() retMisc = fileStream.readline() buffer = fileStream.readline() while True: # breaks out when the buffer line = REPEATS if buffer == 'REPEATS\n': break elif '>' in buffer: self.chromesomesSelectedList.append(buffer) buffer = fileStream.readline() return retGen, retMisc #this function reads all of the chromosomes in the file #stores the data into a list of lists. So the line starting with '>' is the first index of each sub list def read_chromesome(self, endo): self.chromesomeList.clear() tempList = list() fileStream = open(self.fileName, 'r') #ignore the first 3 lines fileStream.readline() fileStream.readline() fileStream.readline() bufferString = fileStream.readline() while (True): #this loop breaks out when bufferString is REPEATS tempList.append(bufferString) if (bufferString == "REPEATS\n"): break bufferString = fileStream.readline() while (True): #this loop breaks out when bufferString[0] is > if (bufferString == "REPEATS\n"): self.chromesomeList.append(tempList) tempList = [] break elif ( bufferString[0] == '>' ): #if we get to the next chromesome, append the tempList, clear it, and break self.chromesomeList.append(tempList) tempList = [] break else: #else decompress the data, and append it to the list bufferString = self.seqTrans.decompress_csf_tuple( bufferString, endo=endo) tempList.append(bufferString) #print(bufferString) bufferString = fileStream.readline() fileStream.close() ######################################################################################################## # this function reads just the repeats # it stores this data in 2 dictionaries: # repeats dictionary is the number of dictionaries # key = the seed, and the value is the number of repeats # seeds dictionary is each seed that is repeated # key = the seeds, and the value is the actual chromesome that is repeated # this function also stores the sum and count in the class itself as well # this function is very similar to what make_graphs in Multitargeting.py was doing before ######################################################################################################## def read_repeats(self, endoChoice): index = 0 seedLength = int(self.seqTrans.endo_info[endoChoice][1]) #clear what is already in there self.repeats.clear() self.seeds.clear() # only read the repeats section of the file fileStream = open(self.fileName, 'r') buf = fileStream.readline() while buf != "REPEATS\n": buf = fileStream.readline() split_info = fileStream.read().split('\n') fileStream.close() #parse the info now and store it in the correct dictionaries while (index + 1 < len(split_info)): seed = self.seqTrans.decompress64(split_info[index], slength=seedLength) repeat = split_info[index + 1].split("\t") self.repeats[seed] = 0 self.seeds[seed] = [] self.dec_tup_data[seed] = [] for item in repeat: #print(self.seqTrans.decompress_csf_tuple(item, endo=endoChoice, bool=True)) if item != "": self.repeats[seed] += 1 sequence = item.split(',') self.seeds[seed].append(sequence) temp = sequence[1:4] #print(seed) #print(str(self.seqTrans.compress(seed,64))) #print(temp[1]) #temp[1] = str(self.seqTrans.compress(seed,64)) + str(temp[1]) #print(temp) temp.append( str( self.seqTrans.decompress64( seed, toseq=True, slength=int(seedLength)))) #print(temp) string = ",".join(temp) #print(string) #print('\t', self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice)) self.dec_tup_data[seed].append( self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice)) self.multiSum += self.seqTrans.decompress64( sequence[3], slength=seedLength) self.multiCount += 1 index = index + 2 # this function takes a list of all the file names # it finds the repeats for each file, and also checks to see if those repeats are in each file, not just the first # stores the data in a class object def popParser(self, cspr_file, endoChoice): self.popData.clear() seedLength = self.seqTrans.endo_info[endoChoice][1] referenceList = list() # skip the junk file_stream = open(cspr_file, 'r') genomeLine = file_stream.readline() file_stream.readline() # parse the genome line genomeLine = genomeLine.split(',') retNumber = int(genomeLine[len(genomeLine) - 1]) # parse the miscalleneous line and get the data we want out of it misc_line = file_stream.readline() colonIndex = misc_line.find(':') + 2 usefulData = misc_line[colonIndex:] usefulData = usefulData.split('|') usefulData.pop() i = 0 while i < len(usefulData): temp = usefulData[i].split(',') referenceList.append((temp[0], temp[1])) i += 1 buf = file_stream.readline() while buf != 'REPEATS\n': buf = file_stream.readline() split_info = file_stream.read().split('\n') file_stream.close() index = 0 while (index + 1 < len(split_info)): # get the seed and repeat line seed_d = self.seqTrans.decompress64(split_info[index], slength=int(seedLength), toseq=True) repeat = split_info[index + 1].split('\t') # if the seed is not in the dict, put it in there if seed_d not in self.popData: self.popData[seed_d] = list() for item in repeat: if item != '': commaIndex = item.find(',') chrom = item[:commaIndex] sequence = item.split(',') temp = sequence[1:4] temp.append(str(seed_d)) string = ",".join(temp) tempTuple = self.seqTrans.decompress_csf_tuple( string, bool=True, endo=endoChoice) orgName = referenceList[int(chrom) - 1][0] storeTuple = ( orgName, chrom, tempTuple[0], tempTuple[1], tempTuple[2], tempTuple[3], tempTuple[4], tempTuple[5], ) self.popData[seed_d].append(storeTuple) index += 2 return retNumber, referenceList """ # for each file given for count in range(len(file_list)): # open the file and get the orgName fileStream = open(file_list[count], 'r') buf = fileStream.readline() colonIndex = buf.find(':') orgName = buf[colonIndex + 2:] orgName = orgName.replace('\n', '') print(orgName) # now skip until the repeats section while buf != 'REPEATS\n': buf = fileStream.readline() # read the whole repeats section split_info = fileStream.read().split('\n') fileStream.close() index = 0 seedLength = self.seqTrans.endo_info[endoChoice][1] while (index + 1 < len(split_info)): # get the seed and repeat line seed_d = self.seqTrans.decompress64(split_info[index], slength=int(seedLength), toseq=True) repeat = split_info[index + 1].split("\t") # if the seed is not in the dict, put it in there if seed_d not in self.popData: self.popData[seed_d] = list() # go through and append each line for item in repeat: if item != "": # get the chromosome number commaIndex = item.find(',') chrom = item[:commaIndex] # from read_repeats sequence = item.split(',') temp = sequence[1:4] temp.append(str(seed_d)) string = ",".join(temp) tempTuple = self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice) # store what we need storeTuple = (orgName, chrom, tempTuple[0], tempTuple[1], tempTuple[2], tempTuple[3], tempTuple[4], tempTuple[5],) #storeTuple = (orgName, chrom, temp) # append it self.popData[seed_d].append(storeTuple) index += 2 split_info.clear() """ #this function just reads the whole file def read_all(self): print("Reading First Lines.") self.read_first_lines() print("Reading Chromesomes.") self.read_chromesome() print("Reading Repeats.") self.read_repeats() #this functions reads the entirety of the file into one string def get_whole_file(self): fileStream = open(self.fileName) fileData = fileStream.read() fileStream.close() return (fileData) #this function reads all of the targets in the file. It is essentially a copy of get_targets from the results.py file, written by Brian Mendoza def read_targets(self, genename, pos_tuple, endo): #open the file, and store the genome and the misc tags. #Note: The KARYSTATS is not stored at all. This should not be hard to implement if it is needed fileStream = open(self.fileName) self.genome = fileStream.readline() fileStream.readline() retList = list() self.misc = fileStream.readline() header = fileStream.readline() # get the sequence length for the decompressor seqLength = self.seqTrans.endo_info[endo][2] # Find the right chromosome: while True: # quick error check so the loop eventually breaks out if nothing is found if header == "": print("Error: the target could not be found in this file!") break # in the right chromosome/scaffold? if header.find("(" + str(pos_tuple[0]) + ")") != -1: while True: # Find the appropriate location by quickly decompressing the location at the front of the line myline = fileStream.readline() if self.seqTrans.decompress64( myline.split(",")[0], slength=seqLength) >= pos_tuple[1]: while self.seqTrans.decompress64( myline.split(",")[0], slength=seqLength) < pos_tuple[2]: retList.append( self.seqTrans.decompress_csf_tuple(myline, endo=endo)) myline = fileStream.readline() else: continue break break else: header = fileStream.readline() fileStream.close() return retList def uniq_seq_count(self): self.unique_targets = 0 for chromo in self.chromesomeList: for data in chromo: if len(data) == 6: self.unique_targets += 1 return self.unique_targets