def __init__(self, database: str, strand: str, mismatch: int, cas9: str): self.root = os.path.dirname(os.path.abspath("../main.py")) os.chdir(self.root) self.sql = SQL(database=database) self.strand = strand self.mismatch = mismatch self.cas9 = cas9
def grna_display_runner(self): display_grna = DisplayGuideRNA(database_list=self.availible_databases, cas9_list=self.availible_cas9) if display_grna.exec_(): holder = PandasModel(pd.DataFrame({'': []})) self.display_candidates.setModel(holder) self.display_backup.setModel(holder) self.display_dropped.setModel(holder) self.display_offtargets.setModel(holder) self.database_querried = False self.statusBar().showMessage("Preparing ...") self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) user_options = display_grna.out() if "temp" not in os.listdir(self.root): tempdir = os.path.join(self.root, "temp") os.mkdir(os.path.join(tempdir)) else: tempdir = os.path.join(self.root, "temp") database = str(user_options['organism']).replace(" ", "_") mismatch = user_options['max_mismatch'] max_grna = user_options['max_grna_count'] max_primer_len = user_options['max_primer_len'] user_cas9 = user_options['cas9'] user_pam_tolerance = user_options['pam_tolerance'] user_fiveprime = user_options['nucleotides_5'] user_threeprime = user_options['nucleotides_3'] gene_mask_dictionary = { 'genes': [items.replace("_", "").lower() if "_" in items else items.lower() for items in user_options['genes']], 'masks': [items.replace("_", "").lower() if "_" in items else items.lower() for items in user_options['masks']] } sqlrunner = SQL(database=database) headers = sqlrunner.custom_sql("SELECT header FROM genes").to_dict('list') gene_check = [True if gene in headers['header'] else False for gene in gene_mask_dictionary['genes']] mask_check = [True if gene in headers['header'] else False for gene in gene_mask_dictionary['masks']] for idx, val in enumerate(gene_check): if not val: db = database.replace("_", " ") QtWidgets.QMessageBox.about(self, "Error", f"{gene_mask_dictionary['genes'][idx]} was not found in {db}") self.main_progressbar_value = 0 self.main_progressbar.setValue(self.main_progressbar_value) return None for idx, val in enumerate(mask_check): if not val: db = database.replace("_", " ") QtWidgets.QMessageBox.about(self, "Error", f"{gene_mask_dictionary['masks'][idx]} was not found in {db}") self.main_progressbar_value = 0 self.main_progressbar.setValue(self.main_progressbar_value) return None if mismatch == "": QtWidgets.QMessageBox.about(self, "Error", "First search guide RNA's") self.main_progressbar_value = 0 self.main_progressbar.setValue(self.main_progressbar_value) return None # Strand is r for reverse worker = CrisprInterference_worker(database=database, mismatch=mismatch, strand='r', max_grna=max_grna, genes_masks=gene_mask_dictionary, max_primer_size=max_primer_len, cas9_organism=user_cas9, pam_tolerance=user_pam_tolerance, fiveprime_nucleotides=user_fiveprime, threeprime_nucleotides=user_threeprime) self.threadingPool.start(worker) while self.threadingPool.activeThreadCount() == 1: self.statusBar().showMessage("Gathering guide RNA's...") QtWidgets.QApplication.processEvents() if self.main_progressbar_value < 90: self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) time.sleep(0.8) if self.threadingPool.waitForDone(): self.statusBar().showMessage("Gathering data ...") self.candidate_gRNA_df = pd.read_csv( filepath_or_buffer=os.path.join(self.root, "temp", "candidates.txt"), sep=",") self.backup_gRNA_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "backup.txt"), sep=",") self.dropped_gRNA_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "dropped.txt"), sep=",") self.offtarget_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "offtargets.txt"), sep=",") cand_model, backup_model, dropped_model, offtargets_model = map(PandasModel, [self.candidate_gRNA_df, self.backup_gRNA_df, self.dropped_gRNA_df, self.offtarget_df]) while self.main_progressbar_value < 100: self.main_progressbar_value += 1 self.statusBar().showMessage("Formatting for display...") self.main_progressbar.setValue(self.main_progressbar_value) time.sleep(0.01) self.display_candidates.setModel(cand_model) self.display_backup.setModel(backup_model) self.display_dropped.setModel(dropped_model) self.display_offtargets.setModel(offtargets_model) self.database_querried = True self.main_progressbar_value = 0 self.main_progressbar.setValue(self.main_progressbar_value) self.statusBar().showMessage("Ready") hits = [genes for genes in self.candidate_gRNA_df['genes']] missed = list(set(gene_mask_dictionary['genes']) - set(hits)) EOSpopup(missed_genes=missed).exec_() shutil.rmtree(tempdir)
def grna_search_runner(self): """ search guide RNA's :return: """ search_grna = SearchGrnaDialog(database_list=self.availible_databases, cas9_list=self.availible_cas9) if search_grna.exec_(): user_data = search_grna.out() chosen_org = user_data['organism'] is_circular = user_data['chromosome'] mismatch = user_data['mismatch'] pam = user_data['pam'] tax_id = user_data['taxonomy_id'] cores = user_data['cores'] if is_circular not in {"TRUE", "FALSE"}: QtWidgets.QMessageBox.about(self, "Error", "chromosome needs to be TRUE or FALSE") return None sql_runner = SQL(database=chosen_org.replace(" ", "_")) searched_pams = sql_runner.list_pams() if pam in searched_pams: QtWidgets.QMessageBox.about(self, "Error", f"PAM {pam} has already been searched, choose a different pam") return None if tax_id == "": QtWidgets.QMessageBox.about(self, "Error", f"Taxonomy id required") return None self.statusBar().showMessage("Creating guide RNA database...") self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) if not "temp" in os.listdir(self.root): os.mkdir(os.path.join(self.root, "temp")) tempdir = os.path.join(self.root, "temp") if "global_gRNA" not in tempdir: os.mkdir(os.path.join(tempdir, "global_gRNA")) global_gRNA = os.path.join(tempdir, "global_gRNA") chosen_org_modified = chosen_org.replace(" ", "_") strain = chosen_org.split(" ")[-1] self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) tempfa = pd.read_sql("SELECT * FROM genome", sqlite3.connect(os.path.join(self.root, "databases", f"{chosen_org_modified}.db"))) tempgff = pd.read_sql("SELECT * FROM gff_file", sqlite3.connect(os.path.join(self.root, "databases", f"{chosen_org_modified}.db"))) tempgenes = pd.read_sql("SELECT * FROM genes", sqlite3.connect(os.path.join(self.root, "databases", f"{chosen_org_modified}.db"))) tempgff.to_csv(f"{os.path.join(tempdir, chosen_org_modified)}.gff", header=False, index=False, sep="\t") fasta_header = [header for header in tempfa['header']] fasta_sequence = [sequence for sequence in tempfa['sequence']] self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) with open(os.path.join(tempdir, f"{chosen_org_modified}.fasta"), 'w') as input_fasta: for header, sequence in zip(fasta_header, fasta_sequence): input_fasta.write(">" + header + '\n') input_fasta.write(sequence + '\n') self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) fasta_header.clear() fasta_sequence.clear() fasta_header = [header for header in tempgenes['header']] fasta_sequence = [sequence for sequence in tempgenes['sequence']] with open(os.path.join(tempdir, f"{chosen_org_modified}_genes.fasta"), 'w') as input_fasta: for header, sequence in zip(fasta_header, fasta_sequence): input_fasta.write(">" + header + '\n') input_fasta.write(sequence + '\n') self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) fasta_header.clear() fasta_sequence.clear() for objects in self.availible_bsgenome: detected_package = objects.split("_")[0] detected_package = detected_package.split(".")[-1] if strain == detected_package: bsgenome_package = str(objects.split("_")[0]) genus, species, strain = chosen_org.split(" ") species = species.lower() config_file = { 'organism': [f"{genus} {species} {strain}"], 'taxonomy_id': [tax_id], 'circular_chromosome': [is_circular], 'input_file': [os.path.join(tempdir, f"{chosen_org_modified}_genes.fasta")], 'gff_file': [os.path.join(tempdir, f"{chosen_org_modified}.gff")], 'find_gRNA_with_cutsites': ["FALSE"], 'find_paired_gRNA': ["FALSE"], 'BSgenome': [bsgenome_package], 'chromosomes_to_search': ["all"], 'min_gap': [0], 'max_gap': [20], 'gRNA_size': [20], 'max_mismatch_gRNA': [mismatch], 'PAM_sequence': [pam], 'PAM_length': [len(pam)], 'n.cores': [cores], 'scoring_method': ["CFDscore"] } config_file = pd.DataFrame(config_file) config_file.to_csv(os.path.join(tempdir, "config.txt"), index=False, sep="\t") findgRNA_worker = FindgRNA_worker() self.threadingPool.start(findgRNA_worker) while self.threadingPool.activeThreadCount() == 1: self.statusBar().showMessage("predicting all guide RNA's...") QtWidgets.QApplication.processEvents() if self.main_progressbar_value <= 90: self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) time.sleep(2) if self.threadingPool.waitForDone(): self.statusBar().showMessage("buidling global gRNA database...") database = Database(database=chosen_org_modified) database.create_gRNA_database(summary=os.path.join(global_gRNA, "Summary.xls"), offtarget=os.path.join(global_gRNA, "OfftargetAnalysis.txt"), config_file=os.path.join(tempdir, "config.txt")) while self.main_progressbar_value <= 100: self.main_progressbar_value += 1 self.main_progressbar.setValue(self.main_progressbar_value) shutil.rmtree(tempdir) self.main_progressbar_value = 0 self.main_progressbar.setValue(self.main_progressbar_value) self.statusBar().showMessage("Ready")
def run(self): sqlrunner = SQL(database=self.database) out = sqlrunner.custom_sql(statement=self.sql_query) out.to_csv(os.path.join(self.root, "temp", "query.txt"), header=True, index=False, sep=",")
def run(self): sqlrunner = SQL(database=os.path.join(self.root, "databases", self.database)) gRNA_db = sqlrunner.get_global_gRNA(mismatch=str(self.mismatch)) # This is a rate limiting step if bool(self.gene_mask_dict['genes']): query_data = self.get_targeted_data(dataframe=gRNA_db, gene_mask_dict=self.gene_mask_dict) else: query_data = gRNA_db multifasta = sqlrunner.get_gene_multifasta() gRNA_runner = RefineCripri(grna_dataframe=query_data, strand=self.strand, fasta_dataframe=multifasta, cas9=self.cas9_organism, offtarget_ids=sqlrunner.custom_sql("SELECT name, strand FROM global_offtarget")) candidates, backup, dropped = gRNA_runner.cripr_interference() candidates, backup, dropped = map(self.utils.annotate_dataframe, [candidates, backup, dropped]) offtargets = sqlrunner.get_offtargets_by_mismatch(mismatch=self.mismatch) offtargets.dropna(subset=['annotation'], inplace=True) offtargets = offtargets[offtargets['strand'] != '+'] offtargets['annotation'] = offtargets['annotation'].apply( lambda x: x.replace("_", "") if isinstance(x, str) else x) offtargets = offtargets.query("gene != annotation") offtargets.reset_index(drop=True, inplace=True) offtarget_ids = list(set(offtargets['name'])) candidates_has_offtargets = self.list_comparison(list1=candidates['names'], list2=offtarget_ids) backup_has_offtargets = self.list_comparison(list1=backup['names'], list2=offtarget_ids) if candidates_has_offtargets: candidate_off_ids = list(set(candidates['names']) & set(offtarget_ids)) candidates_offtargets = self.grab_offtargets(query=candidates, offtargets=offtargets, offtarget_ids=offtarget_ids) candidates = self.negate_pam_mismatch(grna_dataframe=candidates, offtarget_dataframe=candidates_offtargets, target_ids=candidate_off_ids) candidates, dropped = self.move_grna_by_offtargets(grna_dataframe=candidates, dropped_dataframe=dropped, offtarget_dataframe=candidates_offtargets, masks=self.gene_mask_dict['masks']) candidates_offtargets = pd.DataFrame(candidates_offtargets) else: candidates_offtargets = dict.fromkeys(offtargets, []) candidates_offtargets = pd.DataFrame(candidates_offtargets) if backup_has_offtargets: backup_off_ids = list(set(backup['names']) & set(offtarget_ids)) backup_offtargets = self.grab_offtargets(query=backup, offtargets=offtargets, offtarget_ids=offtarget_ids) backup = self.negate_pam_mismatch(grna_dataframe=backup, offtarget_dataframe=backup_offtargets, target_ids=backup_off_ids) backup, dropped = self.move_grna_by_offtargets(grna_dataframe=backup, dropped_dataframe=dropped, offtarget_dataframe=backup_offtargets, masks=self.gene_mask_dict['masks']) backup_offtargets = pd.DataFrame(backup_offtargets) else: backup_offtargets = dict.fromkeys(offtargets, []) backup_offtargets = pd.DataFrame(backup_offtargets) ## add ranking to pam, move between dataframes if ranking is f****d candidates, backup = self.scan_maxmismatches(candidates=candidates, backup=backup) candidates, backup = self.force_max_grna_in_candidates(candidates=candidates, backup=backup, max_grna=self.max_grna) candidates = self.force_ag_base(dataframe=candidates, max_primer_size=self.max_primer_size) backup = self.force_ag_base(dataframe=backup, max_primer_size=self.max_primer_size) candidates, backup, dropped = map(self.calculate_primer_len, [candidates, backup, dropped]) candidates, backup, dropped = map(self.calculate_gc_content, [candidates, backup, dropped]) candidates = self.design_primers(dataframe=candidates, cas9=self.cas9_organism, fiveprime=self.fiveprime, threeprime=self.threeprime) backup = self.design_primers(dataframe=backup, cas9=self.cas9_organism, fiveprime=self.fiveprime, threeprime=self.threeprime) candidates, backup, dropped = map(pd.DataFrame, [candidates, backup, dropped]) offtarget_empty = [candidates_offtargets.empty, backup_offtargets.empty] final_offtargets = pd.DataFrame() if not all(offtarget_empty): final_offtargets = candidates_offtargets final_offtargets['from'] = "candidates" backup_offtargets['from'] = "backup" final_offtargets = final_offtargets.append(backup_offtargets, ignore_index=True) else: if not offtarget_empty[0]: final_offtargets = candidates_offtargets final_offtargets['from'] = "candidates" if not offtarget_empty[1]: final_offtargets = backup_offtargets final_offtargets['from'] = "backup" if final_offtargets.empty: final_offtargets = pd.DataFrame(columns=offtargets.columns) candidates.to_csv(os.path.join(self.root, "temp", "candidates.txt"), header=True, index=False, sep=",") backup.to_csv(os.path.join(self.root, "temp", "backup.txt"), header=True, index=False, sep=",") dropped.to_csv(os.path.join(self.root, "temp", "dropped.txt"), header=True, index=False, sep=",") final_offtargets.to_csv(os.path.join(self.root, "temp", "offtargets.txt"), header=True, index=False, sep=",")
def force_ag_base(self, dataframe, max_primer_size): """ Algorithm matches the grna to the gene and increments the bases if they are not a/g this modifies the index of gRNA in place """ sqlrunner = SQL(database=os.path.join(self.root, "databases", self.database)) for idx, genes in enumerate(dataframe['genes']): gene_sequence = str(sqlrunner.get_gene_sequence(gene=genes)).lower() complement_strand_dict = { 'g': 'c', 'G': 'C', 'a': 't', 'A': 'T', 'c': 'g', 'C': 'G', 't': 'a', 'T': 'A' } grna = str(dataframe['gRNA'][idx]).lower() fails_contstraint = False if grna[0] == 'a' or grna[0] == 'g' else True sequence_swapped = "" if fails_contstraint: for base in gene_sequence: sequence_swapped += complement_strand_dict.get(base, "N") grna = grna[::-1] pam_len = len(str(dataframe['PAM'][idx]).lower()) loc_in_gene = sequence_swapped.find(grna) primer_wo_pam_start = loc_in_gene + pam_len primer_wo_pam_stop = primer_wo_pam_start + 20 # 20 is the primer length, fixed value if primer_wo_pam_stop + max_primer_size < len(gene_sequence): counter = 0 while counter <= max_primer_size: target_base_location = primer_wo_pam_stop + counter target_base = sequence_swapped[target_base_location] if target_base == "a" or target_base == "g": grna_out = sequence_swapped[loc_in_gene:target_base_location + 1] grna_out = grna_out[::-1] dataframe['gRNA'][idx] = grna_out.upper() dataframe['score'][idx] = dataframe['score'][idx] - 0.5 if "position 20 is" in dataframe['notes'][idx]: dataframe['notes'][idx] = "PASS" break else: counter += 1 else: counter = 1 if primer_wo_pam_stop == len(gene_sequence): grna_out = sequence_swapped[loc_in_gene:len(gene_sequence)] grna_out = grna_out[::-1] dataframe['gRNA'][idx] = grna_out.upper() dataframe['score'][idx] = dataframe['score'][idx] - 0.5 if "position 20 is" in dataframe['notes'][idx]: dataframe['notes'][idx] = "PASS" else: while counter < len(gene_sequence): target_base_location = primer_wo_pam_stop + counter target_base = sequence_swapped[target_base_location] if target_base == "a" or target_base == "g": grna_out = sequence_swapped[loc_in_gene:target_base_location + 1] grna_out = grna_out[::-1] dataframe['gRNA'][idx] = grna_out.upper() dataframe['score'][idx] = dataframe['score'][idx] - 0.5 if "position 20 is" in dataframe['notes'][idx]: dataframe['notes'][idx] = "PASS" break else: counter += 1 return dataframe