def full_uniref_check(enzyme_type_obj): unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj).select_related() if len(unirefs) != 0: for ur in unirefs: print(f'Checking {ur.enzyme_name}..') ref_parser = UniRef_Parser() ref_parser.load_xml(ur.enzyme_name) time.sleep(0.2) if ref_parser.check_id_match(ur.enzyme_name) == False: print( f"{ur.enzyme_name} doesnt match cluster id online, deleting.." ) for seq in ur.result_of_blasts_for: seq.blast = None seq.save() ur.delete() ssn_query = SSN_record.objects(enzyme_type=enzyme_type_obj) if len(ssn_query) != 0: ssn_record = SSN_record.objects(enzyme_type=enzyme_type_obj)[0] ssn_record.status = 'Queued for update' ssn_record.save() enzyme_type_obj.bioinformatics_status = 'Queued for update' enzyme_type_obj.save() print(f"Full UniRef50 update complete for {enzyme_type_obj.enzyme_type}")
def clear_all_bioinformatics_data(): enzyme_types = EnzymeType.objects() seqs = Sequence.objects() for enz in enzyme_types: enz.bioinformatics_status = 'Idle' enz.save() for seq in seqs: seq.blast = None seq.alignments_made = None seq.save() UniRef50.drop_collection() SSN_record.drop_collection() UniRef90.drop_collection() Alignment.drop_collection() SeqSimNet.drop_collection() analysis_data_ssn = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/ssn' analysis_data_aba = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/all_by_all_blast' shutil.rmtree(analysis_data_ssn) shutil.rmtree(analysis_data_aba) os.mkdir(analysis_data_ssn) os.mkdir(analysis_data_aba) print('ALL BIOINFORMATICS DATA DELETED') result = {'status': 'success', 'msg': f"Done", 'issues': []} return jsonify(result=result)
def task_check_ssn_status(): for enzyme_type in EnzymeType.objects(): ssn_query = list(SSN_record.objects(enzyme_type=enzyme_type)) if len(ssn_query) > 1: print( f'Warning - multiple ssn records for {enzyme_type} - deleting extras' ) for i in range(1, len(ssn_query)): ssn_query[i].delete() if len(current_app.blast_queue.jobs) + len( current_app.process_blasts_queue.jobs) + len( current_app.alignment_queue.jobs) == 0: print('Checking ssn status') ssn_records = SSN_record.objects().select_related() for ssn_r in ssn_records: if ssn_r.status != 'Complete' and ssn_r.enzyme_type.bioinformatics_status == 'Complete': if len(UniRef50.objects(enzyme_type=ssn_r.enzyme_type)) != 0: enzyme_type = ssn_r.enzyme_type.enzyme_type job_name = f"{enzyme_type}_expand_ssn" current_app.alignment_queue.enqueue( ssn_tasks.task_expand_ssn, enzyme_type, job_id=job_name) print(f'Queued SSN job for {enzyme_type}') for enz_type_obj in EnzymeType.objects(): if enz_type_obj.bioinformatics_status == 'Complete': if enz_type_obj not in SSN_record.objects().distinct( 'enzyme_type'): unirefs = UniRef50.objects(enzyme_type=enz_type_obj) biocatdb_seqs = list( Sequence.objects( db.Q(enzyme_type=enz_type_obj.enzyme_type) & db.Q(bioinformatics_ignore__ne=True))) biocatdb_seqs = [ seq for seq in biocatdb_seqs if seq.sequence != '' and seq.sequence is not None ] if len(unirefs) + len(biocatdb_seqs) != 0: print( f"No SSN for {enz_type_obj.enzyme_type}, but blasts are complete and sequences present.. creating SSN." ) job_name = f"{enz_type_obj.enzyme_type}_expand_ssn" current_app.alignment_queue.enqueue( ssn_tasks.task_expand_ssn, enz_type_obj.enzyme_type, job_id=job_name) else: print(f"Length blast queue = {len(current_app.blast_queue.jobs)}") print( f"Length process blast queue = {len(current_app.process_blasts_queue.jobs)}" ) print( f"Length alignment queue = {len(current_app.alignment_queue.jobs)}" )
def _get_db_object(self): """ Either finds existing db entry for ssn of enzyme type, or makes a new one """ query = SSN_record.objects(enzyme_type=self.enzyme_type_obj) if len(query) == 0: db_ssn = SSN_record(enzyme_type=self.enzyme_type_obj) else: db_ssn = query[0] return db_ssn
def check_blast_status(enzyme_type): seqs = Sequence.objects( db.Q(enzyme_type=enzyme_type) & db.Q(bioinformatics_ignore__ne=True) & db.Q(reviewed=True)) enz_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0] all_complete = True for seq in seqs: if seq.blast is None: all_complete = False enz_type_obj.bioinformatics_status = 'Queued for update' enz_type_obj.save() if all_complete == True: if enz_type_obj.bioinformatics_status != 'Complete': enz_type_obj.bioinformatics_status = 'Complete' enz_type_obj.save() ssn_q = SSN_record.objects(enzyme_type=enz_type_obj) if len(ssn_q) == 1: ssn_record = SSN_record.objects(enzyme_type=enz_type_obj)[0] ssn_record.status = 'Queued for update' ssn_record.save()
def clear_empty_ssns(): ssn_records = SSN_record.objects().select_related() for ssn_r in ssn_records: enzyme_type_obj = ssn_r.enzyme_type unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj) biocat_seqs = Sequence.objects( db.Q(enzyme_type=enzyme_type_obj.enzyme_type) & db.Q(sequence__ne="") & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True)) if len(unirefs) + len(biocat_seqs) == 0: ssn_r.delete() result = {'status': 'success', 'msg': f'Empty SSNs removed', 'issues': []} return jsonify(result=result)
def mark_not_aligned(): enzyme_type = request.form['enzyme_type'] sequences = Sequence.objects(enzyme_type=enzyme_type) for seq in sequences: seq.alignments_made = False seq.save() enz_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0] ssn_record = SSN_record.objects(enzyme_type=enz_type_obj)[0] ssn_record.status = 'Queued for update' ssn_record.save() result = {'status': 'success', 'msg': f"Done", 'issues': []} return jsonify(result=result)
def set_choices(self): ssn_records = SSN_record.objects().distinct('enzyme_type') enzyme_types = EnzymeType.objects() list_enzyme_types = [] enzyme_descriptions = {} for enz_type in enzyme_types: if enz_type in ssn_records: enzyme_descriptions[ enz_type. enzyme_type] = f"{enz_type.enzyme_type} - {enz_type.full_name}" list_enzyme_types.append(enz_type.enzyme_type) list_enzyme_types = sorted(list_enzyme_types) self.enzyme_type.choices = [] for key in list_enzyme_types: self.enzyme_type.choices.append((key, enzyme_descriptions[key]))
def bioinformatics_admin_page(): enzyme_types = EnzymeType.objects().order_by('enzyme_type') biostat = {} ssn = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type biostat[enz_type] = enz_type_obj.bioinformatics_status q = SSN_record.objects(enzyme_type=enz_type_obj) if len(q) != 0: ssn[enz_type] = q[0].status else: ssn[enz_type] = 'None' enzyme_numbers = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type enzyme_numbers[enz_type] = {} enzyme_numbers[enz_type]['biocatdb'] = len(Sequence.objects(enzyme_type=enz_type)) enzyme_numbers[enz_type]['uniref'] = len(UniRef50.objects(enzyme_type=enz_type_obj)) enz_type_dict = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type enz_type_dict[enz_type] = 0 seqs = Sequence.objects(enzyme_type=enz_type) if len(seqs) != 0: for seq in seqs: if seq.blast is not None: enz_type_dict[enz_type] += 1 if enz_type_dict[enz_type] != 0: enz_type_dict[enz_type] = round((enz_type_dict[enz_type]/len(seqs))*100, 0) registry = StartedJobRegistry(queue=current_app.blast_queue) num_jobs = registry.count return render_template('bioinformatics/bioinformatics_admin.html', blasted_enz_types=enz_type_dict, biostat=biostat, ssn=ssn, num_jobs=num_jobs, enzyme_numbers=enzyme_numbers)
def ssn_object(): enzyme_type = request.form['enzyme_type'] enzyme_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0] ssn_obj = SSN_record.objects(enzyme_type=enzyme_type_obj)[0] num_biocatdb = Sequence.objects(enzyme_type=enzyme_type).count() num_uniref = UniRef50.objects(enzyme_type=enzyme_type_obj).count() precalc_choices = {} for score in ssn_obj.num_at_alignment_score: clusters = ssn_obj.num_at_alignment_score[score] idt = ssn_obj.identity_at_alignment_score[score] choice_text = f"{score}, {clusters} clusters, avg identity {idt[0]} ± {idt[1]}" precalc_choices[score] = choice_text result = {'status': ssn_obj.status, 'num_biocatdb': num_biocatdb, 'num_uniref': num_uniref, 'precalculated': precalc_choices} return jsonify(result=result)