def _get_structure_from_params(params): predictor = params.get('predictor', "colabfold") sequence = params.get('sequence') if not sequence or len(sequence) > 500 or not check_valid_sequence(sequence): return abort(400, "Sequence is too long or contains invalid characters.") if predictor not in {"colabfold"}: return abort(400, "Invalid predictor specified") ret = get_structure(predictor, sequence) return ret
def post(self): params = request.json sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence( sequence): return abort( 400, "Sequence is too long or contains invalid characters.") return get_residue_landscape(model_name='prottrans_t5_xl_u50', sequence=sequence)
def post(self): params = request.json sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence( sequence): return abort( 400, "Sequence is too long or contains invalid characters.") model = { 'seqvec': get_seqvec_embeddings_sync, 'prottrans_bert_bfd': get_protbert_embeddings_sync }.get(params.get('model', 'seqvec')) if not model: return abort(400, f"Model '{params.get('model')}' isn't available.") # time_limit && soft_time_limit limit the execution time. Expires limits the queuing time. job = model.apply_async(args=[sequence], time_limit=60 * 5, soft_time_limit=60 * 5, expires=60 * 60) embedding = np.array(job.get()) buffer = io.BytesIO() with h5py.File(buffer, "w") as embeddings_file: embeddings_file.create_dataset("sequence", data=embedding) # This simulates closing the file and re-opening it. # Otherwise the cursor will already be at the end of the # file when flask tries to read the contents, and it will # think the file is empty. buffer.seek(0) return send_file(buffer, attachment_filename="embeddings_file.h5", as_attachment=True)
def post(self): params = request.json sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence(sequence): return abort(400, "Sequence is too long or contains invalid characters.") model_name = params.get('model', 'seqvec') embedding = get_embedding(model_name, sequence) buffer = io.BytesIO() with h5py.File(buffer, "w") as embeddings_file: embeddings_file.create_dataset("sequence", data=embedding) # This simulates closing the file and re-opening it. # Otherwise the cursor will already be at the end of the # file when flask tries to read the contents, and it will # think the file is empty. buffer.seek(0) return send_file(buffer, attachment_filename="embeddings_file.h5", as_attachment=True)
def _get_annotations_from_params(params): sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence( sequence): return abort(400, "Sequence is too long or contains invalid characters.") model_name = params.get('model', 'seqvec') annotations = get_features(model_name, sequence) annotations['sequence'] = sequence format = params.get('format', 'legacy') only_closest_k = params.get('only_closest_k', True) if only_closest_k == True: _filter_ontology(annotations, "predictedBPO") _filter_ontology(annotations, "predictedCCO") _filter_ontology(annotations, "predictedMFO") if format == "protvista-predictprotein": source = Source(url=request.url, id="sync", name=f"bio_embeddings using {model_name}") evidence = Evidence(source=source, ) protvista_features = dict() protvista_features['sequence'] = sequence protvista_features['features'] = list() if annotations.get('predictedDSSP8'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP8'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})", feature_enum=SecondaryStructure)) if annotations.get('predictedDSSP3'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP3'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})", feature_enum=SecondaryStructure)) if annotations.get('predictedDSSP3'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDisorder'], evidences=[evidence], type=f"DISORDER_({model_name})", feature_enum=Disorder)) return protvista_features elif format == "legacy": predictedCCO = {} predictedBPO = {} predictedMFO = {} for prediction in annotations['predictedCCO']: predictedCCO[prediction['GO_Term']] = max( predictedCCO.get(prediction['GO_Term'], -1), prediction['RI']) for prediction in annotations['predictedBPO']: predictedBPO[prediction['GO_Term']] = max( predictedBPO.get(prediction['GO_Term'], -1), prediction['RI']) for prediction in annotations['predictedMFO']: predictedMFO[prediction['GO_Term']] = max( predictedMFO.get(prediction['GO_Term'], -1), prediction['RI']) annotations['predictedCCO'] = predictedCCO annotations['predictedBPO'] = predictedBPO annotations['predictedMFO'] = predictedMFO return annotations elif format == "go-predictprotein": mapping_function = lambda x: { "gotermid": x['GO_Term'], "gotermname": x['GO_Name'], "gotermscore": round(x['RI'] * 100) } predictedCCO = { "ontology": "Cellular Component Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedCCO'])) } predictedBPO = { "ontology": "Biological Process Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedBPO'])) } predictedMFO = { "ontology": "Molecular Function Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedMFO'])) } return [predictedBPO, predictedCCO, predictedMFO] elif format == "full": return annotations else: abort(400, f"Wrong format passed: {format}")
def _get_annotations_from_params(params): sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence( sequence): return abort(400, "Sequence is too long or contains invalid characters.") model_name = params.get('model', 'prottrans_t5_xl_u50') annotations = get_features(model_name, sequence) if model_name == 'prottrans_t5_xl_u50' and 'prott5_residue_landscape_annotations' in get_queues( ): residue_landscape_output = get_residue_landscape(model_name=model_name, sequence=sequence) # merge the output of the residue landscape into the feature dict # add the meta information for key in residue_landscape_output['meta']: annotations['meta'][key] = residue_landscape_output['meta'][key] residue_landscape_output.pop('meta', None) # add all the remaining information for key in residue_landscape_output: annotations[key] = residue_landscape_output[key] annotations['sequence'] = sequence format = params.get('format', 'legacy') only_closest_k = params.get('only_closest_k', True) if only_closest_k == True: _filter_ontology(annotations, "predictedBPO") _filter_ontology(annotations, "predictedCCO") _filter_ontology(annotations, "predictedMFO") if format == "protvista-predictprotein": source = Source(url=request.url, id="sync", name=f"bio_embeddings using {model_name}") evidence = Evidence(source=source, ) protvista_features = dict() protvista_features['sequence'] = sequence protvista_features['features'] = list() if annotations.get('predictedDSSP8'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP8'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})", feature_enum=SecondaryStructure)) if annotations.get('predictedDSSP3'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP3'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})", feature_enum=SecondaryStructure)) if annotations.get('predictedDisorder'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDisorder'], evidences=[evidence], type=f"DISORDER_({model_name})", feature_enum=Disorder)) if annotations.get('predictedBindingMetal'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedBindingMetal'], evidences=[evidence], type=f"BINDING_METAL_({model_name})", feature_enum=BindingResidues)) if annotations.get('predictedBindingNucleicAcids'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations[ 'predictedBindingNucleicAcids'], evidences=[evidence], type=f"BINDING_NUCLEIC_ACIDS_({model_name})", feature_enum=BindingResidues)) if annotations.get('predictedBindingSmallMolecules'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations[ 'predictedBindingSmallMolecules'], evidences=[evidence], type=f"BINDING_SMALL_MOLECULES_({model_name})", feature_enum=BindingResidues)) if annotations.get('predictedTransmembrane'): protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedTransmembrane'], evidences=[evidence], type=f"TRANSMEMBRANE_({model_name})", feature_enum=MembraneResidues)) return protvista_features elif format == "legacy": predictedCCO = {} predictedBPO = {} predictedMFO = {} for prediction in annotations['predictedCCO']: predictedCCO[prediction['GO_Term']] = max( predictedCCO.get(prediction['GO_Term'], -1), prediction['RI']) for prediction in annotations['predictedBPO']: predictedBPO[prediction['GO_Term']] = max( predictedBPO.get(prediction['GO_Term'], -1), prediction['RI']) for prediction in annotations['predictedMFO']: predictedMFO[prediction['GO_Term']] = max( predictedMFO.get(prediction['GO_Term'], -1), prediction['RI']) annotations['predictedCCO'] = predictedCCO annotations['predictedBPO'] = predictedBPO annotations['predictedMFO'] = predictedMFO return annotations elif format == "go-predictprotein": mapping_function = lambda x: { "gotermid": x['GO_Term'], "gotermname": x['GO_Name'], "gotermscore": round(x['RI'] * 100) } predictedCCO = { "ontology": "Cellular Component Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedCCO'])) } predictedBPO = { "ontology": "Biological Process Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedBPO'])) } predictedMFO = { "ontology": "Molecular Function Ontology", "goTermWithScore": list(map(mapping_function, annotations['predictedMFO'])) } return [predictedBPO, predictedCCO, predictedMFO] elif format == "full": return annotations else: abort(400, f"Wrong format passed: {format}")
def post(self): params = request.json sequence = params.get('sequence') if not sequence or len(sequence) > 2000 or not check_valid_sequence( sequence): return abort( 400, "Sequence is too long or contains invalid characters.") model_name = params.get('model', 'seqvec') model = { 'seqvec': get_seqvec_annotations_sync, 'prottrans_bert_bfd': get_protbert_annotations_sync }.get(model_name) if not model: return abort(400, f"Model '{model_name}' isn't available.") # time_limit && soft_time_limit limit the execution time. Expires limits the queuing time. job = model.apply_async(args=[sequence], time_limit=60 * 5, soft_time_limit=60 * 5, expires=60 * 60) annotations = job.get() annotations['sequence'] = sequence format = params.get('format', 'legacy') if format == "protvista-predictprotein": source = Source(url=request.url, id="sync", name=f"bio_embeddings using {model_name}") evidence = Evidence(source=source, ) protvista_features = dict() protvista_features['sequence'] = sequence protvista_features['features'] = list() protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP8'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})", feature_enum=SecondaryStructure)) protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDSSP3'], evidences=[evidence], type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})", feature_enum=SecondaryStructure)) protvista_features['features'].extend( annotations_to_protvista_converter( features_string=annotations['predictedDisorder'], evidences=[evidence], type=f"DISORDER_({model_name})", feature_enum=Disorder)) return protvista_features else: return annotations