예제 #1
0
def _get_structure_from_params(params):
    predictor = params.get('predictor', "colabfold")
    sequence = params.get('sequence')
    if not sequence or len(sequence) > 500 or not check_valid_sequence(sequence):
        return abort(400, "Sequence is too long or contains invalid characters.")
    if predictor not in {"colabfold"}:
        return abort(400, "Invalid predictor specified")

    ret = get_structure(predictor, sequence)
    return ret
예제 #2
0
    def post(self):
        params = request.json

        sequence = params.get('sequence')

        if not sequence or len(sequence) > 2000 or not check_valid_sequence(
                sequence):
            return abort(
                400, "Sequence is too long or contains invalid characters.")

        return get_residue_landscape(model_name='prottrans_t5_xl_u50',
                                     sequence=sequence)
예제 #3
0
    def post(self):
        params = request.json

        sequence = params.get('sequence')

        if not sequence or len(sequence) > 2000 or not check_valid_sequence(
                sequence):
            return abort(
                400, "Sequence is too long or contains invalid characters.")

        model = {
            'seqvec': get_seqvec_embeddings_sync,
            'prottrans_bert_bfd': get_protbert_embeddings_sync
        }.get(params.get('model', 'seqvec'))

        if not model:
            return abort(400,
                         f"Model '{params.get('model')}' isn't available.")

        # time_limit && soft_time_limit limit the execution time. Expires limits the queuing time.
        job = model.apply_async(args=[sequence],
                                time_limit=60 * 5,
                                soft_time_limit=60 * 5,
                                expires=60 * 60)
        embedding = np.array(job.get())

        buffer = io.BytesIO()
        with h5py.File(buffer, "w") as embeddings_file:
            embeddings_file.create_dataset("sequence", data=embedding)

        # This simulates closing the file and re-opening it.
        # Otherwise the cursor will already be at the end of the
        # file when flask tries to read the contents, and it will
        # think the file is empty.
        buffer.seek(0)

        return send_file(buffer,
                         attachment_filename="embeddings_file.h5",
                         as_attachment=True)
예제 #4
0
    def post(self):
        params = request.json

        sequence = params.get('sequence')

        if not sequence or len(sequence) > 2000 or not check_valid_sequence(sequence):
            return abort(400, "Sequence is too long or contains invalid characters.")

        model_name = params.get('model', 'seqvec')

        embedding = get_embedding(model_name, sequence)

        buffer = io.BytesIO()
        with h5py.File(buffer, "w") as embeddings_file:
            embeddings_file.create_dataset("sequence", data=embedding)

        # This simulates closing the file and re-opening it.
        # Otherwise the cursor will already be at the end of the
        # file when flask tries to read the contents, and it will
        # think the file is empty.
        buffer.seek(0)

        return send_file(buffer, attachment_filename="embeddings_file.h5", as_attachment=True)
예제 #5
0
def _get_annotations_from_params(params):
    sequence = params.get('sequence')

    if not sequence or len(sequence) > 2000 or not check_valid_sequence(
            sequence):
        return abort(400,
                     "Sequence is too long or contains invalid characters.")

    model_name = params.get('model', 'seqvec')

    annotations = get_features(model_name, sequence)
    annotations['sequence'] = sequence

    format = params.get('format', 'legacy')
    only_closest_k = params.get('only_closest_k', True)

    if only_closest_k == True:
        _filter_ontology(annotations, "predictedBPO")
        _filter_ontology(annotations, "predictedCCO")
        _filter_ontology(annotations, "predictedMFO")

    if format == "protvista-predictprotein":
        source = Source(url=request.url,
                        id="sync",
                        name=f"bio_embeddings using {model_name}")

        evidence = Evidence(source=source, )

        protvista_features = dict()
        protvista_features['sequence'] = sequence

        protvista_features['features'] = list()
        if annotations.get('predictedDSSP8'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP8'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
        if annotations.get('predictedDSSP3'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP3'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
        if annotations.get('predictedDSSP3'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDisorder'],
                    evidences=[evidence],
                    type=f"DISORDER_({model_name})",
                    feature_enum=Disorder))

        return protvista_features
    elif format == "legacy":
        predictedCCO = {}
        predictedBPO = {}
        predictedMFO = {}

        for prediction in annotations['predictedCCO']:
            predictedCCO[prediction['GO_Term']] = max(
                predictedCCO.get(prediction['GO_Term'], -1), prediction['RI'])

        for prediction in annotations['predictedBPO']:
            predictedBPO[prediction['GO_Term']] = max(
                predictedBPO.get(prediction['GO_Term'], -1), prediction['RI'])

        for prediction in annotations['predictedMFO']:
            predictedMFO[prediction['GO_Term']] = max(
                predictedMFO.get(prediction['GO_Term'], -1), prediction['RI'])

        annotations['predictedCCO'] = predictedCCO
        annotations['predictedBPO'] = predictedBPO
        annotations['predictedMFO'] = predictedMFO

        return annotations

    elif format == "go-predictprotein":
        mapping_function = lambda x: {
            "gotermid": x['GO_Term'],
            "gotermname": x['GO_Name'],
            "gotermscore": round(x['RI'] * 100)
        }

        predictedCCO = {
            "ontology":
            "Cellular Component Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedCCO']))
        }
        predictedBPO = {
            "ontology":
            "Biological Process Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedBPO']))
        }
        predictedMFO = {
            "ontology":
            "Molecular Function Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedMFO']))
        }

        return [predictedBPO, predictedCCO, predictedMFO]
    elif format == "full":
        return annotations
    else:
        abort(400, f"Wrong format passed: {format}")
예제 #6
0
def _get_annotations_from_params(params):
    sequence = params.get('sequence')

    if not sequence or len(sequence) > 2000 or not check_valid_sequence(
            sequence):
        return abort(400,
                     "Sequence is too long or contains invalid characters.")

    model_name = params.get('model', 'prottrans_t5_xl_u50')
    annotations = get_features(model_name, sequence)

    if model_name == 'prottrans_t5_xl_u50' and 'prott5_residue_landscape_annotations' in get_queues(
    ):
        residue_landscape_output = get_residue_landscape(model_name=model_name,
                                                         sequence=sequence)
        # merge the output of the residue landscape into the feature dict
        # add the meta information
        for key in residue_landscape_output['meta']:
            annotations['meta'][key] = residue_landscape_output['meta'][key]

        residue_landscape_output.pop('meta', None)

        # add all the remaining information
        for key in residue_landscape_output:
            annotations[key] = residue_landscape_output[key]

    annotations['sequence'] = sequence

    format = params.get('format', 'legacy')
    only_closest_k = params.get('only_closest_k', True)

    if only_closest_k == True:
        _filter_ontology(annotations, "predictedBPO")
        _filter_ontology(annotations, "predictedCCO")
        _filter_ontology(annotations, "predictedMFO")

    if format == "protvista-predictprotein":
        source = Source(url=request.url,
                        id="sync",
                        name=f"bio_embeddings using {model_name}")

        evidence = Evidence(source=source, )

        protvista_features = dict()
        protvista_features['sequence'] = sequence

        protvista_features['features'] = list()
        if annotations.get('predictedDSSP8'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP8'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
        if annotations.get('predictedDSSP3'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP3'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
        if annotations.get('predictedDisorder'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDisorder'],
                    evidences=[evidence],
                    type=f"DISORDER_({model_name})",
                    feature_enum=Disorder))
        if annotations.get('predictedBindingMetal'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedBindingMetal'],
                    evidences=[evidence],
                    type=f"BINDING_METAL_({model_name})",
                    feature_enum=BindingResidues))
        if annotations.get('predictedBindingNucleicAcids'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations[
                        'predictedBindingNucleicAcids'],
                    evidences=[evidence],
                    type=f"BINDING_NUCLEIC_ACIDS_({model_name})",
                    feature_enum=BindingResidues))
        if annotations.get('predictedBindingSmallMolecules'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations[
                        'predictedBindingSmallMolecules'],
                    evidences=[evidence],
                    type=f"BINDING_SMALL_MOLECULES_({model_name})",
                    feature_enum=BindingResidues))
        if annotations.get('predictedTransmembrane'):
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedTransmembrane'],
                    evidences=[evidence],
                    type=f"TRANSMEMBRANE_({model_name})",
                    feature_enum=MembraneResidues))

        return protvista_features
    elif format == "legacy":
        predictedCCO = {}
        predictedBPO = {}
        predictedMFO = {}

        for prediction in annotations['predictedCCO']:
            predictedCCO[prediction['GO_Term']] = max(
                predictedCCO.get(prediction['GO_Term'], -1), prediction['RI'])

        for prediction in annotations['predictedBPO']:
            predictedBPO[prediction['GO_Term']] = max(
                predictedBPO.get(prediction['GO_Term'], -1), prediction['RI'])

        for prediction in annotations['predictedMFO']:
            predictedMFO[prediction['GO_Term']] = max(
                predictedMFO.get(prediction['GO_Term'], -1), prediction['RI'])

        annotations['predictedCCO'] = predictedCCO
        annotations['predictedBPO'] = predictedBPO
        annotations['predictedMFO'] = predictedMFO

        return annotations
    elif format == "go-predictprotein":
        mapping_function = lambda x: {
            "gotermid": x['GO_Term'],
            "gotermname": x['GO_Name'],
            "gotermscore": round(x['RI'] * 100)
        }

        predictedCCO = {
            "ontology":
            "Cellular Component Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedCCO']))
        }
        predictedBPO = {
            "ontology":
            "Biological Process Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedBPO']))
        }
        predictedMFO = {
            "ontology":
            "Molecular Function Ontology",
            "goTermWithScore":
            list(map(mapping_function, annotations['predictedMFO']))
        }

        return [predictedBPO, predictedCCO, predictedMFO]
    elif format == "full":
        return annotations
    else:
        abort(400, f"Wrong format passed: {format}")
예제 #7
0
    def post(self):
        params = request.json

        sequence = params.get('sequence')

        if not sequence or len(sequence) > 2000 or not check_valid_sequence(
                sequence):
            return abort(
                400, "Sequence is too long or contains invalid characters.")

        model_name = params.get('model', 'seqvec')

        model = {
            'seqvec': get_seqvec_annotations_sync,
            'prottrans_bert_bfd': get_protbert_annotations_sync
        }.get(model_name)

        if not model:
            return abort(400, f"Model '{model_name}' isn't available.")

        # time_limit && soft_time_limit limit the execution time. Expires limits the queuing time.
        job = model.apply_async(args=[sequence],
                                time_limit=60 * 5,
                                soft_time_limit=60 * 5,
                                expires=60 * 60)
        annotations = job.get()

        annotations['sequence'] = sequence

        format = params.get('format', 'legacy')

        if format == "protvista-predictprotein":
            source = Source(url=request.url,
                            id="sync",
                            name=f"bio_embeddings using {model_name}")

            evidence = Evidence(source=source, )

            protvista_features = dict()
            protvista_features['sequence'] = sequence

            protvista_features['features'] = list()
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP8'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_8_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDSSP3'],
                    evidences=[evidence],
                    type=f"SECONDARY_STRUCTURE_3_STATES_({model_name})",
                    feature_enum=SecondaryStructure))
            protvista_features['features'].extend(
                annotations_to_protvista_converter(
                    features_string=annotations['predictedDisorder'],
                    evidences=[evidence],
                    type=f"DISORDER_({model_name})",
                    feature_enum=Disorder))

            return protvista_features
        else:
            return annotations