示例#1
0
def read_protein_from_file(file_pointer):
        dict_ = {}
        _dssp_dict = {'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7}
        _mask_dict = {'-': 0, '+': 1}

        while True:
            next_line = file_pointer.readline()
            if next_line == '[ID]\n':
                id_ = file_pointer.readline()[:-1]
                dict_.update({'id': id_})
            elif next_line == '[PRIMARY]\n':
                primary = encode_primary_string(file_pointer.readline()[:-1])
                dict_.update({'primary': primary})
            elif next_line == '[EVOLUTIONARY]\n':
                evolutionary = []
                for residue in range(21): evolutionary.append(
                    [float(step) for step in file_pointer.readline().split()])
                dict_.update({'evolutionary': evolutionary})
            elif next_line == '[SECONDARY]\n':
                secondary = list([_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]])
                dict_.update({'secondary': secondary})
            elif next_line == '[TERTIARY]\n':
                tertiary = []
                # 3 dimension
                for axis in range(3): tertiary.append(
                    [float(coord) for coord in file_pointer.readline().split()])
                dict_.update({'tertiary': tertiary})
            elif next_line == '[MASK]\n':
                mask = list([_mask_dict[aa] for aa in file_pointer.readline()[:-1]])
                dict_.update({'mask': mask})
            elif next_line == '\n':
                return dict_
            elif next_line == '':
                return None
def read_protein_from_file(file_pointer):

    dict_ = {}
    _dssp_dict = {
        'L': 0,
        'H': 1,
        'B': 2,
        'E': 3,
        'G': 4,
        'I': 5,
        'T': 6,
        'S': 7
    }
    _mask_dict = {'-': 0, '+': 1}

    while True:
        next_line = file_pointer.readline()
        if next_line == '[ID]\n':  # if the line in the file contains its ID include it in the dictionary under the id key
            id_ = file_pointer.readline(
            )[:
              -1]  # such specific indexing is used to omit the last character in this case the newline
            dict_.update({'id': id_})
        elif next_line == '[PRIMARY]\n':  # if the line in the file contains its Primary structure info include it in the dictionary under the primary key
            primary = encode_primary_string(
                file_pointer.readline()[:-1]
            )  # creates an encoded list where each aa is changed to its alphabetical position among all aa
            dict_.update({'primary': primary})
        elif next_line == '[EVOLUTIONARY]\n':
            evolutionary = []
            for residue in range(21):
                evolutionary.append(
                    [float(step) for step in file_pointer.readline().split()]
                )  # the line contains 21 times the seq_length=lenseq of numbers first lenseq amount corresponds to the first aa the next lenseq to the second one and etc. in the end it appends 21 lists to a list which is then included in the dictionary under the key evolutionary
            dict_.update({'evolutionary': evolutionary})
        elif next_line == '[SECONDARY]\n':  # if the line in the file contains its Secondary structure info include it in the dictionary under the secondary key
            secondary = list(
                [_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]])
            dict_.update({'secondary': secondary})
        elif next_line == '[TERTIARY]\n':  # if the line in the file contains its Tertiary structure info include it in the dictionary under the tertiary key
            tertiary = []
            # 3 dimension
            for axis in range(3):
                tertiary.append(  # first appends all first backbone atom coordinates of every aa, then the second and finally the last one
                    [
                        float(coord)
                        for coord in file_pointer.readline().split()
                    ])
            dict_.update({'tertiary': tertiary})
        elif next_line == '[MASK]\n':
            mask = list(
                [_mask_dict[aa] for aa in file_pointer.readline()[:-1]])
            dict_.update({'mask': mask})
        elif next_line == '\n':
            return dict_
        elif next_line == '':
            return None
示例#3
0
def read_protein_from_file(file_pointer):
    """The algorithm Defining Secondary Structure of Proteins (DSSP) uses information on e.g. the
    position of atoms and the hydrogen bonds of the molecule to determine the secondary structure
    (helices, sheets...).
    """
    dict_ = {}
    _dssp_dict = {
        'L': 0,
        'H': 1,
        'B': 2,
        'E': 3,
        'G': 4,
        'I': 5,
        'T': 6,
        'S': 7
    }
    _mask_dict = {'-': 0, '+': 1}

    while True:
        next_line = file_pointer.readline()
        if next_line == '[ID]\n':
            id_ = file_pointer.readline()[:-1]
            dict_.update({'id': id_})
        elif next_line == '[PRIMARY]\n':
            primary = encode_primary_string(file_pointer.readline()[:-1])
            dict_.update({'primary': primary})
        elif next_line == '[EVOLUTIONARY]\n':
            evolutionary = []
            for _residue in range(21):
                evolutionary.append(
                    [float(step) for step in file_pointer.readline().split()])
            dict_.update({'evolutionary': evolutionary})
        elif next_line == '[SECONDARY]\n':
            secondary = list(
                [_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]])
            dict_.update({'secondary': secondary})
        elif next_line == '[TERTIARY]\n':
            tertiary = []
            # 3 dimension
            for _axis in range(3):
                tertiary.append([
                    float(coord) for coord in file_pointer.readline().split()
                ])
            dict_.update({'tertiary': tertiary})
        elif next_line == '[MASK]\n':
            mask = list(
                [_mask_dict[aa] for aa in file_pointer.readline()[:-1]])
            dict_.update({'mask': mask})
        elif next_line == '\n':
            return dict_
        elif next_line == '':
            return None
示例#4
0
def main():
    input_sequences = [
        "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEE" \
        "NTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKI" \
        "EEQIEGKKGLNIRVRTLYDGIKNYKVQFP"]
    model_path = "output/models/2019-01-30_00_38_46-TRAIN-LR0_01-MB1.model"

    model = torch.load(model_path)
    input_sequences_encoded = list(
        torch.LongTensor(encode_primary_string(aa)) for aa in input_sequences)

    predicted_dihedral_angles, _predicted_backbone_atoms, _batch_sizes = \
        model(input_sequences_encoded)

    write_to_pdb(
        get_structure_from_angles(input_sequences_encoded[0],
                                  predicted_dihedral_angles[:, 0]),
        "myprediction")

    print("Wrote prediction to output/protein_myprediction.pdb")
示例#5
0
def predict():
    list_of_files = glob.glob(
        'output/models/*')  # * means all if need specific format then *.csv
    model_path = max(list_of_files, key=os.path.getctime)

    print("Generating ONNX from model:", model_path)
    model = torch.load(model_path)

    input_sequences = [
        "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEE" \
        "NTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKI" \
        "EEQIEGKKGLNIRVRTLYDGIKNYKVQFP"]

    input_sequences_encoded = list(
        torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences)

    print("Exporting to ONNX...")

    output_path = "./tests/output/openprotein.onnx"
    onnx_from_model(model, input_sequences_encoded, output_path)

    print("Wrote ONNX to", output_path)
示例#6
0
def prediction():

    list_of_files = glob.glob('output/models/*')
    default_model_path = max(list_of_files, key=os.path.getctime)

    parser = argparse.ArgumentParser(
        description="OpenProtein - Prediction CLI")
    parser.add_argument('--input_sequence', dest='input_sequence')
    parser.add_argument('--model_path',
                        dest='model_path',
                        default=default_model_path)
    parser.add_argument('--use_gpu', dest='use_gpu', default=False, type=bool)

    args, _ = parser.parse_known_args()

    print("Using model:", args.model_path)

    model = torch.load(args.model_path)

    input_sequences = [args.input_sequence]

    input_sequences_encoded = list(
        torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences)

    predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = \
        model(input_sequences_encoded)

    if predicted_dihedral_angles == []:
        predicted_dihedral_angles, _ = calculate_dihedral_angles_over_minibatch(
            predicted_backbone_atoms, batch_sizes, args.use_gpu)
    write_to_pdb(
        get_structure_from_angles(input_sequences_encoded[0],
                                  predicted_dihedral_angles[:, 0]),
        "prediction")

    print("Wrote prediction to output/protein_prediction.pdb")
示例#7
0
def read_protein_from_file(file_pointer):
    """The algorithm Defining Secondary Structure of Proteins (DSSP) uses information on e.g. the
    position of atoms and the hydrogen bonds of the molecule to determine the secondary structure
    (helices, sheets...).
    """
    dict_ = {}
    _dssp_dict = {'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7}
    _mask_dict = {'-': 0, '+': 1}

    while True:
        next_line = file_pointer.readline()
        if next_line == '[ID]\n':
            id_ = file_pointer.readline()[:-1]
            dict_.update({'id': id_})
        elif next_line == '[PRIMARY]\n':
            primary = encode_primary_string(file_pointer.readline()[:-1])
            dict_.update({'primary': primary})
        elif next_line == '[EVOLUTIONARY]\n':
            evolutionary = []
            for _residue in range(21):
                evolutionary.append(\
                    [float(step) for step in file_pointer.readline().split()])
            dict_.update({'evolutionary': evolutionary})
        elif next_line == '[SECONDARY]\n':
            secondary = list([_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]])
            dict_.update({'secondary': secondary})
        elif next_line == '[TERTIARY]\n':
            tertiary = []
            # 3 dimension
            for _axis in range(3):
                tertiary.append(\
                [float(coord) for coord in file_pointer.readline().split()])
            dict_.update({'tertiary': tertiary})
        elif next_line == '[MASK]\n':
            mask = list([_mask_dict[aa] for aa in file_pointer.readline()[:-1]])
            dict_.update({'mask': mask})
            mask_str = ''.join(map(str, mask))

            write_out("-------------")
            # Check for missing AA coordinates
            missing_internal_aa = False
            sequence_end = len(mask)           # for now, assume no C-terminal truncation needed
            write_out("Reading the protein " + id_)
            if re.search(r'1+0+1+', mask_str) is not None:       # indicates missing coordinates
                missing_internal_aa = True
                write_out("One or more internal coordinates missing. Protein is discarded.")
            elif re.search(r'^0*$', mask_str) is not None:       # indicates no coordinates at all
                missing_internal_aa = True
                write_out("One or more internal coordinates missing. It will be discarded.")
            else:
                if mask[0] == 0:
                    write_out("Missing coordinates in the N-terminal end. Truncating protein.")
                # investigate when the sequence with coordinates start and finish
                sequence_start = re.search(r'1', mask_str).start()
                if re.search(r'10', mask_str) is not None:   # missing coords in the C-term end
                    sequence_end = re.search(r'10', mask_str).start() + 1
                    write_out("Missing coordinates in the C-term end. Truncating protein.")
                write_out("Analyzing amino acids", sequence_start + 1, "-", sequence_end)

                # split lists in dict to have the seq with coords
                # separated from what should not be analysed
                if 'secondary' in dict_:
                    dict_.update({'secondary': secondary[sequence_start:sequence_end]})
                dict_.update({'primary': primary[sequence_start:sequence_end]})
                dict_.update({'mask': mask[sequence_start:sequence_end]})
                for elem in range(len(dict_['evolutionary'])):
                    dict_['evolutionary'][elem] = \
                        dict_['evolutionary'][elem][sequence_start:sequence_end]
                for elem in range(len(dict_['tertiary'])):
                    dict_['tertiary'][elem] = \
                        dict_['tertiary'][elem][sequence_start * 3:sequence_end * 3]

        elif next_line == '\n':
            if 'secondary' not in dict_:
                dict_['secondary'] = [8] * len(dict_['primary'])
            else:
                print("*" * 10, dict_['secondary'])
            return dict_, missing_internal_aa
        elif next_line == '':
            if dict_:
                if 'secondary' not in dict_:
                    dict_['secondary'] = [8] * len(dict_['primary'])
                else:
                    print("*" * 10, dict_['secondary'])
                return dict_, missing_internal_aa
            else:
                return None, False
示例#8
0
# This file is part of the OpenProtein project.
#
# @author Jeppe Hallgren
#
# For license information, please see the LICENSE file in the root directory.

import torch

from util import encode_primary_string, get_structure_from_angles, write_to_pdb, \
    calculate_dihedral_angles_over_minibatch

input_sequences = [
    "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEENTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKIEEQIEGKKGLNIRVRTLYDGIKNYKVQFP"
]
model_path = "output/models/2019-01-30_00_38_46-TRAIN-LR0_01-MB1.model"

model = torch.load(model_path)
input_senquences_encoded = list(
    torch.LongTensor(encode_primary_string(aa)) for aa in input_sequences)

predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = model(
    input_senquences_encoded)

write_to_pdb(
    get_structure_from_angles(input_senquences_encoded[0],
                              predicted_dihedral_angles[:, 0]), "myprediction")

print("Wrote prediction to output/protein_myprediction.pdb")