def test_compute_aa_composition_result_complex_heteropolymer(): protein_sequence = 'AWGY' aa_composition = compute_aa_composition(protein_sequence) assert aa_composition['A'] == 0.25 assert aa_composition['W'] == 0.25 assert aa_composition['G'] == 0.25 assert aa_composition['Y'] == 0.25
def predict_protein_location(protein_sequence:str) -> str: aa_composition = compute_aa_composition(protein_sequence) df_aa_composition = pd.DataFrame([aa_composition]) prediction = model.predict(df_aa_composition) if prediction == 1: protein_location = 'Membrane' else: protein_location = 'Cytoplasm' return protein_location
def run_model(file_path: str, model_path: str) -> pd.DataFrame: """ Run a membrane protein prediction on a FASTA file. Parameters ---------- file_path:str path to proteins in FASTA format. model_path:str path to trained model in pickle format Returns ------- df_prediction:pd.DataFrame Pandas DataFrame containing the membrane protein predictions. """ with open(model_path, 'rb') as handle: model = pickle.load(handle) handle = open(file_path) parser = SeqIO.parse(handle, 'fasta') df_aa_composition = pd.DataFrame() df_predictions = pd.DataFrame(columns=['id', 'membrane']) for record in parser: aa_composition = compute_aa_composition(str(record.seq)) aa_composition['id'] = record.id df_aa_composition = df_aa_composition.append(aa_composition, ignore_index=True) X = df_aa_composition.drop(['id'], axis=1) ids = df_aa_composition['id'] y_pred = model.predict(X) df_predictions['id'] = ids df_predictions['membrane'] = y_pred return df_predictions
def test_compute_aa_composition_result_simple_homopolymer(): protein_sequence = 'AAAAAA' aa_composition = compute_aa_composition(protein_sequence) assert aa_composition['A'] == 1
def test_compute_aa_composition_return_type(): protein_sequence = 'AWGY' aa_composition = compute_aa_composition(protein_sequence) assert isinstance(aa_composition, dict)