def main(): # Get file name and read the sequence test_file, sa_file = parse_args() test_sequence = utils.read_sequence(test_file) sa_sequence = utils.read_sequence(sa_file) # Read in the model json_model = utils.read_json() # Classify the test sequence predicted_rsa = DecisionTree.classify_sequence(test_sequence, json_model) utils.print_alignment(test_sequence, predicted_rsa, sa_sequence)
def needleman_wunsch(x, iterative_method, output_dir, proteins_dir): _, protein1, protein2 = x protein1_header, protein1_seq = read_sequence(protein1, proteins_dir) protein2_header, protein2_seq = read_sequence(protein2, proteins_dir) output_name = [protein1_header[1:5], protein2_header[1:5], 'global'] if iterative_method: output_name.append('iterative') protein1_align, protein2_align, similarity = _needleman_wunsch_iterative(protein1_seq, protein2_seq) else: protein1_align, protein2_align, similarity = _needleman_wunsch(protein1_seq, protein2_seq) output = os.path.join(output_dir, '_'.join(output_name)) save_alignment(protein1_align, protein2_align, similarity, output)
def smith_waterman(x, iterative_method, output_dir, proteins_dir): _, protein1, protein2 = x protein1_header, protein1_seq = read_sequence(protein1, proteins_dir) protein2_header, protein2_seq = read_sequence(protein2, proteins_dir) output_name = [protein1_header[1:5], protein2_header[1:5], 'local'] if iterative_method: output_name.append('iterative') protein1_align, protein2_align, similarity = _smith_waterman_iterative(protein1_seq, protein2_seq) else: protein1_align, protein2_align, similarity = _smith_waterman(protein1_seq, protein2_seq) output = os.path.join(output_dir, '_'.join(output_name)) save_alignment(protein1_align, protein2_align, similarity, output)
def parse_args(): if len(sys.argv) < 3: print(err_msg) sys.exit() try: sequence_1 = utils.read_sequence(sys.argv[1]) sequence_2 = utils.read_sequence(sys.argv[2]) except: # File parsing has failed. Oops. print(err_msg) sys.exit() return sequence_1, sequence_2
def build_feature_matrix(self): # For each fasta file in the training data, read the sequences and add them to the feature matrix for fasta_name in self.fasta_train: fasta = utils.read_sequence(fasta_name, self.fasta_dir) sa = utils.read_sequence(fasta_name.replace('.fasta', '.sa'), self.sa_dir) for index in range(len(fasta)): # Create the AA object acid = get_amino_acid(fasta[index].upper()) # Add the RSA label rsa_binary = rsa_labels[sa[index]] acid['rsa-label'] = rsa_binary # Add the acid to the matrix self.feature_matrix.append(acid)
def evaluate_model(self): # Keeping track of metrics (true condition, predicted condition) metrics = { (0, 0): 0, # True negative (0, 1): 0, # False positive (1, 0): 0, # False negative (1, 1): 0 # True positive } # For each fasta file in the testing data, walk the tree for each amino acid for fasta_name in self.fasta_test: fasta = utils.read_sequence(fasta_name, self.fasta_dir) sa = utils.read_sequence(fasta_name.replace('.fasta', '.sa'), self.sa_dir) for index in range(len(fasta)): # Test this amino acid against our decision tree amino_acid = fasta[index] expected_result = rsa_labels[sa[index]] calculated_result = self.walk_tree(get_amino_acid(amino_acid)) # print('Acid {}, expected {}, calculated {}'.format(amino_acid, expected_result, calculated_result)) metrics[expected_result, calculated_result] += 1 self.calculate_eval_metrics(metrics)