def test_mast(self): meme_interface.run_mast(self.input_memefile, self.input_seqfile, self.output) mast_txt_path = os.path.join(self.output, "mast.txt") act_diagrams = bio_interface.parse_mast_file(mast_txt_path) with open(self.ref_output, 'rb') as file: ref_diagrams = pickle.load(file) self.assertDictEqual(act_diagrams, ref_diagrams) self.success = True
def build_output_matrix_from_aligned(selected_seqs, aligned_seqs, output, composition=None): """ # SEQ_FILE = os.path.join(paths.ROOT, 'mg_seqs.fasta') # ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'mg_aligned.txt') # SEQ_FILE = os.path.join(paths.ROOT, 'efhand_aligned_seqs.fasta') # ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'efhand_aligned.txt') :return: """ # Length of actual motif FINAL_MOTIF_LEN = 30 # composition_file should be provided, can be generated as below: if composition is None: composition = os.path.join(paths.ROOT, "uniprot_full_composition.txt") build_composition.build(os.path.join(paths.ROOT, "uniprot_full.fasta"), \ composition) # Intermediate paths MEME_FOLDER = paths.MEME_MAST_FOLDER MEME_TXT = paths.TMP_FILE_TEMPLATE.format(1) CLEANED_SEQ = paths.TMP_FILE_TEMPLATE.format(2) INIT_MOTIF_LEN = preprocess._get_motif_len_from_aligned(aligned_seqs) clean_fasta_alphabet.screen(fasta_path=selected_seqs, output_path=CLEANED_SEQ) filter_seqs.delete_short_seqs(CLEANED_SEQ, threshold=FINAL_MOTIF_LEN) build_meme_from_aligned.build(aligned_seqs, MEME_TXT, composition) meme_interface.run_mast(MEME_TXT, CLEANED_SEQ, MEME_FOLDER) mast_txt_path = os.path.join(MEME_FOLDER, 'mast.txt') acc_motif_map = meme_interface.extract_motifs_mast_uniprot(mast_txt_path, INIT_MOTIF_LEN) acc_seq_map = get_pname_seq.parse_raw(CLEANED_SEQ) aligned_sequences = [] for acc, motifs in acc_motif_map.items(): if acc not in acc_seq_map: continue for motif in motifs: subseq = extract_subseq_from_seqs(motif, acc_seq_map[acc], INIT_MOTIF_LEN, FINAL_MOTIF_LEN) if subseq: aligned_sequences.append(subseq) output_matrix = np.zeros((FINAL_MOTIF_LEN, 20), dtype=int) for seq in aligned_sequences: for i, char in enumerate(seq): if char in generic.AA1_TO_I: output_matrix[i][generic.AA1_TO_I[char]] += 1 with open(output, 'w') as file: for line in output_matrix: for value in line: file.write(str(value) + " ") file.write("\n") return len(aligned_sequences)
def get_motif_pos_from_output_matrix(output_matrix, num_seqs, pdb_seq_file, output, motif_len=30): from converge import conv_interface OUTPUT_MEME = os.path.join(paths.ROOT, "output_meme.txt") conv_interface.run(output_matrix, motif_len, num_seqs, OUTPUT_MEME) meme_interface.run_mast(OUTPUT_MEME, pdb_seq_file, paths.MEME_MAST_FOLDER) mast_txt_path = os.path.join(paths.MEME_MAST_FOLDER, 'mast.txt') pdb_cid_motif_raw = _get_motif_diagram_mast_pdb(mast_txt_path) pdb_cid_seq = dict() for i, (pdb_id, cid) in enumerate(pdb_cid_motif_raw.keys()): if not i % 10: print(i) try: seq = pdb_interface.get_seq_for(pdb_id, cid=cid) if seq is None: continue except Exception as e: logging.error(f"get_seq_for() fails for pdb_id/cid {pdb_id}/{cid}. " f"Skipping.") logging.error(f"Traceback: <{traceback.format_exc()}>") logging.error(f"Error_msg: <{e}>\n\n") continue pdb_cid_seq[(pdb_id, cid)] = seq with open(paths.TMP_FILE_TEMPLATE.format("tmp_pdb_cid_seq.pkl"), 'wb') as \ file: pickle.dump(pdb_cid_seq, file, -1) pdb_cid_seq = OrderedDict(sorted(pdb_cid_seq.items())) pdb_seq_direct_from_pdb_files = paths.TMP_FILE_TEMPLATE.format(12) with open(pdb_seq_direct_from_pdb_files, 'w') as file: for (pdb_id, cid), seq in pdb_cid_seq.items(): file.write(f">{pdb_id}_{cid}\n") file.write(seq + "\n") clean_fasta_alphabet.screen(pdb_seq_file) filter_seqs.delete_short_seqs(pdb_seq_direct_from_pdb_files, motif_len) meme_interface.run_mast(OUTPUT_MEME, pdb_seq_direct_from_pdb_files, paths.MEME_MAST_FOLDER) mast_txt_path = os.path.join(paths.MEME_MAST_FOLDER, 'mast.txt') pdb_cid_motif_raw = _get_motif_diagram_mast_pdb(mast_txt_path) pdb_cid_motif_map = meme_interface._adjust_motif_diagram(pdb_cid_motif_raw, motif_len) motif_positions = defaultdict(dict) for (pdb_id, cid), motif_pos in pdb_cid_motif_map.items(): motif_positions[pdb_id]['sno_markers'] = motif_pos motif_positions[pdb_id]['cid'] = cid motif_positions = OrderedDict(sorted(motif_positions.items())) with open(output, 'wb') as file: pickle.dump(motif_positions, file, -1)
def setup_meme_suite_mast(): debug_folder = generic.setup_debug_folder(paths_test.DEBUG) mast_output_folder = os.path.join(debug_folder, "output_mast") input_seqfile = paths_test.MEME_TEST_SEQ input_memefile = paths_test.REF_MEME_TXT diagrams_output = paths_test.REF_MAST_DIAGRAMS meme_interface.run_mast(input_memefile, input_seqfile, mast_output_folder) mast_txt_path = os.path.join(mast_output_folder, "mast.txt") diagrams = bio_interface.parse_mast_file(mast_txt_path) with open(diagrams_output, 'wb') as file: pickle.dump(diagrams, file, -1)
def find_mast(pname_cid_map, seq_file, ref_meme_txt, motif_len, meme_folder): assert motif_len >= 1 assert isinstance(motif_len, int) assert isinstance(ref_meme_txt, str) assert os.path.isfile(ref_meme_txt) meme_interface.run_mast(ref_meme_txt, seq_file, meme_folder) mast_txt_path = os.path.join(meme_folder, 'mast.txt') seq_motif_map = meme_interface.extract_motifs_mast(mast_txt_path, motif_len) seq_motif_map = _delete_gapped_motifs(seq_motif_map, seq_file) motif_pos = _assemble_motif_pos(seq_motif_map, pname_cid_map) return motif_pos
def _build_seq_motif_map(process, tmp_output_folder, seq_file, motif_len, num_p=1, ref_meme_txt=None): if process == 'meme': meme_interface.run_meme(seq_file, tmp_output_folder, num_p) meme_txt_path = os.path.join(tmp_output_folder, 'meme.txt') seq_motif_map = meme_interface.extract_motifs_meme( meme_txt_path, motif_len) elif process == 'mast': meme_interface.run_mast(ref_meme_txt, seq_file, tmp_output_folder) mast_txt_path = os.path.join(tmp_output_folder, 'mast.txt') seq_motif_map = meme_interface.extract_motifs_mast( mast_txt_path, motif_len) else: raise Exception return seq_motif_map
def seq_aligned_converge_testing(): ''' 16/10/2019 Testing to see if my conv_rewrite gives same output as the original converge. Mainly in tracking of maxS. To do this, we'll take aligned_sequences, run make_meme on it, then run meme_to_conv, then use that conv as seed_matrix, and run conv_orig on the full uniprot data. ''' # aligned sequences need to have no gap # todo: test for that? # Known matched sequences SEQ_FILE = os.path.join(paths.ROOT, 'mg_seqs.fasta') ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'mg_aligned.txt') # Length of actual motif FINAL_MOTIF_LEN = 30 OUTPUT_MATRIX = os.path.join(paths.USER_OUTPUT, "seq_aligned_converge_output.txt") # composition_file should be provided, can be generated as below: COMPOSITION_FILE = os.path.join(paths.ROOT, "uniprot_full_composition.txt") build_composition.build(os.path.join(paths.ROOT, "uniprot_full.fasta"), \ COMPOSITION_FILE) # Intermediate paths MEME_FOLDER = paths.MEME_MAST_FOLDER MEME_TXT = paths.TMP_FILE_TEMPLATE.format(1) CLEANED_SEQ = paths.TMP_FILE_TEMPLATE.format(2) INIT_MOTIF_LEN = preprocess._get_motif_len_from_aligned(ALIGNED_SEQ_FILE) clean_fasta_alphabet.screen(fasta_path=SEQ_FILE, output_path=CLEANED_SEQ) filter_seqs.delete_short_seqs(CLEANED_SEQ, threshold=FINAL_MOTIF_LEN) build_meme_from_aligned.build(ALIGNED_SEQ_FILE, MEME_TXT, COMPOSITION_FILE) meme_interface.run_mast(MEME_TXT, CLEANED_SEQ, MEME_FOLDER) mast_txt_path = os.path.join(MEME_FOLDER, 'mast.txt') acc_motif_map = meme_interface.extract_motifs_mast_uniprot(mast_txt_path, INIT_MOTIF_LEN) acc_seq_map = get_pname_seq.parse_raw(CLEANED_SEQ) aligned_sequences = [] for acc, motifs in acc_motif_map.items(): if acc not in acc_seq_map: continue for motif in motifs: subseq = extract_subseq_from_seqs(motif, acc_seq_map[acc], INIT_MOTIF_LEN, FINAL_MOTIF_LEN) if subseq: aligned_sequences.append(subseq) composition_file = os.path.join(paths.ROOT, 'uniprot_full_composition.txt') aligned_seq_file = os.path.join(paths.ROOT, "aligned") with open(aligned_seq_file, 'w') as file: for i, seq in enumerate(aligned_sequences): file.write(">RAND_{}\n".format(i)) file.write(seq + "\n") output_meme_for_testing = os.path.join(paths.ROOT, "output_meme_for_testing") build_meme_from_aligned.build(aligned_seq_file, output_meme_for_testing, composition_file) from converge import conv_interface conv_seed_mat = os.path.join(paths.ROOT, "conv_seed") conv_interface.convert_meme_to_conv(output_meme_for_testing, composition_file, conv_seed_mat, minimal=True) output_matrix = np.zeros((FINAL_MOTIF_LEN, 20), dtype=int) for seq in aligned_sequences: for i, char in enumerate(seq): if char in generic.AA1_TO_I: output_matrix[i][generic.AA1_TO_I[char]] += 1 with open(OUTPUT_MATRIX, 'w') as file: for line in output_matrix: for value in line[:-1]: file.write(str(value) + ",") file.write(str(line[-1]) + "\n") print(f"Kmatches is {len(aligned_sequences)}.") # next, we run converge. # assumption that proteome, blosum is already encoded from converge import conv_interface OUTPUT_MEME = os.path.join(paths.ROOT, "output_meme.txt") conv_interface.run(OUTPUT_MATRIX, FINAL_MOTIF_LEN, len(aligned_sequences), OUTPUT_MEME)
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None): """ We start off with a seq_file, aligned_seq_file. We derive matrix from aligned_seq_file We derive composition from seq_file We put both together into a meme.txt We take seq_file, and extract from it the relevant .pdb files and the corresponding cid. So acc+seq => pdb+cid We therefore select the seq files for which a corresponding .pdb+cid exists, and output it into a cropped_seqfile We run mast on this cropped_seqfile, to obtain motif_pos. Finally we output this motif_pos using the pdb+cid from before. Desired intermediate files: meme.txt from aligned_seq acc=>pdb+cid map cropped_seqfile.fasta acc=>motif_pos pdb+cid=>motif_pos """ generic.quit_if_missing(seq_file) generic.quit_if_missing(aligned_seq_file) generic.warn_if_exist(output) if storage_path is None: composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt') meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt') meme_folder = paths.MEME_MAST_FOLDER cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta') else: generic.quit_if_missing(storage_path, filetype='folder') composition_file = os.path.join(storage_path, 'composition.txt') meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt') meme_folder = os.path.join(storage_path, 'meme_mast_folder') cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta') if os.path.isfile(composition_file): os.remove(composition_file) build_composition.build(seq_file, composition_file) build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file) acc_seq_map = get_pname_seq.parse_raw(seq_file) acc_ids = list(acc_seq_map.keys()) acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids) pdb_seq_map = dict() # because pdb => acc mapping may not be 1-1, we retain the original maps mapped_pdb_acc = dict() for acc_id, seq in acc_seq_map.items(): if acc_id in acc_pdb_map: pdb_id = acc_pdb_map[acc_id] mapped_pdb_acc[pdb_id] = acc_id pdb_seq_map[pdb_id] = seq pdb_cid_map = find_cid_from_pname.find(pdb_seq_map) acc_pdb_cid_map = { mapped_pdb_acc[pdb]: (pdb, cid) for pdb, cid in pdb_cid_map.items() } cropped_acc_list = list(acc_pdb_cid_map.keys()) keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file) meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder) mast_txt_path = os.path.join(meme_folder, 'mast.txt') acc_motif_map = meme_interface.extract_motifs_mast_uniprot( mast_txt_path, 14) acc_motif_map = motif_finder._delete_gapped_motifs_uniprot( acc_motif_map, cropped_seq_file) pdb_motif_pos = defaultdict(dict) for acc, motif_pos in acc_motif_map.items(): pdb_id, cid = acc_pdb_cid_map[acc] pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos pdb_motif_pos[pdb_id]['cid'] = cid with open(output, 'wb') as file: pickle.dump(pdb_motif_pos, file, -1) if storage_path is None: shutil.move(composition_file, paths.TRASH) shutil.move(meme_txt, paths.TRASH) shutil.move(meme_folder, paths.TRASH) shutil.move(cropped_seq_file, paths.TRASH) return