def build(seq_path, output): generic.quit_if_missing(seq_path) generic.warn_if_exist(output) if os.path.isfile(output): os.remove(output) # shutil.move(output, paths.TRASH) command = f"{paths.FASTA_2_MARKOV_EXEC} -protein < {seq_path} > {output}" subprocess.run(command, shell=True) generic.quit_if_missing(output) # with open(seq_path, 'r') as file: # lines = file.readlines() # counter_obj = Counter() # for line in lines: # if line.startswith(">"): # pass # counter_obj.update(line.strip()) # total_count = sum(counter_obj.values()) # alphabets = sorted(generic.AA3_to_AA1.values()) # print(counter_obj) # with open(output, 'w') as file: # for letter in alphabets: # percentage = "%.5f" % (counter_obj[letter] / total_count) # file.write(percentage + " ") # from tests.src import paths_test # build(paths_test.UNIPROT_SEQ, "./composition.txt")
def main(input_path, output_path): """ Input: motif_pos_pkl output: descr_file.pkl """ # todo: if pdb_file is empty (0 bytes), for some reason load_pdb_data # does not throw exception logs.set_logging_level() # paths store_dir = os.path.join(paths.ROOT, 'data', 'store') if os.path.isdir(store_dir): logging.warning("Store dir exists, deleting.") shutil.rmtree(store_dir) os.mkdir(store_dir) motif_pos_path = input_path with open(motif_pos_path, 'rb') as file: motif_pos_map = pickle.load(file) timecheck = time() descrs = descr_main.calculate(motif_pos_map) print(f"Time taken: {time() - timecheck}") logging.debug(f"Time taken: {time() - timecheck}") # for __, descr in descrs.groupby(['filename', 'cid', 'seq_marker']): # calc_descr.write_descr(descr) generic.warn_if_exist(paths.OUTPUT_DESCRS) # Switching back to pkl to avoid false float comparison failures. # with open(os.path.join(paths.ROOT, "final_descr_output_orig.pkl"), import numpy as np with open(output_path, "wb") as file: pickle.dump(descrs, file, -1)
def find_motifs_mast(pname_cid_path, seq_file, ref_meme_txt, motif_len, output, meme_folder=paths.MEME_MAST_FOLDER): """ :param pname_cid_path: paths.PNAME_CID :param seq_file: paths.FULL_SEQS :param ref_meme_txt: paths.REF_MEME_TXT :param motif_len: 13 :param output: paths.MOTIF_POS """ assert motif_len >= 1 assert isinstance(motif_len, int) generic.quit_if_missing(pname_cid_path) with open(pname_cid_path, 'rb') as file: pname_cid_map = pickle.load(file) _test_seq_cid_map(pname_cid_map) motif_pos = motif_finder.find_mast(pname_cid_map, seq_file, ref_meme_txt, motif_len, meme_folder=meme_folder) generic.warn_if_exist(output) with open(output, 'wb') as file: pickle.dump(motif_pos, file, -1)
def setUp(self): self.meme_full = paths_test.ORIG_MEME_FOR_CONVERT self.meme_minimal = paths_test.REF_MEME_FROM_CONV self.ref_matrix = paths_test.REF_CONV_MATRIX self.ref_composition = paths_test.REF_CONV_COMPOSITION self.tmp_1 = paths_test.TMP_FILE_TEMPLATE.format(1) self.tmp_2 = paths_test.TMP_FILE_TEMPLATE.format(2) generic.warn_if_exist(self.tmp_1) generic.warn_if_exist(self.tmp_2)
def _write_matrix_file(matrix_ordered, output): """ For meme_suite matrix2meme """ output_lines = [] for AA_counts in matrix_ordered: output_lines.append(" ".join(str(i) for i in AA_counts)) single_str_line = "\n".join(output_lines) generic.warn_if_exist(output) with open(output, 'w') as file: file.write(single_str_line)
def parse_extract_ioncom(input_file, pname_cid_path): """ :param input_file: paths.IONCOM_EXTRACT :param pname_cid_path: paths.PNAME_CID """ generic.quit_if_missing(input_file) pname_cid_map = extract_parser.parse_ioncom(input_file) _test_seq_cid_map(pname_cid_map) generic.warn_if_exist(pname_cid_path) with open(pname_cid_path, 'wb') as file: pickle.dump(pname_cid_map, file, -1)
def run_mast(meme_txt, fasta_filename, mast_output, mast_exec=paths.MAST_EXEC): generic.quit_if_missing(meme_txt) generic.quit_if_missing(fasta_filename) generic.warn_if_exist(mast_output, filetype='folder') command = f"{mast_exec} -oc {mast_output} -mt 0.0001 {meme_txt} " \ f"{fasta_filename}" return_code = subprocess.run(command, shell=True).returncode if return_code != 0: logging.error("run_mast() failed.") logging.error(f"Command: <{command}>") raise Exception
def download_no_convert(acc_list, output): generic.warn_if_exist(output) with open(output, 'w') as file: for acc in acc_list: url = f"https://www.uniprot.org/uniprot/{acc}.fasta" try: with contextlib.closing(request.urlopen(url)) as contents: output = contents.read().decode("utf-8") file.write(output) except: continue
def parse_extract_prosite(input_file, pname_cid_path): """ :param input_file: paths.PROSITE_EXTRACT :param pname_cid_path: paths.PNAME_CID """ generic.quit_if_missing(input_file) pname_cid_map = extract_parser.parse_prosite(input_file, prosite_pdb_list.pdb_list) _test_seq_cid_map(pname_cid_map) generic.warn_if_exist(pname_cid_path) with open(pname_cid_path, 'wb') as file: pickle.dump(pname_cid_map, file, -1)
def download(pdb_list, output): pdb_acc_map = uniprot_id_converter.convert("PDB_ID", "ACC", pdb_list) generic.warn_if_exist(output) with open(output, 'w') as file: acc_unique = set(pdb_acc_map.values()) for acc in acc_unique: url = f"https://www.uniprot.org/uniprot/{acc}.fasta" try: with contextlib.closing(request.urlopen(url)) as contents: output = contents.read().decode("utf-8") file.write(output) except: continue
def run_meme_single(fasta_filename, motif_len, meme_output, num_p=1, meme_exec=paths.MEME_EXEC): assert motif_len >= 1 assert isinstance(motif_len, int) generic.quit_if_missing(fasta_filename) generic.warn_if_exist(meme_output, filetype='folder') command = f"{meme_exec} -w {motif_len} -p {num_p} -protein -nmotifs 1 " \ f"-mod oops -oc {meme_output} {fasta_filename}" return_code = subprocess.run(command, shell=True).returncode if return_code != 0: logging.error("run_meme_single() failed.") logging.error(f"Command: <{command}>") raise Exception
def run_prosite_mast(extract_path, motif_len, ref_meme_txt, output, pdb_folder=paths.PDB_FOLDER, storage_path=None): """ :param extract_path: paths.IONCOM_EXTRACT :param motif_len: 13 :param ref_meme_txt: paths.REF_MEME_TXT :param output: paths.PID_PDB_MAP :param pdb_folder: :param storage_path: :return: """ generic.quit_if_missing(extract_path) generic.quit_if_missing(ref_meme_txt) generic.warn_if_exist(output) if storage_path is None: pname_cid_path = paths.PNAME_CID seq_path = paths.FULL_SEQS meme_folder = paths.MEME_MAST_FOLDER else: generic.quit_if_missing(storage_path, filetype='folder') pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl') seq_path = os.path.join(storage_path, 'seqs.fasta') meme_folder = os.path.join(storage_path, 'meme_folder') parse_extract_prosite(extract_path, pname_cid_path) download_pdb(pname_cid_path, pdb_folder) trim_pnames_based_on_pdb(pname_cid_path, pdb_folder) create_seq(pname_cid_path, seq_path, pdb_folder) filter_seq_file(seq_path, threshold=31) find_motifs_mast(pname_cid_path, seq_path, ref_meme_txt, motif_len, output, meme_folder) if storage_path is None: shutil.move(pname_cid_path, paths.TRASH) shutil.move(seq_path, paths.TRASH) shutil.move(meme_folder, paths.TRASH)
def run_prosite_meme(extract_path, motif_len, output, num_p=7, pdb_folder=paths.PDB_FOLDER, storage_path=None): """ :param extract_path: paths.PROSITE_EXTRACT :param motif_len: 13 :param output: paths.PID_PDB_MAP """ '' generic.quit_if_missing(extract_path) generic.warn_if_exist(output) assert isinstance(num_p, int) assert num_p >= 1 if storage_path is None: pname_cid_path = paths.PNAME_CID seq_path = paths.FULL_SEQS meme_folder = paths.MEME_MAST_FOLDER else: generic.quit_if_missing(storage_path, filetype='folder') pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl') seq_path = os.path.join(storage_path, 'seqs.fasta') meme_folder = os.path.join(storage_path, 'meme_folder') parse_extract_prosite(extract_path, pname_cid_path) download_pdb(pname_cid_path, pdb_folder) trim_pnames_based_on_pdb(pname_cid_path, pdb_folder) create_seq(pname_cid_path, seq_path, pdb_folder) filter_seq_file(seq_path, threshold=31) find_motifs_meme(pname_cid_path, seq_path, motif_len, output, meme_folder, num_p) if storage_path is None: shutil.move(pname_cid_path, paths.TRASH) shutil.move(seq_path, paths.TRASH) shutil.move(meme_folder, paths.TRASH)
def build_descr(input_path, output_path): logs.set_logging_level() store_dir = os.path.join(paths.ROOT, 'data', 'store') if os.path.isdir(store_dir): logging.warning("Store dir exists, deleting.") shutil.rmtree(store_dir) os.mkdir(store_dir) motif_pos_path = input_path with open(motif_pos_path, 'rb') as file: motif_pos_map = pickle.load(file) timecheck = time() descrs = descr_main.calculate(motif_pos_map) print(f"Time taken: {time() - timecheck}") logging.debug(f"Time taken: {time() - timecheck}") generic.warn_if_exist(paths.OUTPUT_DESCRS) # Switching back to pkl to avoid false float comparison failures. # with open(os.path.join(paths.ROOT, "final_descr_output_orig.pkl"), with open(output_path, "wb") as file: pickle.dump(descrs, file, -1)
def setUp(self): self.matrix = paths_test.REF_CONV_MATRIX self.composition = paths_test.REF_CONV_COMPOSITION self.ref_meme = paths_test.REF_MEME_FROM_CONV self.tmp_1 = paths_test.TMP_FILE_TEMPLATE.format(1) generic.warn_if_exist(self.tmp_1)
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None): """ We start off with a seq_file, aligned_seq_file. We derive matrix from aligned_seq_file We derive composition from seq_file We put both together into a meme.txt We take seq_file, and extract from it the relevant .pdb files and the corresponding cid. So acc+seq => pdb+cid We therefore select the seq files for which a corresponding .pdb+cid exists, and output it into a cropped_seqfile We run mast on this cropped_seqfile, to obtain motif_pos. Finally we output this motif_pos using the pdb+cid from before. Desired intermediate files: meme.txt from aligned_seq acc=>pdb+cid map cropped_seqfile.fasta acc=>motif_pos pdb+cid=>motif_pos """ generic.quit_if_missing(seq_file) generic.quit_if_missing(aligned_seq_file) generic.warn_if_exist(output) if storage_path is None: composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt') meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt') meme_folder = paths.MEME_MAST_FOLDER cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta') else: generic.quit_if_missing(storage_path, filetype='folder') composition_file = os.path.join(storage_path, 'composition.txt') meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt') meme_folder = os.path.join(storage_path, 'meme_mast_folder') cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta') if os.path.isfile(composition_file): os.remove(composition_file) build_composition.build(seq_file, composition_file) build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file) acc_seq_map = get_pname_seq.parse_raw(seq_file) acc_ids = list(acc_seq_map.keys()) acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids) pdb_seq_map = dict() # because pdb => acc mapping may not be 1-1, we retain the original maps mapped_pdb_acc = dict() for acc_id, seq in acc_seq_map.items(): if acc_id in acc_pdb_map: pdb_id = acc_pdb_map[acc_id] mapped_pdb_acc[pdb_id] = acc_id pdb_seq_map[pdb_id] = seq pdb_cid_map = find_cid_from_pname.find(pdb_seq_map) acc_pdb_cid_map = { mapped_pdb_acc[pdb]: (pdb, cid) for pdb, cid in pdb_cid_map.items() } cropped_acc_list = list(acc_pdb_cid_map.keys()) keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file) meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder) mast_txt_path = os.path.join(meme_folder, 'mast.txt') acc_motif_map = meme_interface.extract_motifs_mast_uniprot( mast_txt_path, 14) acc_motif_map = motif_finder._delete_gapped_motifs_uniprot( acc_motif_map, cropped_seq_file) pdb_motif_pos = defaultdict(dict) for acc, motif_pos in acc_motif_map.items(): pdb_id, cid = acc_pdb_cid_map[acc] pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos pdb_motif_pos[pdb_id]['cid'] = cid with open(output, 'wb') as file: pickle.dump(pdb_motif_pos, file, -1) if storage_path is None: shutil.move(composition_file, paths.TRASH) shutil.move(meme_txt, paths.TRASH) shutil.move(meme_folder, paths.TRASH) shutil.move(cropped_seq_file, paths.TRASH) return