def test_mast(self):
     meme_interface.run_mast(self.input_memefile, self.input_seqfile,
                             self.output)
     mast_txt_path = os.path.join(self.output, "mast.txt")
     act_diagrams = bio_interface.parse_mast_file(mast_txt_path)
     with open(self.ref_output, 'rb') as file:
         ref_diagrams = pickle.load(file)
     self.assertDictEqual(act_diagrams, ref_diagrams)
     self.success = True
示例#2
0
def build_output_matrix_from_aligned(selected_seqs, aligned_seqs, output,
                                     composition=None):
    """
    # SEQ_FILE = os.path.join(paths.ROOT, 'mg_seqs.fasta')
    # ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'mg_aligned.txt')

    # SEQ_FILE = os.path.join(paths.ROOT, 'efhand_aligned_seqs.fasta')
    # ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'efhand_aligned.txt')
    :return:
    """
    # Length of actual motif
    FINAL_MOTIF_LEN = 30
    # composition_file should be provided, can be generated as below:
    if composition is None:
        composition = os.path.join(paths.ROOT, "uniprot_full_composition.txt")
        build_composition.build(os.path.join(paths.ROOT, "uniprot_full.fasta"), \
                                composition)

    # Intermediate paths
    MEME_FOLDER = paths.MEME_MAST_FOLDER
    MEME_TXT = paths.TMP_FILE_TEMPLATE.format(1)
    CLEANED_SEQ = paths.TMP_FILE_TEMPLATE.format(2)

    INIT_MOTIF_LEN = preprocess._get_motif_len_from_aligned(aligned_seqs)
    clean_fasta_alphabet.screen(fasta_path=selected_seqs, output_path=CLEANED_SEQ)
    filter_seqs.delete_short_seqs(CLEANED_SEQ, threshold=FINAL_MOTIF_LEN)

    build_meme_from_aligned.build(aligned_seqs, MEME_TXT, composition)

    meme_interface.run_mast(MEME_TXT, CLEANED_SEQ, MEME_FOLDER)
    mast_txt_path = os.path.join(MEME_FOLDER, 'mast.txt')
    acc_motif_map = meme_interface.extract_motifs_mast_uniprot(mast_txt_path,
                                                               INIT_MOTIF_LEN)

    acc_seq_map = get_pname_seq.parse_raw(CLEANED_SEQ)
    aligned_sequences = []
    for acc, motifs in acc_motif_map.items():
        if acc not in acc_seq_map:
            continue
        for motif in motifs:
            subseq = extract_subseq_from_seqs(motif, acc_seq_map[acc],
                                              INIT_MOTIF_LEN, FINAL_MOTIF_LEN)
            if subseq:
                aligned_sequences.append(subseq)

    output_matrix = np.zeros((FINAL_MOTIF_LEN, 20), dtype=int)
    for seq in aligned_sequences:
        for i, char in enumerate(seq):
            if char in generic.AA1_TO_I:
                output_matrix[i][generic.AA1_TO_I[char]] += 1

    with open(output, 'w') as file:
        for line in output_matrix:
            for value in line:
                file.write(str(value) + " ")
            file.write("\n")
    return len(aligned_sequences)
示例#3
0
def get_motif_pos_from_output_matrix(output_matrix, num_seqs,
                                     pdb_seq_file, output, motif_len=30):
    from converge import conv_interface

    OUTPUT_MEME = os.path.join(paths.ROOT, "output_meme.txt")

    conv_interface.run(output_matrix, motif_len, num_seqs, OUTPUT_MEME)

    meme_interface.run_mast(OUTPUT_MEME, pdb_seq_file, paths.MEME_MAST_FOLDER)
    mast_txt_path = os.path.join(paths.MEME_MAST_FOLDER, 'mast.txt')

    pdb_cid_motif_raw = _get_motif_diagram_mast_pdb(mast_txt_path)

    pdb_cid_seq = dict()
    for i, (pdb_id, cid) in enumerate(pdb_cid_motif_raw.keys()):
        if not i % 10:
            print(i)
        try:
            seq = pdb_interface.get_seq_for(pdb_id, cid=cid)
            if seq is None:
                continue
        except Exception as e:
            logging.error(f"get_seq_for() fails for pdb_id/cid {pdb_id}/{cid}. "
                          f"Skipping.")
            logging.error(f"Traceback: <{traceback.format_exc()}>")
            logging.error(f"Error_msg: <{e}>\n\n")
            continue
        pdb_cid_seq[(pdb_id, cid)] = seq
    with open(paths.TMP_FILE_TEMPLATE.format("tmp_pdb_cid_seq.pkl"), 'wb') as \
            file:
        pickle.dump(pdb_cid_seq, file, -1)
    pdb_cid_seq = OrderedDict(sorted(pdb_cid_seq.items()))
    pdb_seq_direct_from_pdb_files = paths.TMP_FILE_TEMPLATE.format(12)
    with open(pdb_seq_direct_from_pdb_files, 'w') as file:
        for (pdb_id, cid), seq in pdb_cid_seq.items():
            file.write(f">{pdb_id}_{cid}\n")
            file.write(seq + "\n")
    clean_fasta_alphabet.screen(pdb_seq_file)
    filter_seqs.delete_short_seqs(pdb_seq_direct_from_pdb_files, motif_len)
    meme_interface.run_mast(OUTPUT_MEME, pdb_seq_direct_from_pdb_files,
                            paths.MEME_MAST_FOLDER)
    mast_txt_path = os.path.join(paths.MEME_MAST_FOLDER, 'mast.txt')

    pdb_cid_motif_raw = _get_motif_diagram_mast_pdb(mast_txt_path)
    pdb_cid_motif_map = meme_interface._adjust_motif_diagram(pdb_cid_motif_raw,
                                                             motif_len)

    motif_positions = defaultdict(dict)
    for (pdb_id, cid), motif_pos in pdb_cid_motif_map.items():
        motif_positions[pdb_id]['sno_markers'] = motif_pos
        motif_positions[pdb_id]['cid'] = cid
    motif_positions = OrderedDict(sorted(motif_positions.items()))

    with open(output, 'wb') as file:
        pickle.dump(motif_positions, file, -1)
def setup_meme_suite_mast():
    debug_folder = generic.setup_debug_folder(paths_test.DEBUG)
    mast_output_folder = os.path.join(debug_folder, "output_mast")
    input_seqfile = paths_test.MEME_TEST_SEQ
    input_memefile = paths_test.REF_MEME_TXT
    diagrams_output = paths_test.REF_MAST_DIAGRAMS

    meme_interface.run_mast(input_memefile, input_seqfile, mast_output_folder)
    mast_txt_path = os.path.join(mast_output_folder, "mast.txt")
    diagrams = bio_interface.parse_mast_file(mast_txt_path)
    with open(diagrams_output, 'wb') as file:
        pickle.dump(diagrams, file, -1)
def find_mast(pname_cid_map, seq_file, ref_meme_txt, motif_len, meme_folder):
    assert motif_len >= 1
    assert isinstance(motif_len, int)
    assert isinstance(ref_meme_txt, str)
    assert os.path.isfile(ref_meme_txt)
    meme_interface.run_mast(ref_meme_txt, seq_file, meme_folder)
    mast_txt_path = os.path.join(meme_folder, 'mast.txt')
    seq_motif_map = meme_interface.extract_motifs_mast(mast_txt_path,
                                                       motif_len)
    seq_motif_map = _delete_gapped_motifs(seq_motif_map, seq_file)
    motif_pos = _assemble_motif_pos(seq_motif_map, pname_cid_map)
    return motif_pos
def _build_seq_motif_map(process,
                         tmp_output_folder,
                         seq_file,
                         motif_len,
                         num_p=1,
                         ref_meme_txt=None):
    if process == 'meme':
        meme_interface.run_meme(seq_file, tmp_output_folder, num_p)
        meme_txt_path = os.path.join(tmp_output_folder, 'meme.txt')
        seq_motif_map = meme_interface.extract_motifs_meme(
            meme_txt_path, motif_len)
    elif process == 'mast':
        meme_interface.run_mast(ref_meme_txt, seq_file, tmp_output_folder)
        mast_txt_path = os.path.join(tmp_output_folder, 'mast.txt')
        seq_motif_map = meme_interface.extract_motifs_mast(
            mast_txt_path, motif_len)
    else:
        raise Exception
    return seq_motif_map
示例#7
0
def seq_aligned_converge_testing():
    '''
    16/10/2019
    Testing to see if my conv_rewrite gives same output as the original
    converge. Mainly in tracking of maxS.

    To do this, we'll take aligned_sequences, run make_meme on it, then run
    meme_to_conv, then use that conv as seed_matrix, and run conv_orig on the
    full uniprot data.
    '''
    # aligned sequences need to have no gap
    # todo: test for that?

    # Known matched sequences
    SEQ_FILE = os.path.join(paths.ROOT, 'mg_seqs.fasta')
    ALIGNED_SEQ_FILE = os.path.join(paths.ROOT, 'mg_aligned.txt')
    # Length of actual motif
    FINAL_MOTIF_LEN = 30
    OUTPUT_MATRIX = os.path.join(paths.USER_OUTPUT,
                                 "seq_aligned_converge_output.txt")
    # composition_file should be provided, can be generated as below:
    COMPOSITION_FILE = os.path.join(paths.ROOT, "uniprot_full_composition.txt")
    build_composition.build(os.path.join(paths.ROOT, "uniprot_full.fasta"), \
                            COMPOSITION_FILE)

    # Intermediate paths
    MEME_FOLDER = paths.MEME_MAST_FOLDER
    MEME_TXT = paths.TMP_FILE_TEMPLATE.format(1)
    CLEANED_SEQ = paths.TMP_FILE_TEMPLATE.format(2)

    INIT_MOTIF_LEN = preprocess._get_motif_len_from_aligned(ALIGNED_SEQ_FILE)
    clean_fasta_alphabet.screen(fasta_path=SEQ_FILE, output_path=CLEANED_SEQ)
    filter_seqs.delete_short_seqs(CLEANED_SEQ, threshold=FINAL_MOTIF_LEN)

    build_meme_from_aligned.build(ALIGNED_SEQ_FILE, MEME_TXT, COMPOSITION_FILE)

    meme_interface.run_mast(MEME_TXT, CLEANED_SEQ, MEME_FOLDER)
    mast_txt_path = os.path.join(MEME_FOLDER, 'mast.txt')
    acc_motif_map = meme_interface.extract_motifs_mast_uniprot(mast_txt_path,
                                                               INIT_MOTIF_LEN)

    acc_seq_map = get_pname_seq.parse_raw(CLEANED_SEQ)
    aligned_sequences = []
    for acc, motifs in acc_motif_map.items():
        if acc not in acc_seq_map:
            continue
        for motif in motifs:
            subseq = extract_subseq_from_seqs(motif, acc_seq_map[acc],
                                              INIT_MOTIF_LEN, FINAL_MOTIF_LEN)
            if subseq:
                aligned_sequences.append(subseq)

    composition_file = os.path.join(paths.ROOT, 'uniprot_full_composition.txt')
    aligned_seq_file = os.path.join(paths.ROOT, "aligned")
    with open(aligned_seq_file, 'w') as file:
        for i, seq in enumerate(aligned_sequences):
            file.write(">RAND_{}\n".format(i))
            file.write(seq + "\n")

    output_meme_for_testing = os.path.join(paths.ROOT,
                                           "output_meme_for_testing")
    build_meme_from_aligned.build(aligned_seq_file, output_meme_for_testing,
                                  composition_file)

    from converge import conv_interface
    conv_seed_mat = os.path.join(paths.ROOT, "conv_seed")
    conv_interface.convert_meme_to_conv(output_meme_for_testing,
                                        composition_file, conv_seed_mat,
                                        minimal=True)

    output_matrix = np.zeros((FINAL_MOTIF_LEN, 20), dtype=int)
    for seq in aligned_sequences:
        for i, char in enumerate(seq):
            if char in generic.AA1_TO_I:
                output_matrix[i][generic.AA1_TO_I[char]] += 1

    with open(OUTPUT_MATRIX, 'w') as file:
        for line in output_matrix:
            for value in line[:-1]:
                file.write(str(value) + ",")
            file.write(str(line[-1]) + "\n")
    print(f"Kmatches is {len(aligned_sequences)}.")

    #     next, we run converge.
    #     assumption that proteome, blosum is already encoded
    from converge import conv_interface

    OUTPUT_MEME = os.path.join(paths.ROOT, "output_meme.txt")

    conv_interface.run(OUTPUT_MATRIX, FINAL_MOTIF_LEN, len(aligned_sequences),
                       OUTPUT_MEME)
示例#8
0
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None):
    """
    We start off with a seq_file, aligned_seq_file.

    We derive matrix from aligned_seq_file
    We derive composition from seq_file
    We put both together into a meme.txt

    We take seq_file, and extract from it the relevant .pdb files and the
    corresponding cid. So acc+seq => pdb+cid

    We therefore select the seq files for which a corresponding .pdb+cid
    exists, and output it into a cropped_seqfile

    We run mast on this cropped_seqfile, to obtain motif_pos.

    Finally we output this motif_pos using the pdb+cid from before.

    Desired intermediate files:
    meme.txt from aligned_seq
    acc=>pdb+cid map
    cropped_seqfile.fasta
    acc=>motif_pos
    pdb+cid=>motif_pos
    """
    generic.quit_if_missing(seq_file)
    generic.quit_if_missing(aligned_seq_file)
    generic.warn_if_exist(output)
    if storage_path is None:
        composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt')
        meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt')
        meme_folder = paths.MEME_MAST_FOLDER
        cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta')
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        composition_file = os.path.join(storage_path, 'composition.txt')
        meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt')
        meme_folder = os.path.join(storage_path, 'meme_mast_folder')
        cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta')
    if os.path.isfile(composition_file):
        os.remove(composition_file)
    build_composition.build(seq_file, composition_file)
    build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file)

    acc_seq_map = get_pname_seq.parse_raw(seq_file)
    acc_ids = list(acc_seq_map.keys())
    acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids)

    pdb_seq_map = dict()
    # because pdb => acc mapping may not be 1-1, we retain the original maps
    mapped_pdb_acc = dict()
    for acc_id, seq in acc_seq_map.items():
        if acc_id in acc_pdb_map:
            pdb_id = acc_pdb_map[acc_id]
            mapped_pdb_acc[pdb_id] = acc_id
            pdb_seq_map[pdb_id] = seq

    pdb_cid_map = find_cid_from_pname.find(pdb_seq_map)
    acc_pdb_cid_map = {
        mapped_pdb_acc[pdb]: (pdb, cid)
        for pdb, cid in pdb_cid_map.items()
    }
    cropped_acc_list = list(acc_pdb_cid_map.keys())

    keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file)
    meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder)
    mast_txt_path = os.path.join(meme_folder, 'mast.txt')
    acc_motif_map = meme_interface.extract_motifs_mast_uniprot(
        mast_txt_path, 14)
    acc_motif_map = motif_finder._delete_gapped_motifs_uniprot(
        acc_motif_map, cropped_seq_file)
    pdb_motif_pos = defaultdict(dict)
    for acc, motif_pos in acc_motif_map.items():
        pdb_id, cid = acc_pdb_cid_map[acc]
        pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos
        pdb_motif_pos[pdb_id]['cid'] = cid
    with open(output, 'wb') as file:
        pickle.dump(pdb_motif_pos, file, -1)
    if storage_path is None:
        shutil.move(composition_file, paths.TRASH)
        shutil.move(meme_txt, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
        shutil.move(cropped_seq_file, paths.TRASH)
    return