Пример #1
0
def complex_to_pairs(complex, source_type, get_pairs, output_dir):
    pairs_txt = output_dir + '/pairs.txt'
    name = complex.name
    logging.info("Working on {:}".format(name))
    pairs, num_subunits = get_pairs(complex)
    casp_capri_addon_message = '; selecting pair with most inter-chain interactions'
    logging_message = "For complex {:} found {:} pairs out of {:} chains"
    logging_message += casp_capri_addon_message if source_type == 'casp_capri' and num_subunits > 1 else ''
    logging.info(logging_message.format(name, len(pairs), num_subunits))
    sub_dir = output_dir + '/' + db.get_pdb_code(name)[1:3]
    f = name
    if ('mut' in f) and ('mut' not in db.get_pdb_code(name)):
        pdb = db.get_pdb_code(name)(f) + f[f.rfind('_') + 1: f.find('.')]
        sub_dir = output_dir + '/' + pdb
    with sem:
        if len(pairs) > 0:
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)
            with open(pairs_txt, 'a') as f:
                f.write(name + '\n')

    if source_type == 'casp_capri':
        pair_with_most_interactions = pairs[0]
        for pair in pairs:
            if len(pair.pos_idx) > len(pair_with_most_interactions.pos_idx):
                pair_with_most_interactions = pair
        pairs = [pair_with_most_interactions]
        assert len(pairs) == 1, 'For CASP-CAPRI complexes, the max-interactions chain must be the only chain selected'
    for i, pair in enumerate(pairs):
        output_dill = "{:}/{:}_{:}.dill".format(sub_dir, name, i)
        write_pair_as_dill(pair, output_dill)
Пример #2
0
def find_of_type(pdb_name_query, pdb_dataset, receptor, bound, style):
    """Get matching partner of provided file."""
    pdb_code = db.get_pdb_code(pdb_name_query)
    results = None
    for pdb_name in db.get_structures_filenames(pdb_dataset):
        if db.get_pdb_code(pdb_name) == pdb_code:
            if is_of_type(pdb_name, style, receptor=receptor, bound=bound):
                results = pdb_name
    return results
Пример #3
0
def _generate_clean_unbound_bound(filename_u, filename_b, results_dir, style):
    """Perform alignment on unbound and bound files to standardize them."""
    b2r_chain, u2r_chain, b2r_res, u2r_res = \
        _get_chain_mapping(filename_u, filename_b, style)

    aug_pdb_code_u = get_pdb_code_with_partner_and_binding(filename_u)
    if style == 'db5':
        aug_pdb_code_b = get_pdb_code_with_partner_and_binding(filename_b)
        pdb_extension = 'pdb'
    elif style == 'dockground':
        partner = _get_partner(filename_u)
        aug_pdb_code_b = db.get_pdb_code(filename_b) + '_' + partner + '_b'
        pdb_extension = db.get_pdb_type(filename_b)

    output_filename_u = results_dir + '/' + aug_pdb_code_u + '_cleaned.' + \
        pdb_extension
    output_filename_b = results_dir + '/' + aug_pdb_code_b + '_cleaned.' + \
        pdb_extension
    _generate_reference(
        filename_b, b2r_chain, b2r_res, output_filename_b, style)
    _generate_reference(
        filename_u, u2r_chain, u2r_res, output_filename_u, style)

    output_mapping_u = results_dir + '/' + aug_pdb_code_u + '_toref.pkl'
    output_mapping_b = results_dir + '/' + aug_pdb_code_b + '_toref.pkl'
    _generate_mapping(u2r_chain, u2r_res, output_mapping_u)
    _generate_mapping(b2r_chain, b2r_res, output_mapping_b)
Пример #4
0
def get_pdb_code_with_binding(pdb_filename):
    """
    Get pdb code with binding state annotated.

    e.g. 11as_r_u.pdb would give 11as_u
    """
    return db.get_pdb_code(pdb_filename) + '_' + _get_binding(pdb_filename)
Пример #5
0
def get_pdb_code_with_partner(pdb_filename):
    """
    Get pdb code with partner annotated.

    e.g. 11as_r_u.pdb would give 11as_r
    """
    return db.get_pdb_code(pdb_filename) + '_' + _get_partner(pdb_filename)
Пример #6
0
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus):
    ext = '.pkl'
    requested_filenames = \
        db.get_structures_filenames(pdb_dataset, extension=ext)
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [
        x[0] for x in db.get_all_filenames(work_keys,
                                           pdb_dataset,
                                           extension=ext,
                                           keyer=lambda x: db.get_pdb_name(x))
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)
Пример #7
0
def parse_all(pdb_dataset, output_dir, num_cpus):
    """Parse pdb dataset (pdb files) to pandas dataframes."""
    requested_filenames = db.get_structures_filenames(pdb_dataset)
    produced_filenames = db.get_structures_filenames(
        output_dir, extension='.pkl')

    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [x[0] for x in
                      db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)]

    logging.info("{:} requested keys, {:} produced keys, {:} work keys"
                 .format(len(requested_keys), len(produced_keys),
                         len(work_keys)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(
            sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl")

    inputs = [(key, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(parse, inputs, num_cpus)
Пример #8
0
def main(pair_dir, tfrecord_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    requested_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = \
        db.get_structures_filenames(tfrecord_dir, extension='.tfrecord')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]

    work_keys = [key for key in requested_keys if key not in produced_keys]
    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    work_filenames = [
        x[0]
        for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill')
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".tfrecord")

    inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)]
    par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
Пример #9
0
def main(pair_dir, to_keep_dir, output_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    to_keep_filenames = \
        db.get_structures_filenames(to_keep_dir, extension='.txt')
    if len(to_keep_filenames) == 0:
        logging.warning(
            "There is no to_keep file in {:}. All pair files from {:} "
            "will be copied into {:}".format(to_keep_dir, pair_dir,
                                             output_dir))

    to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames)
    logging.info("There are {:} rows, cols in to_keep_df".format(
        to_keep_df.shape))

    logging.info("Looking for all pairs in {:}".format(pair_dir))
    work_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(i, o, to_keep_df)
              for i, o in zip(work_filenames, output_filenames)]
    ncopied = 0
    ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        ncopied, len(work_keys)))
Пример #10
0
def complex_to_pairs(complex, get_pairs, output_dir):
    pairs_txt = output_dir + '/pairs.txt'
    name = complex.name
    logging.info("Working on {:}".format(name))
    pairs, num_subunits = get_pairs(complex)
    logging.info("For complex {:} found {:} pairs out of {:} chains"
                 .format(name, len(pairs), num_subunits))
    sub_dir = output_dir + '/' + db.get_pdb_code(name)[1:3]
    f = name
    if ('mut' in f) and ('mut' not in db.get_pdb_code(name)):
        pdb = db.get_pdb_code(name)(f) + f[f.rfind('_') + 1: f.find('.')]
        sub_dir = output_dir + '/' + pdb
    with sem:
        if len(pairs) > 0:
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)
            with open(pairs_txt, 'a') as f:
                f.write(name + '\n')

    for i, pair in enumerate(pairs):
        output_dill = "{:}/{:}_{:}.dill".format(sub_dir, name, i)
        write_pair_as_dill(pair, output_dill)
Пример #11
0
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff,
         num_cpus):
    """Run postprocess_pruned_pairs on all provided complexes."""
    logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir))
    work_filenames = \
        db.get_structures_filenames(pruned_pairs_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o)
              for i, o in zip(work_filenames, output_filenames)]
    n_copied = 0
    n_copied += np.sum(
        par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        n_copied, len(work_keys)))
Пример #12
0
def get_complex_pdb_codes(pdb_dataset):
    """Get complexes in provided directory."""
    complexes = set()
    for structure in db.get_structures_filenames(pdb_dataset):
        complexes.add(db.get_pdb_code(structure))
    return complexes
Пример #13
0
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db,
                         num_cpu_jobs, num_cpus_per_job, source_type, num_iter,
                         rank, size, write_file):
    ext = '.pkl'
    if write_file:
        if source_type.lower() == 'rcsb':
            # Filter out pairs that did not survive pruning previously to reduce complexity
            pruned_pdb_names = [
                db.get_pdb_name(filename)
                for filename in db.get_structures_filenames(pruned_dataset,
                                                            extension='.dill')
            ]
            requested_filenames = [
                os.path.join(pkl_dataset,
                             db.get_pdb_code(pruned_pdb_name)[1:3],
                             pruned_pdb_name.split('_')[0] + ext)
                for pruned_pdb_name in pruned_pdb_names
            ]
        else:  # DB5 does not employ pair pruning, so there are no pairs to filter
            requested_filenames = [
                filename
                for filename in db.get_structures_filenames(pkl_dataset,
                                                            extension=ext)
            ]

        # Filter DB5 filenames to unbound type and get all work filenames
        requested_filenames = [
            filename for filename in requested_filenames
            if (source_type.lower() == 'db5' and '_u_' in filename) or (
                source_type.lower() in
                ['rcsb', 'evcoupling', 'casp_capri', 'input'])
        ]
        requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
        produced_filenames = db.get_structures_filenames(output_dir,
                                                         extension='.pkl')
        produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
        work_keys = [key for key in requested_keys if key not in produced_keys]
        establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \
            if source_type.lower() == 'casp_capri' \
            else pdb_code.upper()
        work_filenames = [
            os.path.join(
                pkl_dataset,
                establish_pdb_code_case(db.get_pdb_code(work_key),
                                        source_type)[1:3], work_key + ext)
            for work_key in work_keys
        ]

        # Remove any duplicate filenames
        work_filenames = list(set(work_filenames))
        logging.info(
            "{:} requested keys, {:} produced keys, {:} work filenames".format(
                len(requested_keys), len(produced_keys), len(work_filenames)))

        if source_type.lower() == 'input':
            # Directly generate profile HMM features after aggregating input filenames
            logging.info("{:} work filenames".format(len(work_filenames)))

            output_filenames = []
            for pdb_filename in work_filenames:
                sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
                if not os.path.exists(sub_dir):
                    os.makedirs(sub_dir, exist_ok=True)
                output_filenames.append(sub_dir + '/' +
                                        db.get_pdb_name(pdb_filename) + '.pkl')

            inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                       num_iter)
                      for key, output in zip(work_filenames, output_filenames)]
            par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
        else:
            # Write out a local file containing all work filenames
            temp_df = pd.DataFrame({'filename': work_filenames})
            temp_df.to_csv(f'{source_type}_work_filenames.csv')
            logging.info(
                'File containing work filenames written to storage. Exiting...'
            )

    # Read from previously-created work filenames CSV
    else:
        work_filenames = pd.read_csv(
            f'{source_type}_work_filenames.csv').iloc[:, 1].to_list()
        work_filenames = list(
            set(work_filenames))  # Remove any duplicate filenames

        # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
        work_filename_rank_batches = slice_list(work_filenames, size)
        work_filenames = work_filename_rank_batches[rank]

        logging.info("{:} work filenames".format(len(work_filenames)))

        output_filenames = []
        for pdb_filename in work_filenames:
            sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)
            output_filenames.append(sub_dir + '/' +
                                    db.get_pdb_name(pdb_filename) + '.pkl')

        inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                   num_iter)
                  for key, output in zip(work_filenames, output_filenames)]
        par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
Пример #14
0
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus,
                  source_type, rank, size):
    ext = '.pkl'
    if source_type.lower(
    ) == 'rcsb':  # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or (
            source_type.lower() == 'rcsb') or (source_type.lower(
            ) == 'evcoupling') or (source_type.lower() == 'casp_capri')
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key + ext)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3].upper(),
                         work_key + ext) for work_key in work_keys
        ]

    # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
    work_filenames = list(set(work_filenames))
    work_filename_rank_batches = slice_list(work_filenames, size)
    work_filenames = work_filename_rank_batches[rank]

    # Remove any duplicate filenames
    logging.info(
        "{:} requested keys, {:} produced keys, {:} work filenames".format(
            len(requested_keys), len(produced_keys), len(work_filenames)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir, exist_ok=True)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)
Пример #15
0
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset,
                               pkl_dataset, pruned_dataset, output_dir,
                               source_type):
    ext = '.pkl'
    if source_type.lower() == 'rcsb':
        # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or
        (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input'])
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames]
    produced_filenames_path = os.path.join(output_dir, 'PSAIA',
                                           source_type.upper())
    produced_filenames = [
        path.as_posix()
        for path in Path(produced_filenames_path).rglob('*.tbl')
    ]
    produced_keys = [db.get_pdb_code(x) for x in produced_filenames]
    work_keys = [
        key for key, pdb_code in zip(requested_keys, requested_pdb_codes)
        if pdb_code not in produced_keys
    ]
    format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \
        if source_type.lower() in ['input'] \
        else pdb_code.upper()
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pdb_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(
                pdb_dataset,
                format_pdb_code_for_inputs(db.get_pdb_code(work_key),
                                           source_type), work_key)
            for work_key in work_keys
        ]

    # Remove any duplicate filenames
    work_filenames = list(set(work_filenames))

    # Exit early if no inputs need to processed
    logging.info("{:} PDB files to process with PSAIA".format(
        len(work_filenames)))

    # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion)
    file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(),
                                  'pdb_list.fls')
    with open(file_list_file, 'w') as file:
        for requested_pdb_filename in work_filenames:
            file.write(f'{requested_pdb_filename}\n')

    inputs = [(psaia_dir, psaia_config_file, file_list_file)]
    par.submit_jobs(map_protrusion_indices, inputs,
                    1)  # PSAIA is inherently single-threaded in execution