示例#1
0
 def _lookup(self, file_path):
     target = util.get_target_name(file_path)
     decoy = util.get_decoy_name(file_path)
     key = (target, decoy)
     if key in self._scores.index:
         return key, self._scores.loc[key]
     return None, None
示例#2
0
def main(data_dir, target_list, labels_dir, struct_format,
         num_cpus, overwrite, tmscore_exe):
    """ Compute rmsd, tm-score, gdt-ts, gdt-ha of decoy structures
    """
    logger = logging.getLogger(__name__)
    logger.info("Compute rmsd, tm-score, gdt-ts, gdt-ha of decoys in {:}".format(
        data_dir))

    os.makedirs(labels_dir, exist_ok=True)

    with open(target_list, 'r') as f:
        requested_filenames = \
            [os.path.join(labels_dir, '{:}.dat'.format(x.strip())) for x in f]
    logger.info("{:} requested keys".format(len(requested_filenames)))

    produced_filenames = []
    if not overwrite:
        produced_filenames = [f for f in fi.find_files(labels_dir, 'dat') \
                              if 'targets' not in f]
    logger.info("{:} produced keys".format(len(produced_filenames)))

    inputs = []
    for filename in requested_filenames:
        if filename in produced_filenames:
            continue
        target_name = util.get_target_name(filename)
        target_dir = os.path.join(data_dir, target_name)
        inputs.append((tmscore_exe, filename, target_name,
                       target_dir, struct_format))

    logger.info("{:} work keys".format(len(inputs)))
    par.submit_jobs(run_tmscore_per_target, inputs, num_cpus)
示例#3
0
def run_tmscore_per_target(tmscore_exe, output_filename, target_name,
                           target_dir, struct_format):
    '''
    Run TM-score to compare all decoy structures of a target with its
    native structure. Write the result into a tab-delimited file with
    the following headers:
        <target>  <decoy>  <rmsd>  <tm_score>  <gdt_ts>  <gdt_ha>
    '''
    native = os.path.join(target_dir, '{:}.{:}'.format(
        target_name, struct_format))
    decoys = fi.find_files(target_dir, struct_format)
    logging.info("Running tm-scores for {:} with {:} decoys".format(
        target_name, len(decoys)))
    rows = []
    for decoy in decoys:
        result = run_tmscore_per_structure(tmscore_exe, decoy, native)
        if result == None:
            logging.warning("Skip target {:} decoy {:} due to failure".format(
                target_name, decoy))
            continue
        rmsd, tm, gdt_ts, gdt_ha = result
        rows.append([util.get_target_name(decoy), util.get_decoy_name(decoy),
                     rmsd, gdt_ts, gdt_ha, tm])
    df = pd.DataFrame(
        rows,
        columns=['target', 'decoy', 'rmsd', 'gdt_ts', 'gdt_ha', 'tm'])
    df = df.sort_values(
            ['rmsd', 'gdt_ts', 'gdt_ha', 'tm', 'decoy'],
            ascending=[True, False, False, False, False]).reset_index(drop=True)
    # Write to file
    df.to_csv(output_filename, sep='\t', index=False)
示例#4
0
 def _lookup(self, file_path):
     target = util.get_target_name(file_path)
     decoy = util.get_decoy_name(file_path)
     key = (target, decoy)
     if key in self._scores.index:
         score = self._scores.loc[key].head(1).astype(np.float64).squeeze().to_dict()
         return key, score
     return key, None
示例#5
0
def casp_ensembler(pdb_files):
    targets = col.defaultdict(list)
    for f in pdb_files:
        target_name = util.get_target_name(f)
        targets[target_name].append(f)

    # target_name -> (decoy_name -> filename)
    ensembles = {}
    for target_name, files in targets.items():
        subunits = {util.get_decoy_name(f): f for f in files}
        ensembles[target_name] = subunits

    return ensembles
示例#6
0
def gen_splits(target_list, input_dir, output_sharded_train, output_sharded_val,
               output_sharded_test, splitby, test_years, train_years, val_years,
               train_size, val_size, test_size,
               train_decoy_size, val_decoy_size, test_decoy_size,
               exclude_natives, shuffle, random_seed):
    """ Generate train/val/test sets from the input dataset. """
    targets_df = pd.read_csv(
        target_list, delimiter='\s*', engine='python').dropna()

    files = fi.find_files(input_dir, dt.patterns['pdb'])
    structures_df = pd.DataFrame(
        [[util.get_target_name(f), util.get_decoy_name(f), f] for f in files],
        columns = ['target', 'decoy', 'path'])
    # Remove duplicates
    structures_df = structures_df.drop_duplicates(
        subset=['target', 'decoy'], keep='first').reset_index(drop=True)
    structures_df = pd.merge(structures_df, targets_df, on='target')

    # Keep only (target, year) that also appear in structure_df
    targets_df = structures_df[['target', 'year']].drop_duplicates(
        keep='first').reset_index(drop=True)

    if splitby == 'random':
        targets_train, targets_val, targets_test = split_targets_random(
            targets_df, train_size, val_size, test_size, shuffle, random_seed)
    elif splitby == 'year':
        targets_train, targets_val, targets_test = split_targets_by_year(
            targets_df, test_years, train_years, val_years, val_size,
            shuffle, random_seed)
    else:
        assert 'Unrecognized splitby option %s' % splitby

    print('Generating dataset: train ({:} targets), val ({:} targets), '
          'test ({:} targets)'.format(len(targets_train), len(targets_val),
                                      len(targets_test)))

    train_set, val_set, test_set = generate_train_val_targets_tests(
        structures_df, targets_train, targets_val, targets_test,
        train_decoy_size, val_decoy_size, test_decoy_size,
        exclude_natives, random_seed)

    print('Finished generating dataset: train ({:} decoys), val ({:} decoys), '
          'test ({:} decoys)'.format(len(train_set), len(val_set), len(test_set)))

    for (output_sharded, dataset) in [(output_sharded_train, train_set),
                                      (output_sharded_val, val_set),
                                      (output_sharded_test, test_set)]:
        print('\nWriting out dataset to {:}'.format(output_sharded))
        files = dataset.path.unique()
        create_sharded_dataset(files, output_sharded)
示例#7
0
def train_model(sess, args):
    # tf Graph input
    # Subgrid maps for each residue in a protein
    logging.debug('Create input placeholder...')
    grid_size = subgrid_gen.grid_size(args.grid_config)
    channel_size = subgrid_gen.num_channels(args.grid_config)
    feature_placeholder = tf.placeholder(
        tf.float32, [None, grid_size, grid_size, grid_size, channel_size],
        name='main_input')
    label_placeholder = tf.placeholder(tf.float32, [None, 1], 'label')

    # Placeholder for model parameters
    training_placeholder = tf.placeholder(tf.bool,
                                          shape=[],
                                          name='is_training')
    conv_drop_rate_placeholder = tf.placeholder(tf.float32,
                                                name='conv_drop_rate')
    fc_drop_rate_placeholder = tf.placeholder(tf.float32, name='fc_drop_rate')
    top_nn_drop_rate_placeholder = tf.placeholder(tf.float32,
                                                  name='top_nn_drop_rate')

    # Define loss and optimizer
    logging.debug('Define loss and optimizer...')
    predict_op, loss_op = conv_model(feature_placeholder, label_placeholder,
                                     training_placeholder,
                                     conv_drop_rate_placeholder,
                                     fc_drop_rate_placeholder,
                                     top_nn_drop_rate_placeholder, args)
    logging.debug('Generate training ops...')
    train_op = model.training(loss_op, args.learning_rate)

    # Initialize the variables (i.e. assign their default value)
    logging.debug('Initializing global variables...')
    init = tf.global_variables_initializer()

    # Create saver and summaries.
    logging.debug('Initializing saver...')
    saver = tf.train.Saver(max_to_keep=100000)
    logging.debug('Finished initializing saver...')

    def __loop(generator, mode, num_iters):
        tf_dataset, next_element = batch_dataset_generator(
            generator, args, is_testing=(mode == 'test'))

        structs, losses, preds, labels = [], [], [], []
        epoch_loss = 0
        progress_format = mode + ' loss: {:6.6f}'

        # Loop over all batches (one batch is all feature for 1 protein)
        num_batches = int(math.ceil(float(num_iters) / args.batch_size))
        #print('Running {:} -> {:} iters in {:} batches (batch size: {:})'.format(
        #    mode, num_iters, num_batches, args.batch_size))
        with tqdm.tqdm(total=num_batches, desc=progress_format.format(0)) as t:
            for i in range(num_batches):
                try:
                    struct_, feature_, label_ = sess.run(next_element)
                    _, pred, loss = sess.run(
                        [train_op, predict_op, loss_op],
                        feed_dict={
                            feature_placeholder:
                            feature_,
                            label_placeholder:
                            label_,
                            training_placeholder: (mode == 'train'),
                            conv_drop_rate_placeholder:
                            args.conv_drop_rate if mode == 'train' else 0.0,
                            fc_drop_rate_placeholder:
                            args.fc_drop_rate if mode == 'train' else 0.0,
                            top_nn_drop_rate_placeholder:
                            args.top_nn_drop_rate if mode == 'train' else 0.0
                        })
                    epoch_loss += (np.mean(loss) - epoch_loss) / (i + 1)
                    structs.extend(struct_.astype(str))
                    losses.append(loss)
                    preds.extend(pred)
                    labels.extend(label_)

                    t.set_description(progress_format.format(epoch_loss))
                    t.update(1)
                except (tf.errors.OutOfRangeError, StopIteration):
                    logging.info("\nEnd of dataset at iteration {:}".format(i))
                    break

        def __concatenate(array):
            try:
                array = np.concatenate(array)
                return array
            except:
                return array

        structs = __concatenate(structs)
        preds = __concatenate(preds)
        labels = __concatenate(labels)
        losses = __concatenate(losses)
        return structs, preds, labels, losses, epoch_loss

    # Run the initializer
    logging.debug('Running initializer...')
    sess.run(init)
    logging.debug('Finished running initializer...')

    ##### Training + validation
    if not args.test_only:
        prev_val_loss, best_val_loss = float("inf"), float("inf")

        if (args.max_targets_train == None) and (args.max_decoys_train
                                                 == None):
            train_num_structs = args.train_sharded.get_num_structures(
                ['ensemble', 'subunit'])
        elif (args.max_targets_train == None):
            train_num_structs = args.train_sharded.get_num_keyed(
            ) * args.max_decoys_train
        elif (args.max_decoys_train == None):
            assert False
        else:
            train_num_structs = args.max_targets_train * args.max_decoys_train

        if (args.max_targets_val == None) and (args.max_decoys_val == None):
            val_num_structs = args.val_sharded.get_num_structures(
                ['ensemble', 'subunit'])
        elif (args.max_targets_val == None):
            val_num_structs = args.val_sharded.get_num_keyed(
            ) * args.max_decoys_val
        elif (args.max_decoys_val == None):
            assert False
        else:
            val_num_structs = args.max_targets_val * args.max_decoys_val

        train_num_structs *= args.repeat_gen
        #val_num_structs *= args.repeat_gen

        logging.info(
            "Start training with {:} structs for train and {:} structs for val per epoch"
            .format(train_num_structs, val_num_structs))

        def _save():
            ckpt = saver.save(sess,
                              os.path.join(args.output_dir, 'model-ckpt'),
                              global_step=epoch)
            return ckpt

        run_info_filename = os.path.join(args.output_dir, 'run_info.json')
        run_info = {}

        def __update_and_write_run_info(key, val):
            run_info[key] = val
            with open(run_info_filename, 'w') as f:
                json.dump(run_info, f, indent=4)

        per_epoch_val_losses = []
        for epoch in range(1, args.num_epochs + 1):
            random_seed = args.random_seed  #random.randint(1, 10e6)
            logging.info('Epoch {:} - random_seed: {:}'.format(
                epoch, args.random_seed))

            logging.debug('Creating train generator...')
            train_generator_callable = functools.partial(
                feature_psp.dataset_generator,
                args.train_sharded,
                args.grid_config,
                score_type=args.score_type,
                shuffle=args.shuffle,
                repeat=args.repeat_gen,
                max_targets=args.max_targets_train,
                max_decoys=args.max_decoys_train,
                max_dist_threshold=300.0,
                random_seed=random_seed)

            logging.debug('Creating val generator...')
            val_generator_callable = functools.partial(
                feature_psp.dataset_generator,
                args.val_sharded,
                args.grid_config,
                score_type=args.score_type,
                shuffle=args.shuffle,
                repeat=1,  #*args.repeat_gen,
                max_targets=args.max_targets_val,
                max_decoys=args.max_decoys_val,
                max_dist_threshold=300.0,
                random_seed=random_seed)

            # Training
            train_structs, train_preds, train_labels, _, curr_train_loss = __loop(
                train_generator_callable, 'train', num_iters=train_num_structs)
            # Validation
            val_structs, val_preds, val_labels, _, curr_val_loss = __loop(
                val_generator_callable, 'val', num_iters=val_num_structs)

            per_epoch_val_losses.append(curr_val_loss)
            __update_and_write_run_info('val_losses', per_epoch_val_losses)

            if args.use_best or args.early_stopping:
                if curr_val_loss < best_val_loss:
                    # Found new best epoch.
                    best_val_loss = curr_val_loss
                    ckpt = _save()
                    __update_and_write_run_info('val_best_loss', best_val_loss)
                    __update_and_write_run_info('best_ckpt', ckpt)
                    logging.info("New best {:}".format(ckpt))

            if (epoch == args.num_epochs - 1 and not args.use_best):
                # At end and just using final checkpoint.
                ckpt = _save()
                __update_and_write_run_info('best_ckpt', ckpt)
                logging.info("Last checkpoint {:}".format(ckpt))

            if args.save_all_ckpts:
                # Save at every checkpoint
                ckpt = _save()
                logging.info("Saving checkpoint {:}".format(ckpt))

            ## Save last train and val results
            train_df = pd.DataFrame(
                np.array([train_structs, train_labels, train_preds]).T,
                columns=['structure', 'true', 'pred'],
            )
            train_df['target'] = train_df.structure.apply(
                lambda x: psp_util.get_target_name(x))
            train_df.to_pickle(
                os.path.join(args.output_dir,
                             'train_result-{:}.pkl'.format(epoch)))
            __stats('Train Epoch {:}'.format(epoch), train_df)

            val_df = pd.DataFrame(
                np.array([val_structs, val_labels, val_preds]).T,
                columns=['structure', 'true', 'pred'],
            )
            val_df['target'] = val_df.structure.apply(
                lambda x: psp_util.get_target_name(x))
            val_df.to_pickle(
                os.path.join(args.output_dir,
                             'val_result-{:}.pkl'.format(epoch)))
            __stats('Val Epoch {:}'.format(epoch), val_df)

            if args.early_stopping and curr_val_loss >= prev_val_loss:
                logging.info("Validation loss stopped decreasing, stopping...")
                break
            else:
                prev_val_loss = curr_val_loss

        logging.info("Finished training")

    ##### Testing
    logging.debug("Run testing")
    if not args.test_only:
        to_use = run_info['best_ckpt'] if args.use_best else ckpt
    else:
        with open(os.path.join(args.model_dir, 'run_info.json')) as f:
            run_info = json.load(f)
        to_use = run_info['best_ckpt']
        saver = tf.train.import_meta_graph(to_use + '.meta')

    logging.info("Using {:} for testing".format(to_use))
    saver.restore(sess, to_use)

    test_generator_callable = functools.partial(
        feature_psp.dataset_generator,
        args.test_sharded,
        args.grid_config,
        score_type=args.score_type,
        shuffle=args.shuffle,
        repeat=1,
        max_targets=args.max_targets_test,
        max_decoys=args.max_decoys_test,
        max_dist_threshold=None,
        random_seed=args.random_seed)

    if (args.max_targets_test == None) and (args.max_decoys_test == None):
        test_num_structs = args.test_sharded.get_num_structures(
            ['ensemble', 'subunit'])
    elif (args.max_targets_test == None):
        test_num_structs = args.test_sharded.get_num_keyed(
        ) * args.max_decoys_test
    elif (args.max_decoys_test == None):
        assert False
    else:
        test_num_structs = args.max_targets_test * args.max_decoys_test

    logging.info("Start testing with {:} structs".format(test_num_structs))

    test_structs, test_preds, test_labels, _, test_loss = __loop(
        test_generator_callable, 'test', num_iters=test_num_structs)
    logging.info("Finished testing")

    test_df = pd.DataFrame(
        np.array([test_structs, test_labels, test_preds]).T,
        columns=['structure', 'true', 'pred'],
    )

    test_df.to_pickle(os.path.join(args.output_dir, 'test_result.pkl'))
    test_df['target'] = test_df.structure.apply(
        lambda x: psp_util.get_target_name(x))
    test_df.to_pickle(os.path.join(args.output_dir, 'test_result.pkl'))
    __stats('Test', test_df)