Exemplo n.º 1
0
def get_data_loader(args, data_subset, n):
    """
    Given a subset of a dataset as a python dictionary file to make
    predictions from, this function selects n items at random from that
    dataset to predict. It then returns a DataLoader for those items,
    along with a list of ids.
    """
    if not args.rnn:
        collate = paired_collate_fn
    else:
        collate = paired_collate_fn_with_len
    if n is 0:
        train_loader = torch.utils.data.DataLoader(
            ProteinDataset(
                seqs=data_subset['seq'],
                crds=data_subset['crd'],
                angs=data_subset['ang'],
                ),
            num_workers=2,
            batch_size=1,
            collate_fn=collate,
            shuffle=False)
        return train_loader, data_subset["ids"]

    # We just want to predict a few examples
    to_predict = set([s.upper() for s in np.random.choice(data_subset["ids"], n)])  # ["2NLP_D", "3ASK_Q", "1SZA_C"]
    will_predict = []
    ids = []
    seqs = []
    angs = []
    crds = []
    for i, prot in enumerate(data_subset["ids"]):
        if prot.upper() in to_predict and prot.upper() not in will_predict:
            seqs.append(data_subset["seq"][i])
            angs.append(data_subset["ang"][i])
            crds.append(data_subset["crd"][i])
            ids.append(prot)
            will_predict.append(prot.upper())
    assert len(seqs) == n and len(angs) == n or (len(seqs) == len(angs) and len(seqs) < n)

    data_loader = torch.utils.data.DataLoader(
        ProteinDataset(
            seqs=seqs,
            angs=angs,
            crds=crds),
        num_workers=2,
        batch_size=1,
        collate_fn=collate,
        shuffle=False)
    return data_loader, ids
Exemplo n.º 2
0
def setup_data_loaders(batch_size=128, use_cuda=False):
    train_set = ProteinDataset()
    test_set = ProteinDataset()
    #print("dataloader1")
    kwargs = {'num_workers': 1, 'pin_memory': use_cuda}
    train_loader = DataLoader(dataset=train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              **kwargs)
    test_loader = DataLoader(dataset=test_set,
                             batch_size=batch_size,
                             shuffle=False,
                             **kwargs)
    #print("daatloader")
    return train_loader, test_loader

from albumentations    import HorizontalFlip, VerticalFlip, Rotate


if __name__ == '__main__':
    # params
    params = yaml.load(open('config.yaml'))

    # data transforms
    main_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize(255*params['mean'], 255*params['std'])])

    # defining test dataset
    test_dataset = ProteinDataset(img_csv_path=params['img_csv_path_test'],
                                  mode='test',
                                  data_path=params['data_path'],
                                  depth=3,
                                  img_size=512,
                                  transform=None)

    # defining dataloader
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=1,
                             pin_memory=True,
                             num_workers=4)

    # defining training components
    model = create_model(model_name=params['model_name'],
                         n_classes=params['n_classes'],
                         device=device,
                         multi_gpu=params['multi_gpu'],
Exemplo n.º 4
0
    cfg_dict = yaml.load(cfg_file, Loader=yaml.FullLoader)

MULTIGPU = cfg_dict.get('multigpu', True)
MAX_EPOCH = cfg_dict.get('max_epoch', 30)
CHECKPOINT_DIR = cfg_dict.get('checkpoint_dir', 'checkpoint')
GENERATOR = cfg_dict.get('generator', {})
DISCRIMINATOR = cfg_dict.get('discriminator', {})
LOSS = cfg_dict.get('loss', {})
GENERATOR_BATCH_SIZE = GENERATOR.get('batch_size', 1)
CLASS_NUMBER = GENERATOR.get('output_channel', 10)

# Load data & Build dataset
TEST_DIR = os.path.join('data', 'test')
TEST_FEATURE_DIR = os.path.join(TEST_DIR, 'feature')
TEST_LABEL_DIR = os.path.join(TEST_DIR, 'label')
test_dataset = ProteinDataset(TEST_FEATURE_DIR, TEST_LABEL_DIR)
test_dataloader = DataLoader(test_dataset,
                             batch_size=GENERATOR_BATCH_SIZE,
                             shuffle=True,
                             collate_fn=collate_fn)

# Build model from configs
generator = ContactMapGenerator(GENERATOR)
discriminator = ContactMapDiscriminator(DISCRIMINATOR)

# Define Criterion
LOSS_ALPHA = LOSS.get(
    'alpha', [0.25, 0.25, 0.25, 0.25, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75])
LOSS_BETA = LOSS.get('beta', 1.0)
LOSS_GAMMA = LOSS.get('gamma', 2.0)
LOSS_LAMBDA = LOSS.get('lambda', 1.0)
Exemplo n.º 5
0
MULTIGPU = cfg_dict.get('multigpu', True)
MAX_EPOCH = cfg_dict.get('max_epoch', 30)
CHECKPOINT_DIR = cfg_dict.get('checkpoint_dir', 'checkpoint')
GENERATOR = cfg_dict.get('generator', {})
DISCRIMINATOR = cfg_dict.get('discriminator', {})
TRAINING_CFG = cfg_dict.get('training', {})
LOSS = cfg_dict.get('loss', {})
GENERATOR_BATCH_SIZE = GENERATOR.get('batch_size', 1)
CLASS_NUMBER = GENERATOR.get('output_channel', 10)

# Load data & Build dataset
TRAIN_DIR = os.path.join('data', 'train')
TRAIN_FEATURE_DIR = os.path.join(TRAIN_DIR, 'feature')
TRAIN_LABEL_DIR = os.path.join(TRAIN_DIR, 'label')
train_dataset = ProteinDataset(TRAIN_FEATURE_DIR, TRAIN_LABEL_DIR)
train_dataloader = DataLoader(train_dataset,
                              batch_size=GENERATOR_BATCH_SIZE,
                              shuffle=True,
                              collate_fn=collate_fn)

VAL_DIR = os.path.join('data', 'val')
VAL_FEATURE_DIR = os.path.join(VAL_DIR, 'feature')
VAL_LABEL_DIR = os.path.join(VAL_DIR, 'label')
val_dataset = ProteinDataset(VAL_FEATURE_DIR, VAL_LABEL_DIR)
val_dataloader = DataLoader(val_dataset,
                            batch_size=GENERATOR_BATCH_SIZE,
                            shuffle=True,
                            collate_fn=collate_fn)

# Build model from configs
Exemplo n.º 6
0
        # If the residue is labelled as LIP show it as sphere
        if lip_indexes[i] == 1:
            cmd.show_as(
                "spheres",
                "chain {} and resi {}".format(residues[i].get_full_id()[2],
                                              residues[i].id[1]))

    # color by B-factor values
    cmd.spectrum("b", palette="rainbow", selection="(all)")


#######################################################
#################  MAIN PROGRAM   #####################
#######################################################
# parse the dataset
prot_dataset = ProteinDataset()
prot_dataset.parse()

# create the top-level parser
parser = argparse.ArgumentParser(prog='lips_predictor')
subparsers = parser.add_subparsers(help='sub-command help')

# create the parser for the "downpdb" command
parser_down_pdb = subparsers.add_parser('downpdb', help='Download pdb files.')
parser_down_pdb.add_argument(
    '-a',
    '--all',
    default=False,
    action='store_true',
    help='Download pdb file for every entry in the dataset.')
parser_down_pdb.add_argument('-i',
from datetime import datetime as dt

if __name__ == '__main__':
    # params
    params = yaml.load(open('config.yaml'))

    # data transforms
    main_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(255 * params['mean'], 255 * params['std'])
    ])

    # defining train/val datasets
    train_dataset = ProteinDataset(img_csv_path=params['img_csv_path_train'],
                                   mode='train',
                                   data_path=params['data_path'],
                                   depth=params['depth'],
                                   img_size=params['image_size'],
                                   transform=main_transforms)

    val_dataset = ProteinDataset(img_csv_path=params['img_csv_path_val'],
                                 data_path=params['data_path'],
                                 mode='val',
                                 depth=params['depth'],
                                 img_size=params['image_size'],
                                 transform=main_transforms)

    # defining dataloaders
    weights = pd.read_csv(params['img_csv_path_train']).Weight.values
    train_loader = DataLoader(train_dataset,
                              sampler=WeightedRandomSampler(
                                  weights=weights, num_samples=len(weights)),
Exemplo n.º 8
0
data = torch.load(opt.data)

to_predict = np.random.choice(data[dataset]["ids"], 3)  # ["2NLP_D", "3ASK_Q", "1SZA_C"]
actual_order = []
seqs = []
angs = []
for i, prot in enumerate(data[dataset]["ids"]):
    if prot.upper() in to_predict:
        seqs.append(data[dataset]["seq"][i])
        angs.append(data[dataset]["ang"][i])
        actual_order.append(prot)
assert len(seqs) == 3 and len(angs) ==3

data_loader = torch.utils.data.DataLoader(
    ProteinDataset(
        seqs=seqs,#data[dataset]['seq'],
        angs=angs),#data[dataset]['ang']),
    num_workers=2,
    batch_size=1,
    collate_fn=paired_collate_fn,
    shuffle=False)

cords_list = []
losses = []
norm_losses = []

# TODO: make batch_level predictions?
with torch.no_grad():
    for batch in tqdm(data_loader, mininterval=2,
                      desc=' - (Evaluation ', leave=False):
        # prepare data