예제 #1
0
def weighted_random_samplers(train_z, test_z):
    """[summary]

    Args:
        train_z ([type]): [description]
        test_z ([type]): [description]

    Returns:
        [type]: [description]
    """

    # Prepare weighted random sampler: 
    train_target_list = torch.tensor(train_z).type(torch.LongTensor)
    test_target_list = torch.tensor(test_z).type(torch.LongTensor)

    # Number of classes and their frequencies
    train_targets_, train_class_count = get_class_distribution(train_target_list)
    test_targets_, test_class_count = get_class_distribution(test_target_list)

    # Compute class weights
    train_class_weights = 1. / torch.tensor(train_class_count, dtype=torch.float)
    test_class_weights = 1. / torch.tensor(test_class_count, dtype=torch.float)

    # Assign weights to original target list
    train_class_weights_all = train_class_weights[train_target_list - 1]  # Note the targets start from 1, to python idx
    # to apply,-1
    test_class_weights_all = test_class_weights[test_target_list - 1]

    # Weighted samplers
    train_weighted_sampler = WeightedRandomSampler(
        weights=train_class_weights_all,
        num_samples=len(train_class_weights_all),
        replacement=True
    )
    test_weighted_sampler = WeightedRandomSampler(
        weights=test_class_weights_all,
        num_samples=len(test_class_weights_all),
        replacement=True
    )
    return train_weighted_sampler, test_weighted_sampler
예제 #2
0
def get_loader(train_image,
               train_mask,
               val_image,
               val_mask,
               image_size=224,
               batch_size=2,
               num_workers=2,
               augmentation_flag=False,
               weights_sample=None):
    """Builds and returns Dataloader."""
    # train loader
    dataset_train = SIIMDataset(train_image, train_mask, image_size,
                                augmentation_flag)
    # val loader, 验证集要保证augmentation_flag为False
    dataset_val = SIIMDataset(val_image,
                              val_mask,
                              image_size,
                              augmentation_flag=False)

    # 依据weigths_sample决定是否对训练集的样本进行采样
    if weights_sample:
        if os.path.exists('weights_sample.pkl'):
            print('Extract weights of sample from: weights_sample.pkl...')
            with open('weights_sample.pkl', 'rb') as f:
                weights = pickle.load(f)
        else:
            print('Calculating weights of sample...')
            weights = weight_mask(dataset_train, weights_sample)
            with open('weights_sample.pkl', 'wb') as f:
                pickle.dump(weights, f)
        sampler = WeightedRandomSampler(weights,
                                        num_samples=len(dataset_train),
                                        replacement=True)
        train_data_loader = DataLoader(dataset_train,
                                       batch_size=batch_size,
                                       num_workers=num_workers,
                                       sampler=sampler,
                                       pin_memory=True)
    else:
        train_data_loader = DataLoader(dataset_train,
                                       batch_size=batch_size,
                                       num_workers=num_workers,
                                       shuffle=True,
                                       pin_memory=True)

    val_data_loader = DataLoader(dataset_val,
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 shuffle=False,
                                 pin_memory=True)

    return train_data_loader, val_data_loader
예제 #3
0
 def __init__(self,
              num_samples,
              num_items,
              batch_size,
              drop_last,
              update_callback,
              replacement=True):
     self.num_items = num_items
     weighted_sampler = WeightedRandomSampler(torch.ones(num_items),
                                              num_samples, replacement)
     self.sampler = BatchSampler(weighted_sampler, batch_size, drop_last)
     self.update_callback = update_callback
     self.update_callback.connect_sampler(self)
예제 #4
0
 def train_process(self, train_data_iterator, train_dataset_loader,
                   batch_size, class_group, kwargs):
     weights = self.make_weights_for_balanced_classes(
         train_data_iterator.dataset.indices, class_group)
     train_sampler = WeightedRandomSampler(weights,
                                           num_samples=7000,
                                           replacement=True)
     train_iterator = torch.utils.data.DataLoader(train_dataset_loader,
                                                  batch_size=batch_size,
                                                  sampler=train_sampler,
                                                  shuffle=False,
                                                  **kwargs)
     self.train_data_iterator = train_iterator
예제 #5
0
    def sample(self, batch_size):
        self.beta = min(1. - self.epsilon, self.beta + self.beta_annealing)
        sampler = WeightedRandomSampler(self.priorities[:self._n, 0],
                                        batch_size)
        indices = list(sampler)
        batch = self._sample(indices)

        p = self.priorities[indices] / np.sum(self.priorities[:self._n])
        weights = (self._n * p)**-self.beta
        weights /= np.max(weights)
        weights = torch.FloatTensor(weights).to(self.device)

        return batch, indices, weights
예제 #6
0
    def weightedRandomSampler(self, labels):
        #Class Weighting
        labels_unique, counts = np.unique(labels, return_counts=True)
        # print('Unique labels: {}'.format(labels_unique))

        class_weights = np.zeros(np.max(labels_unique) + 1, dtype=float)

        for k, c in enumerate(counts):
            class_weights[labels_unique[k]] = sum(counts) / c
        # Assign weight to each input sample
        example_weights = [class_weights[e] for e in labels]
        sampler = WeightedRandomSampler(example_weights, len(labels))
        return sampler
예제 #7
0
파일: dataset.py 프로젝트: Tomiinek/WaveRNN
    def __init__(self, data_source):
        lebel_freq = {}
        for idx in range(len(data_source)):
            label = data_source.items[idx]['language']
            if label in lebel_freq: lebel_freq[label] += 1
            else: lebel_freq[label] = 1

        total = float(sum(lebel_freq.values()))
        weights = [
            total / lebel_freq[data_source.items[idx]['language']]
            for idx in range(len(data_source))
        ]
        self._sampler = WeightedRandomSampler(weights, len(weights))
예제 #8
0
def function9():
    dataset4 = DogCat2("../data/dogcat/", transforms=transform2)
    #狗的照片呗取出的概率是猫的2倍,两类图片被取出的概率与weights的绝对值无关,只和比值大小相关
    weights = [2 if label == 1 else 1 for data, label in dataset4]
    print(weights)
    """[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2]"""
    sampler = WeightedRandomSampler(weights, num_samples=9, replacement=True)
    dataloader3 = DataLoader(dataset=dataset4, batch_size=3, sampler=sampler)
    for datas, labels in dataloader3:
        print(labels.tolist())
    """replacement=True表示有样本是被重复返回的。
    [1, 0, 0]
    [1, 0, 1]
    [1, 1, 1]
    """
    sampler2 = WeightedRandomSampler(weights=weights,
                                     num_samples=8,
                                     replacement=False)
    dataloader4 = DataLoader(dataset=dataset4, batch_size=4, sampler=sampler2)
    for datas, labels in dataloader4:
        print(labels.tolist())
        """
예제 #9
0
    def __init__(self, class_weights, class_idxs, batch_size, n_batches):
        self.sample_idxs = []
        for idxs in class_idxs:
            self.sample_idxs.extend(idxs)

        sample_weights = []
        for c, weight in enumerate(class_weights):
            sample_weights.extend([weight] * len(class_idxs[c]))

        self.sampler = WeightedRandomSampler(sample_weights,
                                             batch_size,
                                             replacement=True)
        self.n_batches = n_batches
예제 #10
0
    def train_dataloader(self):
        train_dataset = graphs(split_root, subset='nci_train').shuffle()
        class_sample_count = train_dataset.balance_sampler()
        weights = 1 / torch.Tensor(class_sample_count)
        samples_weights = weights[train_dataset.targets]
        sampler = WeightedRandomSampler(samples_weights,
                                        num_samples=len(samples_weights),
                                        replacement=True)

        return DataLoader(train_dataset,
                          self.hp.batch_size,
                          sampler=sampler,
                          num_workers=6, drop_last=True)
예제 #11
0
 def _get_uniform_group_sampler(self, dataset):
     group_counts, group_labels = (
         dataset.get_class_counts("subclass"),
         dataset.get_labels("subclass"),
     )
     group_weights = np.array(
         [len(dataset) / c if c != 0 else 0 for c in group_counts])
     group_weights /= np.sum(group_weights)
     weights = group_weights[np.array(group_labels)]
     sampler = WeightedRandomSampler(weights,
                                     num_samples=len(dataset),
                                     replacement=True)
     return sampler, group_weights
예제 #12
0
def get_sampler_with_random_imbalance(skew_val, num_samples, n_classes,
                                      labels):
    classes = list(range(n_classes))
    class_probas = get_class_probas(classes, skew_val)
    weights = np.zeros(num_samples)
    for cls in classes:
        prob = class_probas[cls]
        w = weights[np.asarray(labels) == cls]
        weights[np.asarray(labels) == cls] = class_probas[cls]
    weights = weights / np.linalg.norm(weights)
    global IMBALANCE
    print(class_probas)
    return WeightedRandomSampler(weights, num_samples, replacement=True)
예제 #13
0
def class_imbalance_sampler(labels):
    """
    create a weighted sampler
    :param labels: labels of the data
    :return: sampler
    """
    class_count = torch.bincount(labels.squeeze()).type('torch.DoubleTensor')
    class_weighting = 1. / class_count
    sample_weights = class_weighting[labels]
    sampler = WeightedRandomSampler(sample_weights,
                                    len(labels),
                                    replacement=True)
    return sampler
예제 #14
0
    def get_dl(self, ds, shuffle):
        if ds is None: return None

        if shuffle:
            sampler = WeightedRandomSampler(ds.frequency, len(ds))
        else:
            sampler = SequentialSampler(ds)

        return DataLoader(ds,
                          batch_size=self.bs,
                          sampler=sampler,
                          num_workers=self.num_workers,
                          pin_memory=False)
예제 #15
0
def main():
    args = Arg()
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:%d" % args.cuda if use_cuda else "cpu")
    print('device = {}'.format(device), file = sys.stderr)
    BATCH_SIZE = args.batch_size
    # setup logging
    logging.basicConfig(
        level=logging.INFO, 
        format='%(message)s', 
        handlers=[logging.FileHandler(args.log_name, 'w'), logging.StreamHandler(sys.stdout)]
    )
    logging.info(args)
    
    # set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
    
    # load data
    train_path = os.path.join(args.train_data, 'ca_data/*')
    dev_path = os.path.join(args.dev_data, 'ca_data/*')
    
    print('Load train data...', file=sys.stderr)
    QAExamples_train, train_index_list = preprocess_parent(train_path, tokenizer, args.overlap_k, (0 if args.overlap_k != 0 else 1), merge_type=args.merge_type)
    QAFeatures_train, QAdataset_train = QAExamples_to_QAFeatureDataset(QAExamples_train, tokenizer, 'train')
    print('DONE', file=sys.stderr)
    
    print('Load dev data...', file=sys.stderr)
    QAExamples_dev, dev_index_list = preprocess_parent(dev_path, tokenizer, 0, 1, merge_type=args.merge_type)
    QAFeatures_dev, QAdataset_dev = QAExamples_to_QAFeatureDataset(QAExamples_dev, tokenizer, 'train')
    print('DONE', file=sys.stderr)
    
    # train model
    if args.train != None:
        if args.use_sampler:
            sampler = WeightedRandomSampler(
                    [args.ratio if feature.start_position != 0 else 1 for feature in QAFeatures_train],
                    args.round * BATCH_SIZE,
                    replacement=True
            )
            train_dataloader = DataLoader(QAdataset_train, batch_size=BATCH_SIZE, sampler=sampler)
        else:
            train_dataloader = DataLoader(QAdataset_train, batch_size=BATCH_SIZE, shuffle=True)
        dev_dataloader = DataLoader(QAdataset_dev, batch_size=BATCH_SIZE, shuffle=False)
        
        model = Model(args.pretrained_model, model_type=args.train, kernel_size=args.kernel_size)
        if args.hw2_QA_bert is not None:
            model.model.load_state_dict(torch.load(args.hw2_QA_bert, map_location=device)['bert_state_dict'])

        optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=1e-8)
        criterion = nn.CrossEntropyLoss()
        Train(model, train_dataloader, dev_dataloader, dev_index_list, QAExamples_dev, QAFeatures_dev, tokenizer, criterion, optimizer, device, args.model_name, args.dev_ref_file, epochs=args.epochs)
예제 #16
0
    def train(self, train_dataset, test_dataset, model):
        weights = make_weights_for_balanced_classes(train_dataset.targets)
        sampler = WeightedRandomSampler(weights, len(weights))
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=8,
        )
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=self.batch_size,
            num_workers=8,
        )

        criterion_triplet = OnlineTripleLoss(
            margin=self.triplet_margin,
            sampling_strategy=self.triplet_sampling_strategy,
        )
        criterion_classifier = CrossEntropyLoss()

        optimizer_triplet = Adam(
            params=model.feature_extractor.parameters(),
            lr=self.learning_rate_triplet,
        )
        optimizer_classifier = Adam(
            params=model.classifier.parameters(),
            lr=self.learning_rate_classify,
        )
        print("Training with Triplet loss")
        for i in range(self.epochs_triplet):
            self._train_epoch_triplet(
                model,
                train_dataloader,
                optimizer_triplet,
                criterion_triplet,
                i + 1,
            )
            save_embedding_umap(model, train_dataloader, test_dataloader,
                                self.exp_folder, i + 1)
        print("Training the classifier")
        for i in range(self.epochs_classifier):
            self._train_epoch_classify(
                model,
                train_dataloader,
                optimizer_classifier,
                criterion_classifier,
                i + 1,
            )
            self._test_epoch_(model, test_dataloader, criterion_classifier,
                              i + 1)
예제 #17
0
파일: test_auto.py 프로젝트: gucifer/ignite
    def _test(data):
        if sampler_name is None:
            sampler = None
        elif sampler_name == "WeightedRandomSampler":
            sampler = WeightedRandomSampler(weights=torch.ones(100),
                                            num_samples=100)
        elif sampler_name == "DistributedSampler":
            sampler = DistributedSampler(data,
                                         num_replicas=ws,
                                         rank=idist.get_rank())
        else:
            raise RuntimeError(f"Unknown sampler name: {sampler_name}")

        # Test auto_dataloader
        assert idist.get_world_size(
        ) == ws, f"{idist.get_world_size()} vs {ws}"

        shuffle = sampler is None if not isinstance(data,
                                                    IterableDataset) else False
        dataloader = auto_dataloader(data,
                                     batch_size=batch_size,
                                     num_workers=num_workers,
                                     sampler=sampler,
                                     shuffle=shuffle)

        assert isinstance(dataloader, dl_type)
        if hasattr(dataloader, "_loader"):
            dataloader = dataloader._loader
        if ws < batch_size:
            assert dataloader.batch_size == batch_size // ws
        else:
            assert dataloader.batch_size == batch_size
        if ws <= num_workers:
            assert dataloader.num_workers == (num_workers + nproc - 1) // nproc
        else:
            assert dataloader.num_workers == num_workers

        if isinstance(data, IterableDataset):
            sampler_type = _InfiniteConstantSampler
        elif ws > 1:
            if sampler is None or isinstance(sampler, DistributedSampler):
                sampler_type = DistributedSampler
            else:
                sampler_type = DistributedProxySampler
        else:
            sampler_type = RandomSampler if sampler is None else type(sampler)

        assert isinstance(dataloader.sampler, sampler_type)

        if isinstance(dataloader, DataLoader):
            assert dataloader.pin_memory == ("cuda" in idist.device().type)
예제 #18
0
def get_weighted_sampler(targets):
    """Get the weighted sampler based on class balance for data loader"""

    class_weights = 1 / targets.value_counts()
    class_index = targets.value_counts().index
    weights_map = {
        class_index[i]: class_weights.iloc[i]
        for i in range(len(class_index))
    }
    class_weights_all = targets.map(weights_map).tolist()

    return WeightedRandomSampler(weights=class_weights_all,
                                 num_samples=len(class_weights_all),
                                 replacement=True)
예제 #19
0
 def on_train_begin(self, **kwargs):
     ds, dl = self.data.train_ds, self.data.train_dl
     self.labels = ds.y.items
     assert np.issubdtype(self.labels.dtype,
                          np.integer), "Can only oversample integer values"
     _, self.label_counts = np.unique(self.labels, return_counts=True)
     if self.weights is None:
         self.weights = torch.DoubleTensor(
             (1 / self.label_counts)[self.labels])
     self.total_len_oversample = int(self.data.c *
                                     np.max(self.label_counts))
     sampler = WeightedRandomSampler(self.weights,
                                     self.total_len_oversample)
     self.data.train_dl = dl.new(shuffle=False, sampler=sampler)
예제 #20
0
def get_weighted_train_sampler(dataset, classes_weight, n_samples=25000):

    assert isinstance(classes_weight, np.ndarray) and classes_weight.ndim == 2
    weights = np.ones((len(dataset),))

    y = np.zeros((len(dataset, )), dtype=np.int)
    for i, dp in enumerate(dataset):
        y[i] = dp[1] - 1

    for c, w in classes_weight:
        indices = np.where(y == int(c))[0]
        weights[indices] = w
    sampler = WeightedRandomSampler(weights, num_samples=n_samples)
    return sampler
예제 #21
0
def get_balanced_loader(data, **kwargs):

    counts = get_class_counts(data)
    weights = 1. / (1e-5 + counts)
    weights[counts == 0] = 0.
    weights = torch.from_numpy(weights / weights.sum()).float()

    print('Class Counts', counts)
    print('Weights', weights)

    sampler = WeightedRandomSampler(weights, kwargs.get('batch_size'))
    loader = DataLoader(data, sampler=sampler, **kwargs)

    return loader
예제 #22
0
def sampler(dataset, opt):
    # Multiclass umbalanced dataset
    if opt.select_clf == 0:
        balance_factor = 20

        data = np.array(dataset.data)
        dis = data[:, 1]
        sev = data[:, -1]

        total = len(dis)
        samplesWeight = np.zeros(total)

        for d in range(5):
            for s in range(5):
                targets_sum = sum(
                    [a and b for a, b in zip(dis == d, sev == s)])

                idx = np.where([a and b for a, b in zip(dis == d, sev == s)])
                samplesWeight[idx] = 1 / (
                    (targets_sum + balance_factor) / total)

    elif opt.select_clf < 3:

        data = np.array(dataset.data)
        labels = data[:, 1] if opt.select_clf == 1 else data[:, -1]

        total = len(labels)
        samplesWeight = np.zeros(total)

        for i in range(5):
            targets_sum = sum(labels == i)
            idx = np.where(labels == i)
            samplesWeight[idx] = 1 / ((targets_sum) / total)

    # Others
    else:
        targets = np.array([x[1] for x in dataset.samples])
        total = len(targets)

        samplesWeight = np.zeros(total)

        for t in np.unique(targets):
            idx = np.where(targets == t)[0]

            samplesWeight[idx] = 1 / (len(idx) / total)

    samplesWeight = samplesWeight / sum(samplesWeight)
    samplesWeight = torch.from_numpy(samplesWeight).double()

    return WeightedRandomSampler(samplesWeight, len(samplesWeight))
예제 #23
0
def main(params):
    # reproducitiblity
    torch.manual_seed(0) 
    np.random.seed(0) 
    torch.backends.cudnn.deterministic = False # cuDNN deterministically select an algorithm, possibly at the cost of reduced performance

    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    print('using device : ', device)

    # prepare dataset
    train_transform = get_train_transform()
    train_dataset = VRPDataset(params['dataset'], 'train', transform=train_transform)
    target_transformer = Target_Transformer()
    train_dataset.set_target_transformer(target_transformer)
    if params['weighted_loss'] == 'weighted_sample':
        sample_weights = pickle.load(open(os.path.join('data', params['dataset'], 'sample_weights.pkl'), 'rb'))
        weighted_sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_dataset), replacement=True)
        train_loader = data.DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, sampler=weighted_sampler, num_workers=8)
    else:
        train_loader = data.DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=8)

    train_global_tensor = pickle.load(open(os.path.join('data', params['dataset'], 'train_global_tensor.pkl'), 'rb'))
    U, S = hosvd_dcmp(train_global_tensor, params['epsilon'])
    pre_rank = np.array([train_dataset.num_entity, train_dataset.num_entity, train_dataset.num_predicate])
    new_rank = (U[0].shape[1], U[1].shape[1], U[2].shape[1])
    if params['model_name'] == 'TCN':
        net = TCNet(pre_rank, new_rank, U, params)
    else:
        print('unrecognized model name {}'.format(args.model))
    net = net.to(device)

    # define loss function and optimizer
    if params['weighted_loss'] == 'weighted_label':
        label_weights = pickle.load(open(os.path.join('data', params['dataset'], 'label_weights.pkl'), 'rb'))
        label_weights = torch.from_numpy(label_weights.flatten()).to(device)
        criterion = TagTogLoss(label_weights)
    else:
        criterion = TagTogLoss()

    model_params = list(net.parameters())


    optimizer = optim.SGD(model_params, lr=params['lr'], momentum=params['momentum'], weight_decay=params['weight_decay'])
    # train
    train(train_loader, net, criterion, optimizer, device)

    # save model
    save_model_path = os.path.join('models', params['model_file_name'])
    torch.save(net.state_dict(), save_model_path)
예제 #24
0
def imbalanced_loader(X_train,y_train,X_test,y_test,valid_size=.05,batch_size=512): # Split train into train + validation 
    """
    Get trainloader, validloader, and testloader for model training. This 
    creates equal training batches but naturally balanced validation and testing 
    sets. Note the testing set was previously augmented to get better per class metrics 
    
    Outputs: dataloader + testloader, where dataloader =  {"train": trainloader, "val": validloader}

    """
    warnings.filterwarnings("ignore") #torch bug
    print ('Getting Data... {}% Validation Set\n'.format(int(np.around(valid_size*100))))
    
    num_train = len(X_train)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))
    train_idx, valid_idx = indices[split:], indices[:split]
    
    print("Batch Size:",batch_size)

    print('\nTrain Len=',len(train_idx),', Validation Len=',len(valid_idx), 'Test Len=',len(y_test))
                                                                                        
    class_sample_count = np.array([len(np.where(y_train[[train_idx]]==t)[0]) for t in np.unique(y_train[[train_idx]])])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in y_train[[train_idx]]])
    samples_weight = torch.from_numpy(samples_weight)
    train_sampler = WeightedRandomSampler(torch.tensor(samples_weight,dtype=torch.double), len(samples_weight))
    trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_train[[train_idx]]), torch.LongTensor(y_train[[train_idx]].astype(int)))
    train_sampler= torch.utils.data.BatchSampler(sampler=train_sampler, batch_size=batch_size, drop_last=True)
    trainloader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=batch_size, num_workers=1, sampler= train_sampler)
  
    valDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_train[[valid_idx]]), torch.LongTensor(y_train[[valid_idx]].astype(int)))
    sampler = torch.utils.data.RandomSampler(valDataset)
    sampler= torch.utils.data.BatchSampler(sampler, batch_size, drop_last=True)
    validloader = torch.utils.data.DataLoader(dataset = valDataset, batch_size=batch_size, num_workers=1,sampler=sampler)

    testset=[]
    for i,x in enumerate(X_test):
        testset.append((torch.from_numpy(x),torch.tensor([y_test[i]])))
    
    #testloader = torch.utils.data.DataLoader(dataset = testDataset, batch_size=batch_size, shuffle=False, num_workers=1) 
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                           shuffle=False, num_workers=1)

    print("")
    dataloader = {"train": trainloader, "val": validloader}
    print('Train Size Batched=',int(len(dataloader['train'].dataset)/batch_size),', Validation Size Batched=',int(len(dataloader['val'].dataset)/batch_size),', Test Size Batched=',len(testloader))
    
    warnings.resetwarnings()
    return dataloader,testloader
예제 #25
0
파일: loc2vec.py 프로젝트: hwasiti/loc2vec
def main():
    cuda = torch.cuda.is_available()

    anchor_transform = transforms.Compose([
        transforms.RandomAffine(degrees=90, translate=(0.25, 0.25)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.CenterCrop(128),
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    train_transforms = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    #  Let's use 12 while developing as it reduces the start time.
    dset_train = GeoTileDataset(TILE_FILE,
                                transform=train_transforms,
                                center_transform=anchor_transform)

    pd_files = dset_train.get_file_df()
    weights = pd_files.frequency
    train_sampler = WeightedRandomSampler(weights, len(dset_train))
    # Should numworkers be 1?
    kwargs = {'num_workers': 8, 'pin_memory': True} if cuda else {}
    online_train_loader = DataLoader(dset_train,
                                     batch_size=BATCH_SIZE,
                                     sampler=train_sampler,
                                     **kwargs)

    model = Loc2Vec()
    if cuda:
        model.cuda()
    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     model = nn.DataParallel(model)
    loss_fn = OnlineTripletLoss(MARGIN, HardestNegativeTripletSelector(MARGIN),
                                SemihardNegativeTripletSelector(MARGIN),
                                RandomNegativeTripletSelector(MARGIN))

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = lr_scheduler.StepLR(optimizer, 16, gamma=0.1, last_epoch=-1)

    fit(online_train_loader, online_train_loader, model, loss_fn, optimizer,
        scheduler, N_EPOCHS, cuda, LOG_INTERVAL)
예제 #26
0
def create_a1_dataset(config, train_config):
    ds_train, ds_valid = _create_ds(config, train_config)

    if train_config.balanced_sampling:
        sampler = WeightedRandomSampler(
            weights=ds_train.sample_weights,
            num_samples=train_config.batch_size,
            replacement=train_config.replacement,)
    else:
        sampler = RandomSampler(
            data_source=ds_train,
            num_samples=train_config.batch_size,
            replacement=True,)

    return sampler, ds_train, ds_valid
예제 #27
0
파일: train.py 프로젝트: oncoml/camelyon
def load_data(train_dir, val_dir):
    ImageFile.LOAD_TRUNCATED_IMAGES = True

    train_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(train_mean, train_std)
    ])
    val_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.ToTensor(),
        transforms.Normalize(val_mean, val_std)
    ])

    train_data = datasets.ImageFolder(root=train_dir,
                                      transform=train_transforms)
    val_data = datasets.ImageFolder(root=val_dir, transform=val_transforms)

    train_weights = calculate_weight(train_data.imgs, len(train_data.classes))
    train_weights = torch.DoubleTensor(train_weights)
    val_weights = calculate_weight(val_data.imgs, len(val_data.classes))
    val_weights = torch.DoubleTensor(val_weights)

    train_sampler = WeightedRandomSampler(train_weights, len(train_weights))
    val_sampler = WeightedRandomSampler(val_weights, len(val_weights))

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=16,
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=16,
                                             sampler=val_sampler)

    return train_loader, val_loader
예제 #28
0
def class_balanced_sampler(labels, num_samples_per_class=10):
    class_counts = np.bincount(labels)
    total_samples = len(labels)
    sample_weights = np.zeros_like(labels).astype(np.float32)
    for idx, label in enumerate(labels):
        sample_weights[idx] = total_samples / class_counts[label]
    # return sample_weights
    # sampler = WeightedRandomSampler(weights=sample_weights,
    #     num_samples=total_samples)

    # mimic test distribution
    num_samples = len(class_counts) * num_samples_per_class
    sampler = WeightedRandomSampler(weights=sample_weights,
        num_samples=num_samples)
    return sampler
예제 #29
0
    def train_ens(self, dataset, learning_rate = 1e-3, epochs=50, batch_size=50, optim="Adam", loss_fn=nn.MSELoss(), split=0.9, preprocess=True):
        """
        Train the neural network.
        if preprocess = False
            dataset is a list of tuples to train on, where the first value in the tuple is the training data (should be implemented as a torch tensor), and the second value in the tuple
            is the label/action taken
        if preprocess = True
            dataset is simply the raw output of generate data (X, U)
        Epochs is number of times to train on given training data,
        batch_size is hyperparameter dicating how large of a batch to use for training,
        optim is the optimizer to use (options are "Adam", "SGD")
        split is train/test split ratio
        """
        if preprocess:
            dataset = self.preprocess(dataset[0], dataset[1])
            print('Length of dataset is:', len(dataset))


        num_samples = len(dataset)
        weights = (1/(num_samples+1))*np.ones((int(split*num_samples)))
        # TODO: Update datasets by sampling with replacement for each net

        # Make random sampling with replacement by using a evenly weighted random sampler with replacement
        sampler = WeightedRandomSampler(weights, num_samples, replacement=True)

        # Training loader has the sampler, testing does not matter.
        trainLoader = DataLoader(dataset[:int(split*len(dataset))], sampler = sampler, batch_size=batch_size)
        testLoader = DataLoader(dataset[int(split*len(dataset)):], batch_size=batch_size)


        # TODO: Train each net separately
        #Unclear if we should be using SGD or ADAM? Papers seem to say ADAM works better

        # train each net
        errors = []
        for i, net in enumerate(self.nets):
            if(optim=="Adam"):
                optimizer = torch.optim.Adam(super(NeuralNet, net).parameters(), lr=learning_rate)
            elif(optim=="SGD"):
                optimizer = torch.optim.SGD(super(NeuralNet, net).parameters(), lr=learning_rate)
            else:
                raise ValueError(optim + " is not a valid optimizer type")
            print('Training net ', i+1)
            error = net._optimize(loss_fn, optimizer, epochs, batch_size, trainLoader, testLoader)
            errors.append(error)
            print('-------------------------------------------------------')
        print(np.shape(errors))
        return errors
예제 #30
0
def main():        
    dataroot = '~/dataset/'
    
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    train_dataset = torchvision.datasets.CIFAR10(root=dataroot, train=True, download=True, transform=transform_train)
    test_dataset = torchvision.datasets.CIFAR10(root=dataroot, train=False, download=True, transform=transform_test)
    
    train_size = len(train_dataset)
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=b_size, shuffle=False, num_workers=2)

    model = ResNet(9, res_option='A', use_dropout=True).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0001)

    #temp_list = [0.05, 0.1, 0.5, 1, 2, 5, 10]
    temp_list = np.arange(0.05, 0.5, 0.01).tolist() + np.arange(0.5, 10, 0.2).tolist() 
    for epoch in range(1, n_epochs + 1):
        if (epoch - 1) // 2 < len(temp_list):
            T = temp_list[(epoch - 1) // 2]
        else:
            T = temp_list[-1]
        print('Set temperature: %f' % T)
        train_loader_per_class = []
        for c in range(10):
            mask = np.zeros(train_size)
            mask[np.array(label_to_idx[c])] = 1
            a_vec = _softmax(-lid_vec / T, mask)
            sampler = WeightedRandomSampler(a_vec,\
                                            num_samples=b_size // 10,\
                                            replacement=False)
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=b_size // 10, shuffle=False,
                                                    sampler=sampler)
            train_loader_per_class.append(train_loader)
        train(model, device, train_loader_per_class, test_loader, optimizer, epoch)