Exemplo n.º 1
0
    def select_and_reveal(self, label_manager, K):
        """
        Labels K examples by passing label_manager the indexes of examples to reveal
        Wrapper for the select() function implemented by subclasses
        """
        if K == 0: return
        groups = self.select_grouper.metadata_to_group(
            label_manager.unlabeled_metadata_array)
        group_ids = groups.unique().int().tolist()
        remaining = (torch.ones(len(group_ids)) * K).int().tolist()
        reveal = []
        for idx in self._prior_selections:
            i = label_manager.unlabeled_indices.index(idx)
            g = groups[i]
            g_ind = group_ids.index(g)
            if remaining[g_ind] > 0:
                reveal.append(idx)
                remaining[g_ind] -= 1
            if sum(remaining) == 0: break
        self._prior_selections = [
            idx for idx in self._prior_selections if idx not in set(reveal)
        ]

        if sum(remaining) > 0:
            unlabeled_indices = torch.tensor(label_manager.unlabeled_indices)
            reveal = reveal + self.select(label_manager, remaining,
                                          unlabeled_indices, groups, group_ids)

        label_manager.reveal_labels(reveal)
        save_array(reveal,
                   csv_path=f"{self.log_dir}/selected_ids.csv",
                   mode=self.mode)
    def __init__(self, w):
        '''
        Generates random messages

        # Parameter
        -------------
        w: np.array
            Weighted random condition.

        # Returns
        -------------
        sample: integer
            Single random message.
        '''

        self.w = w
        self.N = len(w)
        # We assign each message version with a unique ID from 0 to N-1
        self.setN = np.arange(self.N)
        self.M = sum(w)
        # w/self.M will generate an array of 0s, use numpy divide 
        self.norm_w = np.true_divide(w,self.M)
        # We randomize M message from N version for only 1 time
        if self.M < 6:
            self.shuffle = np.random.choice(self.setN, self.M, p=self.norm_w)
        else:
            # Use bcolz to store array > 1MB to speed up computation
            utils.save_array("shuffle.bc", np.random.choice(self.setN, self.M, p=self.norm_w))
            self.shuffle = utils.load_array("shuffle.bc")
        # Everytime we call the message function that take a random message, 
        # we increase current_id for the next message
        self.current_id = 0
Exemplo n.º 3
0
    def load_data(self,
                  serialized_data_folder='./',
                  path_to_vehicle_folder='vehicles/',
                  path_to_non_vehicle_folder='non-vehicles/'):

        path_to_X_dat_file = '{0}/{1}'.format(serialized_data_folder, 'X.dat')
        path_to_y_dat_file = '{0}/{1}'.format(serialized_data_folder, 'y.dat')
        X_dat_file_exists = os.path.exists(path_to_X_dat_file)
        y_dat_file_exists = os.path.exists(path_to_y_dat_file)

        if X_dat_file_exists and y_dat_file_exists:
            print('Loading from serialized ...')
            self.X = load_array(path_to_X_dat_file)
            self.y = load_array(path_to_y_dat_file)
            print('Done reading serialized arrays')
        else:
            print('Creating data from image folders')
            non_vehicle_class, vehicle_class = 0, 1
            non_vehicle_X, non_vehicle_y = self._get_X_y(
                path_to_non_vehicle_folder, non_vehicle_class)
            vehicle_X, vehicle_y = self._get_X_y(path_to_vehicle_folder,
                                                 vehicle_class)

            self.X = np.concatenate((non_vehicle_X, vehicle_X))
            self.y = np.concatenate((non_vehicle_y, vehicle_y))
            print(
                'Data created successfully, creating serialized numpy arrays')
            save_array(path_to_X_dat_file, self.X)
            save_array(path_to_y_dat_file, self.y)
            print('Done saving arrays')
Exemplo n.º 4
0
def ensemble():
    preds_raw = []
    os.chdir(MODEL_DIR)
    total_weight = 0
    preds_w = None
    for match_str in w_file_matcher:
        w_files = glob.glob(match_str)
        for w_file in w_files:
            weight = 0
            full_w_file = MODEL_DIR + '/' + w_file
            if w_file.startswith('dense161'):
                model, _ = create_dense161()
                weight = 1
            elif w_file.startswith('dense169'):
                model, _ = create_dense169()
                weight = 0.8
            elif w_file.startswith('dense201'):
                model, _ = create_dense201()
                weight = 1
            elif w_file.startswith('res50'):
                model, _ = create_res50()
                weight = 0.9
            elif w_file.startswith('res101'):
                model, _ = create_res101()
                weight = 0.9
            elif w_file.startswith('res152'):
                model, _ = create_res152()
                weight = 0.9
            elif w_file.startswith('vgg16'):
                model, _ = create_vgg16()
                weight = 0.2
            elif w_file.startswith('vgg19'):
                model, _ = create_vgg19()
                weight = 0.7
            elif w_file.startswith('inceptionv3'):
                model, _ = create_inceptionv3()
                weight = 0.8
            else:
                pass
            model.load_state_dict(torch.load(full_w_file))
            print(full_w_file)

            pred = make_preds(model, test_loader)
            pred = np.array(pred)
            preds_raw.append(pred)

            if preds_w is None:
                preds_w = np.zeros((pred.shape))
            preds_w += pred * weight
            total_weight += weight

            del model

    save_array(PRED_FILE_RAW, preds_raw)
    preds = np.mean(preds_raw, axis=0)
    #preds = preds_w / total_weight

    save_array(PRED_FILE, preds)
Exemplo n.º 5
0
    def save_precomputed_conv_models(self):
        fName1 = "precomputed_trn_features." + self.runID + ".h5"
        fName2 = "precomputed_val_features." + self.runID + ".h5"

        save_array(fName1, self.train_precomputed)
        save_array(fName2, self.val_precomputed)
        print("models saved to files: ", fName1, " and ", fName2)

        return self
Exemplo n.º 6
0
def save_layers(PV, IBA):

	#save PV
	nucleus_channel_PV = read_layer(PV,0)
	utils.save_array("data/PV/X_cells_only.bc", nucleus_channel_PV)

	#save IBA
	nucleus_channel_IBA1 = read_layer(IBA,0)
	utils.save_array("data/IBA1/X_cells_only.bc", nucleus_channel_IBA1)
Exemplo n.º 7
0
def train_and_test(no_of_epochs=4):
    batch_size = 64
    vgg = Vgg16()

    train_model(vgg, DATA_DIR, batch_size, no_of_epochs)

    batches, preds = test_model(vgg, DATA_DIR + '/test', batch_size=batch_size)

    save_array(RESULTS_DIR + '/test_preds', preds)
    save_array(RESULTS_DIR + '/filenames', batches.filenames)

    return batches, preds, vgg
Exemplo n.º 8
0
def ensemble():
    res101, _ = create_res101(True)
    res152, _ = create_res152(True)
    dense201, _ = create_dense201(True)
    dense161, _ = create_dense161(True)
    
    pred1 = np.array(make_preds(res101, test_loader))
    pred2 = np.array(make_preds(res152, test_loader))
    pred3 = np.array(make_preds(dense201, test_loader))
    pred4 = np.array(make_preds(dense161, test_loader))
    
    preds = np.mean([pred1, pred2, pred3, pred4], axis=0)
    save_array(PRED_FILE, preds)
    print(preds[:10])
Exemplo n.º 9
0
def find_best_weather():
    thr = load_array(THRESHOLD_FILE_ENS)
    labels = load_array(VAL_LABELS)
    preds = load_array(PRED_VAL)

    print(labels.shape)
    weather = preds[:, 0:4]
    y = labels[0, :, 0:4]
    print(y.shape)
    print(weather.shape)
    thr = thr[0:4]

    def mf(p):
        p2 = np.zeros_like(p)
        for i in range(4):
            p2[:, i] = (p[:, i] > thr[i]).astype(np.int)
        score1 = fbeta_score(y, p2, beta=2, average='samples')
        return score1

    base_score = mf(
        weather)  #fbeta_score(y, weather, beta=2, average='samples')
    print('base score:{}'.format(base_score))
    max_score = base_score
    d = 0.5
    best_d = 0.5
    best_w = weather
    while d < 1:
        w = get_one_weather(weather, thr, d)
        score = mf(w)  #fbeta_score(y, w, beta=2, average='samples')
        print('score{}, d:{}'.format(score, d))
        if score > max_score:
            max_score = score
            best_d = d
            best_w = w
        d += 0.1

    print('best d:{}'.format(best_d))
    w1 = force_one_weather(weather, thr)
    score1 = mf(w1)
    print('force one weather score:{}'.format(score1))

    if max_score > base_score + 0.00001:
        test_preds = load_array(PRED_FILE)
        test_w = test_preds[:, 0:4]
        w = get_one_weather(test_w, thr, best_d)
        test_preds[:, 0:4] = w
        #preds[:, 0:4] = best_w

        save_array(PRED_WEATHER, test_preds)
Exemplo n.º 10
0
def find_best_threshold():
    preds = load_array(PRED_VAL)
    labels = load_array(VAL_LABELS)
    print(np.array(labels).shape)
    for i in range(1, len(labels)):
        for j in range(len(labels[i])):
            for k in range(len(labels[i][j])):
                if labels[i][j][k] != labels[i - 1][j][k]:
                    print('error, check labels failed')
                    exit()

    x = optimise_f2_thresholds(labels[0], preds)
    print('best threshold:')
    print(x)
    save_array(THRESHOLD_FILE_ENS, x)
Exemplo n.º 11
0
 def save(self, filedir, filename, master_params):
     try:
         os.makedirs(filedir)
     except OSError as e:
         if e.errno != errno.EEXIST:
             raise
     return utils.save_array(filedir + filename, master_params)
Exemplo n.º 12
0
def calc_val_feats():
    print("===== (VALID) Precalc validation conv features =====")
    pcf = PrecalcFeats()
    batches = create_batches('data/valid/', shuffle=False, use_da=False)
    print("    (precalc) calculating features...")
    feats = pcf.calc_feats_on_batch(batches)
    labels = to_categorical(batches.classes)

    # save
    labels_file = "data/results/conv_val_labels.h5"
    feats_file = "data/results/conv_val_feats.h5"
    save_array(labels_file, labels)
    save_array(feats_file, feats)
    print("    (precalc) feats: %s" % (feats.shape, ))
    print("    (precalc) saved feats to: %s" % feats_file)
    print("    (precalc) saved labels to: %s" % labels_file)
Exemplo n.º 13
0
def precalculate_conv_output(model, train_batches, valid_batches):
    click.echo('Precalculating convolutional layer outputs...')
    train_features = model.predict_generator(train_batches,
                                             train_batches.nb_sample)
    click.echo('train_features shape: %s' % (train_features.shape, ))

    valid_features = model.predict_generator(valid_batches,
                                             valid_batches.nb_sample)
    click.echo('valid_features shape: %s' % (valid_features.shape, ))

    click.echo('Saving data...')
    utils.save_array(os.path.join(MODEL_PATH, 'train_convlayer_features.bc'),
                     train_features)
    utils.save_array(os.path.join(MODEL_PATH, 'valid_convlayer_features.bc'),
                     valid_features)

    return train_features, valid_features
Exemplo n.º 14
0
def calc_train_da_feats():
    nb_augm = 5
    print("===== (TRAIN) Precalc data-augmented conv features =====")

    pcf = PrecalcFeats()
    for aug in range(nb_augm):
        print("===== data-aug: %d =====" % aug)
        batches = create_batches('data/train/', shuffle=False, use_da=True)
        print("    (precalc) calculating features...")
        feats = pcf.calc_feats_on_batch(batches)
        labels = to_categorical(batches.classes)

        # save
        labels_file = "data/results/da%d_conv_labels.h5" % aug
        feats_file = "data/results/da%d_conv_feats.h5" % aug
        save_array(labels_file, labels)
        save_array(feats_file, feats)
        print("    (precalc) feats: %s" % (feats.shape, ))
        print("    (precalc) saved feats to: %s" % feats_file)
        print("    (precalc) saved labels to: %s" % labels_file)
Exemplo n.º 15
0
def ensemble(model_name, file_name, tta=False):
    preds_raw = []

    model = create_model(model_name)

    test_set = data_loader.get_test_set()

    rounds = 1
    if tta:
        rounds = 20

    loader = data_loader.get_test_loader(model, test_set, tta=True)

    for index in range(rounds):
        predictions = np.array(make_preds(model, loader))
        preds_raw.append(predictions)

    preds = np.mean(preds_raw, axis=0)

    save_array(settings.PREDICT_DIR + os.sep + file_name, preds)
Exemplo n.º 16
0
 def plot(self, figsize=(12, 8)):
     fig = plt.figure(figsize=figsize)
     plt.ylabel("loss", fontsize=16)
     plt.xlabel("learning rate (log scale)", fontsize=16)
     plt.xscale("log")
     plt.tick_params(axis='x', which='minor')
     plt.plot(self.lr_history[10:-5], self.loss_history[10:-5])
     utils.save_array(
         f'../experiment/lr_find_edsr2'
         f'/lr_history_steps_{self.steps}_epoch{self.epoch}.bc',
         self.lr_history[10:-5])
     utils.save_array(
         f'../experiment/lr_find_edsr2'
         f'/loss_history_steps_{self.steps}_epoch{self.epoch}.bc',
         self.loss_history[10:-5])
     plt.savefig(
         f'../experiment/lr_find_edsr2/lr_find_steps_'
         f'{self.steps}_epoch{self.epoch}.png',
         bbox_inches='tight')
     plt.show()
Exemplo n.º 17
0
def ensemble():
    preds_raw = []

    for match_str in w_file_matcher:
        os.chdir(MODEL_DIR)
        w_files = glob.glob(match_str)
        for w_file in w_files:
            full_w_file = MODEL_DIR + '/' + w_file
            mname = w_file.split('_')[0]
            print(full_w_file)
            model = create_model(mname)
            model.load_state_dict(torch.load(full_w_file))

            pred = make_preds(model)
            pred = np.array(pred)
            preds_raw.append(pred)
            del model

    save_array(PRED_FILE_RAW, preds_raw)
    preds = np.mean(preds_raw, axis=0)
    save_array(PRED_FILE, preds)
Exemplo n.º 18
0
def ensemble_val_data():
    preds_raw = []
    labels = []

    for match_str in w_file_matcher:
        os.chdir(MODEL_DIR)
        w_files = glob.glob(match_str)
        for w_file in w_files:
            full_w_file = MODEL_DIR + '/' + w_file
            mname = w_file.split('_')[0]
            print(full_w_file)
            model = create_model(mname)
            model.load_state_dict(torch.load(full_w_file))

            pred, y = make_preds_val(model)
            #pred = np.array(pred)
            preds_raw.append(pred)
            labels.append(y)
            del model

    save_array(PRED_VAL_RAW, preds_raw)
    preds = np.mean(preds_raw, axis=0)
    save_array(PRED_VAL, preds)
    save_array(VAL_LABELS, labels)
    return preds, labels
Exemplo n.º 19
0
def ensemble():
    preds_raw = []
    os.chdir(MODEL_DIR)
    for match_str in w_file_matcher:
        w_files = glob.glob(match_str)
        for w_file in w_files:
            full_w_file = MODEL_DIR + '/' + w_file
            if w_file.startswith('dense161'):
                model, _ = create_dense161()
            elif w_file.startswith('dense169'):
                model, _ = create_dense169()
            elif w_file.startswith('dense201'):
                model, _ = create_dense201()
            elif w_file.startswith('res50'):
                model, _ = create_res50()
            elif w_file.startswith('res101'):
                model, _ = create_res101()
            elif w_file.startswith('res152'):
                model, _ = create_res152()
            elif w_file.startswith('vgg16'):
                model, _ = create_vgg16()
            elif w_file.startswith('vgg19'):
                model, _ = create_vgg19()
            elif w_file.startswith('inceptionv3'):
                model, _ = create_inceptionv3()
            else:
                pass
            model.load_state_dict(torch.load(full_w_file))
            print(full_w_file)

            pred = make_preds(model, test_loader)
            pred = np.array(pred)
            preds_raw.append(pred)

            del model

    save_array(PRED_FILE_RAW, preds_raw)
    preds = np.mean(preds_raw, axis=0)
    save_array(PRED_FILE, preds)
Exemplo n.º 20
0
def test_ensemble(models):
    nb_test_samples = 1000
    nb_classes = 8
    nb_augmentations = 5
    preds = np.zeros((nb_test_samples, nb_classes))

    for test_run in range(nb_augmentations):
        # make test batch randomly with data aug
        print("====== data-aug test batch: %d ======" % test_run)
        test_batches = models[0].create_test_batches(use_da=True)
        preds_aug = np.zeros((nb_test_samples, nb_classes))

        for ind, m in enumerate(models):
            print("====== running test model: %d ======" % ind)
            _preds = m.test_on_batch(test_batches)
            preds_aug = preds_aug + _preds

        preds_aug /= len(models)
        preds = preds + preds_aug

    preds /= nb_augmentations
    save_array('data/results/ensemble_dn512_ep20_da_test_preds.h5', preds)
    return preds
Exemplo n.º 21
0
def test_ensemble(models):
    nb_test_samples = 1000
    nb_classes = 8
    nb_augmentations = 5
    preds = np.zeros((nb_test_samples, nb_classes))

    for test_run in range(nb_augmentations):
        # make test batch randomly with data aug
        print("====== data-aug test batch: %d ======" % test_run)
        test_batches = create_batches('data/test/', shuffle=False, use_da=True)
        preds_aug = np.zeros((nb_test_samples, nb_classes))

        for ind, m in enumerate(models):
            print("====== running test model: %d ======" % ind)
            _preds = m.test_on_batch(test_batches)
            preds_aug = preds_aug + _preds

        preds_aug /= len(models)
        preds = preds + preds_aug

    preds /= nb_augmentations
    save_array('submits/resnet_ft_ens_preds.gz', preds)
    return preds
Exemplo n.º 22
0
def test_ensemble(models):
    nb_test_samples = 1000
    nb_classes = 8
    nb_augmentations = 5
    preds = np.zeros((nb_test_samples, nb_classes))

    for test_run in range(nb_augmentations):
        # make test batch randomly with data aug
        print("====== data-aug test batch: %d ======" % test_run)
        preds_aug = np.zeros((nb_test_samples, nb_classes))
        conv_test_feat = load_array("data/results/da%d_conv_test_feats.h5" %
                                    test_run)

        for ind, m in enumerate(models):
            print("====== running test model: %d ======" % ind)
            _preds = m.test(conv_test_feat)
            preds_aug = preds_aug + _preds

        preds_aug /= len(models)
        preds = preds + preds_aug

    preds /= nb_augmentations
    save_array('data/results/ensemble_dense_preds.h5', preds)
    return preds
Exemplo n.º 23
0
def save_pseudo_if_needed(y_pseudo,
                          split,
                          dataset,
                          epoch,
                          config,
                          is_best,
                          force_save=False):
    if (not config.save_pseudo_step) or (y_pseudo is None) or (
            split not in config.save_splits):
        return
    prefix = get_pred_prefix(dataset, config)
    if config.algorithm == 'NoisyStudent':  # save on first epoch; pseudolabels are constant
        save_array(y_pseudo, prefix + f'pseudo.csv')
    else:
        if force_save or (config.save_pseudo_step is not None and
                          (epoch + 1) % config.save_pseudo_step == 0):
            save_array(y_pseudo, prefix + f'epoch:{epoch}_pseudo.csv')
        if config.save_last:
            save_array(y_pseudo, prefix + f'epoch:last_pseudo.csv')
        if config.save_best and is_best:
            save_array(y_pseudo, prefix + f'epoch:best_pseudo.csv')
Exemplo n.º 24
0
def save_pred_if_needed(y_pred,
                        split,
                        dataset,
                        epoch,
                        config,
                        is_best,
                        force_save=False):
    if (not config.save_pred_step) or (split not in config.save_splits):
        return
    prefix = get_pred_prefix(dataset, config)
    if force_save or (config.save_pred_step is not None and
                      (epoch + 1) % config.save_pred_step == 0):
        save_array(y_pred, prefix + f'epoch:{epoch}_pred.csv')
    if config.save_last:
        save_array(y_pred, prefix + f'epoch:last_pred.csv')
    if config.save_best and is_best:
        save_array(y_pred, prefix + f'epoch:best_pred.csv')
Exemplo n.º 25
0
if use_ti:
    feature_names.extend(feature_names_ti)
    X_train.append(X_ti_train)
    X_test.append(X_ti_test)

feature_names = np.array(feature_names)
X_train = hstack(X_train)
X_test = hstack(X_test)
print("done assembling features in %fs" % (time() - t0))

# <codecell>

# Models we will use
X_train = X_train.todense()
X_test = X_test.todense()
utils.save_array("%s/X_test" % dataset_version, X_test)

fitted = []


def grid_search(estimator):
    # try:
    # pipeline = Pipeline([(estimator[0], estimator[1])])
    pipeline = estimator[1]
    param_grid = [estimator[2]]
    # print()
    print("Performing grid search for %s %s" % (t_name, e_name))
    # print(str(type(pipeline)))
    # print("parameters:")
    pprint(param_grid)
    clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=0, scoring=scoring)
Exemplo n.º 26
0
dsets_v3 = {
    x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms_v3[x])
    for x in ['train', 'valid']
}
dset_loaders_v3 = {
    x: torch.utils.data.DataLoader(dsets_v3[x],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=4)
    for x in ['train', 'valid']
}

dset_sizes = {x: len(dsets[x]) for x in ['train', 'valid']}
dset_classes = dsets['train'].classes
save_array(CLASSES_FILE, dset_classes)

use_gpu = torch.cuda.is_available()

w_files_training = []


def save_weights(acc, model, epoch, max_num=3):
    f_name = '{}_{}_{:.5f}.pth'.format(model.name, epoch, acc)
    w_file_path = os.path.join(MODEL_DIR, f_name)
    if len(w_files_training) < max_num:
        w_files_training.append((acc, w_file_path))
        torch.save(model.state_dict(), w_file_path)
        return
    min = 10.0
    index_min = -1
Exemplo n.º 27
0
        tfidf_ti.transform(pairs[ti + "_b"]))
    comps_diffs_df = pairs.apply(utils.compare, axis=1)
    comps_diffs = comps_diffs_df - comps_diffs_df.mean()
    comps_diffs = np.array(comps_diffs / comps_diffs.std())

    y = np.array(pairs[un + "_a"] == pairs[un + "_b"])

    feature_names_comps = np.array(comps_diffs_df.columns.values)
    feature_names_ab = np.array(tfidf_ab.get_feature_names())
    feature_names_ti = np.array(tfidf_ti.get_feature_names())

    if not os.path.exists(dataset_version):
        os.makedirs(dataset_version)

    utils.save_csr("%s/X_ab" % dataset_version, ab_diffs)
    utils.save_csr("%s/X_ti" % dataset_version, ti_diffs)
    utils.save_array("%s/X_comps" % dataset_version, comps_diffs)
    utils.save_array("%s/y" % dataset_version, y)

    utils.save_array("%s/feature_names_ab" % dataset_version, feature_names_ab)
    utils.save_array("%s/feature_names_ti" % dataset_version, feature_names_ti)
    utils.save_array("%s/feature_names_comps" % dataset_version,
                     feature_names_comps)
    pairs[['pmid_a', 'pmid_b', un + '_a',
           un + '_b']].to_pickle("%s/feature_names_comps" % dataset_version)

    text_file = open("%s/description.txt" % dataset_version, "w")
    text_file.write(dataset_description)
    text_file.close()
    print("done with %s" % dataset_version)
from mnist_sequence_api import MNIST_Sequence_API
import numpy as np
from utils import save_array, load_array

seq_len = 5  # generate sequences of this length
api_object = MNIST_Sequence_API()


def generate_data(n, seq_len, image_width, spacing_range=(0, 0)):
    inputs = []
    labels = []
    for i in range(n):
        seq_values = np.random.randint(0, 10, seq_len)
        seq = api_object.generate_mnist_sequence(seq_values, spacing_range,
                                                 image_width)
        seq = (255 - seq) / 255  # normalize the data
        inputs.append(seq)
        labels.append(seq_values)
    return np.array(inputs), np.array(labels)


n_train = 500
inputs, labels = generate_data(n_train, seq_len, 28 * seq_len)
save_array(inputs, "data/train_inputs.bc")
save_array(labels, "data/train_labels.bc")

n_validation = 250
inputs, labels = generate_data(n_validation, seq_len, 28 * seq_len)
save_array(inputs, "data/test_inputs.bc")
save_array(labels, "data/test_labels.bc")
Exemplo n.º 29
0
def run_active_learning(selection_fn,
                        datasets,
                        grouper,
                        config,
                        general_logger,
                        full_dataset=None):
    label_manager = datasets[config.target_split]['label_manager']

    # First run selection function
    selection_fn.select_and_reveal(label_manager=label_manager,
                                   K=config.n_shots)
    general_logger.write(
        f"Total Labels Revealed: {label_manager.num_labeled}\n")

    # Concatenate labeled source examples to labeled target examples
    if config.use_source_labeled:
        assert full_dataset is not None
        # We allow optionally ignoring the target examples entirely
        if not config.use_target_labeled:
            indices = datasets['train']['dataset'].indices
        else:
            indices = np.concatenate(
                (label_manager.labeled_indices,
                 datasets['train']['dataset'].indices)).astype(
                     int)  # target points at front
        labeled_dataset = WILDSSubset(full_dataset, indices,
                                      label_manager.labeled_train_transform)
    else:
        labeled_dataset = label_manager.get_labeled_subset()

    if config.upsample_target_labeled:
        # upsample target labels (compared to src labels) using a weighted sampler
        # do this by grouping by split and then using --uniform_over_groups=True
        labeled_grouper = CombinatorialGrouper(dataset=full_dataset,
                                               groupby_fields=['split'])
        labeled_config = copy(config)
        labeled_config.uniform_over_groups = True
    else:
        labeled_config = config
        labeled_grouper = grouper

    # Dump unlabeled indices to file
    save_array(label_manager.unlabeled_indices,
               csv_path=f'{config.log_dir}/unlabeled_test_ids.csv')

    # Add new splits to datasets dict
    ## Training Splits
    ### Labeled test
    datasets[f'labeled_{config.target_split}'] = configure_split_dict(
        data=labeled_dataset,
        split=f'labeled_{config.target_split}',
        split_name=f'labeled_{config.target_split}',
        get_train=True,
        verbose=True,
        grouper=labeled_grouper,
        batch_size=config.batch_size,
        config=labeled_config)
    ### Unlabeled test
    datasets[
        f'unlabeled_{config.target_split}_augmented'] = configure_split_dict(
            data=label_manager.get_unlabeled_subset(train=True),
            split=f"unlabeled_{config.target_split}_augmented",
            split_name=f"unlabeled_{config.target_split}_augmented",
            get_train=True,
            get_eval=True,
            grouper=grouper,
            batch_size=config.unlabeled_batch_size,
            verbose=True,
            config=config)
    ## Eval Splits
    ### Unlabeled test, eval transform
    datasets[f'unlabeled_{config.target_split}'] = configure_split_dict(
        data=label_manager.get_unlabeled_subset(train=False,
                                                return_pseudolabels=False),
        split=f"unlabeled_{config.target_split}",
        split_name=f"unlabeled_{config.target_split}",
        get_eval=True,
        grouper=None,
        verbose=True,
        batch_size=config.unlabeled_batch_size,
        config=config)

    ## Special de-duplicated eval set for fmow
    if config.dataset == 'fmow':
        disjoint_unlabeled_indices = fmow_deduplicate_locations(
            negative_indices=label_manager.labeled_indices,
            superset_indices=label_manager.unlabeled_indices,
            config=config)
        save_array(disjoint_unlabeled_indices,
                   csv_path=f'{config.log_dir}/disjoint_ids.csv')
        # build disjoint split
        disjoint_eval_dataset = WILDSSubset(full_dataset,
                                            disjoint_unlabeled_indices,
                                            label_manager.eval_transform)
        datasets[
            f'unlabeled_{config.target_split}_disjoint'] = configure_split_dict(
                data=disjoint_eval_dataset,
                split=f'unlabeled_{config.target_split}_disjoint',
                split_name=f'unlabeled_{config.target_split}_disjoint',
                get_eval=True,
                grouper=None,
                verbose=True,
                batch_size=config.unlabeled_batch_size,
                config=config)

    # Save NoisyStudent pseudolabels initially
    if config.algorithm == 'NoisyStudent':
        save_pseudo_if_needed(label_manager.unlabeled_pseudolabel_array,
                              f'unlabeled_{config.target_split}',
                              datasets[f'unlabeled_{config.target_split}'],
                              None, config, None)
        if f'unlabeled_{config.target_split}_disjoint' in datasets:
            save_pseudo_if_needed(
                label_manager.unlabeled_pseudolabel_array[[
                    label_manager.unlabeled_indices.index(i)
                    for i in disjoint_unlabeled_indices
                ]], f'unlabeled_{config.target_split}_disjoint',
                datasets[f'unlabeled_{config.target_split}_disjoint'], None,
                config, None)

    # return names of train_split, unlabeled_split
    return f'labeled_{config.target_split}', f"unlabeled_{config.target_split}_augmented"
Exemplo n.º 30
0
            if not loaded:
                train_text = utils.read_text(args.d)
                utils.logger.info("train text reading finished")

                if args.use_d2v:
                    train_tokens = utils.tokenize_paragraph_d2v(train_text)
                    utils.logger.info("train text tokenizing finished")
                    train_data = utils.compute_paragraph_doc2vec(
                        train_tokens,
                        vector_size=args.vector_size,
                        model_path=args.dm,
                        load_model=True,
                        predict=True)
                    utils.logger.info("train data doc2vec computing finished")
                    if utils.is_path_creatable(args.dd):
                        utils.save_array(args.dd, train_data)
                    utils.logger.info("save doc2vec train data successfully")

                elif args.use_w2v:
                    train_tokens = utils.tokenize_paragraph_w2v(train_text)
                    utils.logger.info("train text tokenizing finished")
                    train_data = utils.compute_paragraph_word2vec(
                        train_tokens,
                        vector_size=args.vector_size,
                        model_path=args.wm,
                        load_model=True,
                        predict=True)
                    utils.logger.info("train data word2vec computing finished")
                    if utils.is_path_creatable(args.wd):
                        utils.save_array(args.wd, train_data)
                    utils.logger.info("save word2vec train data successfully")
Exemplo n.º 31
0
    ab_diffs = np.abs(tfidf_ab.transform(pairs[ab+"_a"]) - tfidf_ab.transform(pairs[ab+"_b"]))
    ti_diffs = np.abs(tfidf_ti.transform(pairs[ti+"_a"]) - tfidf_ti.transform(pairs[ti+"_b"]))
    comps_diffs_df = pairs.apply(utils.compare, axis=1)
    comps_diffs = comps_diffs_df - comps_diffs_df.mean()
    comps_diffs = np.array(comps_diffs / comps_diffs.std())

    y = np.array(pairs[un+"_a"] == pairs[un+"_b"])

    feature_names_comps = np.array(comps_diffs_df.columns.values)
    feature_names_ab = np.array(tfidf_ab.get_feature_names())
    feature_names_ti = np.array(tfidf_ti.get_feature_names())

    if not os.path.exists(dataset_version):
        os.makedirs(dataset_version)

    utils.save_csr("%s/X_ab" % dataset_version, ab_diffs)
    utils.save_csr("%s/X_ti" % dataset_version, ti_diffs)
    utils.save_array("%s/X_comps" % dataset_version, comps_diffs)
    utils.save_array("%s/y" % dataset_version, y)

    utils.save_array("%s/feature_names_ab" % dataset_version, feature_names_ab)
    utils.save_array("%s/feature_names_ti" % dataset_version, feature_names_ti)
    utils.save_array("%s/feature_names_comps" % dataset_version, feature_names_comps)
    pairs[['pmid_a', 'pmid_b', un+'_a', un+'_b']].to_pickle("%s/feature_names_comps" % dataset_version)


    text_file = open("%s/description.txt" % dataset_version, "w")
    text_file.write(dataset_description)
    text_file.close()
    print("done with %s" % dataset_version)
Exemplo n.º 32
0
train['LATITUDE'] = pd.Series([
    np.array([point[1] for point in poly], dtype=np.float32)
    for poly in polyline
])

# In[150]:

train['LONGITUDE'] = pd.Series([
    np.array([point[0] for point in poly], dtype=np.float32)
    for poly in polyline
])

# In[157]:

utils.save_array(data_path + 'train/train.bc', train.as_matrix())

# In[158]:

utils.save_array(data_path + 'train/meta_train.bc', meta.as_matrix())

# ## Further Feature Engineering

# After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.

# In[424]:

train = pd.DataFrame(utils.load_array(data_path + 'train/train.bc'),
                     columns=[
                         'TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND',
                         'TAXI_ID', 'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA',