def select_and_reveal(self, label_manager, K): """ Labels K examples by passing label_manager the indexes of examples to reveal Wrapper for the select() function implemented by subclasses """ if K == 0: return groups = self.select_grouper.metadata_to_group( label_manager.unlabeled_metadata_array) group_ids = groups.unique().int().tolist() remaining = (torch.ones(len(group_ids)) * K).int().tolist() reveal = [] for idx in self._prior_selections: i = label_manager.unlabeled_indices.index(idx) g = groups[i] g_ind = group_ids.index(g) if remaining[g_ind] > 0: reveal.append(idx) remaining[g_ind] -= 1 if sum(remaining) == 0: break self._prior_selections = [ idx for idx in self._prior_selections if idx not in set(reveal) ] if sum(remaining) > 0: unlabeled_indices = torch.tensor(label_manager.unlabeled_indices) reveal = reveal + self.select(label_manager, remaining, unlabeled_indices, groups, group_ids) label_manager.reveal_labels(reveal) save_array(reveal, csv_path=f"{self.log_dir}/selected_ids.csv", mode=self.mode)
def __init__(self, w): ''' Generates random messages # Parameter ------------- w: np.array Weighted random condition. # Returns ------------- sample: integer Single random message. ''' self.w = w self.N = len(w) # We assign each message version with a unique ID from 0 to N-1 self.setN = np.arange(self.N) self.M = sum(w) # w/self.M will generate an array of 0s, use numpy divide self.norm_w = np.true_divide(w,self.M) # We randomize M message from N version for only 1 time if self.M < 6: self.shuffle = np.random.choice(self.setN, self.M, p=self.norm_w) else: # Use bcolz to store array > 1MB to speed up computation utils.save_array("shuffle.bc", np.random.choice(self.setN, self.M, p=self.norm_w)) self.shuffle = utils.load_array("shuffle.bc") # Everytime we call the message function that take a random message, # we increase current_id for the next message self.current_id = 0
def load_data(self, serialized_data_folder='./', path_to_vehicle_folder='vehicles/', path_to_non_vehicle_folder='non-vehicles/'): path_to_X_dat_file = '{0}/{1}'.format(serialized_data_folder, 'X.dat') path_to_y_dat_file = '{0}/{1}'.format(serialized_data_folder, 'y.dat') X_dat_file_exists = os.path.exists(path_to_X_dat_file) y_dat_file_exists = os.path.exists(path_to_y_dat_file) if X_dat_file_exists and y_dat_file_exists: print('Loading from serialized ...') self.X = load_array(path_to_X_dat_file) self.y = load_array(path_to_y_dat_file) print('Done reading serialized arrays') else: print('Creating data from image folders') non_vehicle_class, vehicle_class = 0, 1 non_vehicle_X, non_vehicle_y = self._get_X_y( path_to_non_vehicle_folder, non_vehicle_class) vehicle_X, vehicle_y = self._get_X_y(path_to_vehicle_folder, vehicle_class) self.X = np.concatenate((non_vehicle_X, vehicle_X)) self.y = np.concatenate((non_vehicle_y, vehicle_y)) print( 'Data created successfully, creating serialized numpy arrays') save_array(path_to_X_dat_file, self.X) save_array(path_to_y_dat_file, self.y) print('Done saving arrays')
def ensemble(): preds_raw = [] os.chdir(MODEL_DIR) total_weight = 0 preds_w = None for match_str in w_file_matcher: w_files = glob.glob(match_str) for w_file in w_files: weight = 0 full_w_file = MODEL_DIR + '/' + w_file if w_file.startswith('dense161'): model, _ = create_dense161() weight = 1 elif w_file.startswith('dense169'): model, _ = create_dense169() weight = 0.8 elif w_file.startswith('dense201'): model, _ = create_dense201() weight = 1 elif w_file.startswith('res50'): model, _ = create_res50() weight = 0.9 elif w_file.startswith('res101'): model, _ = create_res101() weight = 0.9 elif w_file.startswith('res152'): model, _ = create_res152() weight = 0.9 elif w_file.startswith('vgg16'): model, _ = create_vgg16() weight = 0.2 elif w_file.startswith('vgg19'): model, _ = create_vgg19() weight = 0.7 elif w_file.startswith('inceptionv3'): model, _ = create_inceptionv3() weight = 0.8 else: pass model.load_state_dict(torch.load(full_w_file)) print(full_w_file) pred = make_preds(model, test_loader) pred = np.array(pred) preds_raw.append(pred) if preds_w is None: preds_w = np.zeros((pred.shape)) preds_w += pred * weight total_weight += weight del model save_array(PRED_FILE_RAW, preds_raw) preds = np.mean(preds_raw, axis=0) #preds = preds_w / total_weight save_array(PRED_FILE, preds)
def save_precomputed_conv_models(self): fName1 = "precomputed_trn_features." + self.runID + ".h5" fName2 = "precomputed_val_features." + self.runID + ".h5" save_array(fName1, self.train_precomputed) save_array(fName2, self.val_precomputed) print("models saved to files: ", fName1, " and ", fName2) return self
def save_layers(PV, IBA): #save PV nucleus_channel_PV = read_layer(PV,0) utils.save_array("data/PV/X_cells_only.bc", nucleus_channel_PV) #save IBA nucleus_channel_IBA1 = read_layer(IBA,0) utils.save_array("data/IBA1/X_cells_only.bc", nucleus_channel_IBA1)
def train_and_test(no_of_epochs=4): batch_size = 64 vgg = Vgg16() train_model(vgg, DATA_DIR, batch_size, no_of_epochs) batches, preds = test_model(vgg, DATA_DIR + '/test', batch_size=batch_size) save_array(RESULTS_DIR + '/test_preds', preds) save_array(RESULTS_DIR + '/filenames', batches.filenames) return batches, preds, vgg
def ensemble(): res101, _ = create_res101(True) res152, _ = create_res152(True) dense201, _ = create_dense201(True) dense161, _ = create_dense161(True) pred1 = np.array(make_preds(res101, test_loader)) pred2 = np.array(make_preds(res152, test_loader)) pred3 = np.array(make_preds(dense201, test_loader)) pred4 = np.array(make_preds(dense161, test_loader)) preds = np.mean([pred1, pred2, pred3, pred4], axis=0) save_array(PRED_FILE, preds) print(preds[:10])
def find_best_weather(): thr = load_array(THRESHOLD_FILE_ENS) labels = load_array(VAL_LABELS) preds = load_array(PRED_VAL) print(labels.shape) weather = preds[:, 0:4] y = labels[0, :, 0:4] print(y.shape) print(weather.shape) thr = thr[0:4] def mf(p): p2 = np.zeros_like(p) for i in range(4): p2[:, i] = (p[:, i] > thr[i]).astype(np.int) score1 = fbeta_score(y, p2, beta=2, average='samples') return score1 base_score = mf( weather) #fbeta_score(y, weather, beta=2, average='samples') print('base score:{}'.format(base_score)) max_score = base_score d = 0.5 best_d = 0.5 best_w = weather while d < 1: w = get_one_weather(weather, thr, d) score = mf(w) #fbeta_score(y, w, beta=2, average='samples') print('score{}, d:{}'.format(score, d)) if score > max_score: max_score = score best_d = d best_w = w d += 0.1 print('best d:{}'.format(best_d)) w1 = force_one_weather(weather, thr) score1 = mf(w1) print('force one weather score:{}'.format(score1)) if max_score > base_score + 0.00001: test_preds = load_array(PRED_FILE) test_w = test_preds[:, 0:4] w = get_one_weather(test_w, thr, best_d) test_preds[:, 0:4] = w #preds[:, 0:4] = best_w save_array(PRED_WEATHER, test_preds)
def find_best_threshold(): preds = load_array(PRED_VAL) labels = load_array(VAL_LABELS) print(np.array(labels).shape) for i in range(1, len(labels)): for j in range(len(labels[i])): for k in range(len(labels[i][j])): if labels[i][j][k] != labels[i - 1][j][k]: print('error, check labels failed') exit() x = optimise_f2_thresholds(labels[0], preds) print('best threshold:') print(x) save_array(THRESHOLD_FILE_ENS, x)
def save(self, filedir, filename, master_params): try: os.makedirs(filedir) except OSError as e: if e.errno != errno.EEXIST: raise return utils.save_array(filedir + filename, master_params)
def calc_val_feats(): print("===== (VALID) Precalc validation conv features =====") pcf = PrecalcFeats() batches = create_batches('data/valid/', shuffle=False, use_da=False) print(" (precalc) calculating features...") feats = pcf.calc_feats_on_batch(batches) labels = to_categorical(batches.classes) # save labels_file = "data/results/conv_val_labels.h5" feats_file = "data/results/conv_val_feats.h5" save_array(labels_file, labels) save_array(feats_file, feats) print(" (precalc) feats: %s" % (feats.shape, )) print(" (precalc) saved feats to: %s" % feats_file) print(" (precalc) saved labels to: %s" % labels_file)
def precalculate_conv_output(model, train_batches, valid_batches): click.echo('Precalculating convolutional layer outputs...') train_features = model.predict_generator(train_batches, train_batches.nb_sample) click.echo('train_features shape: %s' % (train_features.shape, )) valid_features = model.predict_generator(valid_batches, valid_batches.nb_sample) click.echo('valid_features shape: %s' % (valid_features.shape, )) click.echo('Saving data...') utils.save_array(os.path.join(MODEL_PATH, 'train_convlayer_features.bc'), train_features) utils.save_array(os.path.join(MODEL_PATH, 'valid_convlayer_features.bc'), valid_features) return train_features, valid_features
def calc_train_da_feats(): nb_augm = 5 print("===== (TRAIN) Precalc data-augmented conv features =====") pcf = PrecalcFeats() for aug in range(nb_augm): print("===== data-aug: %d =====" % aug) batches = create_batches('data/train/', shuffle=False, use_da=True) print(" (precalc) calculating features...") feats = pcf.calc_feats_on_batch(batches) labels = to_categorical(batches.classes) # save labels_file = "data/results/da%d_conv_labels.h5" % aug feats_file = "data/results/da%d_conv_feats.h5" % aug save_array(labels_file, labels) save_array(feats_file, feats) print(" (precalc) feats: %s" % (feats.shape, )) print(" (precalc) saved feats to: %s" % feats_file) print(" (precalc) saved labels to: %s" % labels_file)
def ensemble(model_name, file_name, tta=False): preds_raw = [] model = create_model(model_name) test_set = data_loader.get_test_set() rounds = 1 if tta: rounds = 20 loader = data_loader.get_test_loader(model, test_set, tta=True) for index in range(rounds): predictions = np.array(make_preds(model, loader)) preds_raw.append(predictions) preds = np.mean(preds_raw, axis=0) save_array(settings.PREDICT_DIR + os.sep + file_name, preds)
def plot(self, figsize=(12, 8)): fig = plt.figure(figsize=figsize) plt.ylabel("loss", fontsize=16) plt.xlabel("learning rate (log scale)", fontsize=16) plt.xscale("log") plt.tick_params(axis='x', which='minor') plt.plot(self.lr_history[10:-5], self.loss_history[10:-5]) utils.save_array( f'../experiment/lr_find_edsr2' f'/lr_history_steps_{self.steps}_epoch{self.epoch}.bc', self.lr_history[10:-5]) utils.save_array( f'../experiment/lr_find_edsr2' f'/loss_history_steps_{self.steps}_epoch{self.epoch}.bc', self.loss_history[10:-5]) plt.savefig( f'../experiment/lr_find_edsr2/lr_find_steps_' f'{self.steps}_epoch{self.epoch}.png', bbox_inches='tight') plt.show()
def ensemble(): preds_raw = [] for match_str in w_file_matcher: os.chdir(MODEL_DIR) w_files = glob.glob(match_str) for w_file in w_files: full_w_file = MODEL_DIR + '/' + w_file mname = w_file.split('_')[0] print(full_w_file) model = create_model(mname) model.load_state_dict(torch.load(full_w_file)) pred = make_preds(model) pred = np.array(pred) preds_raw.append(pred) del model save_array(PRED_FILE_RAW, preds_raw) preds = np.mean(preds_raw, axis=0) save_array(PRED_FILE, preds)
def ensemble_val_data(): preds_raw = [] labels = [] for match_str in w_file_matcher: os.chdir(MODEL_DIR) w_files = glob.glob(match_str) for w_file in w_files: full_w_file = MODEL_DIR + '/' + w_file mname = w_file.split('_')[0] print(full_w_file) model = create_model(mname) model.load_state_dict(torch.load(full_w_file)) pred, y = make_preds_val(model) #pred = np.array(pred) preds_raw.append(pred) labels.append(y) del model save_array(PRED_VAL_RAW, preds_raw) preds = np.mean(preds_raw, axis=0) save_array(PRED_VAL, preds) save_array(VAL_LABELS, labels) return preds, labels
def ensemble(): preds_raw = [] os.chdir(MODEL_DIR) for match_str in w_file_matcher: w_files = glob.glob(match_str) for w_file in w_files: full_w_file = MODEL_DIR + '/' + w_file if w_file.startswith('dense161'): model, _ = create_dense161() elif w_file.startswith('dense169'): model, _ = create_dense169() elif w_file.startswith('dense201'): model, _ = create_dense201() elif w_file.startswith('res50'): model, _ = create_res50() elif w_file.startswith('res101'): model, _ = create_res101() elif w_file.startswith('res152'): model, _ = create_res152() elif w_file.startswith('vgg16'): model, _ = create_vgg16() elif w_file.startswith('vgg19'): model, _ = create_vgg19() elif w_file.startswith('inceptionv3'): model, _ = create_inceptionv3() else: pass model.load_state_dict(torch.load(full_w_file)) print(full_w_file) pred = make_preds(model, test_loader) pred = np.array(pred) preds_raw.append(pred) del model save_array(PRED_FILE_RAW, preds_raw) preds = np.mean(preds_raw, axis=0) save_array(PRED_FILE, preds)
def test_ensemble(models): nb_test_samples = 1000 nb_classes = 8 nb_augmentations = 5 preds = np.zeros((nb_test_samples, nb_classes)) for test_run in range(nb_augmentations): # make test batch randomly with data aug print("====== data-aug test batch: %d ======" % test_run) test_batches = models[0].create_test_batches(use_da=True) preds_aug = np.zeros((nb_test_samples, nb_classes)) for ind, m in enumerate(models): print("====== running test model: %d ======" % ind) _preds = m.test_on_batch(test_batches) preds_aug = preds_aug + _preds preds_aug /= len(models) preds = preds + preds_aug preds /= nb_augmentations save_array('data/results/ensemble_dn512_ep20_da_test_preds.h5', preds) return preds
def test_ensemble(models): nb_test_samples = 1000 nb_classes = 8 nb_augmentations = 5 preds = np.zeros((nb_test_samples, nb_classes)) for test_run in range(nb_augmentations): # make test batch randomly with data aug print("====== data-aug test batch: %d ======" % test_run) test_batches = create_batches('data/test/', shuffle=False, use_da=True) preds_aug = np.zeros((nb_test_samples, nb_classes)) for ind, m in enumerate(models): print("====== running test model: %d ======" % ind) _preds = m.test_on_batch(test_batches) preds_aug = preds_aug + _preds preds_aug /= len(models) preds = preds + preds_aug preds /= nb_augmentations save_array('submits/resnet_ft_ens_preds.gz', preds) return preds
def test_ensemble(models): nb_test_samples = 1000 nb_classes = 8 nb_augmentations = 5 preds = np.zeros((nb_test_samples, nb_classes)) for test_run in range(nb_augmentations): # make test batch randomly with data aug print("====== data-aug test batch: %d ======" % test_run) preds_aug = np.zeros((nb_test_samples, nb_classes)) conv_test_feat = load_array("data/results/da%d_conv_test_feats.h5" % test_run) for ind, m in enumerate(models): print("====== running test model: %d ======" % ind) _preds = m.test(conv_test_feat) preds_aug = preds_aug + _preds preds_aug /= len(models) preds = preds + preds_aug preds /= nb_augmentations save_array('data/results/ensemble_dense_preds.h5', preds) return preds
def save_pseudo_if_needed(y_pseudo, split, dataset, epoch, config, is_best, force_save=False): if (not config.save_pseudo_step) or (y_pseudo is None) or ( split not in config.save_splits): return prefix = get_pred_prefix(dataset, config) if config.algorithm == 'NoisyStudent': # save on first epoch; pseudolabels are constant save_array(y_pseudo, prefix + f'pseudo.csv') else: if force_save or (config.save_pseudo_step is not None and (epoch + 1) % config.save_pseudo_step == 0): save_array(y_pseudo, prefix + f'epoch:{epoch}_pseudo.csv') if config.save_last: save_array(y_pseudo, prefix + f'epoch:last_pseudo.csv') if config.save_best and is_best: save_array(y_pseudo, prefix + f'epoch:best_pseudo.csv')
def save_pred_if_needed(y_pred, split, dataset, epoch, config, is_best, force_save=False): if (not config.save_pred_step) or (split not in config.save_splits): return prefix = get_pred_prefix(dataset, config) if force_save or (config.save_pred_step is not None and (epoch + 1) % config.save_pred_step == 0): save_array(y_pred, prefix + f'epoch:{epoch}_pred.csv') if config.save_last: save_array(y_pred, prefix + f'epoch:last_pred.csv') if config.save_best and is_best: save_array(y_pred, prefix + f'epoch:best_pred.csv')
if use_ti: feature_names.extend(feature_names_ti) X_train.append(X_ti_train) X_test.append(X_ti_test) feature_names = np.array(feature_names) X_train = hstack(X_train) X_test = hstack(X_test) print("done assembling features in %fs" % (time() - t0)) # <codecell> # Models we will use X_train = X_train.todense() X_test = X_test.todense() utils.save_array("%s/X_test" % dataset_version, X_test) fitted = [] def grid_search(estimator): # try: # pipeline = Pipeline([(estimator[0], estimator[1])]) pipeline = estimator[1] param_grid = [estimator[2]] # print() print("Performing grid search for %s %s" % (t_name, e_name)) # print(str(type(pipeline))) # print("parameters:") pprint(param_grid) clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=0, scoring=scoring)
dsets_v3 = { x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms_v3[x]) for x in ['train', 'valid'] } dset_loaders_v3 = { x: torch.utils.data.DataLoader(dsets_v3[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'valid'] } dset_sizes = {x: len(dsets[x]) for x in ['train', 'valid']} dset_classes = dsets['train'].classes save_array(CLASSES_FILE, dset_classes) use_gpu = torch.cuda.is_available() w_files_training = [] def save_weights(acc, model, epoch, max_num=3): f_name = '{}_{}_{:.5f}.pth'.format(model.name, epoch, acc) w_file_path = os.path.join(MODEL_DIR, f_name) if len(w_files_training) < max_num: w_files_training.append((acc, w_file_path)) torch.save(model.state_dict(), w_file_path) return min = 10.0 index_min = -1
tfidf_ti.transform(pairs[ti + "_b"])) comps_diffs_df = pairs.apply(utils.compare, axis=1) comps_diffs = comps_diffs_df - comps_diffs_df.mean() comps_diffs = np.array(comps_diffs / comps_diffs.std()) y = np.array(pairs[un + "_a"] == pairs[un + "_b"]) feature_names_comps = np.array(comps_diffs_df.columns.values) feature_names_ab = np.array(tfidf_ab.get_feature_names()) feature_names_ti = np.array(tfidf_ti.get_feature_names()) if not os.path.exists(dataset_version): os.makedirs(dataset_version) utils.save_csr("%s/X_ab" % dataset_version, ab_diffs) utils.save_csr("%s/X_ti" % dataset_version, ti_diffs) utils.save_array("%s/X_comps" % dataset_version, comps_diffs) utils.save_array("%s/y" % dataset_version, y) utils.save_array("%s/feature_names_ab" % dataset_version, feature_names_ab) utils.save_array("%s/feature_names_ti" % dataset_version, feature_names_ti) utils.save_array("%s/feature_names_comps" % dataset_version, feature_names_comps) pairs[['pmid_a', 'pmid_b', un + '_a', un + '_b']].to_pickle("%s/feature_names_comps" % dataset_version) text_file = open("%s/description.txt" % dataset_version, "w") text_file.write(dataset_description) text_file.close() print("done with %s" % dataset_version)
from mnist_sequence_api import MNIST_Sequence_API import numpy as np from utils import save_array, load_array seq_len = 5 # generate sequences of this length api_object = MNIST_Sequence_API() def generate_data(n, seq_len, image_width, spacing_range=(0, 0)): inputs = [] labels = [] for i in range(n): seq_values = np.random.randint(0, 10, seq_len) seq = api_object.generate_mnist_sequence(seq_values, spacing_range, image_width) seq = (255 - seq) / 255 # normalize the data inputs.append(seq) labels.append(seq_values) return np.array(inputs), np.array(labels) n_train = 500 inputs, labels = generate_data(n_train, seq_len, 28 * seq_len) save_array(inputs, "data/train_inputs.bc") save_array(labels, "data/train_labels.bc") n_validation = 250 inputs, labels = generate_data(n_validation, seq_len, 28 * seq_len) save_array(inputs, "data/test_inputs.bc") save_array(labels, "data/test_labels.bc")
def run_active_learning(selection_fn, datasets, grouper, config, general_logger, full_dataset=None): label_manager = datasets[config.target_split]['label_manager'] # First run selection function selection_fn.select_and_reveal(label_manager=label_manager, K=config.n_shots) general_logger.write( f"Total Labels Revealed: {label_manager.num_labeled}\n") # Concatenate labeled source examples to labeled target examples if config.use_source_labeled: assert full_dataset is not None # We allow optionally ignoring the target examples entirely if not config.use_target_labeled: indices = datasets['train']['dataset'].indices else: indices = np.concatenate( (label_manager.labeled_indices, datasets['train']['dataset'].indices)).astype( int) # target points at front labeled_dataset = WILDSSubset(full_dataset, indices, label_manager.labeled_train_transform) else: labeled_dataset = label_manager.get_labeled_subset() if config.upsample_target_labeled: # upsample target labels (compared to src labels) using a weighted sampler # do this by grouping by split and then using --uniform_over_groups=True labeled_grouper = CombinatorialGrouper(dataset=full_dataset, groupby_fields=['split']) labeled_config = copy(config) labeled_config.uniform_over_groups = True else: labeled_config = config labeled_grouper = grouper # Dump unlabeled indices to file save_array(label_manager.unlabeled_indices, csv_path=f'{config.log_dir}/unlabeled_test_ids.csv') # Add new splits to datasets dict ## Training Splits ### Labeled test datasets[f'labeled_{config.target_split}'] = configure_split_dict( data=labeled_dataset, split=f'labeled_{config.target_split}', split_name=f'labeled_{config.target_split}', get_train=True, verbose=True, grouper=labeled_grouper, batch_size=config.batch_size, config=labeled_config) ### Unlabeled test datasets[ f'unlabeled_{config.target_split}_augmented'] = configure_split_dict( data=label_manager.get_unlabeled_subset(train=True), split=f"unlabeled_{config.target_split}_augmented", split_name=f"unlabeled_{config.target_split}_augmented", get_train=True, get_eval=True, grouper=grouper, batch_size=config.unlabeled_batch_size, verbose=True, config=config) ## Eval Splits ### Unlabeled test, eval transform datasets[f'unlabeled_{config.target_split}'] = configure_split_dict( data=label_manager.get_unlabeled_subset(train=False, return_pseudolabels=False), split=f"unlabeled_{config.target_split}", split_name=f"unlabeled_{config.target_split}", get_eval=True, grouper=None, verbose=True, batch_size=config.unlabeled_batch_size, config=config) ## Special de-duplicated eval set for fmow if config.dataset == 'fmow': disjoint_unlabeled_indices = fmow_deduplicate_locations( negative_indices=label_manager.labeled_indices, superset_indices=label_manager.unlabeled_indices, config=config) save_array(disjoint_unlabeled_indices, csv_path=f'{config.log_dir}/disjoint_ids.csv') # build disjoint split disjoint_eval_dataset = WILDSSubset(full_dataset, disjoint_unlabeled_indices, label_manager.eval_transform) datasets[ f'unlabeled_{config.target_split}_disjoint'] = configure_split_dict( data=disjoint_eval_dataset, split=f'unlabeled_{config.target_split}_disjoint', split_name=f'unlabeled_{config.target_split}_disjoint', get_eval=True, grouper=None, verbose=True, batch_size=config.unlabeled_batch_size, config=config) # Save NoisyStudent pseudolabels initially if config.algorithm == 'NoisyStudent': save_pseudo_if_needed(label_manager.unlabeled_pseudolabel_array, f'unlabeled_{config.target_split}', datasets[f'unlabeled_{config.target_split}'], None, config, None) if f'unlabeled_{config.target_split}_disjoint' in datasets: save_pseudo_if_needed( label_manager.unlabeled_pseudolabel_array[[ label_manager.unlabeled_indices.index(i) for i in disjoint_unlabeled_indices ]], f'unlabeled_{config.target_split}_disjoint', datasets[f'unlabeled_{config.target_split}_disjoint'], None, config, None) # return names of train_split, unlabeled_split return f'labeled_{config.target_split}', f"unlabeled_{config.target_split}_augmented"
if not loaded: train_text = utils.read_text(args.d) utils.logger.info("train text reading finished") if args.use_d2v: train_tokens = utils.tokenize_paragraph_d2v(train_text) utils.logger.info("train text tokenizing finished") train_data = utils.compute_paragraph_doc2vec( train_tokens, vector_size=args.vector_size, model_path=args.dm, load_model=True, predict=True) utils.logger.info("train data doc2vec computing finished") if utils.is_path_creatable(args.dd): utils.save_array(args.dd, train_data) utils.logger.info("save doc2vec train data successfully") elif args.use_w2v: train_tokens = utils.tokenize_paragraph_w2v(train_text) utils.logger.info("train text tokenizing finished") train_data = utils.compute_paragraph_word2vec( train_tokens, vector_size=args.vector_size, model_path=args.wm, load_model=True, predict=True) utils.logger.info("train data word2vec computing finished") if utils.is_path_creatable(args.wd): utils.save_array(args.wd, train_data) utils.logger.info("save word2vec train data successfully")
ab_diffs = np.abs(tfidf_ab.transform(pairs[ab+"_a"]) - tfidf_ab.transform(pairs[ab+"_b"])) ti_diffs = np.abs(tfidf_ti.transform(pairs[ti+"_a"]) - tfidf_ti.transform(pairs[ti+"_b"])) comps_diffs_df = pairs.apply(utils.compare, axis=1) comps_diffs = comps_diffs_df - comps_diffs_df.mean() comps_diffs = np.array(comps_diffs / comps_diffs.std()) y = np.array(pairs[un+"_a"] == pairs[un+"_b"]) feature_names_comps = np.array(comps_diffs_df.columns.values) feature_names_ab = np.array(tfidf_ab.get_feature_names()) feature_names_ti = np.array(tfidf_ti.get_feature_names()) if not os.path.exists(dataset_version): os.makedirs(dataset_version) utils.save_csr("%s/X_ab" % dataset_version, ab_diffs) utils.save_csr("%s/X_ti" % dataset_version, ti_diffs) utils.save_array("%s/X_comps" % dataset_version, comps_diffs) utils.save_array("%s/y" % dataset_version, y) utils.save_array("%s/feature_names_ab" % dataset_version, feature_names_ab) utils.save_array("%s/feature_names_ti" % dataset_version, feature_names_ti) utils.save_array("%s/feature_names_comps" % dataset_version, feature_names_comps) pairs[['pmid_a', 'pmid_b', un+'_a', un+'_b']].to_pickle("%s/feature_names_comps" % dataset_version) text_file = open("%s/description.txt" % dataset_version, "w") text_file.write(dataset_description) text_file.close() print("done with %s" % dataset_version)
train['LATITUDE'] = pd.Series([ np.array([point[1] for point in poly], dtype=np.float32) for poly in polyline ]) # In[150]: train['LONGITUDE'] = pd.Series([ np.array([point[0] for point in poly], dtype=np.float32) for poly in polyline ]) # In[157]: utils.save_array(data_path + 'train/train.bc', train.as_matrix()) # In[158]: utils.save_array(data_path + 'train/meta_train.bc', meta.as_matrix()) # ## Further Feature Engineering # After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module. # In[424]: train = pd.DataFrame(utils.load_array(data_path + 'train/train.bc'), columns=[ 'TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA',