def eval_loss_cpu_thread(tuple_shape): global EVAL_LOSS_CPU_IN_QUEUE global EVAL_LOSS_GPU_IN_QUEUE current_epoch = EPOCH meta = load_csv( os.path.join(SHUFFLED_ROOT, '{}_{:03d}.csv'.format(OTHER_REF_SET, current_epoch))) xy = get_xy(meta) ref_tree = KDTree(xy) yaw = np.array(meta['yaw'], dtype=float) while True: t = time() original_indices = EVAL_LOSS_CPU_IN_QUEUE.get() if not EPOCH == current_epoch: current_epoch = EPOCH meta = load_csv( os.path.join( SHUFFLED_ROOT, '{}_{:03d}.csv'.format(OTHER_REF_SET, current_epoch))) xy = get_xy(meta) ref_tree = KDTree(xy) yaw = np.array(meta['yaw'], dtype=float) distances, image_info, used_indices = get_tuple( original_indices, tuple_shape, False, meta, xy, yaw, ref_tree) if len(image_info) == TUPLES_PER_BATCH * sum(tuple_shape): images = load_images(image_info) EVAL_LOSS_GPU_IN_QUEUE.put((distances, images), block=True) EVAL_LOSS_CPU_IN_QUEUE.task_done() print('Loaded eval loss tuples in {}s.'.format(time() - t))
def sample_anchors(shuffled_root, cluster_root, out_root, s, mode, r, epoch): train_meta = load_csv(os.path.join(shuffled_root, '{}_{}_{:03d}.csv'.format(s, mode, epoch))) train_xy = get_xy(train_meta) out_file = os.path.join(out_root, '{}_{}_{}_{:03d}.csv'.format(s, mode, r, epoch)) if not os.path.exists(out_file): ref_meata = load_csv(os.path.join(cluster_root, '{}_{}_{}.csv'.format(s, mode, r))) ref_xy = get_xy(ref_meata) # Sample reference images (random image withing r/2 of reference location) ref_tree = KDTree(train_xy) ref_neighbors = ref_tree.query_radius(ref_xy, r=1, return_distance=False) anchors = [np.random.choice(potential_anchors) for potential_anchors in ref_neighbors] np.random.shuffle(anchors) anchor_indices = {'idx': anchors} save_csv(anchor_indices, out_file) else: anchor_indices = load_csv(out_file) anchor_xy = np.array([train_xy[int(i), :] for i in anchor_indices['idx']]) out_img = os.path.join(out_root, '{}_{}_{}_{}.png'.format(s, mode, r, epoch)) plt.clf() plt.clf() f, (ax1) = plt.subplots(1, 1, sharey=False) f.set_figheight(50) f.set_figwidth(50) ax1.scatter(anchor_xy[:, 0], anchor_xy[:, 1], c=np.arange(len(anchor_xy))) plt.savefig(out_img)
def get_l_based_fixed_localization_reference(in_root, out_root, s, r): out_txt = os.path.join(out_root, '{}_ref_l_{}.txt'.format(s, int(r))) out_csv = os.path.join(out_root, '{}_ref_l_{}.csv'.format(s, int(r))) if not os.path.exists(out_csv): meta = load_csv(os.path.join(in_root, '{}_ref.csv'.format(s))) # Not using query locations for this l = np.array(meta['l']).reshape(-1, 1) ll = np.arange(math.floor(l[-1]), step=r).reshape(-1, 1) l_tree = KDTree(l) i_l = l_tree.query(ll, return_distance=False, k=1) i_l = np.squeeze(i_l) save_txt('\n'.join(['{}'.format(i) for i in i_l]), out_txt) selected_meta = dict() for key in meta.keys(): selected_meta[key] = [meta[key][i] for i in i_l] save_csv(selected_meta, out_csv) else: selected_meta = load_csv(out_csv) out_folder = os.path.join(out_root, '{}_ref_l_{}'.format(s, int(r))) if not os.path.exists(out_folder): os.makedirs(out_folder) for i, (d, f, t) in tqdm(enumerate(zip(selected_meta['date'], selected_meta['folder'], selected_meta['t']))): f = int(f) img = load_img(img_path((d, f, t))) save_img(img, os.path.join(out_folder, '{:04d}_{}_{:02d}_{}.png'.format(i, d, f, t)))
def evaluate_localization(global_step, ref_set_name, query_set_name, mode, out_name, tuple_shape, writer, epoch): # Get ref features ref_meta = load_csv( os.path.join(LOC_REF_ROOT, '{}_{}.csv'.format(ref_set_name, EVAL_REF_R))) num_ref = len(ref_meta['t']) padding = np.zeros(TUPLES_PER_BATCH * sum(tuple_shape) - (num_ref % (TUPLES_PER_BATCH * sum(tuple_shape))), dtype=int) ref_image_info = [(ref_meta['date'][i], ref_meta['folder'][i], ref_meta['t'][i]) for i in np.concatenate((np.arange(num_ref), padding))] ref_features = extract_features(ref_image_info, tuple_shape) ref_features = np.array(ref_features[0:num_ref]) ref_image_info = ref_image_info[0:num_ref] ref_xy = get_xy(ref_meta) # Get query features query_meta = load_csv( os.path.join(SHUFFLED_ROOT, '{}_{:03d}.csv'.format(query_set_name, epoch))) test_number = (global_step // EVAL_STEP) query_indices = np.arange(test_number * NUM_EVAL_QUERIES, (test_number + 1) * NUM_EVAL_QUERIES) \ % len(query_meta['t']) padding = np.zeros(TUPLES_PER_BATCH * sum(tuple_shape) - (NUM_EVAL_QUERIES % (TUPLES_PER_BATCH * sum(tuple_shape))), dtype=int) query_image_info = [(query_meta['date'][i], query_meta['folder'][i], query_meta['t'][i]) for i in np.concatenate((query_indices, padding))] query_features = np.array(extract_features( query_image_info, tuple_shape))[:len(query_indices), :] query_xy = np.array( [xy for i, xy in enumerate(get_xy(query_meta)) if i in query_indices]) ref_feature_tree = KDTree(ref_features) nearest_latent_dists, nearest_latent_indices = ref_feature_tree.query( query_features, k=5) ref_xy_tree = KDTree(ref_xy) nearest_d_dist, nearest_d_indices = ref_xy_tree.query(query_xy, k=1) # CPU part of evaluation is done asynchronously worker = Thread(target=evaluate_localization_thread, args=(global_step, mode, nearest_d_dist, nearest_d_indices, nearest_latent_indices, out_name, query_image_info, query_xy, ref_image_info, ref_xy, writer)) worker.setDaemon(True) worker.start() return
def create_reference(s): date = getattr(sys.modules[__name__], '{}_ref_date'.format(s)) out_file = os.path.join(out_root, '{}_{}_geodesic.csv'.format(s, date)) if not os.path.exists(out_file): data = load_csv(os.path.join(in_root, 'clean_{}.csv'.format(s))) ref_data = dict() for key in data.keys(): ref_data[key] = [ e for e, d in zip(data[key], data['date']) if d == date ] ref_xy = [(float(x), float(y)) for x, y in zip(ref_data['easting'], ref_data['northing'])] ref_d = [0] + [ math.sqrt((p[0] - q[0])**2 + (p[1] - q[1])**2) for p, q in zip(ref_xy[1:], ref_xy[:-1]) ] ref_l = [sum(ref_d[:i]) for i in range(1, len(ref_data['date']) + 1)] vmin = min(ref_l) vmax = max(ref_l) ref_data['l'] = ref_l ref_yaw = np.array(ref_data['yaw'], dtype=float) plot_results(ref_xy, ref_yaw, ref_l, date, ref_data, s, vmin, vmax) save_csv(ref_data, out_file)
def get_tags(tag_root): tags = dict() all_tags = [] for date in os.listdir(tag_root): tags[date] = load_csv(os.path.join(tag_root, date, 'tags.csv')) all_tags = list(set(all_tags + tags[date])) return tags, all_tags
def merge_dates(in_root, ins_root, out_root): # Find all dates with INS data (not all images have ins, but all ins should have images) all_dates = sorted( os.listdir(ins_root)) # Sort to make sure we always get the same order first = True all_info = dict() for date in all_dates: split_file = os.path.join(in_root, '{}.csv'.format(date)) if not os.path.exists(split_file): print('Missing {}.'.format(split_file)) continue date_info = load_csv(split_file) # Add date and tags column num_entries = len(date_info['easting']) rep_date = [date] * num_entries date_info['date'] = rep_date if first: all_info = date_info first = False else: for key in all_info.keys(): all_info[key] = all_info[key] + date_info[key] out_file = os.path.join(out_root, 'merged.csv') save_csv(all_info, out_file)
def get_eval_loss(global_step, test_writer, epoch): meta = load_csv( os.path.join(SHUFFLED_ROOT, '{}_{:03d}.csv'.format(OTHER_REF_SET, epoch))) test_number = (global_step // EVAL_STEP) actual_num_eval_queries = (NUM_EVAL_QUERIES // TUPLES_PER_BATCH) * TUPLES_PER_BATCH test_indices = np.arange( test_number * actual_num_eval_queries, (test_number + 1) * actual_num_eval_queries) % len(meta['t']) batched_indices = np.reshape(test_indices, (-1, TUPLES_PER_BATCH)) # Start queues for index_batch in batched_indices: EVAL_LOSS_CPU_IN_QUEUE.put(index_batch) # Wait for completion & order output EVAL_LOSS_CPU_IN_QUEUE.join() EVAL_LOSS_GPU_IN_QUEUE.join() eval_losses = list(EVAL_LOSS_GPU_OUT_QUEUE.queue) EVAL_LOSS_GPU_OUT_QUEUE.queue.clear() if len(eval_losses) > 0: summary = tf.Summary() loss = np.mean(eval_losses) summary.value.add(tag='loss', simple_value=loss) log('Other region loss: {}'.format(loss)) test_writer.add_summary(summary, global_step) else: log('Evaluated but got no valid losses.')
def downsize_images(task_id, max_side, img_root, ins_root, tar_root, out_img_root, out_root, cams): # Find all dates with INS data (not all images have ins, but all ins should have images) all_dates = sorted(os.listdir(ins_root)) # Sort to make sure we always get the same order date = all_dates[int(task_id) - 1] print(date) out_file = os.path.join(out_root, 'img_info_{}'.format(max_side), '{}.csv'.format(date)) if os.path.exists(out_file): print('Output already exists.') return imgs = load_csv(os.path.join(img_root, date, 'stereo.timestamps'), has_header=False, delimiter=' ', keys=['t', 'folder']) cam = oxford_camera.CameraModel(cams, '/stereo/centre/') exposures = [0] * len(imgs['t']) max_folder = max(np.array(imgs['folder'], dtype=int)) if date == '2015-09-02-10-37-32': max_folder = 4 # Folders 5 and 6 are missing from the website imgs['t'] = [t for f, t in zip(imgs['folder'], imgs['t']) if int(f) <= max_folder] imgs['folder'] = [f for f in imgs['folder'] if int(f) <= max_folder] for folder in range(1, max_folder + 1): filename = os.path.join(tar_root, '{}_stereo_centre_{:02d}.tar'.format(date, folder)) print(filename) if not os.path.exists(filename): print("MISSING!!") save_txt(txt=filename, mode='a', out_file=os.path.join(out_root, 'missing.txt')) with tarfile.open(filename) as archive: print(archive) for entry in archive.getmembers(): img_name = os.path.basename(entry.name) if '.png' not in img_name: continue ts = img_name.split('.')[0] img_path = entry.name with archive.extractfile(archive.getmember(img_path)) as file: timer = time.time() index = imgs['t'].index(ts) # Assuming that timestamps are not ordered try: img = oxford_image.load_image(file, cam) # One file has unloadable image... img = resize_img(img, max_side) exposures[index] = sum(np.array(img).flatten()) out_img_folder = os.path.join(out_img_root, '{}_stereo_centre_{:02d}'.format(date, folder)) if not os.path.exists(out_img_folder): os.makedirs(out_img_folder) out_img_path = os.path.join(out_img_folder, img_name) save_img(img, out_img_path) print('Processed {} in {}s.'.format(ts, time.time() - timer)) except: del exposures[index] del imgs['t'][index] del imgs['folder'][index] imgs['exposure'] = exposures save_csv(imgs, out_file)
def get_splits(task_id, grids, in_root, ins_root, out_root): # Find all dates with INS data (not all images have ins, but all ins should have images) all_dates = sorted(os.listdir(ins_root)) # Sort to make sure we always get the same order date = all_dates[int(task_id) - 1] print(date) out_file = os.path.join(out_root, '{}.csv'.format(date)) if os.path.exists(out_file): print('Already calculated {}.'.format(out_file)) return xy_file = os.path.join(in_root, '{}.csv'.format(date)) if not os.path.exists(xy_file): print('Missing {}.'.format(xy_file)) return xy = load_csv(xy_file) X = [0 if math.isnan(float(e)) else int(float(e) - 619500.0) for e in xy['easting']] Y = [0 if math.isnan(float(n)) else int(5736480.0 - float(n)) for n in xy['northing']] out_img_grid = os.path.join(out_root, '{}_grid.png'.format(date)) draw_grid(X, Y, out_img_grid) out_img_scatter = os.path.join(out_root, '{}_scatter.png'.format(date)) plt.clf() plt.scatter(np.array(xy['easting'], dtype=float), np.array(xy['northing'], dtype=float), c=np.array(xy['yaw'], dtype=float)) plt.savefig(out_img_scatter) for grid_name in grids.keys(): grid = cv2.imread(grids[grid_name]) grid = np.asarray(grid, dtype=np.uint8) # Fix for failing img loading in_fold = list() for x, y in zip(X, Y): if x < 0 or y < 0 or x >= grid.shape[1] or y >= grid.shape[0]: in_fold.append(0) elif grid[y, x, 0] > 0: # All color channels are the same in_fold.append(1) else: in_fold.append(0) xy[grid_name] = in_fold max_assigned = [a1 + a2 + a3 for a1, a2, a3 in zip(xy['train'], xy['test'], xy['val'])] assert max(max_assigned) <= 1, 'Please increase in_fold grid threshold.' for grid_name in grids.keys(): X_g = [x for x, in_fold in zip(X, xy[grid_name]) if in_fold == 1] Y_g = [y for y, in_fold in zip(Y, xy[grid_name]) if in_fold == 1] print('Found {} imgs in {} for {}.'.format(len(X_g), grid_name, date)) out_img_file = os.path.join(out_root, '{}_{}.png'.format(date, grid_name)) draw_grid(X_g, Y_g, out_img_file) save_csv(xy, out_file)
def train_cpu_thread(tuple_shape): global TRAIN_CPU_IN_QUEUE global TRAIN_GPU_IN_QUEUE global USED_IMAGE_LOCK global USED_IMAGES current_epoch = EPOCH meta = load_csv( os.path.join(SHUFFLED_ROOT, '{}_{:03d}.csv'.format(LOCAL_REF_SET, current_epoch))) xy = get_xy(meta) ref_tree = KDTree(xy) yaw = np.array(meta['yaw'], dtype=float) while True: t = time() original_indices = TRAIN_CPU_IN_QUEUE.get() if not EPOCH == current_epoch: current_epoch = EPOCH meta = load_csv( os.path.join( SHUFFLED_ROOT, '{}_{:03d}.csv'.format(LOCAL_REF_SET, current_epoch))) xy = get_xy(meta) ref_tree = KDTree(xy) yaw = np.array(meta['yaw'], dtype=float) distances, image_info, used_indices = get_tuple( original_indices, tuple_shape, True, meta, xy, yaw, ref_tree) if len(image_info) == TUPLES_PER_BATCH * sum(tuple_shape): images = load_images(image_info) TRAIN_GPU_IN_QUEUE.put((distances, images), block=True) with USED_IMAGE_LOCK: USED_IMAGES.update(used_indices) else: log('Faulty training batch... ') log(image_info) TRAIN_CPU_IN_QUEUE.task_done() print('Loaded train tuples in {}s.'.format(time() - t))
def get_greedy_fixed_localization_reference(in_root, out_root, s, r): out_file = os.path.join(out_root, '{}_greedy_{}_ref.txt'.format(s, r)) if not os.path.exists(out_file): meta = load_csv(os.path.join(in_root, '{}_ref.csv'.format(s))) # Not using query locations for this xy = np.array([(e, n) for e, n in zip(meta['northing'], meta['easting'])], dtype=float) ref_ids = greedy(xy, 1) print(len(ref_ids)) save_txt('\n'.join(['{}'.format(i) for i in ref_ids]), os.path.join(out_root))
def infer(): with tf.Graph().as_default() as graph: print("In Graph") ops, tuple_shape = build_inference_model() sess = restore_weights() # For better gpu utilization, loading processes and gpu inference are done in separate threads. # Start CPU threads num_loader_threads = 6 for i in range(num_loader_threads): worker = Thread(target=cpu_thread) worker.setDaemon(True) worker.start() # Start GPU threads worker = Thread(target=gpu_thread, args=(sess, ops)) worker.setDaemon(True) worker.start() csv_file = os.path.join(CSV_ROOT, '{}.csv'.format(SET)) meta = load_csv(csv_file) num = len(meta['path']) # Clean list padding = [0 for i in range(IMAGES_PER_PASS - (num % IMAGES_PER_PASS))] image_info = [(meta['path'][i]) for i in np.concatenate((np.arange(num), np.array(padding)))] padded_num = len(image_info) batched_indices = np.reshape(np.arange(padded_num), (-1, TUPLES_PER_BATCH * sum(tuple_shape))) batched_image_info = np.reshape( image_info, (-1, TUPLES_PER_BATCH * sum(tuple_shape))) for batch_indices, batch_image_info in zip(batched_indices, batched_image_info): CPU_IN_QUEUE.put((batch_indices, batch_image_info)) # Wait for completion & order output CPU_IN_QUEUE.join() GPU_IN_QUEUE.join() feature_pairs = list(GPU_OUT_QUEUE.queue) GPU_OUT_QUEUE.queue.clear() features = [[]] * padded_num for pair in feature_pairs: for i, f in zip(pair[0], pair[1]): features[i] = f features = features[:num] save_pickle( features, os.path.join(OUT_ROOT, '{}_{}.pickle'.format(SET, OUT_NAME)))
def cluster(in_root, out_root, s, mode, r): out_file = os.path.join(out_root, '{}_{}_{}.pickle'.format(s, mode, r)) meta_file = os.path.join(in_root, '{}_{}_000.csv'.format(s, mode)) meta = load_csv(meta_file) if not os.path.exists(out_file): date = getattr(sys.modules[__name__], '{}_ref_date'.format(s)) temp_meta = dict() for key in meta.keys(): temp_meta[key] = [ e for e, d in zip(meta[key], meta['date']) if d in date ] t_idx = np.argsort(temp_meta['t']) date_meta = dict() for key in meta.keys(): date_meta[key] = [temp_meta[key][i] for i in t_idx] print(len(date_meta['t'])) xy = get_xy(date_meta) ref_xy = [xy[0, :]] ref_idx = [0] for i in tqdm(range(len(date_meta['t']))): if sum((xy[i, :] - ref_xy[-1])**2) > r**2: ref_xy.append(xy[i, :]) ref_idx.append(i) ref_xy = np.array(ref_xy) save_pickle([ref_xy, date_meta, ref_idx], out_file) else: ref_xy, date_meta, ref_idx = load_pickle(out_file) print('{}: {}'.format(s, len(ref_idx))) out_img = os.path.join(out_root, '{}_{}_{}.png'.format(s, mode, r)) plt.clf() plt.clf() f, (ax1) = plt.subplots(1, 1, sharey=False) f.set_figheight(50) f.set_figwidth(50) ax1.scatter(ref_xy[:, 0], ref_xy[:, 1], c=np.arange(len(ref_xy))) plt.savefig(out_img) out_meta = dict() for key in meta.keys(): out_meta[key] = [date_meta[key][i] for i in ref_idx] out_file = os.path.join(out_root, '{}_{}_{}.csv'.format(s, mode, r)) save_csv(out_meta, out_file)
def shuffle(in_root, out_root, s, mode, num_epochs): meta = load_csv(os.path.join(in_root, '{}_{}.csv'.format( s, mode))) # Not using query locations for this for e in range(num_epochs): out_file = os.path.join(out_root, '{}_{}_{:03d}.csv'.format(s, mode, e)) if os.path.exists(out_file): print('{} exists. Not recalculating.'.format(out_file)) else: print('Shuffling {}.'.format(out_file)) shuffled_indices = np.random.permutation(len(meta['t'])) shuffled_meta = dict() for key in meta.keys(): shuffled_meta[key] = [meta[key][i] for i in shuffled_indices] save_csv(shuffled_meta, out_file)
def set_aside_queries(in_root, folds, query_dates): num_per_fold = dict() for fold in folds: clean_file = os.path.join(in_root, '{}.csv'.format(fold)) data = load_csv(clean_file) query_out = clean_file.replace(fold, '{}_query'.format(fold)) ref_out = clean_file.replace(fold, '{}_ref'.format(fold)) query_data = dict() ref_data = dict() for key in data.keys(): query_data[key] = [el for el, date in zip(data[key], data['date']) if date in query_dates] ref_data[key] = [el for el, date in zip(data[key], data['date']) if date not in query_dates] num_per_fold['{}_query'.format(fold)] = len(query_data['t']) num_per_fold['{}_ref'.format(fold)] = len(ref_data['t']) save_csv(query_data, query_out) save_csv(ref_data, ref_out) save_csv(num_per_fold, os.path.join(in_root, 'num_per_fold.csv'))
def merge_parametrized(in_root, folds, cols_to_keep, out_root): files = os.listdir(in_root) meta_info = dict() full_data = dict() for c in cols_to_keep: full_data[c] = [] for fold in folds: data = dict() date_count = dict() for c in cols_to_keep: data[c] = [] fold_files = [f for f in files if f.split('_')[0] == fold] for file in fold_files: if '.csv' in file: date_data = load_csv(os.path.join(in_root, file)) if len( date_data['t'] ) < 100: # Very few files indicate bad l alignment or bad ins estimates continue for c in cols_to_keep: data[c].extend(date_data[c]) full_data[c].extend(date_data[c]) date_count[file.split('_')[1]] = len(date_data['t']) out_file = os.path.join(out_root, '{}.csv'.format(fold)) save_csv(data, out_file) meta_info[fold] = len(data['t']) save_csv(date_count, os.path.join(out_root, '{}_date_count.csv'.format(fold))) out_file = os.path.join(out_root, 'full.csv') save_csv(full_data, out_file) meta_info['full'] = len(full_data['t']) save_csv(meta_info, os.path.join(out_root, 'meta.csv'))
def train_one_epoch(sess, epoch, writers, saver, part_saver, tuple_shape): global GLOBAL_STEP global GLOBAL_STEP_LOCK global CACHED_FEATURE_LOCK global CACHED_FEATURE_INDICES global CACHED_FEATURES global CACHED_FEATURE_TREE global USED_IMAGE_LOCK global USED_IMAGES global REF_FEATURE_LOCK global REF_FEATURES global TRAIN_XY global TRAIN_XY_LOCK train_meta = load_csv( os.path.join(SHUFFLED_ROOT, '{}_{:03d}.csv'.format(LOCAL_REF_SET, epoch))) train_xy = get_xy(train_meta) with TRAIN_XY_LOCK: TRAIN_XY = train_xy anchor_indices = np.array(load_csv( os.path.join( ANCHOR_ROOT, '{}_{}_{:03d}.csv'.format(LOCAL_REF_SET, TRAIN_REF_R, epoch)))['idx'], dtype=int) mining_count = 0 for step in np.arange(len(anchor_indices), step=TUPLES_PER_BATCH): print(step) if step % EVAL_STEP == 0: TRAIN_CPU_IN_QUEUE.join() TRAIN_GPU_IN_QUEUE.join() log('EVALUATING') with GLOBAL_STEP_LOCK: global_step = GLOBAL_STEP # Some steps produce invalid tuples, and are therefore skipped save_path = saver.save(sess, os.path.join(OUT_DIR, "checkpoint"), global_step=global_step) out_name = '{:02d}_{}'.format(epoch, os.path.basename(save_path)) # Get loss for other region log('Calculating test loss.') get_eval_loss(global_step, writers['other'], epoch) # Test localization on other region evaluate_localization(global_step, OTHER_REF_SET, OTHER_QUERY_SET, 'other', out_name, tuple_shape, writers['other'], epoch) # Evaluate localization on training region evaluate_localization(global_step, LOCAL_REF_SET, LOCAL_QUERY_SET, 'local', out_name, tuple_shape, writers['local'], epoch) if step % MINING_STEP == 0: TRAIN_CPU_IN_QUEUE.join() TRAIN_GPU_IN_QUEUE.join() log('Caching features for hard negative mining.') mining_indices = np.arange( mining_count * MINING_CACHE_SIZE, (mining_count + 1) * MINING_CACHE_SIZE) % len(train_meta['t']) anchors_to_mine = np.array(anchor_indices[step:np.min( [step + MINING_STEP, len(anchor_indices)])]) mining_indices = np.concatenate([mining_indices, anchors_to_mine]) num_to_mine = len(mining_indices) padding = np.zeros(TUPLES_PER_BATCH * sum(tuple_shape) - (num_to_mine % (TUPLES_PER_BATCH * sum(tuple_shape))), dtype=int) image_info = [(train_meta['date'][i], train_meta['folder'][i], train_meta['t'][i]) for i in np.concatenate((mining_indices, padding))] with CACHED_FEATURE_LOCK: CACHED_FEATURES = np.array( extract_features(image_info, tuple_shape)[:num_to_mine]) CACHED_FEATURE_INDICES = mining_indices CACHED_FEATURE_TREE = KDTree(CACHED_FEATURES) mining_count = mining_count + 1 if step % SAVE_STEP == 0: TRAIN_CPU_IN_QUEUE.join() TRAIN_GPU_IN_QUEUE.join() with GLOBAL_STEP_LOCK: global_step = GLOBAL_STEP # Some steps produce invalid tuples, and are therefore skipped log('Saving model.') part_saver.save(sess, os.path.join(OUT_DIR, "part-checkpoint"), global_step=global_step) # Train one step: TRAIN_CPU_IN_QUEUE.put(anchor_indices[step:step + TUPLES_PER_BATCH]) # Finish training at end of epoch TRAIN_CPU_IN_QUEUE.join() TRAIN_GPU_IN_QUEUE.join()
mkdir(out_root) # Oxford place = 'oxford' def img_path(info): date = info[0] folder = info[1] t = info[2] return os.path.join('datasets/oxford_512', '{}_stereo_centre_{:02d}'.format(date, int(folder)), '{}.png'.format(t)) # Preselected reference preselected_ref = os.path.join(fs_root(), 'data/learnlarge/shuffled/train_ref_000.csv') p_meta = load_csv(preselected_ref) p_meta['path'] = [img_path((d, f, t)) for d, f, t in zip(p_meta['date'], p_meta['folder'], p_meta['t'])] idxs_to_keep = np.linspace(0, len(p_meta['path']), num=N_SAMPLES, endpoint=False, dtype=int) for key in p_meta.keys(): p_meta[key] = [p_meta[key][i] for i in idxs_to_keep] save_csv(p_meta, os.path.join(out_root, '{}_pca.csv'.format(place))) # Cold place = 'cold' def parse_cold_folder(path, pattern): all_files = get_recursive_file_list(path, pattern) all_files, TXYA = parse_file_list(all_files)
def get_grad_cam(): with tf.Graph().as_default() as graph: print("In Graph") ops, tuple_shape = build_inference_model() sess = restore_weights() print('\n'.join([n.name for n in tf.all_variables()])) # For better gpu utilization, loading processes and gpu inference are done in separate threads. # Start CPU threads num_loader_threads = 3 for i in range(num_loader_threads): worker = Thread(target=cpu_thread) worker.setDaemon(True) worker.start() worker = Thread(target=save_thread) worker.setDaemon(True) worker.start() # Start GPU threads worker = Thread(target=gpu_thread, args=(sess, ops)) worker.setDaemon(True) worker.start() ref_meta = load_csv(REF_CSV) query_meta = load_csv(QUERY_CSV) ref_xy = get_xy(ref_meta) query_xy = get_xy(query_meta) [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx] = load_pickle(TOP_N_PICKLE) top_n = np.array(top_i) num = len(query_meta['path']) # Fewer queries for speed last_xy = query_xy[0, :] selected = [0] if QUERY_CSV.startswith('pittsburgh'): selected = np.linspace(0, num, 500, dtype=int) else: if 'freiburg' in QUERY_CSV: r = 0.5 else: r = 2 for i in range(num): if sum((query_xy[i, :] - last_xy)**2) > r**2: last_xy = query_xy[i, :] selected.append(i) selected = np.array(selected, dtype=int) xy_dists = pairwise_distances(query_xy, ref_xy, metric='euclidean') # Clean list image_info = [(query_meta['path'][i], ref_meta['path'][top_n[i, 0]]) for i in selected] image_dist = [(np.linalg.norm(query_xy[i] - ref_xy[top_n[i, 0]])) for i in selected] batched_indices = np.reshape(selected, (-1, TUPLES_PER_BATCH)) batched_image_info = np.reshape(image_info, (-1, TUPLES_PER_BATCH * 2)) batched_distances = np.reshape(image_dist, (-1, TUPLES_PER_BATCH)) for batch_indices, batch_image_info, batched_distance in zip( batched_indices, batched_image_info, batched_distances): CPU_IN_QUEUE.put( (batch_indices, batch_image_info, batched_distance)) # Wait for completion & order output CPU_IN_QUEUE.join() GPU_IN_QUEUE.join() GPU_OUT_QUEUE.join()
def get_top_n(): # check if complete: ld_checkpoints = get_checkpoints('obm') ld_cp_names = [] for cp in ld_checkpoints: cp_name = cp.split('/')[-2] cp_name = ''.join(os.path.basename(cp_name).split('.')) # Removing '.' cp_name += '_e{}'.format(cp[-1]) ld_cp_names.append(cp_name) if any([x in QUERY_LV_PICKLE for x in ld_cp_names]): L = [0.0, 0.3, 1.0, 5.0] D = [64, 128, 256, 512, 1024, 2048, 4096] else: L = [0.0] D = [256] complete = True for l in L: for d in D: out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if not os.path.exists(out_pickle): complete = False break if not complete: break if complete: print('Skipping complete {}'.format(QUERY_LV_PICKLE)) return ref_meta = load_csv(REF_CSV) query_meta = load_csv(QUERY_CSV) full_ref_xy = get_xy(ref_meta) full_query_xy = get_xy(query_meta) num_q = full_query_xy.shape[0] pca_f = np.array(load_pickle(PCA_LV_PICKLE)) full_ref_f = np.array(load_pickle(REF_LV_PICKLE)) full_query_f = np.array(load_pickle(QUERY_LV_PICKLE)) full_xy_dists = pairwise_distances(full_query_xy, full_ref_xy, metric='euclidean') for d in D: print(d) pca = PCA(whiten=True, n_components=d) pca = pca.fit(pca_f) pca_ref_f = pca.transform(full_ref_f) pca_query_f = pca.transform(full_query_f) for l in L: print(l) out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) mkdir(out_folder) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if os.path.exists(out_pickle): print('{} already exists. Skipping.'.format(out_pickle)) continue ref_idx = [0] for i in range(len(full_ref_xy)): if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])** 2) >= l**2: ref_idx.append(i) if len(ref_idx) < N: continue ref_f = np.array([pca_ref_f[i, :] for i in ref_idx]) xy_dists = np.array([full_xy_dists[:, i] for i in ref_idx]).transpose() print('Building tree') ref_tree = KDTree(ref_f) print('Retrieving') top_f_dists, top_i = np.array( ref_tree.query(pca_query_f, k=N, return_distance=True, sort_results=True)) top_f_dists = np.array(top_f_dists) top_i = np.array(top_i, dtype=int) top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]] for q in range(num_q)] gt_i = np.argmin(xy_dists, axis=1) gt_g_dist = np.min(xy_dists, axis=1) # Translate to original indices top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)] gt_i = [ref_idx[r] for r in gt_i] save_pickle( [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx], out_pickle)
def get_top_n(): ref_meta = load_csv(REF_CSV) query_meta = load_csv(QUERY_CSV) full_ref_xy = get_xy(ref_meta) full_query_xy = get_xy(query_meta) num_q = full_query_xy.shape[0] pca_f = np.array(load_pickle(PCA_LV_PICKLE)) full_ref_f = np.array(load_pickle(REF_LV_PICKLE)) full_query_f = np.array(load_pickle(QUERY_LV_PICKLE)) full_xy_dists = pairwise_distances(full_query_xy, full_ref_xy, metric='euclidean') for d in DIMS: print(d) pca = PCA(whiten=True, n_components=d) pca = pca.fit(pca_f) pca_ref_f = pca.transform(full_ref_f) pca_query_f = pca.transform(full_query_f) for l in L: print(l) out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) mkdir(out_folder) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if os.path.exists(out_pickle): print('{} already exists. Skipping.'.format(out_pickle)) continue ref_idx = [0] for i in range(len(full_ref_xy)): if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])** 2) >= l**2: ref_idx.append(i) if len(ref_idx) < N: continue ref_f = np.array([pca_ref_f[i, :] for i in ref_idx]) xy_dists = np.array([full_xy_dists[:, i] for i in ref_idx]).transpose() print('Building tree') ref_tree = KDTree(ref_f) print('Retrieving') top_f_dists, top_i = np.array( ref_tree.query(pca_query_f, k=N, return_distance=True, sort_results=True)) top_f_dists = np.array(top_f_dists) top_i = np.array(top_i, dtype=int) top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]] for q in range(num_q)] gt_i = np.argmin(xy_dists, axis=1) gt_g_dist = np.min(xy_dists, axis=1) # Translate to original indices top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)] gt_i = [ref_idx[r] for r in gt_i] save_pickle( [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx], out_pickle)
def interpolate_xy(task_id, in_root, ins_root, out_root): # Find all dates with INS data (not all images have ins, but all ins should have images) all_dates = sorted( os.listdir(ins_root)) # Sort to make sure we always get the same order date = all_dates[int(task_id) - 1] out_file = os.path.join(out_root, '{}.csv'.format(date)) if os.path.exists(out_file): # print('Already calculated {}.'.format(out_file)) return imgs_file = os.path.join(in_root, '{}.csv'.format(date)) if not os.path.exists(imgs_file): print('Missing {}: {}.'.format(task_id, imgs_file)) return imgs = load_csv(imgs_file) ins = load_csv(os.path.join(ins_root, date, 'gps', 'ins.csv')) ins_ts = np.array(ins['timestamp'], dtype=int).reshape( (-1, 1)) # num_samples x num_features img_ts = np.array(imgs['t'], dtype=int).reshape((-1, 1)) northing = np.array(ins['northing'], dtype=float) easting = np.array(ins['easting'], dtype=float) yaw = np.array(ins['yaw'], dtype=float) # Yaw range: 0-2pi status = ins['ins_status'] # Ins measures are roughly 3 times more frequent than images mean_td_img = np.mean( [img_ts[i, 0] - img_ts[i - 1, 0] for i in range(1, img_ts.shape[0])]) mean_td_ins = np.mean( [ins_ts[i, 0] - ins_ts[i - 1, 0] for i in range(1, ins_ts.shape[0])]) print('Found {} times more ins measures than images.'.format(mean_td_img / mean_td_ins)) print('The mean time between ins measures is {}.'.format(mean_td_ins)) print('The mean time between img measures is {}.'.format(mean_td_img)) ins_ts_tree = KDTree(ins_ts) d_closest, i_closest = ins_ts_tree.query(img_ts, 2) img_northing = [ lin_ip(northing[i_c[0]], northing[i_c[1]], d_c[0], d_c[1]) for d_c, i_c in zip(d_closest, i_closest) ] img_easting = [ lin_ip(easting[i_c[0]], easting[i_c[1]], d_c[0], d_c[1]) for d_c, i_c in zip(d_closest, i_closest) ] img_yaw = [ lin_ip(yaw[i_c[0]], yaw[i_c[1]], d_c[0], d_c[1]) % (2 * pi) for d_c, i_c in zip(d_closest, i_closest) ] # Yaw range: 0-2pi # Remove interpolations of unclean ins states ins_good = [0] * len(img_easting) for j, i_c in enumerate(i_closest): if status[i_c[0]] == 'INS_SOLUTION_GOOD' and status[ i_c[1]] == 'INS_SOLUTION_GOOD': ins_good[j] = 1 imgs['northing'] = img_northing imgs['easting'] = img_easting imgs['ins_good'] = ins_good imgs['yaw'] = img_yaw ic1 = [i_c[0] for i_c in i_closest] ic2 = [i_c[1] for i_c in i_closest] tn1 = [ins_ts[i, 0] for i in ic1] tn2 = [ins_ts[i, 0] for i in ic2] imgs['ic1'] = ic1 # Index of closest ins point imgs['ic2'] = ic2 imgs['tn1'] = tn1 # Timestamp of closest ins point imgs['tn2'] = tn2 save_csv(imgs, out_file)
def img_path(info): date = info[0] folder = info[1] t = info[2] return os.path.join('datasets/oxford', '{}_stereo_centre_{:02d}'.format(date, int(folder)), '{}.png'.format(t)) # Preselected reference preselected_ref = os.path.join( fs_root(), 'data/learnlarge/clean_merged_parametrized/test.csv') ref_date = '2014-12-02-15-30-08' p_meta = load_csv(preselected_ref) for key in p_meta.keys(): p_meta[key] = [ e for e, d in zip(p_meta[key], p_meta['date']) if d == ref_date ] p_meta['path'] = [ img_path((d, f, t)) for d, f, t in zip(p_meta['date'], p_meta['folder'], p_meta['t']) ] ref_xy = get_xy(p_meta) save_csv(p_meta, os.path.join(list_out_root, '{}_ref.csv'.format(place))) ax = axs[0, 2] ax.plot(ref_xy[:, 0], ref_xy[:, 1], label='{} overcast reference images'.format(len(ref_xy)),
def clean(in_root, out_root, folds, cols_to_keep): merged_file = os.path.join(in_root, 'merged.csv') meta_file = os.path.join(out_root, 'meta.csv') meta_info = dict() merged = load_csv(merged_file) # Original number of imgs meta_info['total_imgs'] = len(merged['exposure']) # Valid ins valid_ins = np.array(merged['ins_good'], dtype=int) meta_info['valid_ins'] = sum(valid_ins) # Valid location on grid valid_grid = np.array(merged['full'], dtype=int) meta_info['valid_grid'] = sum(valid_grid) # Analise and clean exposure # Visual inspection shows that images below 50'000'000 are very dark and above 110'000'000 very light exposures = np.array(merged['exposure'], dtype=float) low_exposure = np.percentile(exposures, 1) high_exposure = np.percentile(exposures, 99) print('Lo: {} \nHi: {}'.format(low_exposure, high_exposure)) plt.clf() plt.hist(exposures, bins=10000, histtype='step') plt.xticks(rotation=90) plt.savefig(os.path.join(out_root, 'exposures.pdf')) valid_exposure = [ 1 if low_exposure < e < high_exposure else 0 for e in exposures ] meta_info['valid_exposures'] = sum(valid_exposure) # Manual cleaning valid_date = [1 if d not in bad_dates else 0 for d in merged['date']] meta_info['valid_date'] = sum(valid_date) # Get fully valid fully_valid = np.array(valid_exposure) * np.array(valid_grid) * np.array( valid_ins) * np.array(valid_date) meta_info['fully_valid'] = sum(fully_valid) # Save for different folds for fold in folds: fold_valid = np.array(fully_valid) * np.array(merged[fold], dtype=int) meta_info['valid_{}'.format(fold)] = sum(fold_valid) out_data = dict() for col in cols_to_keep: out_col = [e for e, v in zip(merged[col], fold_valid) if v == 1] out_data[col] = out_col clean_file = os.path.join(out_root, 'clean_{}.csv'.format(fold)) save_csv(out_data, clean_file) # Plot fold exposure: fold_exposure = [e for e, v in zip(exposures, fold_valid) if v == 1] plt.clf() plt.hist(fold_exposure, bins=10000, histtype='step') plt.xticks(rotation=90) plt.savefig(os.path.join(out_root, 'exposures_{}.pdf'.format(fold))) save_csv(meta_info, meta_file) dict_to_bar(meta_info, os.path.join(out_root, 'meta_info.pdf'))
if not os.path.exists(out_root): os.makedirs(out_root) if not os.path.exists(log_root): os.makedirs(log_root) settings = list() # Get settings: sets = ['train', 'test', 'val'] for s in sets: dates = sorted( list( set( load_csv(os.path.join(in_root, 'clean_{}.csv'.format(s)))['date']))) for date in dates: if not (s == 'val' and date in ['2014-05-14-13-59-05', '2014-05-14-13-53-47' ]): # Wrong direction settings.append((s, date)) if task_id == -1: for s in sets: create_reference(s) create_array_job(len(settings), log_root) else: setting = settings[task_id - 1] parametrize(s=setting[0], date=setting[1])
def clean_parametrization(in_root, folds, cols_to_keep, out_root): full_data = dict() full_ref_data = dict() full_query_data = dict() for key in cols_to_keep: full_data[key] = [] full_ref_data[key] = [] full_query_data[key] = [] meta = dict() for s in folds: ref_data = load_csv(os.path.join(in_root, '{}_ref.csv'.format(s))) query_data = load_csv(os.path.join(in_root, '{}_query.csv'.format(s))) # Not used to detect ref outliers for key in ['l', 'northing', 'easting']: ref_data[key] = np.array(ref_data[key], dtype=float) query_data[key] = np.array(query_data[key], dtype=float) l_max = max(ref_data['l']) num_bins = math.ceil(l_max) ref_member_path = os.path.join(out_root, '{}_ref_bin_raw_members.pickle'.format(s)) if not os.path.exists(ref_member_path): bin_members = [[i for i in range(len(ref_data['t'])) if math.floor(ref_data['l'][i]) == j] for j in tqdm(range(num_bins))] save_pickle(bin_members, ref_member_path) else: bin_members = load_pickle(ref_member_path) ref_bin_xy_path = os.path.join(out_root, '{}_ref_bin_raw_xy.pickle'.format(s)) if not os.path.exists(ref_bin_xy_path): ref_bin_xy = [ np.median(np.array([[ref_data['easting'][i], ref_data['northing'][i]] for i in bin_members[j]]), axis=0) if len( bin_members[j]) else np.array([-1, -1]) for j in tqdm(range(num_bins))] save_pickle(ref_bin_xy, ref_bin_xy_path) else: ref_bin_xy = load_pickle(ref_bin_xy_path) meta['{}_ref'.format(s)], clean_ref_data = find_and_remove_errors('ref', out_root, ref_bin_xy, ref_data, s) # Cleaning query files to allow for more efficient testing, should not influence performance # (other than possibly excluding faulty gps/ins 'ground truth', which we don't want anyways) meta['{}_query'.format(s)], clean_query_data = find_and_remove_errors('query', out_root, ref_bin_xy, query_data, s) fold_clean_data = dict() for key in clean_ref_data.keys(): fold_clean_data[key] = [] fold_clean_data[key].extend(clean_ref_data[key]) fold_clean_data[key].extend(clean_query_data[key]) full_data[key].extend(clean_ref_data[key]) full_data[key].extend(clean_query_data[key]) full_query_data[key].extend(clean_ref_data[key]) full_ref_data[key].extend(clean_query_data[key]) save_csv(fold_clean_data, os.path.join(out_root, '{}.csv'.format(s))) save_csv(full_data, os.path.join(out_root, 'full.csv'.format(s))) save_csv(full_ref_data, os.path.join(out_root, 'full_ref.csv'.format(s))) save_csv(full_query_data, os.path.join(out_root, 'full_query.csv'.format(s))) save_csv(meta, os.path.join(out_root, 'meta.csv'))
def parametrize(s, date): ref_date = getattr(sys.modules[__name__], '{}_ref_date'.format(s)) ref_file = os.path.join(out_root, '{}_{}_geodesic.csv'.format(s, ref_date)) data = load_csv(os.path.join(in_root, 'clean_{}.csv'.format(s))) ref_data = load_csv(ref_file) ref_xy = [(float(x), float(y)) for x, y in zip(ref_data['easting'], ref_data['northing'])] ref_l = np.array(ref_data['l'], dtype=float) ref_yaw = np.array(ref_data['yaw'], dtype=float) ref_tree = KDTree(np.array(ref_xy)) vmin = min(ref_l) vmax = max(ref_l) date_data = dict() for key in data.keys(): date_data[key] = [ e for e, d in zip(data[key], data['date']) if d == date ] date_xy = [(float(x), float(y)) for x, y in zip(date_data['easting'], date_data['northing'])] date_d = [0] + [ math.sqrt((p[0] - q[0])**2 + (p[1] - q[1])**2) for p, q in zip(date_xy[1:], date_xy[:-1]) ] date_l = [sum(date_d[:i]) for i in range(1, len(date_d) + 1)] date_yaw = np.array(date_data['yaw'], dtype=float) matched_l = np.zeros(len(date_yaw)) matchable = [] r = 20 if s == 'val': r = 100 date_ni, date_nd = ref_tree.query_radius(np.array(date_xy), r=100, return_distance=True, sort_results=True) current_l = 0 latest_valid = 0 for j, (yaw, ni, nd) in enumerate(zip(date_yaw, date_ni, date_nd)): if len(ni) < 2: continue angle_neighbors = [ i for i in range(len(ni)) if abs(yaw - ref_yaw[ni[i]]) % (2 * math.pi) < math.pi / 3 ] ni = [ni[i] for i in angle_neighbors] nd = [nd[i] for i in angle_neighbors] if len(ni) < 2: continue potential_l = np.array([ref_l[i] for i in ni]) if j == 0: threshold = 40 if s == 'val': threshold = 5 km = KMeans(n_clusters=2, random_state=0).fit(potential_l.reshape(-1, 1)) if abs(km.cluster_centers_[0] - km.cluster_centers_[1]) > threshold: closest_center = km.predict( np.array(current_l).reshape(-1, 1))[0] assignments = km.labels_ l_neighbors = [ i for i, a in zip(range(len(ni)), assignments) if a == closest_center ] else: l_neighbors = range(len(ni)) else: l_neighbors = [ i for i, l in enumerate(potential_l) if abs(current_l - date_l[latest_valid] + date_l[j] - l) < 500 ] ni = [ni[i] for i in l_neighbors] nd = [nd[i] for i in l_neighbors] if len(ni) < 2: continue interp_l = lin_ip(ref_l[ni[0]], ref_l[ni[1]], nd[0], nd[1]) current_l = interp_l latest_valid = j matched_l[j] = interp_l print(interp_l) matchable.append(j) if len(matchable) > 0: date_data['l'] = matched_l for key in ref_data.keys(): date_data[key] = [date_data[key][i] for i in matchable] plot_results(date_xy, date_yaw, date_l, date, date_data, s, vmin, vmax) out_file = os.path.join(out_root, '{}_{}_geodesic.csv'.format(s, date)) save_csv(date_data, out_file)
def plot_statistics(in_root, out_root, folds, tag_root): date_tags, all_tags = get_tags(tag_root) for fold in folds: print('Plotting {} statistics.'.format(fold)) clean_file = os.path.join(in_root, '{}.csv'.format(fold)) data = load_csv(clean_file) # Images per date images_per_date = Counter(data['date']) save_csv(images_per_date, os.path.join(out_root, 'images_per_date_{}.csv'.format(fold))) dict_to_bar(images_per_date, os.path.join(out_root, 'images_per_date_{}.pdf'.format(fold))) # Images/dates per tag, month and hour images_per_tag = dict.fromkeys(all_tags, 0) images_per_month = dict.fromkeys(range(1, 13), 0) images_per_hour = dict.fromkeys(range(0, 24), 0) dates_per_tag = dict.fromkeys(all_tags, 0) dates_per_month = dict.fromkeys(range(1, 13), 0) dates_per_hour = dict.fromkeys(range(0, 24), 0) for date in images_per_date.keys(): month = int(date[5:7]) hour = int(date[11:13]) images_per_month[month] = images_per_date[date] + images_per_month[month] images_per_hour[hour] = images_per_date[date] + images_per_hour[hour] dates_per_month[month] = 1 + dates_per_month[month] dates_per_hour[hour] = 1 + dates_per_hour[hour] for tag in date_tags[date]: images_per_tag[tag] = images_per_date[date] + images_per_tag[tag] dates_per_tag[tag] = 1 + dates_per_tag[tag] save_csv(images_per_tag, os.path.join(out_root, 'images_per_tag_{}.csv'.format(fold))) dict_to_bar(images_per_tag, os.path.join(out_root, 'images_per_tag_{}.pdf'.format(fold))) save_csv(images_per_tag, os.path.join(out_root, 'images_per_tag_{}.csv'.format(fold))) dict_to_bar(images_per_tag, os.path.join(out_root, 'images_per_tag_{}.pdf'.format(fold))) save_csv(images_per_month, os.path.join(out_root, 'images_per_month_{}.csv'.format(fold))) dict_to_bar(images_per_month, os.path.join(out_root, 'images_per_month_{}.pdf'.format(fold))) new_months = OrderedDict() months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] for i in range(12): new_months[months[i]] = images_per_month[i + 1] save_csv(new_months, os.path.join(out_root, 'images_per_month_pretty_{}.csv'.format(fold))) dict_to_bar(new_months, os.path.join(out_root, 'images_per_month_pretty_{}.pdf'.format(fold))) save_csv(images_per_hour, os.path.join(out_root, 'images_per_hour_{}.csv'.format(fold))) dict_to_bar(images_per_hour, os.path.join(out_root, 'images_per_hour_{}.pdf'.format(fold))) new_hours = OrderedDict() for i in range(6, 22): new_hours['{:02d}:00'.format(i)] = images_per_hour[i] save_csv(new_hours, os.path.join(out_root, 'images_per_pretty_hour_{}.csv'.format(fold))) dict_to_bar(new_hours, os.path.join(out_root, 'images_per_pretty_hour_{}.pdf'.format(fold))) save_csv(dates_per_tag, os.path.join(out_root, 'dates_per_tag_{}.csv'.format(fold))) dict_to_bar(dates_per_tag, os.path.join(out_root, 'dates_per_tag_{}.pdf'.format(fold))) save_csv(dates_per_month, os.path.join(out_root, 'dates_per_month_{}.csv'.format(fold))) dict_to_bar(dates_per_month, os.path.join(out_root, 'dates_per_month_{}.pdf'.format(fold))) save_csv(dates_per_hour, os.path.join(out_root, 'dates_per_hour_{}.csv'.format(fold))) dict_to_bar(dates_per_hour, os.path.join(out_root, 'dates_per_hour_{}.pdf'.format(fold)))
def get_top_n(): name = os.path.basename(QUERY_LV_PICKLE).split('.')[0] print(name) sampling = 1 out_png_1 = os.path.join( OUT_ROOT, '{}_top{}_t{}_path_{}_s{}.pdf'.format(name, N, T, PERPLEXITY, sampling)) out_png_1c = os.path.join( OUT_ROOT, '{}_top{}_t{}_ct_{}_s{}.pdf'.format(name, N, T, PERPLEXITY, sampling)) out_pickle = os.path.join( OUT_ROOT, '{}_top{}_t{}_{}_s{}.pickle'.format(name, N, T, PERPLEXITY, sampling)) if os.path.exists(out_pickle): print('{} already exists. Skipping.'.format(out_pickle)) return pca_f = np.array(load_pickle(PCA_LV_PICKLE)) pca = PCA(whiten=True, n_components=256) pca = pca.fit(pca_f) query_meta = load_csv(QUERY_CSV) query_xy = get_xy(query_meta)[::sampling] l_query_f = np.array(load_pickle(QUERY_LV_PICKLE)) l_query_f = l_query_f[::sampling, :] query_f = pca.transform(l_query_f) Y = TSNE(n_components=2, perplexity=PERPLEXITY).fit_transform(query_f) Y[:, 0] = (Y[:, 0] - min(Y[:, 0])) / (max(Y[:, 0]) - min(Y[:, 0])) Y[:, 1] = (Y[:, 1] - min(Y[:, 1])) / (max(Y[:, 1]) - min(Y[:, 1])) plt.clf() plt.figure(figsize=(3, 3)) x = [p[0] for p in query_xy] y = [p[1] for p in query_xy] x_max = np.max(x) x_min = np.min(x) y_max = np.max(y) y_min = np.min(y) x_span = float(x_max - x_min) y_span = float(y_max - y_min) query_color = [(0, float(p[1] - y_min) / y_span, float(p[0] - x_min) / x_span) for p in query_xy] s1 = plt.scatter(x, y, c=query_color, s=2) s1.set_rasterized(True) plt.savefig(out_png_1, bbox_inches='tight', pad_inches=0) plt.clf() plt.figure(figsize=(3, 3)) s2 = plt.scatter(Y[:, 0], Y[:, 1], c=query_color, s=2) s2.set_rasterized(True) plt.savefig(out_png_1c, bbox_inches='tight', pad_inches=0)