def main(args): path = utils.add_trailing_slash(args.path) outdir = utils.add_trailing_slash(args.outdir) debug = args.debug utils.make_dir(outdir) files = utils.get_filenames(path) data_frames_list = [pd.read_csv(path + f) for f in files] big_data_frame = pd.concat(data_frames_list) sorted_big_data_frame = big_data_frame.sort(parameters.distance_col) final_data_frame = sorted_big_data_frame.drop_duplicates(parameters.name_col) final_data_frame.to_csv(outdir + 'cleaned.marker', index=False) if debug: debug_df = pd.read_csv(outdir + 'cleaned.marker') substacks = np.empty(len(debug_df.index)) names = debug_df[parameters.name_col] lut = dict() count = 0 for index in debug_df.index: substack = utils.extract_substack(names[index]) if substack not in lut: count += 1 lut[substack] = count substacks[index] = lut[substack] debug_df[parameters.substack_col] = substacks debug_df.to_csv(outdir + 'debug_cleaned.csv', col=[parameters.x_col, parameters.y_col, parameters.z_col, parameters.distance_col, parameters.substack_col], index=False)
def main(args): path = utils.add_trailing_slash(args.path) outdir = utils.add_trailing_slash(args.outdir) debug = args.debug utils.make_dir(outdir) files = utils.get_filenames(path) data_frames_list = [pd.read_csv(path + f) for f in files] big_data_frame = pd.concat(data_frames_list) sorted_big_data_frame = big_data_frame.sort(parameters.distance_col) final_data_frame = sorted_big_data_frame.drop_duplicates( parameters.name_col) final_data_frame.to_csv(outdir + 'cleaned.marker', index=False) indices = np.setdiff1d( np.arange(len(sorted_big_data_frame)), np.unique(sorted_big_data_frame, return_index=True)[1]) duplicates = np.unique(sorted_big_data_frame[indices]) #sorted_big_data_frame.to_csv(outdir + 'cleaned.marker', index=False) if debug: debug_df = pd.read_csv(outdir + 'cleaned.marker') substacks = np.empty(len(debug_df.index)) names = debug_df[parameters.name_col] lut = dict() count = 0 for index in debug_df.index: substack = utils.extract_substack(names[index]) if substack not in lut: count += 1 lut[substack] = count substacks[index] = lut[substack] #debug_df[parameters.substack_col] = substacks #debug_df.to_csv(outdir + 'debug_cleaned.csv', #col=[parameters.x_col, #parameters.y_col, #parameters.z_col, #parameters.density_col, #parameters.distance_col, #parameters.substack_col], #index=False) cols = debug_df.as_matrix([ parameters.x_col, parameters.y_col, parameters.z_col, parameters.density_col, parameters.distance_col ]) cleaned_df = pd.DataFrame( data={ parameters.x_col: cols[:, 0], parameters.y_col: cols[:, 1], parameters.z_col: cols[:, 2], parameters.density_col: cols[:, 3], parameters.distance_col: cols[:, 4] }) cleaned_df.to_csv(outdir + 'debug_cleaned.csv', index=False)
def main(args): path = utils.add_trailing_slash(args.path) outdir = utils.add_trailing_slash(args.outdir) debug = args.debug utils.make_dir(outdir) files = utils.get_filenames(path) data_frames_list = [pd.read_csv(path + f) for f in files] big_data_frame = pd.concat(data_frames_list) sorted_big_data_frame = big_data_frame.sort(parameters.distance_col) final_data_frame = sorted_big_data_frame.drop_duplicates(parameters.name_col) final_data_frame.to_csv(outdir + 'cleaned.marker', index=False) indices = np.setdiff1d(np.arange(len(sorted_big_data_frame)), np.unique(sorted_big_data_frame, return_index=True)[1]) duplicates = np.unique(sorted_big_data_frame[indices]) #sorted_big_data_frame.to_csv(outdir + 'cleaned.marker', index=False) if debug: debug_df = pd.read_csv(outdir + 'cleaned.marker') substacks = np.empty(len(debug_df.index)) names = debug_df[parameters.name_col] lut = dict() count = 0 for index in debug_df.index: substack = utils.extract_substack(names[index]) if substack not in lut: count += 1 lut[substack] = count substacks[index] = lut[substack] #debug_df[parameters.substack_col] = substacks #debug_df.to_csv(outdir + 'debug_cleaned.csv', #col=[parameters.x_col, #parameters.y_col, #parameters.z_col, #parameters.density_col, #parameters.distance_col, #parameters.substack_col], #index=False) cols = debug_df.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col,parameters.density_col,parameters.distance_col]) cleaned_df = pd.DataFrame(data={parameters.x_col: cols[:, 0], parameters.y_col: cols[:, 1], parameters.z_col: cols[:, 2], parameters.density_col: cols[:, 3], parameters.distance_col: cols[:, 4]}) cleaned_df.to_csv(outdir + 'debug_cleaned.csv', index=False)
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) hreshold = args.threshold utils.make_dir(outdir) data_frame = pd.read_csv(data_file) filtered_data_frame = data_frame[data_frame[parameters.distance_col] <= threshold].drop(parameters.distance_col, axis=1) filtered_data_frame.to_csv(outdir + 'threshold_' + repr(threshold) + '.marker', index=False)
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) hreshold = args.threshold utils.make_dir(outdir) data_frame = pd.read_csv(data_file) filtered_data_frame = data_frame[ data_frame[parameters.distance_col] <= threshold].drop( parameters.distance_col, axis=1) filtered_data_frame.to_csv(outdir + 'threshold_' + repr(threshold) + '.marker', index=False)
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) outdir_seeds = outdir + 'seeds/' outdir_nn = outdir + 'nn/' utils.make_dir(outdir) utils.make_dir(outdir_seeds) utils.make_dir(outdir_nn) for folder in xrange(parameters.jobs): utils.make_dir(outdir_seeds + repr(folder)) data_frame = pd.read_csv(data_file) points_matrix = data_frame.as_matrix( [parameters.x_col, parameters.y_col, parameters.z_col]) name = data_frame[parameters.name_col] data_substacks = utils.points_to_substack(points_matrix, name) seeds = list() global_kdtree = cKDTree(points_matrix) for substack, data in data_substacks.iteritems(): X = np.vstack(data) X = np.float64(X) kdtree = cKDTree(X) _, index = kdtree.query(np.mean(X, axis=0)) _, centroid = global_kdtree.query(X[index, :]) seeds.append(centroid) print len(seeds) n_neighbors = graph_utils.compute_minimum_nearest_neighbors(points_matrix) with open(outdir_nn + repr(n_neighbors), 'w') as nn_file: nn_file.close() folder = 0 while len(seeds) > 0: seed = seeds.pop() with open(outdir_seeds + repr(folder) + '/' + repr(seed), 'w') as seed_file: seed_file.close() folder = (folder + 1) % parameters.jobs
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) outdir_seeds = outdir + 'seeds/' outdir_nn = outdir + 'nn/' utils.make_dir(outdir) utils.make_dir(outdir_seeds) utils.make_dir(outdir_nn) for folder in xrange(parameters.jobs): utils.make_dir(outdir_seeds + repr(folder)) data_frame = pd.read_csv(data_file) points_matrix = data_frame.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col]) name = data_frame[parameters.name_col] data_substacks = utils.points_to_substack(points_matrix, name) seeds = list() global_kdtree = cKDTree(points_matrix) for substack, data in data_substacks.iteritems(): X = np.vstack(data) X = np.float64(X) kdtree = cKDTree(X) _, index = kdtree.query(np.mean(X, axis=0)) _, centroid = global_kdtree.query(X[index, :]) seeds.append(centroid) print len(seeds) n_neighbors = graph_utils.compute_minimum_nearest_neighbors(points_matrix) with open(outdir_nn + repr(n_neighbors), 'w') as nn_file: nn_file.close() folder = 0 while len(seeds) > 0: seed = seeds.pop() with open(outdir_seeds + repr(folder) + '/' + repr(seed), 'w') as seed_file: seed_file.close() folder = (folder + 1) % parameters.jobs
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) utils.make_dir(outdir) data_frame = pd.read_csv(data_file) max_distance = args.max_distance n_neighbors = args.n_neighbors seeds_folder = utils.add_trailing_slash(args.seeds_folder) sigma = args.sigma debug = args.debug if debug: outdir_embeddings = outdir + 'embeddings/' outdir_reconstructions = outdir + 'reconstructions/' outdir_csvs = outdir + 'csv_patches/' outdir_single_points = outdir + 'single_points/' outdir_faulty = outdir + 'faulty/' utils.make_dir(outdir_embeddings) utils.make_dir(outdir_reconstructions) utils.make_dir(outdir_csvs) utils.make_dir(outdir_single_points) utils.make_dir(outdir_faulty) seeds = utils.get_filenames(seeds_folder) for seed in seeds: X = data_frame.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col]) name = data_frame[parameters.name_col] patch_maker = PatchMaker(X, int(seed), n_neighbors, max_distance) patch = patch_maker.patch_data() if len(patch) == 1: print "There is one point in patch from seed " + seed + " with geodesic radius " + repr(max_distance) print "Most likely a false positive, skipping..." if debug: print "Saving patch with one point for debug purposes..." single_frame_patch = data_frame[data_frame.index.isin(patch)] single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False) continue elif len(patch) == 2: print "There are two points in patch from seed " + seed + " with geodesic radius " + repr(max_distance) print "Most likely two false positives, skipping..." if debug: print "Saving patch with two points for debug purposes..." single_frame_patch = data_frame[data_frame.index.isin(patch)] single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False) continue data_frame_patch = data_frame[data_frame.index.isin(patch)] X_patch = data_frame_patch.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col]) n_neighbors_patch = graph_utils.compute_minimum_nearest_neighbors(X_patch) iso = IsomapEmbedder(n_neighbors_patch) try: points_2d = iso.compute(X_patch) except ValueError: print "Processing seed " + seed + "..." print "Got a strange ValueError due to sparse representation, skipping the patch..." if debug: print "Saving faulty patch for debug purposes..." data_frame_patch.to_csv(outdir_faulty + seed + '.csv', index=False) continue metric = EuclideanMetric() kernel = GaussianKernel(sigma, metric) weights = kernel.compute_multiple(points_2d) np.fill_diagonal(weights, 0) low = Lowess(metric, parameters.robust_iter) points_3d_rebuilt = low.fit_transform(points_2d, X_patch, weights) surface_cleaner = SurfaceCleaner(metric) surface_distance_penalty = surface_cleaner.compute_distances(X_patch, points_3d_rebuilt) data_frame_patch[parameters.distance_col] = surface_distance_penalty data_frame_patch.to_csv(outdir + seed + '.marker', index=False) if debug: embed_df = pd.DataFrame(data={parameters.x_col: points_2d[:, 0], parameters.y_col: points_2d[:, 1], parameters.distance_col: surface_distance_penalty}) embed_df.to_csv(outdir_embeddings + seed + '.csv', index=False) rebuild_df = pd.DataFrame(data={parameters.x_col: points_3d_rebuilt[:, 0], parameters.y_col: points_3d_rebuilt[:, 1], parameters.z_col: points_3d_rebuilt[:, 2], parameters.distance_col: surface_distance_penalty}) rebuild_df.to_csv(outdir_reconstructions + seed + '.csv', index=False) data_frame_patch.to_csv(outdir_csvs + seed + '.csv', columns=[parameters.x_col, parameters.y_col, parameters.z_col, parameters.distance_col], index=False)
def main(args): data_file = args.data_file outdir = utils.add_trailing_slash(args.outdir) utils.make_dir(outdir) data_frame = pd.read_csv(data_file) max_distance = args.max_distance n_neighbors = args.n_neighbors seeds_folder = utils.add_trailing_slash(args.seeds_folder) sigma = args.sigma debug = args.debug if debug: outdir_embeddings = outdir + 'embeddings/' outdir_reconstructions = outdir + 'reconstructions/' outdir_csvs = outdir + 'csv_patches/' outdir_single_points = outdir + 'single_points/' outdir_faulty = outdir + 'faulty/' utils.make_dir(outdir_embeddings) utils.make_dir(outdir_reconstructions) utils.make_dir(outdir_csvs) utils.make_dir(outdir_single_points) utils.make_dir(outdir_faulty) seeds = utils.get_filenames(seeds_folder) for seed in seeds: X = data_frame.as_matrix( [parameters.x_col, parameters.y_col, parameters.z_col]) name = data_frame[parameters.name_col] patch_maker = PatchMaker(X, int(seed), n_neighbors, max_distance) patch = patch_maker.patch_data() if len(patch) == 1: print "There is one point in patch from seed " + seed + " with geodesic radius " + repr( max_distance) print "Most likely a false positive, skipping..." if debug: print "Saving patch with one point for debug purposes..." single_frame_patch = data_frame[data_frame.index.isin(patch)] single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False) continue elif len(patch) == 2: print "There are two points in patch from seed " + seed + " with geodesic radius " + repr( max_distance) print "Most likely two false positives, skipping..." if debug: print "Saving patch with two points for debug purposes..." single_frame_patch = data_frame[data_frame.index.isin(patch)] single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False) continue data_frame_patch = data_frame[data_frame.index.isin(patch)] X_patch = data_frame_patch.as_matrix( [parameters.x_col, parameters.y_col, parameters.z_col]) n_neighbors_patch = graph_utils.compute_minimum_nearest_neighbors( X_patch) iso = IsomapEmbedder(n_neighbors_patch) try: points_2d = iso.compute(X_patch) except ValueError: print "Processing seed " + seed + "..." print "Got a strange ValueError due to sparse representation, skipping the patch..." if debug: print "Saving faulty patch for debug purposes..." data_frame_patch.to_csv(outdir_faulty + seed + '.csv', index=False) continue metric = EuclideanMetric() kernel = GaussianKernel(sigma, metric) weights = kernel.compute_multiple(points_2d) np.fill_diagonal(weights, 0) low = Lowess(metric, parameters.robust_iter) points_3d_rebuilt = low.fit_transform(points_2d, X_patch, weights) surface_cleaner = SurfaceCleaner(metric) surface_distance_penalty = surface_cleaner.compute_distances( X_patch, points_3d_rebuilt) data_frame_patch[parameters.distance_col] = surface_distance_penalty data_frame_patch.to_csv(outdir + seed + '.marker', index=False) if debug: embed_df = pd.DataFrame( data={ parameters.x_col: points_2d[:, 0], parameters.y_col: points_2d[:, 1], parameters.distance_col: surface_distance_penalty }) embed_df.to_csv(outdir_embeddings + seed + '.csv', index=False) rebuild_df = pd.DataFrame( data={ parameters.x_col: points_3d_rebuilt[:, 0], parameters.y_col: points_3d_rebuilt[:, 1], parameters.z_col: points_3d_rebuilt[:, 2], parameters.distance_col: surface_distance_penalty }) rebuild_df.to_csv(outdir_reconstructions + seed + '.csv', index=False) data_frame_patch.to_csv(outdir_csvs + seed + '.csv', columns=[ parameters.x_col, parameters.y_col, parameters.z_col, parameters.distance_col ], index=False)