def main(args):
    path = utils.add_trailing_slash(args.path)
    outdir = utils.add_trailing_slash(args.outdir)
    debug = args.debug
    
    utils.make_dir(outdir)
    
    files = utils.get_filenames(path)
    
    data_frames_list = [pd.read_csv(path + f) for f in files]
    big_data_frame = pd.concat(data_frames_list)
    sorted_big_data_frame = big_data_frame.sort(parameters.distance_col)
    final_data_frame = sorted_big_data_frame.drop_duplicates(parameters.name_col)
    final_data_frame.to_csv(outdir + 'cleaned.marker', index=False)
    
    if debug:
        debug_df = pd.read_csv(outdir + 'cleaned.marker')
        substacks = np.empty(len(debug_df.index))
        names = debug_df[parameters.name_col]
        lut = dict()
        count = 0
        for index in debug_df.index:
            substack = utils.extract_substack(names[index])
            if substack not in lut:
                count += 1
                lut[substack] = count
            substacks[index] = lut[substack]
        debug_df[parameters.substack_col] = substacks
        debug_df.to_csv(outdir + 'debug_cleaned.csv',
                col=[parameters.x_col,
                    parameters.y_col,
                    parameters.z_col,
                    parameters.distance_col,
                    parameters.substack_col],
                index=False)
def main(args):
    path = utils.add_trailing_slash(args.path)
    outdir = utils.add_trailing_slash(args.outdir)
    debug = args.debug

    utils.make_dir(outdir)

    files = utils.get_filenames(path)

    data_frames_list = [pd.read_csv(path + f) for f in files]
    big_data_frame = pd.concat(data_frames_list)
    sorted_big_data_frame = big_data_frame.sort(parameters.distance_col)
    final_data_frame = sorted_big_data_frame.drop_duplicates(
        parameters.name_col)
    final_data_frame.to_csv(outdir + 'cleaned.marker', index=False)

    indices = np.setdiff1d(
        np.arange(len(sorted_big_data_frame)),
        np.unique(sorted_big_data_frame, return_index=True)[1])
    duplicates = np.unique(sorted_big_data_frame[indices])
    #sorted_big_data_frame.to_csv(outdir + 'cleaned.marker', index=False)

    if debug:
        debug_df = pd.read_csv(outdir + 'cleaned.marker')
        substacks = np.empty(len(debug_df.index))
        names = debug_df[parameters.name_col]
        lut = dict()
        count = 0
        for index in debug_df.index:
            substack = utils.extract_substack(names[index])
            if substack not in lut:
                count += 1
                lut[substack] = count
            substacks[index] = lut[substack]
        #debug_df[parameters.substack_col] = substacks
        #debug_df.to_csv(outdir + 'debug_cleaned.csv',
        #col=[parameters.x_col,
        #parameters.y_col,
        #parameters.z_col,
        #parameters.density_col,
        #parameters.distance_col,
        #parameters.substack_col],
        #index=False)
        cols = debug_df.as_matrix([
            parameters.x_col, parameters.y_col, parameters.z_col,
            parameters.density_col, parameters.distance_col
        ])
        cleaned_df = pd.DataFrame(
            data={
                parameters.x_col: cols[:, 0],
                parameters.y_col: cols[:, 1],
                parameters.z_col: cols[:, 2],
                parameters.density_col: cols[:, 3],
                parameters.distance_col: cols[:, 4]
            })
        cleaned_df.to_csv(outdir + 'debug_cleaned.csv', index=False)
예제 #3
0
def main(args):
    path = utils.add_trailing_slash(args.path)
    outdir = utils.add_trailing_slash(args.outdir)
    debug = args.debug

    utils.make_dir(outdir)

    files = utils.get_filenames(path)

    data_frames_list = [pd.read_csv(path + f) for f in files]
    big_data_frame = pd.concat(data_frames_list)
    sorted_big_data_frame = big_data_frame.sort(parameters.distance_col)
    final_data_frame = sorted_big_data_frame.drop_duplicates(parameters.name_col)
    final_data_frame.to_csv(outdir + 'cleaned.marker', index=False)

    indices = np.setdiff1d(np.arange(len(sorted_big_data_frame)), np.unique(sorted_big_data_frame, return_index=True)[1])
    duplicates = np.unique(sorted_big_data_frame[indices])
    #sorted_big_data_frame.to_csv(outdir + 'cleaned.marker', index=False)

    if debug:
        debug_df = pd.read_csv(outdir + 'cleaned.marker')
        substacks = np.empty(len(debug_df.index))
        names = debug_df[parameters.name_col]
        lut = dict()
        count = 0
        for index in debug_df.index:
            substack = utils.extract_substack(names[index])
            if substack not in lut:
                count += 1
                lut[substack] = count
            substacks[index] = lut[substack]
        #debug_df[parameters.substack_col] = substacks
        #debug_df.to_csv(outdir + 'debug_cleaned.csv',
                #col=[parameters.x_col,
                    #parameters.y_col,
                    #parameters.z_col,
                    #parameters.density_col,
                    #parameters.distance_col,
                    #parameters.substack_col],
                #index=False)
        cols = debug_df.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col,parameters.density_col,parameters.distance_col])
        cleaned_df = pd.DataFrame(data={parameters.x_col: cols[:, 0],
            parameters.y_col: cols[:, 1],
            parameters.z_col: cols[:, 2],
            parameters.density_col: cols[:, 3],
            parameters.distance_col: cols[:, 4]})
        cleaned_df.to_csv(outdir + 'debug_cleaned.csv', index=False)
예제 #4
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)
    hreshold = args.threshold
    
    utils.make_dir(outdir)
    
    data_frame = pd.read_csv(data_file)
    filtered_data_frame = data_frame[data_frame[parameters.distance_col] <= threshold].drop(parameters.distance_col, axis=1)
    filtered_data_frame.to_csv(outdir + 'threshold_' + repr(threshold) + '.marker', index=False)
예제 #5
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)
    hreshold = args.threshold

    utils.make_dir(outdir)

    data_frame = pd.read_csv(data_file)
    filtered_data_frame = data_frame[
        data_frame[parameters.distance_col] <= threshold].drop(
            parameters.distance_col, axis=1)
    filtered_data_frame.to_csv(outdir + 'threshold_' + repr(threshold) +
                               '.marker',
                               index=False)
예제 #6
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)

    outdir_seeds = outdir + 'seeds/'
    outdir_nn = outdir + 'nn/'

    utils.make_dir(outdir)
    utils.make_dir(outdir_seeds)
    utils.make_dir(outdir_nn)

    for folder in xrange(parameters.jobs):
        utils.make_dir(outdir_seeds + repr(folder))

    data_frame = pd.read_csv(data_file)

    points_matrix = data_frame.as_matrix(
        [parameters.x_col, parameters.y_col, parameters.z_col])
    name = data_frame[parameters.name_col]

    data_substacks = utils.points_to_substack(points_matrix, name)

    seeds = list()
    global_kdtree = cKDTree(points_matrix)
    for substack, data in data_substacks.iteritems():
        X = np.vstack(data)
        X = np.float64(X)
        kdtree = cKDTree(X)
        _, index = kdtree.query(np.mean(X, axis=0))
        _, centroid = global_kdtree.query(X[index, :])
        seeds.append(centroid)

    print len(seeds)
    n_neighbors = graph_utils.compute_minimum_nearest_neighbors(points_matrix)

    with open(outdir_nn + repr(n_neighbors), 'w') as nn_file:
        nn_file.close()

    folder = 0
    while len(seeds) > 0:
        seed = seeds.pop()
        with open(outdir_seeds + repr(folder) + '/' + repr(seed),
                  'w') as seed_file:
            seed_file.close()
        folder = (folder + 1) % parameters.jobs
예제 #7
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)

    outdir_seeds = outdir + 'seeds/'
    outdir_nn = outdir + 'nn/'

    utils.make_dir(outdir)
    utils.make_dir(outdir_seeds)
    utils.make_dir(outdir_nn)

    for folder in xrange(parameters.jobs):
        utils.make_dir(outdir_seeds + repr(folder))

    data_frame = pd.read_csv(data_file)

    points_matrix = data_frame.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col])
    name = data_frame[parameters.name_col]

    data_substacks = utils.points_to_substack(points_matrix, name)

    seeds = list()
    global_kdtree = cKDTree(points_matrix)
    for substack, data in data_substacks.iteritems():
        X = np.vstack(data)
        X = np.float64(X)
        kdtree = cKDTree(X)
        _, index = kdtree.query(np.mean(X, axis=0))
        _, centroid = global_kdtree.query(X[index, :])
        seeds.append(centroid)

    print len(seeds)
    n_neighbors = graph_utils.compute_minimum_nearest_neighbors(points_matrix)

    with open(outdir_nn + repr(n_neighbors), 'w') as nn_file:
        nn_file.close()

    folder = 0
    while len(seeds) > 0:
        seed = seeds.pop()
        with open(outdir_seeds + repr(folder) + '/' + repr(seed), 'w') as seed_file:
            seed_file.close()
        folder = (folder + 1) % parameters.jobs
예제 #8
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)
    
    utils.make_dir(outdir)
    
    data_frame = pd.read_csv(data_file)
    
    max_distance = args.max_distance
    n_neighbors = args.n_neighbors
    seeds_folder = utils.add_trailing_slash(args.seeds_folder)
    sigma = args.sigma
    debug = args.debug
    
    if debug:
        outdir_embeddings = outdir + 'embeddings/'
        outdir_reconstructions = outdir + 'reconstructions/'
        outdir_csvs = outdir + 'csv_patches/'
        outdir_single_points = outdir + 'single_points/'
        outdir_faulty = outdir + 'faulty/'
        utils.make_dir(outdir_embeddings)
        utils.make_dir(outdir_reconstructions)
        utils.make_dir(outdir_csvs)
        utils.make_dir(outdir_single_points)
        utils.make_dir(outdir_faulty)
    
    seeds = utils.get_filenames(seeds_folder)
    
    for seed in seeds:
        X = data_frame.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col])
        name = data_frame[parameters.name_col]
        
        patch_maker = PatchMaker(X, int(seed), n_neighbors, max_distance)
        patch = patch_maker.patch_data()
        
        if len(patch) == 1:
            print "There is one point in patch from seed " + seed + " with geodesic radius " + repr(max_distance)
            print "Most likely a false positive, skipping..."
            if debug:
                print "Saving patch with one point for debug purposes..."
                single_frame_patch = data_frame[data_frame.index.isin(patch)]
                single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False)
            continue
        elif len(patch) == 2:
            print "There are two points in patch from seed " + seed + " with geodesic radius " + repr(max_distance)
            print "Most likely two false positives, skipping..."
            if debug:
                print "Saving patch with two points for debug purposes..."
                single_frame_patch = data_frame[data_frame.index.isin(patch)]
                single_frame_patch.to_csv(outdir_single_points + seed + '.csv', index=False)
            continue
        
        data_frame_patch = data_frame[data_frame.index.isin(patch)]
        
        X_patch = data_frame_patch.as_matrix([parameters.x_col, parameters.y_col, parameters.z_col])
        
        n_neighbors_patch = graph_utils.compute_minimum_nearest_neighbors(X_patch)
        
        iso = IsomapEmbedder(n_neighbors_patch)
        
        try:
            points_2d = iso.compute(X_patch)
        except ValueError:
            print "Processing seed " + seed + "..."
            print "Got a strange ValueError due to sparse representation, skipping the patch..."
            if debug:
                print "Saving faulty patch for debug purposes..."
                data_frame_patch.to_csv(outdir_faulty + seed + '.csv', index=False)
            continue
        
        metric = EuclideanMetric()
        kernel = GaussianKernel(sigma, metric)
        
        weights = kernel.compute_multiple(points_2d)
        np.fill_diagonal(weights, 0)
        
        low = Lowess(metric, parameters.robust_iter)
        
        points_3d_rebuilt = low.fit_transform(points_2d, X_patch, weights)
        
        surface_cleaner = SurfaceCleaner(metric)
        surface_distance_penalty = surface_cleaner.compute_distances(X_patch, points_3d_rebuilt)
        
        data_frame_patch[parameters.distance_col] = surface_distance_penalty
        data_frame_patch.to_csv(outdir + seed + '.marker', index=False)
        
        if debug:
            embed_df = pd.DataFrame(data={parameters.x_col: points_2d[:, 0],
                parameters.y_col: points_2d[:, 1],
                parameters.distance_col: surface_distance_penalty})
            embed_df.to_csv(outdir_embeddings + seed + '.csv', index=False)
            
            rebuild_df = pd.DataFrame(data={parameters.x_col: points_3d_rebuilt[:, 0],
                parameters.y_col: points_3d_rebuilt[:, 1],
                parameters.z_col: points_3d_rebuilt[:, 2],
                parameters.distance_col: surface_distance_penalty})
            rebuild_df.to_csv(outdir_reconstructions + seed + '.csv', index=False)
            
            data_frame_patch.to_csv(outdir_csvs + seed + '.csv', columns=[parameters.x_col,
                parameters.y_col,
                parameters.z_col,
                parameters.distance_col], index=False)
예제 #9
0
def main(args):
    data_file = args.data_file
    outdir = utils.add_trailing_slash(args.outdir)

    utils.make_dir(outdir)

    data_frame = pd.read_csv(data_file)

    max_distance = args.max_distance
    n_neighbors = args.n_neighbors
    seeds_folder = utils.add_trailing_slash(args.seeds_folder)
    sigma = args.sigma
    debug = args.debug

    if debug:
        outdir_embeddings = outdir + 'embeddings/'
        outdir_reconstructions = outdir + 'reconstructions/'
        outdir_csvs = outdir + 'csv_patches/'
        outdir_single_points = outdir + 'single_points/'
        outdir_faulty = outdir + 'faulty/'
        utils.make_dir(outdir_embeddings)
        utils.make_dir(outdir_reconstructions)
        utils.make_dir(outdir_csvs)
        utils.make_dir(outdir_single_points)
        utils.make_dir(outdir_faulty)

    seeds = utils.get_filenames(seeds_folder)

    for seed in seeds:
        X = data_frame.as_matrix(
            [parameters.x_col, parameters.y_col, parameters.z_col])
        name = data_frame[parameters.name_col]

        patch_maker = PatchMaker(X, int(seed), n_neighbors, max_distance)
        patch = patch_maker.patch_data()

        if len(patch) == 1:
            print "There is one point in patch from seed " + seed + " with geodesic radius " + repr(
                max_distance)
            print "Most likely a false positive, skipping..."
            if debug:
                print "Saving patch with one point for debug purposes..."
                single_frame_patch = data_frame[data_frame.index.isin(patch)]
                single_frame_patch.to_csv(outdir_single_points + seed + '.csv',
                                          index=False)
            continue
        elif len(patch) == 2:
            print "There are two points in patch from seed " + seed + " with geodesic radius " + repr(
                max_distance)
            print "Most likely two false positives, skipping..."
            if debug:
                print "Saving patch with two points for debug purposes..."
                single_frame_patch = data_frame[data_frame.index.isin(patch)]
                single_frame_patch.to_csv(outdir_single_points + seed + '.csv',
                                          index=False)
            continue

        data_frame_patch = data_frame[data_frame.index.isin(patch)]

        X_patch = data_frame_patch.as_matrix(
            [parameters.x_col, parameters.y_col, parameters.z_col])

        n_neighbors_patch = graph_utils.compute_minimum_nearest_neighbors(
            X_patch)

        iso = IsomapEmbedder(n_neighbors_patch)

        try:
            points_2d = iso.compute(X_patch)
        except ValueError:
            print "Processing seed " + seed + "..."
            print "Got a strange ValueError due to sparse representation, skipping the patch..."
            if debug:
                print "Saving faulty patch for debug purposes..."
                data_frame_patch.to_csv(outdir_faulty + seed + '.csv',
                                        index=False)
            continue

        metric = EuclideanMetric()
        kernel = GaussianKernel(sigma, metric)

        weights = kernel.compute_multiple(points_2d)
        np.fill_diagonal(weights, 0)

        low = Lowess(metric, parameters.robust_iter)

        points_3d_rebuilt = low.fit_transform(points_2d, X_patch, weights)

        surface_cleaner = SurfaceCleaner(metric)
        surface_distance_penalty = surface_cleaner.compute_distances(
            X_patch, points_3d_rebuilt)

        data_frame_patch[parameters.distance_col] = surface_distance_penalty
        data_frame_patch.to_csv(outdir + seed + '.marker', index=False)

        if debug:
            embed_df = pd.DataFrame(
                data={
                    parameters.x_col: points_2d[:, 0],
                    parameters.y_col: points_2d[:, 1],
                    parameters.distance_col: surface_distance_penalty
                })
            embed_df.to_csv(outdir_embeddings + seed + '.csv', index=False)

            rebuild_df = pd.DataFrame(
                data={
                    parameters.x_col: points_3d_rebuilt[:, 0],
                    parameters.y_col: points_3d_rebuilt[:, 1],
                    parameters.z_col: points_3d_rebuilt[:, 2],
                    parameters.distance_col: surface_distance_penalty
                })
            rebuild_df.to_csv(outdir_reconstructions + seed + '.csv',
                              index=False)

            data_frame_patch.to_csv(outdir_csvs + seed + '.csv',
                                    columns=[
                                        parameters.x_col, parameters.y_col,
                                        parameters.z_col,
                                        parameters.distance_col
                                    ],
                                    index=False)