Exemplo n.º 1
0
 def __init__(self, labels, optical_flow_path, segments_path, batch_size=BATCH_SIZE, shuffle=True):
     self.batch_size = batch_size
     self.labels = labels
     self.optical_flow = da.from_npy_stack(optical_flow_path)
     self.segments = da.from_npy_stack(segments_path)
     self.shuffle = shuffle
     self.on_epoch_end()
Exemplo n.º 2
0
def get_or_create_array(config, npy_stack_dir=None):
    """ Load or create Dask Array for tests. You can specify a test case too.

    If file exists the function returns the array.
    If chunk_shape given the function rechunk the array before returning it.
    If file does not exist it will be created using "shape" parameter.

    Arguments (from config object):
    ----------
        file_path: File containing the array, will be created if does not exist.
        chunk_shape: 
        shape: Shape of the array to create if does not exist.
        test_case: Test case. If None, returns the test array.
        nb_chunks: Number of chunks to treat in the test case.
        overwrite: Use the chunk_shape to create a new array, overwriting if file_path already used.
        split_file: for the test case 'split'
    """

    file_path = config.array_filepath
    if not os.path.isfile(file_path):
        raise FileNotFoundError()

    # get the file and rechunk logically using a chosen chunk shape, or dask default
    if npy_stack_dir:
        arr = da.from_npy_stack(dirname=npy_stack_dir, mmap_mode=None)
    else:
        if config.chunks_shape:
            arr = get_dask_array_from_hdf5(
                file_path, logic_chunks_shape=config.chunks_shape)
        else:
            arr = get_dask_array_from_hdf5(file_path)  # TODO: see what happens
    return arr
Exemplo n.º 3
0
def measuring():

    global sky
    global dirty
    global psf
     
    list_schedule = []
    list_compute = []
    list_total = []
    list_load = []
    n = 5
    
    for i in range (1):
        start_time1 = time.time()
        #sky_npy, sky = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/sky.npy', n)
        #dirty_npy, dirty = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/dirty.npy', n)
        #psf_npy, psf = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/psf.npy', n)
        sky = da.from_npy_stack(os.getcwd() + '/skyStack')
        dirty = da.from_npy_stack(os.getcwd() + '/dirtyStack')
        psf = da.from_npy_stack(os.getcwd() + '/psfStack')
        end_time1 = time.time()
        
        start_time2 = time.time()
        scheduling()
        end_time2 = time.time()
	
        #pbar = ProgressBar()
        #with pbar, Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof:	
        start_time3 = time.time()        
        quad.compute()
        end_time3 = time.time()
        
        list_load.append(end_time1 - start_time1)
        list_schedule.append(end_time2 - start_time2)
        list_compute.append(end_time3 - start_time3)
        list_total.append(end_time3 - start_time1)
	#visualize([prof, rprof, cprof])
    
    print('num de dimension: {}'.format(n))
    print('load time: {}'.format(round(sum(list_load)/len(list_load), 4)))
    print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4)))
    print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4)))
    print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
Exemplo n.º 4
0
 def _tile_generator(self):
     if str(self.input_path).endswith('zarr'):
         level_array = da.from_zarr(
             str(self.input_path)
         )
     else:
         level_array = da.from_npy_stack(
             str(self.input_path)
         )
     block_array = level_array.blocks[0,0].compute()
     yield block_array
Exemplo n.º 5
0
def dask_player(video, segmentation, optical_flow):
    Cache(2e6).register()  # Turn cache on globally
    video = cv2.VideoCapture(video)
    segmentation = da.from_npy_stack(segmentation)
    optical_flow = da.from_npy_stack(optical_flow)

    bar = Bar('Frame', max=video.get(cv2.CAP_PROP_FRAME_COUNT))
    bar.index = OFFSET
    video.set(cv2.CAP_PROP_POS_FRAMES, OFFSET)
    while video.isOpened():
        video_frame = video.read()[1]
        optical_flow_frame = optical_flow[bar.index].compute()
        segmentation_frame = segmentation[bar.index].compute()
        cv2.imshow('video', video_frame)
        cv2.imshow('segmentation', segmentation_frame)
        cv2.imshow('optical_flow', visualize_optical_flow(optical_flow_frame))
        import pdb
        pdb.set_trace()
        cv2.waitKey(27)
        bar.next()
Exemplo n.º 6
0
def get_data(data_loc: str) -> tuple:
    data_folders = [
        os.path.join(data_loc, x) for x in os.listdir(data_loc)
        if 'cb46fd46' in x
    ]
    out_data = None
    out_labels = None
    out_times = None

    for data_folder in tqdm(data_folders):
        all_data_fol = os.path.join(data_folder, 'data')
        label_folder = os.path.join(data_folder, 'labels')
        times_folder = os.path.join(data_folder, 'times')

        if out_data is None:
            out_data = da.from_npy_stack(all_data_fol)
        else:
            out_data = da.concatenate(
                [out_data, da.from_npy_stack(all_data_fol)])

        if out_labels is None:
            out_labels = da.from_npy_stack(label_folder)
        else:
            out_labels = da.concatenate(
                [out_labels, da.from_npy_stack(label_folder)])

        if out_times is None:
            out_times = da.from_npy_stack(times_folder)
        else:
            out_times = da.concatenate(
                [out_times, da.from_npy_stack(times_folder)])

    return out_data.compute(), out_labels.compute(), out_times.comput()
Exemplo n.º 7
0
def test_to_npy_stack():
    x = np.arange(5*10*10).reshape((5, 10, 10))
    d = da.from_array(x, chunks=(2, 4, 4))

    dirname = mkdtemp()
    try:
        da.to_npy_stack(dirname, d, axis=0)
        assert os.path.exists(os.path.join(dirname, '0.npy'))
        assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all()

        e = da.from_npy_stack(dirname)
        assert eq(d, e)
    finally:
        shutil.rmtree(dirname)
Exemplo n.º 8
0
def test_to_npy_stack():
    x = np.arange(5 * 10 * 10).reshape((5, 10, 10))
    d = da.from_array(x, chunks=(2, 4, 4))

    dirname = mkdtemp()
    try:
        da.to_npy_stack(dirname, d, axis=0)
        assert os.path.exists(os.path.join(dirname, '0.npy'))
        assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all()

        e = da.from_npy_stack(dirname)
        assert eq(d, e)
    finally:
        shutil.rmtree(dirname)
Exemplo n.º 9
0
def benchmark_dask_map_block_1d_sparse_distributed(filename,
                                                   cutoff=15,
                                                   direct_output=True):
    global point_array
    func_name = sys._getframe().f_code.co_name
    results = []
    #cache = Chest(path=os.path.join(BASE_DIRECTORY, "cache"), available_memory=98e9)

    cluster_details = client.ncores()
    number_nodes = len(cluster_details.keys())
    number_cores = sum(cluster_details.values())
    global point_array
    with dask.set_options(get=client.get):
        start = time.time()
        point_array = da.from_npy_stack(filename)
        chunk_size = point_array.chunks[0][0]
        end_read = time.time()
        results.append(
            "%s,dask-distributed, %s, %d, %d, comet, read_file, %.4f" %
            (filename, func_name, number_nodes, number_cores,
             end_read - start))
        chunk_size
        dist_matrix = da.zeros((point_array.shape[0], point_array.shape[0]),
                               chunks=(chunk_size, point_array.shape[0]))
        """map_block_distances operates on point_array """
        da_res = dist_matrix.map_blocks(map_blocks_1d_sparse,
                                        chunks=(chunk_size, 3),
                                        dtype='int')
        res = da_res.compute()
        end_compute = time.time()
        results.append(
            "%s,dask-distributed, %s, %d, %d, comet, compute, %.4f" %
            (filename, func_name, number_nodes, number_cores,
             end_compute - end_read))
        results.append("%s,dask-distributed, %s, %d, %d, comet, total, %.4f" %
                       (filename, func_name, number_nodes, number_cores,
                        end_compute - start))
        print("\n".join(results))
Exemplo n.º 10
0
import numpy as np
from keras.models import load_model
from dask import array as da

hybrid_model = load_model('results/hybrid_model.hdf5')

def predict(block):
    if block.shape == (1,1,1,1):
        return block
    return hybrid_model.predict(block)

if __name__ == '__main__':
    from multiprocessing.pool import ThreadPool
    import dask
    with dask.config.set(pool=ThreadPool(8)):
        segments = da.from_npy_stack('data/comma_ai/test_segments')[1:] # drop first segment
        optical_flow = da.from_npy_stack('data/comma_ai/test_optical_flow')

        frame = da.concatenate([optical_flow, segments.reshape((10797, 480, 640, 1))], axis=3)
        predicted_speeds = None
        for block in frame.blocks:
            if predicted_speeds is None:
                predicted_speeds = hybrid_model.predict(block.compute())
            else:
                predicted_speeds = np.concatenate([predicted_speeds, hybrid_model.predict(block.compute())])
            print 'processed block'

        # duplicate first prediction
        with open('test.txt', 'w+') as f:
            np.savetxt(f,predicted_speeds)
Exemplo n.º 11
0
        type=str,
        metavar='savedir',
        help='Directory in which to save model and evaluation plots')

    #parse the arguments
    args = parser.parse_args()
    impaths_file = args.impaths_file
    labels_fpath = args.labels_fpath
    savedir = args.savedir

    #make sure the savedir exists
    if not os.path.isdir(savedir):
        os.mkdir(savedir)

    #load the dask array
    impaths = da.from_npy_stack(impaths_file)

    #load the labels array
    gt_labels = np.load(labels_fpath)

    #sanity check that the number of labels and impaths are the same
    assert (len(impaths) == len(gt_labels)
            ), "Number of impaths and labels are different!"

    #it's expected that the gt_labels were generated within a Jupyter notebook by
    #using the the corrector.py labeling utilities
    #in that case the labels are text with the possible options of "informative", "uninformative", and "none"
    #those with the label "none" are considered the unlabeled set and we make predictions
    #about their labels using the random forest that we train on the labeled images
    good_indices = np.where(gt_labels == 'informative')[0]
    bad_indices = np.where(gt_labels == 'uninformative')[0]
from progress.bar import Bar

import os
import cv2
import numpy as np
import pickle
from dask import array as da
from progress.bar import Bar

optical_flow_model = load_model('results/optical_flow_baseline.hdf5')

if __name__ == '__main__':
    from multiprocessing.pool import ThreadPool
    import dask
    with dask.config.set(pool=ThreadPool(8)):
        optical_flow = da.from_npy_stack('data/comma_ai/test_optical_flow')

        predicted_speeds = None
        for block in optical_flow.blocks:
            if predicted_speeds is None:
                predicted_speeds = optical_flow_model.predict(block.compute())
            else:
                predicted_speeds = np.concatenate([
                    predicted_speeds,
                    optical_flow_model.predict(block.compute())
                ])
            print 'processed block'

        # duplicate first prediction
        with open('test.txt', 'w+') as f:
            np.savetxt(f, predicted_speeds)
Exemplo n.º 13
0
def learn_clusters(n_clust):
    client = Client(n_workers=4, processes=True)

    # 1. Learn clusters

    # Full set
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    array = da.from_npy_stack(npy_stack_path)
    kmeans = KMeans(n_clusters=n_clust)

    # Learn on a part of set
    # array = np.load('Clustering/npy_post_vecs_part/0.npy')
    # kmeans = SKMeans(n_clusters=n_clust)

    print('Fitting')
    kmeans.fit(array)

    del array
    # Dump centroids to the disk

    # Dump as a sklearn object, for (maybe) faster prediction and less problems
    skmeans = SKMeans(n_clusters=n_clust)
    skmeans.cluster_centers_ = kmeans.cluster_centers_
    skmeans._n_threads = _openmp_effective_n_threads()
    dump(skmeans, kmeans_path)
    del kmeans, skmeans

    # dump(kmeans, kmeans_path) # For learning on a part of set
    # del kmeans
    print('Fitted')

    # 3. Turn posts into clusters
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    df = dd.read_parquet('preprocessed.parquet')
    df = df.map_partitions(df_to_vector_predict,
                           kmeans_path,
                           meta={
                               'user_id': int,
                               'post_id': int,
                               'text': object,
                               'type': str,
                               'date': str,
                               'cluster': int
                           })
    df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Clustered')

    # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info)
    # For each user find his last like and filter out likes that are older than the last + half a year
    df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Original df len: {}'.format(len(df)))

    year = 31536000  # One year in timestamp
    kyear = 20
    break_time = kyear * year  # 0.75*year - A quarter to year
    last_like = df['date'].max().compute(
    )  # Set has been fully collected on 8 of June 2020

    df = df[df['date'] >
            last_like - break_time]  # Pass only a quarter-to-year recent likes
    print('max_date: {} '.format(df['date'].max().compute()))
    print('min date: {}'.format(df['date'].min().compute()))
    print('Filtered df len: {}'.format(len(df)))
    print('Likes has been filtered out by date')

    # 3. Group clusters by user_id and turn them into a single vector for each user

    # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER!

    # - Count text_likes number for each user (and later merge with user_info)
    count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby(
        'user_id')['text'].count().compute()
    count.rename('text_likes', inplace=True)

    # Generate meta
    meta = {'user_id': int}
    for i in range(n_clust):
        meta[i] = float

    df = df.map_partitions(
        lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta)

    df.to_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    # 5. Merge clusters and user_info dataframes. (Working with pandas frames)
    df_info = pd.read_csv('users_info.csv')

    df_info = df_info.merge(count, on='user_id', how='inner')
    del count

    df = pd.read_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    df = df_info.merge(
        df, on='user_id', how='inner'
    )  # Merging user's info and clusters. Maybe, mistake is here

    df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format(
        n_clust, kyear))
    print('Final dataset has been saved')
    del df_info

    # Filter some users out
    # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING

    df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)]

    df['bdate'] = df['bdate'].apply(
        lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple()))

    # Clean up the dataset
    df = df.drop(columns=[
        'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol',
        'parth_id', 'country', 'city', 'user_id'
    ]).dropna().reset_index(drop=True)

    # 6. Supervise a Linear Regression model
    regr = LinearRegression()
    R2 = train(df, regr)

    client.close()
    return R2
Exemplo n.º 14
0
if __name__ == "__main__":
    from distributed import Client, LocalCluster
    import dask.dataframe as df
    import dask.array as da

    cluster = LocalCluster()
    client = Client(cluster)

    matches = da.from_npy_stack("/spell/data/")
    matches = df.from_array(matches)

    # IMPORTANT: note that this repartition is optional, if you want a partitioned write
    matches.repartition(npartitions=1).to_csv("predictions.csv")
Exemplo n.º 15
0
    volume_paths = glob(os.path.join(volume_dir, '*'))
    print(f'Found {len(volume_paths)} in {volume_dir}')

    #extract the volume names
    #NOTE: this is the same code used to generate the names
    #from cross_section3d.py
    volume_names = []
    for vp in volume_paths:
        fext = vp.split('.')[-1] if vp[-5:] != 'nii.gz' else 'nii.gz'
        volume_names.append(vp.split('/')[-1].split(f'.{fext}')[0])

    volume_names = np.array(volume_names)

    #convert filtered to numpy straightaway
    #dask.array doesn't have good support for string operations
    filtered_impaths = da.from_npy_stack(filtered_impaths_file).compute()

    #the first thing that we need to do it to isolate
    #images from 3d source datasets. during creation we
    #gave 2d files the handy identifying -LOC-2d-
    source3d = np.where(
        np.core.defchararray.find(filtered_impaths, '-LOC-2d') == -1)
    print(
        f'Isolated {len(source3d[0])} images from 3d volumes out of {len(filtered_impaths)}'
    )

    #overwrite filtered_impaths to save space
    #and sort the results such that images from the same
    #source datasets are grouped together
    filtered_impaths = np.sort(filtered_impaths[source3d])