예제 #1
0
def to_hdf5(wrapper, save_as=None):
    if save_as is None:
        save_as = wrapper.dataset_name.replace(' ', '_') + '.h5'

    f = None

    print wrapper.dataset_name

    try:
        f = tables.openFile(save_as, mode='w')

        train_group = f.createGroup('/', 'train', 'train set')
        test_group = f.createGroup('/', 'test', 'test set')

        img_groups = (train_group, None)

        test_idx = None
        dsets = [range(len(wrapper)), None]

        if wrapper.get_standard_train_test_splits() is not None:
            dsets[0], dsets[1] = wrapper.get_standard_train_test_splits()
            img_groups = (img_groups[0], test_group)

        for some_idx, testing in enumerate(dsets):
            if testing is None:
                break
            img_group = img_groups[some_idx]
            dset = dsets[some_idx]

            img_table = f.createCArray(img_group,
                                       'img',
                                       tables.StringAtom(itemsize=1),
                                       shape=(len(dset), 96 * 96 * 3))
            label_table = f.createCArray(img_group,
                                         'label',
                                         tables.Float64Atom(),
                                         shape=(len(dset),
                                                len(keypoints_names), 2))

            for i, _ in enumerate(dset):
                img, label = crop_face(
                    PIL.Image.open(wrapper.get_original_image_path(i)),
                    wrapper.get_bbox(i), wrapper.get_eyes_location(i),
                    wrapper.get_keypoints_location(i))

                img_table[i, :] = [x for x in img.tostring()]
                #NOTE: will be stored as cstring, so it is mandatory to store as
                #strings of size 1 (as \0 would break the string)
                for name in keypoints_names:
                    if name in label:
                        point = label[name]
                        label_table[i, keypoints_names.index(name), :] = [
                            point[0], point[1]
                        ]
                    else:
                        label_table[i,
                                    keypoints_names.index(name), :] = [-1, -1]

    finally:
        if f is not None:
            f.close()
예제 #2
0
    eggind = np.random.randint(0, num, int(num * (1.0 - rate)))
    emails[eggind] = 'eggs'
    return emails


# 1. Create an HDF5 file which contains a single emails array, uncompressed.  
#    Close this file and use the getsize() function to report the size on disk.
#    Time how long this took and print that out as well.

NUM = 10000000
RATE = 0.75
emails = email_array(NUM, RATE)

t = time()
with tb.openFile('uncompressed.h5', 'w') as f:
    earray = f.createEArray('/', 'emails', tb.StringAtom(4), (0,), expectedrows=NUM)
    earray.append(emails)
tdelta = time() - t

msg = "The uncompressed array is {0} bytes and took {1} ms to write."
print msg.format(getsize('uncompressed.h5'), tdelta * 1000)



# 2. Repeat step 1 but with zlib compression at level 1.

filters = tb.Filters(complib='zlib', complevel=1)

t = time()
with tb.openFile('zlib1.h5', 'w') as f:
    earray = f.createEArray('/', 'emails', tb.StringAtom(4), (0,), 
예제 #3
0
def create_all_arrays(h5, expectedrows=1000):
    """
    Utility functions used by both create_song_file and create_aggregate_files,
    creates all the EArrays (empty).
    INPUT
       h5   - hdf5 file, open with write or append permissions
              metadata and analysis groups already exist!
    """
    # group metadata arrays
    group = h5.root.metadata
    h5.createEArray(where=group,
                    name='similar_artists',
                    atom=tables.StringAtom(20, shape=()),
                    shape=(0, ),
                    title=ARRAY_DESC_SIMILAR_ARTISTS)
    h5.createEArray(group,
                    'artist_terms',
                    tables.StringAtom(256, shape=()), (0, ),
                    ARRAY_DESC_ARTIST_TERMS,
                    expectedrows=expectedrows * 40)
    h5.createEArray(group,
                    'artist_terms_freq',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_ARTIST_TERMS_FREQ,
                    expectedrows=expectedrows * 40)
    h5.createEArray(group,
                    'artist_terms_weight',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_ARTIST_TERMS_WEIGHT,
                    expectedrows=expectedrows * 40)
    # group analysis arrays
    group = h5.root.analysis
    h5.createEArray(where=group,
                    name='segments_start',
                    atom=tables.Float64Atom(shape=()),
                    shape=(0, ),
                    title=ARRAY_DESC_SEGMENTS_START)
    h5.createEArray(group,
                    'segments_confidence',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SEGMENTS_CONFIDENCE,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'segments_pitches',
                    tables.Float64Atom(shape=()), (0, 12),
                    ARRAY_DESC_SEGMENTS_PITCHES,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'segments_timbre',
                    tables.Float64Atom(shape=()), (0, 12),
                    ARRAY_DESC_SEGMENTS_TIMBRE,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'segments_loudness_max',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SEGMENTS_LOUDNESS_MAX,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'segments_loudness_max_time',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SEGMENTS_LOUDNESS_MAX_TIME,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'segments_loudness_start',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SEGMENTS_LOUDNESS_START,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'sections_start',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SECTIONS_START,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'sections_confidence',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_SECTIONS_CONFIDENCE,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'beats_start',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_BEATS_START,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'beats_confidence',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_BEATS_CONFIDENCE,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'bars_start',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_BARS_START,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'bars_confidence',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_BARS_CONFIDENCE,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'tatums_start',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_TATUMS_START,
                    expectedrows=expectedrows * 300)
    h5.createEArray(group,
                    'tatums_confidence',
                    tables.Float64Atom(shape=()), (0, ),
                    ARRAY_DESC_TATUMS_CONFIDENCE,
                    expectedrows=expectedrows * 300)
    # group musicbrainz arrays
    group = h5.root.musicbrainz
    h5.createEArray(where=group,
                    name='artist_mbtags',
                    atom=tables.StringAtom(256, shape=()),
                    shape=(0, ),
                    title=ARRAY_DESC_ARTIST_MBTAGS,
                    expectedrows=expectedrows * 5)
    h5.createEArray(group,
                    'artist_mbtags_count',
                    tables.IntAtom(shape=()), (0, ),
                    ARRAY_DESC_ARTIST_MBTAGS_COUNT,
                    expectedrows=expectedrows * 5)
예제 #4
0
import tables
import numpy

fileh = tables.open_file('earray1.h5', mode='w')
a = tables.StringAtom(itemsize=8)
# Use ``a`` as the object type for the enlargeable array.
array_c = fileh.create_earray(fileh.root, 'array_c', a, (0, ), "Chars")
array_c.append(numpy.array(['a' * 2, 'b' * 4], dtype='S8'))
array_c.append(numpy.array(['a' * 6, 'b' * 8, 'c' * 10], dtype='S8'))

# Read the string ``EArray`` we have created on disk.
for s in array_c:
    print 'array_c[%s] => %r' % (array_c.nrow, s)
# Close the file.
fileh.close()
예제 #5
0
 def test_from_dtype_03(self):
     with self.assertWarns(Warning):
         atom1 = tb.Atom.from_dtype(np.dtype('U5'), dflt=b'hello')
     atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello')
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
예제 #6
0
파일: disflow_bystep.py 프로젝트: Dad0u/phd
def calc_flow(
        file_list,
        out_rel='optflow_rel.hdf',  # pair by pair flow (cannot be None)
        out_total='optflow_total.hdf',  # Cumulated flow
        out_res='optflow_res.hdf',  # Residual
        complevel=0,
        complevel_res=0,
        complevel_tot=0,
        use_last=True,
        open_func=lambda s: cv2.imread(s, 0),
        # Preset medium
        finest_scale=0,
        gd_iterations=25,
        patch_size=4,
        patch_stride=1,
        alpha=20,
        delta=5,
        gamma=1,
        iterations=20):

    infos = dict()
    infos['start_time'] = str(datetime.datetime.now())
    infos['dir'] = os.getcwd()
    infos['host'] = os.uname()[1]
    infos['algo'] = 'Disflow-rel'
    infos['algo_version'] = version
    infos['finest_scale'] = finest_scale
    infos['gd_iterations'] = gd_iterations
    infos['patch_size'] = patch_size
    infos['patch_stride'] = patch_stride
    infos['alpha'] = alpha
    infos['delta'] = delta
    infos['gamma'] = gamma
    infos['iterations'] = iterations
    infos['opencv'] = cv2.__version__
    infos_s = str(infos).encode('utf-8')
    o_img = open_func(file_list[0])
    height, width = o_img.shape
    size = 8 * height * width * len(file_list) / 2**20
    output_size = 2 * size if out_total else size
    if out_res:
        output_size += size / 2  # Same type as the fields, but only 1 component
    print("Estimated output size: {:.2f} MB".format(output_size))

    # Opening the main output file
    hrel = tables.open_file(unique_name(out_rel), 'w')

    # Creating the infos node
    hrel.create_array(hrel.root, 'infos', [infos_s])

    # If compression is asked, create the filter
    filt = tables.Filters(complevel=complevel) if complevel else None
    # Create the array at the node 'table'
    arr = hrel.create_earray(hrel.root,
                             'table',
                             tables.Float32Atom(), (0, height, width, 2),
                             expectedrows=len(file_list),
                             filters=filt)
    # Create the array of the names of the images
    max_size = max([len(i.encode('utf-8')) for i in file_list])
    names = hrel.create_earray(hrel.root,
                               'names',
                               tables.StringAtom(max_size), (0, 2),
                               expectedrows=len(file_list))
    res_arr = hrel.create_earray(hrel.root,
                                 'res',
                                 tables.Float32Atom(), (0, ),
                                 expectedrows=len(file_list) - 1)

    # If asked, create the file and array for the residual
    if out_res:
        hres = tables.open_file(unique_name(out_res), 'w')
        filt_r = tables.Filters(complevel=complevel_res) if complevel_res\
            else None
        arr_r = hres.create_earray(hres.root,
                                   'table',
                                   tables.Float32Atom(), (0, height, width),
                                   expectedrows=len(file_list),
                                   filters=filt_r)
        names_r = hres.create_earray(hres.root,
                                     'names',
                                     tables.StringAtom(max_size), (0, 2),
                                     expectedrows=len(file_list) - 1)

    # If asked, create the total (cumulated field)
    if out_total:
        htot = tables.open_file(unique_name(out_total), 'w')
        filt_t = tables.Filters(complevel=complevel_tot) if complevel_res\
            else None
        arr_t = htot.create_earray(htot.root,
                                   'table',
                                   tables.Float32Atom(), (0, height, width, 2),
                                   expectedrows=len(file_list),
                                   filters=filt_t)
        names_t = htot.create_earray(htot.root,
                                     'names',
                                     tables.StringAtom(max_size), (0, 2),
                                     expectedrows=len(file_list) - 1)

    # Creating the optflow class
    dis = dis_class()
    dis.setFinestScale(finest_scale)
    dis.setGradientDescentIterations(gd_iterations)
    dis.setPatchSize(patch_size)
    dis.setPatchStride(patch_stride)
    dis.setVariationalRefinementAlpha(alpha)
    dis.setVariationalRefinementDelta(delta)
    dis.setVariationalRefinementGamma(gamma)
    dis.setVariationalRefinementIterations(iterations)

    r = None
    t0 = t2 = time()
    total = np.zeros((height, width, 2), dtype=np.float32)
    # Main loop (can catch kb interrupt)
    fb = file_list[0]
    imb = open_func(fb)
    ai = Async_iter(file_list[1:], open_func)
    try:
        for i, (f, img) in enumerate(zip(file_list[1:], ai)):
            fa = fb
            ima = imb
            fb = f
            imb = img
            print("Image {}/{}: {}".format(i + 1, len(file_list), fb))
            # Adding the names of the two images
            names.append([[fa.encode('utf-8'), fb.encode('utf-8')]])
            # Opening the second image
            print("Computing optflow...")
            # Should we initialize the field ?
            r = dis.calc(ima, imb, r if use_last else None)
            # Adding the result to the table
            arr.append(r[None])
            # Computing the residual
            print("Done. Computing residual...")
            res = get_res(ima, imb, r)
            res_arr.append(np.array([scalar_res(res)]))
            if out_res:
                arr_r.append(res[None])
                names_r.append([[fa.encode('utf-8'), fb.encode('utf-8')]])
            if out_total:
                total = compose(total, r)
                arr_t.append(total[None])
                names_t.append([[fa.encode('utf-8'), fb.encode('utf-8')]])
            print("Done.")
            t1 = t2
            t2 = time()
            print("Last loop took {}".format(format_time(t2 - t1)))
            print("  ETA1 {}".format(
                format_time((t2 - t1) * (len(file_list) - i - 1))))
            print("Elapsed time: {}".format(format_time(t2 - t0)))
            print("  ETA2 {}".format(
                format_time((t2 - t0) / (i + 1) * (len(file_list) - i - 1))))
    except KeyboardInterrupt:
        print("Interrupted !")  # Support de la reprise ?
        ai.terminate()

    print("Correlation finished !")
    hrel.create_array(hrel.root, 'elapsed', [time() - t0])
    print("Closing main hdf file..")
    hrel.close()
    print("Done.")
    if out_res:
        print("Closing residual hdf file..")
        hres.close()
        print("Done.")
    if out_total:
        print("Closing total flow hdf file..")
        htot.close()
        print("Done.")
예제 #7
0
h5file.setNodeAttr(group_arr, 'type', 'element')
h5file.setNodeAttr(group_arr, 'entityType', 'face')

#write data
ft_id = h5file.createGroup(h5file.root, 'floatingType')
ars_id = h5file.createGroup(ft_id, 'data3D')
h5file.setNodeAttr(ars_id, 'floatingType', 'arraySet')
h5file.setNodeAttr(ars_id, 'label', 'Data on triangles')
data_arr = h5file.createCArray(ars_id,
                               'data',
                               tables.Float64Atom(),
                               numpy.shape(elttypes),
                               filters=filters)
i = 0
for array in data_arr:
    data_arr[i] = numpy.float64(i)
    i += 1
h5file.setNodeAttr(data_arr, 'label', 'Current on element')
h5file.setNodeAttr(data_arr, 'physicalNature', 'electricCurrent')
h5file.setNodeAttr(data_arr, 'unit', 'ampere')

ds_id = h5file.createGroup(ars_id, 'ds')
dim1_arr = h5file.createCArray(ds_id,
                               'dim1',
                               tables.StringAtom(42), (1, ),
                               filters=filters)
dim1_arr[0] = '/mesh/trianglesMesh/tmesh/group/triangles'
h5file.setNodeAttr(dim1_arr, 'label', 'mesh elements')
h5file.setNodeAttr(dim1_arr, 'physicalNature', 'meshEntity')
h5file.close()
예제 #8
0
 def test_init_parameters_02(self):
     atom1 = tb.StringAtom(itemsize=12)
     atom2 = atom1.copy(itemsize=100, shape=(2, 2))
     self.assertEqual(atom2,
                      tb.StringAtom(itemsize=100, shape=(2, 2), dflt=b''))
예제 #9
0
def train(nthreads,
          maindir,
          output,
          testartists,
          npicks,
          winsize,
          finaldim,
          trainsongs=None,
          typecompress='picks'):
    """
    Main function to do the training
    Do the main pass with the number of given threads.
    Then, reads the tmp files, creates the main output, delete the tmpfiles.
    INPUT
      - nthreads     - number of threads to use
      - maindir      - dir of the MSD, wehre to find song files
      - output       - main model, contains everything to perform KNN
      - testartists  - set of artists to ignore
      - npicks       - number of samples to pick per song
      - winsize      - window size (in beats) of a sample
      - finaldim     - final dimension of the sample, something like 5?
      - trainsongs   - list of songs to use for training
      - typecompress - 'picks', 'corrcoeff' or 'cov'
    RETURN
       - nothing
    """
    # sanity checks
    if os.path.isfile(output):
        print 'ERROR: file', output, 'already exists.'
        return
    # initial time
    t1 = time.time()
    # do main pass
    tmpfiles = process_filelist_train_main_pass(nthreads,
                                                maindir,
                                                testartists,
                                                npicks,
                                                winsize,
                                                finaldim,
                                                trainsongs=trainsongs,
                                                typecompress=typecompress)
    if tmpfiles is None:
        print 'Something went wrong, tmpfiles are None'
        return
    # intermediate time
    t2 = time.time()
    stimelen = str(datetime.timedelta(seconds=t2 - t1))
    print 'Main pass done after', stimelen
    sys.stdout.flush()
    # find approximate number of rows per tmpfiles
    h5 = tables.openFile(tmpfiles[0], 'r')
    nrows = h5.root.data.year.shape[0] * len(tmpfiles)
    h5.close()
    # create output
    output = tables.openFile(output, mode='a')
    group = output.createGroup("/", 'data',
                               'KNN MODEL FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'feats',
                        tables.Float64Atom(shape=()), (0, finaldim),
                        'feats',
                        expectedrows=nrows)
    output.createEArray(group,
                        'year',
                        tables.IntAtom(shape=()), (0, ),
                        'year',
                        expectedrows=nrows)
    output.createEArray(group,
                        'track_id',
                        tables.StringAtom(18, shape=()), (0, ),
                        'track_id',
                        expectedrows=nrows)
    # aggregate temp files
    for tmpf in tmpfiles:
        h5 = tables.openFile(tmpf)
        output.root.data.year.append(h5.root.data.year[:])
        output.root.data.track_id.append(h5.root.data.track_id[:])
        output.root.data.feats.append(h5.root.data.feats[:])
        h5.close()
        # delete tmp file
        os.remove(tmpf)
    # close output
    output.close()
    # final time
    t3 = time.time()
    stimelen = str(datetime.timedelta(seconds=t3 - t1))
    print 'Whole training done after', stimelen
    # done
    return
예제 #10
0
def process_filelist_train(filelist=None,
                           testartists=None,
                           tmpfilename=None,
                           npicks=None,
                           winsize=None,
                           finaldim=None,
                           typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'feats',
                        tables.Float64Atom(shape=()), (0, finaldim),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'track_id',
                        tables.StringAtom(18, shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #', cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append(np.array([year] * n_p_feats))
        output.root.data.track_id.append(np.array([track_id] * n_p_feats))
        output.root.data.feats.append(processed_feats)
    # we're done, close output
    output.close()
    return
예제 #11
0
    def saveTo(self, file, where, name):
        """Creates SetArray which is a pickled version of this annotation
        """

        atom = tables.StringAtom(flavor='numpy')
        node = file.createVLArray(where, name, atom)
def proc_data(pattern, f_ex, params, l_path, d_path, conv_table, CGN):
    # get list of audio and transcript files
    audio_files = [d_path + "/" + x for x in list_files(d_path)]
    audio_files.sort()

    label_files = [l_path + "/" + x for x in list_files(l_path)]
    label_files.sort()

    # create h5 file for the processed data
    data_file = tables.open_file(params[5] + '.h5', mode='a')

    # create pytable atoms
    # if we want filterbanks the feature size is #filters+1 for energy x3 for delta and double delta
    if params[4] == True:
        feature_shape = (params[1] + 1)
        # features are three times bigger if deltas are used
        if params[6] == True:
            feature_shape = feature_shape * 3
    # if we make MFCCs we take the first 12 cepstral coefficients and energy + delta double delta = 39 features
    else:
        feature_shape = (39)
    f_atom = tables.Float64Atom()
    # N.B. label size is hard coded. It provides phoneme and 7 articulatory feature
    # labels
    l_atom = tables.StringAtom(itemsize=5)
    # create a feature and label group branching of the root node
    features = data_file.create_group("/", 'features')
    labels = data_file.create_group("/", 'labels')
    # create a dictionary from the conv table
    cgndict = phoneme_dict(conv_table)

    # check if the audio and transcript files match
    if check_files(audio_files, label_files, f_ex):

        # len(audio_files)
        for x in range(0, len(audio_files)):  #len(audio_files)
            print('processing file ' + str(x))
            # create new leaf nodes in the feature and leave nodes for every audio file
            f_table = data_file.create_earray(features,
                                              audio_files[x][-12:-4],
                                              f_atom, (0, feature_shape),
                                              expectedrows=100000)
            l_table = data_file.create_earray(labels,
                                              audio_files[x][-12:-4],
                                              l_atom, (0, 8),
                                              expectedrows=100000)

            # read audio samples
            input_data = read(audio_files[x])
            # sampling frequency
            fs = input_data[0]
            # get window and frameshift size in samples
            s_window = int(fs * params[2])
            s_shift = int(fs * params[3])

            # create mfccs
            [mfcc,
             frame_nrs] = get_mfcc(input_data, params[0], params[1], s_window,
                                   s_shift, params[4], params[6])

            # read datatranscript
            trans = parse_transcript(pattern, label_files[x], CGN)
            # convert phoneme transcript to articulatory feature transcript
            l_trans = label_transcript(trans, fs, cgndict)
            nframes = mfcc.shape[0]
            # label frames using the labelled transcript
            l_data = numpy.array(label_frames(nframes, l_trans, s_shift))

            # append new data to the tables
            f_table.append(mfcc)
            l_table.append(l_data)
    else:
        print('audio and transcript files do not match')
    # close the output files
    data_file.close()
    data_file.close()
    return (mfcc, l_data)
예제 #13
0
    def test_base_class_factory(self):
        cls_props_target = [('tags', (set,)),
                            ('target_id', (np.str_,)),
                            ('name', (np.str_,)),
                            ('position', (np.ndarray, np.float_)),
                            ('position_error', (np.ndarray, np.float_)),
                            ('description', (np.str_,))]

        _Target = _base_class_factory('_Target', class_type='base',
                                      class_properties=cls_props_target)
        cls_props_dqt = [('tags', (set,)), ('name', (np.str_,)),
                         ('reference', (np.str_,))]

        _DataQualityType = _base_class_factory('_DataQualityType', 'base',
                                               class_properties=cls_props_dqt)

        cls_props_rawdt = [('tags', (set,)),
                           ('inc_angle', (np.ndarray, np.float_)),
                           ('inc_angle_error', (np.ndarray, np.float_)),
                           ('bearing', (np.ndarray, np.float_)),
                           ('bearing_error', (np.ndarray, np.float_)),
                           ('position', (np.ndarray, np.float_)),
                           ('position_error', (np.ndarray, np.float_)),
                           ('path_length', (np.ndarray, np.float_)),
                           ('path_length_error', (np.ndarray, np.float_)),
                           ('d_var', (np.ndarray, np.float_)),
                           ('ind_var', (np.ndarray, np.float_)),
                           ('datetime', (np.ndarray, datetime.datetime)),
                           ('data_quality', (np.ndarray, np.float_)),
                           ('integration_time', (np.ndarray, np.float_)),
                           ('no_averages', (np.float_,)),
                           ('temperature', (np.float_,)),
                           ('user_notes', (np.str_,))]
        cls_refr_rawdt = [('instrument', (_Instrument,)),
                          ('target', (_Target,)),
                          ('type', (_RawDataType,)),
                          ('data_quality_type', (np.ndarray,
                                                 _DataQualityType))]

        filename = tempfile.mktemp()
        h5f = tables.open_file(filename, 'w')
        h5f.create_earray('/', 'hash', tables.StringAtom(itemsize=28), (0,))

        TargetBuffer = _buffer_class_factory('TargetBuffer',
                                             class_properties=cls_props_target)
        DataQualityTypeBuffer = \
                _buffer_class_factory('DataQualityTypeBuffer',
                                      class_properties=cls_props_dqt)

        dtb1 = DataQualityTypeBuffer(name='q-measure 1')
        dtb2 = DataQualityTypeBuffer(name='q-measure 2')

        tb = TargetBuffer(name='White Island', position=(177.2, -37.5, 50))

        group_name = _Target.__name__.strip('_')
        h5f.create_group('/', group_name)
        rid = ResourceIdentifier()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            group = h5f.create_group('/'+group_name, str(rid))
        t = _Target(group, tb)

        rid = ResourceIdentifier()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            group = h5f.create_group('/'+group_name, str(rid))
        dt1 = _DataQualityType(group, dtb1)

        rid = ResourceIdentifier()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            group = h5f.create_group('/'+group_name, str(rid))
        dt2 = _DataQualityType(group, dtb2)

        _RawData = _base_class_factory('_RawData', class_type='base',
                                       class_properties=cls_props_rawdt,
                                       class_references=cls_refr_rawdt)

        RawDataBuffer = _buffer_class_factory('RawDataBuffer',
                                              class_properties=cls_props_rawdt,
                                              class_references=cls_refr_rawdt)
        rdb = RawDataBuffer(d_var=np.zeros((1, 2048)), ind_var=np.arange(2048),
                            datetime=['2017-01-10T15:23:00'], no_averages=23,
                            user_notes='something', target=t,
                            data_quality_type=[dt1, dt2])
        group_name = _RawData.__name__.strip('_')
        h5f.create_group('/', group_name)
        rid = ResourceIdentifier()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            group = h5f.create_group('/'+group_name, str(rid))
        rd = _RawData(group, rdb)
        np.testing.assert_array_equal(rd.d_var[:], np.zeros((1, 2048)))
        with self.assertRaises(AttributeError):
            rd.something = 'something'
        with self.assertRaises(AttributeError):
            rd.d_var = np.ones((1, 2048))
        self.assertEqual(rd.user_notes, 'something')
        self.assertEqual(rd.no_averages, 23)
        np.testing.assert_array_equal(rd.target.position[:],
                                      np.array([177.2, -37.5, 50.]))
        self.assertEqual(rd.target.name, 'White Island')
        self.assertEqual(rd.data_quality_type[1].name, 'q-measure 2')
예제 #14
0
def put_string(h5, where, name, string, ext_name):
    ''' put a sting in a hdf5 file'''

    atom = tables.StringAtom(itemsize=len(string))
    ca = h5.createCArray(where, name, atom, (1, ), ext_name)
    ca[:] = string[:]
예제 #15
0
root = fileh.root
vlarray = fileh.create_vlarray(root,
                               'vlarray1',
                               tables.Int32Atom(),
                               "ragged array of ints",
                               filters=tables.Filters(1))
# Append some (variable length) rows:
vlarray.append(numpy.array([5, 6]))
vlarray.append(numpy.array([5, 6, 7]))
vlarray.append([5, 6, 9, 8])

# Now, do the same with native Python strings.
vlarray2 = fileh.create_vlarray(root,
                                'vlarray2',
                                tables.StringAtom(itemsize=2),
                                "ragged array of strings",
                                filters=tables.Filters(1))
vlarray2.flavor = 'python'
# Append some (variable length) rows:
vlarray2.append(['5', '66'])
vlarray2.append(['5', '6', '77'])
vlarray2.append(['5', '6', '9', '88'])

# Test with lists of bidimensional vectors
vlarray3 = fileh.create_vlarray(root, 'vlarray3',
                                tables.Int64Atom(shape=(2, )),
                                "Ragged array of vectors")
a = numpy.array([[1, 2], [1, 2]], dtype=numpy.int64)
vlarray3.append(a)
vlarray3.append(numpy.array([[1, 2], [3, 4]], dtype=numpy.int64))
예제 #16
0
 def test_init_parameters_01(self):
     atom1 = tb.StringAtom(itemsize=12)
     atom2 = atom1.copy()
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
     self.assertIsNot(atom1, atom2)
예제 #17
0
import random

######################################################################################################################################

os.chdir(base_dir + '/' + dataname + '/' + experiment)

seed = random.randrange(
    sys.maxsize
)  #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed)  # set the seed
#print(f"random seed (note down for reproducibility): {seed}")

img_dtype = tables.UInt8Atom(
)  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
filenameAtom = tables.StringAtom(
    itemsize=255
)  #create an atom to store the filename of the image, just in case we need it later.

files = glob.glob(
    '../imgs_json_masks/*.png'
)  # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning

#create training and validation stages and split the files appropriately between them
phases = {}
phases["train"], phases["val"] = next(
    iter(
        model_selection.ShuffleSplit(n_splits=1,
                                     test_size=test_set_size).split(files)))

print(f"\tDataset: {dataname}")
print(f"\tExperiment: {experiment}")
예제 #18
0
 def test_init_parameters_03(self):
     atom1 = tb.StringAtom(itemsize=12)
     self.assertRaises(TypeError, atom1.copy, foobar=42)
예제 #19
0
def calc_flow(original_image,
              file_list,
              out_file='optflow.hdf',
              out_res='optflow_res.hdf',
              complevel=0,
              complevel_res=0,
              use_last=True,
              open_func=lambda s: cv2.imread(s, 0),
              # Preset medium
              finest_scale=0,
              gd_iterations=25,
              patch_size=4,
              patch_stride=1,
              alpha=20,
              delta=5,
              gamma=1,
              iterations=20):

  infos = dict()
  infos['start_time'] = str(datetime.datetime.now())
  infos['dir'] = os.getcwd()
  infos['host'] = os.uname()[1]
  infos['algo'] = 'Disflow'
  infos['algo_version'] = version
  infos['finest_scale'] = finest_scale
  infos['gd_iterations'] = gd_iterations
  infos['patch_size'] = patch_size
  infos['patch_stride'] = patch_stride
  infos['alpha'] = alpha
  infos['delta'] = delta
  infos['gamma'] = gamma
  infos['iterations'] = iterations
  infos['opencv'] = cv2.__version__
  infos_s = str(infos).encode('utf-8')
  o_img = open_func(original_image)
  height, width = o_img.shape
  output_size = 8 * height * width * len(file_list) / 2**20
  if out_res:
    output_size *= 1.5  # Residual has the same type but half the values
  print("Estimated output size: {:.2f} MB".format(output_size))

  # Opening the main output file
  h = tables.open_file(unique_name(out_file), 'w')

  # Creating the infos node
  h.create_array(h.root, 'infos', [infos_s])

  # If compression is asked, create the filter
  filt = tables.Filters(complevel=complevel) if complevel else None
  # Create the array at the node 'table'
  arr = h.create_earray(h.root, 'table', tables.Float32Atom(),
                        (0, height, width, 2),
                        expectedrows=len(file_list),
                        filters=filt)
  # Create the array of the names of the images
  max_size = max([len(i.encode('utf-8'))
                 for i in file_list + [original_image]])
  names = h.create_earray(h.root, 'names', tables.StringAtom(max_size),
                          (0, 2), expectedrows=len(file_list))
  res_arr = h.create_earray(h.root, 'res', tables.Float32Atom(), (0,),
                            expectedrows=len(file_list))

  # If asked, create the file and array for the residual
  if out_res:
    h_res = tables.open_file(unique_name(out_res), 'w')
    filt_r = tables.Filters(complevel=complevel_res) if complevel_res\
        else None
    arr_r = h.create_earray(h_res.root, 'table', tables.Float32Atom(),
                            (0, height, width), expectedrows=len(file_list),
                            filters=filt_r)
    names_r = h.create_earray(h_res.root, 'names', tables.StringAtom(max_size),
                              (0, 2), expectedrows=len(file_list))
  # Creating the optflow class
  dis = dis_class()
  dis.setFinestScale(finest_scale)
  dis.setGradientDescentIterations(gd_iterations)
  dis.setPatchSize(patch_size)
  dis.setPatchStride(patch_stride)
  dis.setVariationalRefinementAlpha(alpha)
  dis.setVariationalRefinementDelta(delta)
  dis.setVariationalRefinementGamma(gamma)
  dis.setVariationalRefinementIterations(iterations)

  r = None
  t0 = t2 = time()
  # Main loop (can catch kb interrupt)
  ai = Async_iter(file_list, open_func)
  try:
    for i, (f, img) in enumerate(zip(file_list, ai)):
      print("Image {}/{}: {}".format(i + 1, len(file_list), f))
      # Adding the names of the two images
      names.append([[original_image.encode('utf-8'), f.encode('utf-8')]])
      # Opening the second image
      print("Computing optflow...")
      # Should we initialize the field ?
      if use_last:
        r = dis.calc(o_img, img, r)
      else:
        r = dis.calc(o_img, img, None)
      # Adding the result to the table
      arr.append(r[None])
      # Computing the residual
      print("Done. Computing residual...")
      res = get_res(o_img, img, r)
      res_arr.append(np.array([scalar_res(res)]))
      if out_res:
        arr_r.append(res[None])
        names_r.append([[original_image.encode('utf-8'), f.encode('utf-8')]])
      print("Done.")
      t1 = t2
      t2 = time()
      print("Last loop took {}".format(format_time(t2 - t1)))
      print("  ETA1 {}".format(format_time(
          (t2 - t1) * (len(file_list) - i - 1))))
      print("Elapsed time: {}".format(format_time(t2 - t0)))
      print("  ETA2 {}".format(
          format_time((t2 - t0) / (i + 1) * (len(file_list) - i - 1))))
  except KeyboardInterrupt:
    print("Interrupted !")  # Support de la reprise ?
    ai.terminate()

  print("Correlation finished !")
  h.create_array(h.root, 'elapsed', [time() - t0])
  print("Closing main hdf file..")
  h.close()
  print("Done.")
  if out_res:
    print("Closing residual hdf file..")
    h_res.close()
    print("Done.")
예제 #20
0
 def test_from_kind_04(self):
     atom1 = tb.Atom.from_kind('string', itemsize=5, dflt=b'hello')
     atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello')
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
예제 #21
0
                   atom=a,
                   shape=(0, CF.NUMLAT, CF.NUMLON),
                   title=CF.DATA_TITLE)

# Create the EArray for grid information, and populate it.
a = tables.Float64Atom()
myh5.create_earray(myh5.root,
                   name="gridInfo",
                   atom=a,
                   shape=(0, len(CF.INFO_LIST)),
                   title=CF.GRIDINFO_TITLE)
mydatum = np.array([CF.INFO_LIST])  # Note extra dimension, for enumeration.
myh5.root.gridInfo.append(mydatum)

# Create the EArray for date-time information.
a = tables.StringAtom(itemsize=CF.DATETIME_SIZE)
myh5.create_earray(myh5.root,
                   name="dateTime",
                   atom=a,
                   shape=(0, CF.DATETIME_NUM),
                   title=CF.DATETIME_TITLE)
if CF.VERBOSE:
    print("HDF5 object before being populated:")
    print(myh5)

##### Loop over the specified directories.  #####

idxstart = len(CF.DATAFILE_HEADER)

# Outer loop is over per-date folders.
for d in range(len(DIR_TOREAD)):
예제 #22
0
def Run(args):
    in_fn_list = args.in_fn
    out_fn = args.out_fn
    platform = args.platform
    pileup = args.pileup

    global param
    float_type = 'int32'
    if pileup:
        import shared.param_p as param
    else:
        import shared.param_f as param
        float_type = 'int8'

    tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape

    # select all match prefix if file path not exists
    tables.set_blosc_max_threads(64)
    int_atom = tables.Atom.from_dtype(np.dtype(float_type))
    string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50)
    long_string_atom = tables.StringAtom(itemsize=5000)  # max alt_info length
    table_file = tables.open_file(out_fn, mode='w', filters=FILTERS)
    table_file.create_earray(where='/',
                             name='position_matrix',
                             atom=int_atom,
                             shape=[0] + tensor_shape,
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='position',
                             atom=string_atom,
                             shape=(0, 1),
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='label',
                             atom=int_atom,
                             shape=(0, param.label_size),
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='alt_info',
                             atom=long_string_atom,
                             shape=(0, 1),
                             filters=FILTERS)

    table_dict = utils.update_table_dict()
    total_compressed = 0

    for f in in_fn_list:
        print("[INFO] Merging file {}".format(f))
        fi = tables.open_file(f, model='r')
        assert (len(fi.root.label) == len(fi.root.position) == len(
            fi.root.position_matrix) == len(fi.root.alt_info))
        for index in range(len(fi.root.label)):
            table_dict['label'].append(fi.root.label[index])
            table_dict['position'].append(fi.root.position[index])
            table_dict['position_matrix'].append(
                fi.root.position_matrix[index])
            table_dict['alt_info'].append(fi.root.alt_info[index])

            total_compressed += 1

            if total_compressed % 500 == 0 and total_compressed > 0:
                table_dict = utils.write_table_file(table_file, table_dict,
                                                    tensor_shape,
                                                    param.label_size,
                                                    float_type)

            if total_compressed % 50000 == 0:
                print("[INFO] Compressed %d tensor" % (total_compressed),
                      file=sys.stderr)
        fi.close()

    if total_compressed % 500 != 0 and total_compressed > 0:
        table_dict = utils.write_table_file(table_file, table_dict,
                                            tensor_shape, param.label_size,
                                            float_type)
        print("[INFO] Compressed %d tensor" % (total_compressed),
              file=sys.stderr)

    table_file.close()
예제 #23
0
파일: fah.py 프로젝트: jchodera/fahmunge
def strip_water(allatom_filename,
                protein_filename,
                protein_atom_indices,
                min_num_frames=1):
    """Strip water (or other) atoms from a Core17, Core18, or OCore FAH HDF5 trajectory.

    Parameters
    ----------
    allatom_filename : str
        Path to HDF5 trajectory with all atoms.  This trajectory must have been generated by
        concatenate_core17 or concatenate_siegetank--e.g. it must include
        extra metadata that lists the XTC files (bzipped or in OCore directories) that
        have already been processed.  This file will not be modified.
    protein_filename : str
        Path to HDF5 trajectory with all just protein atoms.  This trajectory must have been generated by
        concatenate_core17 or concatenate_siegetank--e.g. it must include
        extra metadata that lists the XTC files (bzipped or in OCore directories) that
        have already been processed.  This file will be appended to.
    protein_atom_indices : np.ndarray, dtype='int'
        List of atom indices to extract from allatom HDF5 file.
    min_num_frames : int, optional, default=1
        Skip if below this number.

    """
    # Check integrity of trajectory if it exists.
    delete_trajectory_if_broken(allatom_filename)

    if not os.path.exists(allatom_filename):
        print("Skipping, %s not found" % allatom_filename)
        return

    trj_allatom = HDF5TrajectoryFile(allatom_filename, mode='r')

    print('all-atom trajectory %s has %d frames' %
          (allatom_filename, len(trj_allatom)))
    if len(trj_allatom) < min_num_frames:
        print("Must have at least %d frames in %s to proceed!" %
              (min_num_frames, allatom_filename))
        del trj_allatom
        return

    if hasattr(trj_allatom.root, "processed_filenames"):
        key = "processed_filenames"  # Core17, Core18 style data
    elif hasattr(trj_allatom.root, "processed_directories"):
        key = "processed_directories"  # Siegetank style data
    else:
        raise (ValueError("Can't find processed files in %s" %
                          allatom_filename))

    # Check integrity of trajectory if it exists.
    delete_trajectory_if_broken(protein_filename)

    # Open the stripped trajectory.
    trj_protein = HDF5TrajectoryFile(protein_filename, mode='a')

    try:
        trj_protein._create_earray(where='/',
                                   name=key,
                                   atom=tables.StringAtom(1024),
                                   shape=(0, ))
        trj_protein.topology = trj_allatom.topology.subset(
            protein_atom_indices)
    except tables.NodeError:
        pass

    n_frames_allatom = len(trj_allatom)
    try:
        n_frames_protein = len(trj_protein)
    except tables.NoSuchNodeError:
        n_frames_protein = 0

    filenames_allatom = getattr(trj_allatom.root, key)
    filenames_protein = getattr(trj_protein._handle.root,
                                key)  # Hacky workaround of MDTraj bug #588

    n_files_allatom = len(filenames_allatom)
    n_files_protein = len(filenames_protein)
    print(
        "Found %d,%d filenames and %d,%d frames in %s and %s, respectively." %
        (n_files_allatom, n_files_protein, n_frames_allatom, n_frames_protein,
         allatom_filename, protein_filename))

    if n_frames_protein > n_frames_allatom:
        raise (ValueError(
            "Found more frames in protein trajectory (%d) than allatom trajectory (%d)"
            % (n_frames_protein, n_frames_allatom)))

    if n_files_protein > n_files_allatom:
        raise (ValueError(
            "Found more filenames in protein trajectory (%d) than allatom trajectory (%d)"
            % (n_files_protein, n_files_allatom)))

    if n_frames_protein == n_frames_allatom or n_files_allatom == n_files_protein:
        if not (n_frames_protein == n_frames_allatom
                and n_files_allatom == n_files_protein):
            raise (ValueError(
                "The trajectories must match in BOTH n_frames and n_filenames or NEITHER."
            ))
        else:
            print("Same number of frames and filenames found, skipping.")
            del trj_allatom, trj_protein
            return

    trj_allatom.seek(
        n_frames_protein)  # Jump forward past what we've already stripped.
    coordinates, time, cell_lengths, cell_angles, velocities, kineticEnergy, potentialEnergy, temperature, alchemicalLambda = trj_allatom.read(
    )
    trj_protein.write(
        coordinates=coordinates[:, protein_atom_indices],
        time=time,
        cell_lengths=cell_lengths,
        cell_angles=cell_angles)  # Ignoring the other fields for now, TODO.

    filenames_protein.append(filenames_allatom[n_files_protein:])
    del trj_allatom, trj_protein
예제 #24
0
import tables as tb
import numpy as np

fileh = tb.open_file('earray1.h5', mode='w')
a = tb.StringAtom(itemsize=8)
# Use ``a`` as the object type for the enlargeable array.
array_c = fileh.create_earray(fileh.root, 'array_c', a, (0, ), "Chars")
array_c.append(np.array(['a' * 2, 'b' * 4], dtype='S8'))
array_c.append(np.array(['a' * 6, 'b' * 8, 'c' * 10], dtype='S8'))

# Read the string ``EArray`` we have created on disk.
for s in array_c:
    print(f'array_c[{array_c.nrow}] => {s!r}')
# Close the file.
fileh.close()
예제 #25
0
def save_images_h5(units, stimulus_list, name, frame_log, video_file, append):
    """Assumes each group is three stimuli with image in second position.
    
    Concatenate second stimuli with first 0.5s of third stimuli"""
    # open first so if there's a problem we don't waste time
    compression_level = 3
    dset_filter = tables.filters.Filters(complevel=compression_level,
                                         complib='blosc:zstd')
    with tables.open_file(name + ".h5", 'w') as h5:
        class_resolver = get_classes_from_stimulus_list(stimulus_list)
        nclasses = len(class_resolver)
        frames, image_classes = glia.get_images_from_vid(
            stimulus_list, frame_log, video_file)

        image_class_num = list(
            map(lambda x: class_resolver[str(x)], image_classes))
        idx_sorted_order = np.argsort(image_class_num)

        # save mapping of class_num target to class metadata
        # this way h5.root.image_classes[n] will give the class metadata string
        logger.info("create class_resolver with max string of 256")
        resolver = h5.create_carray(h5.root, "image_classes",
                                    tables.StringAtom(itemsize=256),
                                    (nclasses, ))
        img_class_array = np.array(image_classes,
                                   dtype="S256")[idx_sorted_order]
        for i, image_class in enumerate(img_class_array):
            resolver[i] = image_class

        atom = tables.Atom.from_dtype(frames[0].dtype)
        images = h5.create_carray(h5.root,
                                  "images",
                                  atom, (nclasses, *frames[0].shape),
                                  filters=dset_filter)

        frames = np.array(frames)
        nFrames = len(frames)
        for i, idx in enumerate(idx_sorted_order):
            if idx >= nFrames:
                logger.warn(
                    f"skipping class {image_classes[idx]} as no accompanying frame. This should only occur if experiment stopped early."
                )
                continue
            images[i] = frames[idx]

        print("finished saving images")
        get_image_responses = glia.compose(
            # returns a list
            partial(glia.create_experiments,
                    stimulus_list=stimulus_list,
                    progress=True,
                    append_lifespan=append),
            partial(glia.group_by, key=lambda x: x["metadata"]["group"]),
            glia.group_dict_to_list,
            glia.f_filter(partial(glia.group_contains, "IMAGE")),
            # truncate to 0.5s
            glia.f_map(lambda x: [x[1], truncate(x[2], 0.5)]),
            glia.f_map(glia.merge_experiments),
            partial(glia.group_by, key=lambda x: x["metadata"]["cohort"]),
            # glia.f_map(f_flatten)
        )

        image_responses = get_image_responses(units)
        ncohorts = len(image_responses)
        ex_cohort = glia.get_value(image_responses)
        images_per_cohort = len(ex_cohort)
        print("images_per_cohort", images_per_cohort)
        duration = ex_cohort[0]["lifespan"]

        d = int(np.ceil(duration * 1000))  # 1ms bins
        logger.info(f"ncohorts: {ncohorts}")
        # import pdb; pdb.set_trace()

        logger.info(f"nclasses: {nclasses}")
        if nclasses < 256:
            class_dtype = np.dtype('uint8')
        else:
            class_dtype = np.dtype('uint16')

        class_resolver_func = lambda c: class_resolver[str(c)]

        # determine shape
        experiments = glia.flatten_group_dict(image_responses)
        nE = len(experiments)
        d = int(np.ceil(duration * 1000))  # 1ms bins
        data_shape = (nE, d, Unit.nrow, Unit.ncol, Unit.nunit)

        print(f"writing to {name}.h5 with zstd compression...")
        data = h5.create_carray("/",
                                "data",
                                tables.Atom.from_dtype(np.dtype('uint8')),
                                shape=data_shape,
                                filters=dset_filter)
        target = h5.create_carray("/",
                                  "target",
                                  tables.Atom.from_dtype(class_dtype),
                                  shape=(nE, ),
                                  filters=dset_filter)

        glia.experiments_to_h5(experiments,
                               data,
                               target,
                               partial(get_image_class_from_stim,
                                       class_resolver=class_resolver_func),
                               append,
                               class_dtype=class_dtype)
예제 #26
0
    def load_toHDF5(self, hdf5_file=None, verbose=-1):
        # initialize the list of features and labels
        n_samples = len(self.ids)
        filters = tables.Filters(complevel=5, complib='blosc')
        image_storage = hdf5_file.create_earray(hdf5_file.root,
                                                'imdata',
                                                tables.Float32Atom(),
                                                shape=self.image_data_shape,
                                                filters=filters,
                                                expectedrows=n_samples)

        if self.problem_type is "Classification":
            truth_storage = hdf5_file.create_earray(
                hdf5_file.root,
                'truth',
                tables.StringAtom(itemsize=15),
                shape=self.truth_data_shape,
                filters=filters,
                expectedrows=n_samples)
        elif self.problem_type is "Segmentation":
            truth_storage = hdf5_file.create_earray(
                hdf5_file.root,
                'truth',
                tables.UInt8Atom(),
                shape=self.truth_data_shape,
                filters=filters,
                expectedrows=n_samples)

        # loop over the input images
        for (i, imagePath) in enumerate(self.data_files):
            # load the image and extract the class label assuming
            # that our path has the following format:
            # /path/to/dataset/{class}/{image}.jpg
            if self.problem_type is "Classification":
                subject_name = imagePath[0].split(os.path.sep)[-2]
                if subject_name in self.ids:
                    images = self.get_images(in_files=imagePath,
                                             image_shape=self.input_shape,
                                             label_indices=len(imagePath) - 1)
                    label = imagePath[0].split(os.path.sep)[-3]
                    subject_data = [image for image in images]
                    image_storage.append(np.asarray(subject_data)[np.newaxis])
                    truth_storage.append(np.asarray(label)[np.newaxis])

            elif self.problem_type is "Segmentation":
                subject_name = imagePath[0].split(os.path.sep)[-2]

                if subject_name in self.ids:
                    images = self.get_images(in_files=imagePath,
                                             image_shape=self.input_shape,
                                             label_indices=len(imagePath) - 1,
                                             slice_number=self.slice_number)
                    subject_data = [image for image in images]
                    image_storage.append(
                        np.asarray(subject_data[:self.n_channels])[np.newaxis])
                    # DEBUG
                    #image = np.asarray(subject_data[:self.n_channels])
                    truth_storage.append(
                        np.asarray(subject_data[self.n_channels],
                                   dtype=np.uint8)[np.newaxis][np.newaxis])

            # elif self.problem_type is "Regression":
            #    image = cv2.imread(imagePath)

            # show an update every `verbose` images
            if verbose > 0 and i > 0 and (i + 1) % verbose == 0:
                print("[INFO] processed {}/{}".format(i + 1, len(self.ids)))
        return (image_storage)
예제 #27
0

# **1. Create an HDF5 file which contains a single emails array, uncompressed. Close this file and use the getsize() function to report the size on disk.
# Time how long this took and print that out as well.**

# In[2]:

NUM = 10000000
RATE = 0.75
emails = email_array(NUM, RATE)

t = time()
with tb.open_file('uncompressed.h5', 'w') as f:
    earray = f.create_earray('/',
                             'emails',
                             tb.StringAtom(4), (0, ),
                             expectedrows=NUM)
    earray.append(emails)
tdelta = time() - t

msg = "The uncompressed array is {0} bytes and took {1} ms to write."
print(msg.format(getsize('uncompressed.h5'), tdelta * 1000))

# **2. Repeat step 1 but with zlib compression at level 1.**

# In[3]:

filters = tb.Filters(complib='zlib', complevel=1)

t = time()
with tb.open_file('zlib1.h5', 'w') as f: