def datref(datfile, outfile): dataset = bark.read_sampled(datfile) data, params = dataset.data, dataset.attrs outparams = params.copy() bark.write_sampled(outfile, data, outparams) outdset = bark.read_sampled(outfile, 'r+') out = outdset.data # determine reference coefficient n_channels = len(params["columns"]) coefs = np.zeros((n_channels, len(range(0, len(out), BUF)))) power = np.zeros_like(coefs) for ith, i in enumerate(range(0, len(out), BUF)): for c in range(n_channels): refs = np.delete(data[i:i + BUF, :], c, axis=1) # remove col c ref = np.mean(refs, axis=1) x = data[i:i + BUF, c] coefs[c, ith] = np.dot(x, ref) / np.dot(ref, ref) best_C = np.zeros(n_channels) for c in range(n_channels): c_coefs = coefs[c, :] c_power = power[c, :] mask = c_power >= np.percentile(c_power, 90) best_C[c] = np.nanmean(c_coefs[mask]) print("best reference coefficients: {}".format(best_C)) for i, c in enumerate(best_C): outparams['columns'][i]['reference_coefficient'] = float(c) for i in range(0, len(out), BUF): for c in range(n_channels): refs = np.delete(data[i:i + BUF, :], c, axis=1) # remove col c ref = np.mean(refs, axis=1) x = data[i:i + BUF, c] out[i:i + BUF, c] = data[i:i + BUF, c] - best_C[c] * np.median(refs, axis=1) bark.write_metadata(outfile, **outparams)
def main(datfile, labelfile, outfile=None, shortcutfile=None, use_ops=True): if not labelfile: labelfile = os.path.splitext(datfile)[0] + '.csv' kill_shortcuts(plt) sampled = bark.read_sampled(datfile) assert len(sampled.attrs['columns']) == 1 labels = bark.read_events(labelfile) labeldata = to_seconds(labels).data.to_dict('records') if len(labeldata) == 0: print('{} has no data'.format(labelfile)) return shortcuts = build_shortcut_map(shortcutfile) opsfile = labelfile + '.ops.json' opstack = load_opstack(opsfile, labelfile, labeldata, use_ops) if not outfile: outfile = os.path.splitext(labelfile)[0] + '_edit.csv' plt.figure() # Oscillogram and Spectrogram get # three times the vertical space as the minimap. osc_ax = plt.subplot2grid((7, 1), (0, 0), rowspan=3) spec_ax = plt.subplot2grid((7, 1), (3, 0), rowspan=3, sharex=osc_ax) map_ax = plt.subplot2grid((7, 1), (6, 0)) # Segement review is a context manager to ensure a save prompt # on exit. see SegmentReviewer.__exit__ with SegmentReviewer(osc_ax, spec_ax, map_ax, sampled, opstack, shortcuts, outfile, labels.attrs, opsfile) as reviewer: reviewer.connect() plt.show(block=True)
def datenrich(dat, out, label_file, window): dataset = bark.read_sampled(dat) data, params = dataset.data, dataset.attrs rate = params["sampling_rate"] total_samples = data.shape[0] # cut out labelled segments label_dset = bark.read_events(label_file) for x in label_dset.data.itertuples(): assert x.start > 0 assert x.start * rate < total_samples assert x.stop > 0 assert x.stop * rate < total_samples if x.start - window < 0: print('warning, cannot place a full window at beginning of data') segs, newlabels = get_segments(label_dset.data, window) # convert to samples segs = np.array(segs * rate, dtype=int) # write to new file with open(out, "wb") as outfp: for start, stop in segs: assert stop > 0 assert start < total_samples assert start >= 0 if stop >= total_samples: print('warning, cannot place a full window at end of data') stop = total_samples - 1 outfp.write(data[start:stop, :].tobytes()) bark.write_metadata(out, **params) bark.write_events( os.path.splitext(out)[0] + ".csv", newlabels, **label_dset.attrs)
def _run(): ''' Function for getting commandline args.''' import argparse p = argparse.ArgumentParser(description=''' Create a segment label file from a 2D categorical probability series Uses method from Koumura & Okanoya 2016. First the most likely syllable is created. Then from these threshold crossings, any short gaps are annealed, and any short syllables are removed. ''') p.add_argument('cat', help='name of a sampled dataset') p.add_argument('out', help='name of output event dataset') p.add_argument('--min-syl', help='minimum syllable length in ms, default: {}'.format( default_min_syl), type=int, default=default_min_syl) p.add_argument('--min-silent', help='minimum silence length in ms, default: {}'.format( default_min_silent), type=int, default=default_min_silent) args = p.parse_args() sampled = bark.read_sampled(args.cat) sr = sampled.sampling_rate decoder = sampled.attrs['decoder'] main(sampled.data, sr, decoder, args.out, args.min_syl, args.min_silent)
def main(dat, csv, thresh, is_std, order=default_order, min_dist=0): if is_std: std = compute_std(dat) threshs = thresh * std else: # make threshs a vector if it's a scalar n_channels = bark.read_sampled(dat).data.shape[1] threshs = np.ones(n_channels) * thresh print('thresholds:', threshs) s = stream.read(dat) pad_len = order with open(csv, 'w') as fp: fp.write('channel,start\n') for (channel, sample) in stream_spikes(s, threshs, pad_len, order, min_dist * s.sr): fp.write('{},{}\n'.format(channel, sample / s.sr)) bark.write_metadata(csv, datatype=1000, columns={ 'channel': { 'units': None }, 'start': { 'units': 's' } }, thresholds=threshs, order=order, source=dat)
def rb_select(): p = argparse.ArgumentParser(description=''' Select a subset of channels from a sampled dataset ''') p.add_argument('dat', help='dat file') p.add_argument('-o', '--out', help='name of output datfile') p.add_argument('-c', '--channels', help='''channels to extract, zero indexed channel numbers unless --col-attr is set, in which case channels are metadata values''', nargs='+', required=True) p.add_argument('--col-attr', help='name of column attribute to select channels with') args = p.parse_args() fname, outfname, channels, col_attr = (args.dat, args.out, args.channels, args.col_attr) stream = bark.read_sampled(fname).toStream() if col_attr: columns = stream.attrs['columns'] rev_attr = { col[col_attr]: idx for idx, col in columns.items() if col_attr in col } # so you can tag only some channels channels = [rev_attr[c] for c in channels] else: channels = [int(c) for c in channels] stream[channels].write(outfname)
def datref(datfile, outfile): shutil.copyfile(datfile, outfile) shutil.copyfile(datfile + '.meta.yaml', outfile + '.meta.yaml') outdset = bark.read_sampled(outfile, 'r+') out = outdset.data # determine reference coefficient n_samples, n_channels = out.shape coefs = np.zeros((n_channels, len(range(0, n_samples, BUF)))) power = np.zeros_like(coefs) for ith, i in enumerate(range(0, n_samples, BUF)): total_mean = np.mean(out[i:i + BUF, :], axis=1) for c in range(n_channels): x = out[i:i + BUF, c] # this way we avoid re-calculating the entire mean for each channel ref = (total_mean * n_channels - x) / (n_channels - 1) coefs[c, ith] = np.dot(x, ref) / np.dot(ref, ref) best_C = np.zeros(n_channels) for c in range(n_channels): c_coefs = coefs[c, :] c_power = power[c, :] mask = c_power >= np.percentile(c_power, 90) best_C[c] = np.nanmean(c_coefs[mask]) print("best reference coefficients: {}".format(best_C)) for i, c in enumerate(best_C): outdset.attrs['columns'][i]['reference_coefficient'] = float(c) # we want to avoid re-calculating the median from scratch for each channel # unfortunately, the "new median after removing an element" calculation # is less succinct than for the mean if n_channels % 2 == 0: median_idx = [int(n_channels / 2) - 1, int(n_channels / 2)] idx_smaller = [median_idx[0] + 1] # new median if elt removed < median idx_equal = [median_idx[0]] # new median if elt removed == median idx_greater = [median_idx[0]] # new median if elt removed > median else: median_idx = [int(n_channels / 2)] idx_smaller = [median_idx[0], median_idx[0] + 1] idx_equal = [median_idx[0] - 1, median_idx[0] + 1] idx_greater = [median_idx[0] - 1, median_idx[0]] for i in range(0, n_samples, BUF): sorted_buffer = np.sort(out[i:i + BUF, :], axis=1) total_medians = np.mean(sorted_buffer[:, median_idx], axis=1) new_med_smaller = np.mean(sorted_buffer[:, idx_smaller], axis=1) new_med_equal = np.mean(sorted_buffer[:, idx_equal], axis=1) new_med_greater = np.mean(sorted_buffer[:, idx_greater], axis=1) for c in range(n_channels): less = np.less(out[i:i + BUF, c], total_medians) equal = np.equal(out[i:i + BUF, c], total_medians) greater = np.greater(out[i:i + BUF, c], total_medians) out[i:i + BUF, c][less] = out[i:i + BUF, c][less] - best_C[c] * new_med_smaller[less] out[i:i + BUF, c][equal] = out[i:i + BUF, c][equal] - best_C[c] * new_med_equal[equal] out[i:i + BUF, c][greater] = out[ i:i + BUF, c][greater] - best_C[c] * new_med_greater[greater] bark.write_metadata(outfile, **outdset.attrs)
def _helper_test(fname_wav, fname_dat, dir_path, data): rate = 48000 #1. Create wav file wavfile.write(fname_wav, rate, data) assert os.path.exists(fname_wav), 'File Not Found: test.wav' #2. Generate .dat and .dat.meta.yaml file attrs = {"name": "hello bark", "project": "bark"} dat_file = dfw.dat_from_wav(fname_wav, fname_dat, **attrs) assert os.path.exists(fname_dat), 'File Not Found: test.dat' assert os.path.exists(os.path.join(dir_path, 'test.dat.meta.yaml')), 'File Not Found: test.dat.meta.yaml' #3. Compare data, dtype, rate in .dat file assert np.array_equal(data, bark.read_sampled(fname_dat).data), 'Data in .wav and .dat files does not match' assert data.dtype==bark.read_sampled(fname_dat).data.dtype, 'dtypes does not match' assert rate == bark.read_sampled(fname_dat).sampling_rate, 'Sampling rates does not match' assert 'hello bark' == bark.read_sampled(fname_dat).attrs['name'], 'name attribute does not match' assert 'bark' == bark.read_sampled(fname_dat).attrs['project'], 'project attribute does not match'
def read(fname, chunksize=2e6, **kwargs): """ input: the filename of a raw binary file should have an associated meta file returns FileStream """ bark_obj = bark.read_sampled(fname) data = bark_obj.data sr = bark_obj.attrs["sampling_rate"] kwargs.update(bark_obj.attrs) return Stream(data, sr=sr, chunksize=chunksize, attrs=kwargs)
def main(datfile, trigfile, outfile, wavfiles): common_sr = 22050 # everything is resampled to this # get wav envelopes stim_names, stim_envs = wav_envelopes(wavfiles, common_sr) mic_dset = bark.read_sampled(datfile) mic_sr = mic_dset.sampling_rate starts = bark.read_events(trigfile).data.start # get most likely stimulus for each trigger time labels = classify_stimuli(mic_dset.data, mic_sr, starts, stim_names, stim_envs, common_sr) stops = get_stops(labels, starts, stim_names, stim_envs, common_sr) write(outfile, starts, stops, labels)
def test_read_sampled(tmpdir): test_write_sampled(tmpdir) # create 'test_sampled' path = os.path.join(tmpdir.strpath, "test_sampled") assert os.path.exists(path) assert os.path.exists(path + ".meta.yaml") dset = bark.read_sampled(path) assert isinstance(dset, bark.SampledData) assert isinstance(dset.path, str) assert isinstance(dset.attrs, dict) assert isinstance(dset.data, np.memmap) assert np.allclose(np.zeros((10,3)), dset.data) assert np.allclose(dset.data.shape, (10, 3))
def test_write(tmpdir): fname = os.path.join(tmpdir.strpath, "mydat") columns = bark.sampled_columns(data1) attrs = dict(sampling_rate=100, columns=columns, fluffy="cat") a = Stream(data1, attrs=attrs) a.write(fname) sdataset = bark.read_sampled(fname) sdata = sdataset.data assert eq(data1, sdata) sattrs = sdataset.attrs for key in attrs: assert attrs[key] == sattrs[key]
def getfiles(): file = FileDialog() files = file.openFileNamesDialog() if not files: sys.exit(app.exec_()) sampled = [bark.read_sampled(file) for file in files] readonlylabelfile = file.openFileNameDialog() if not readonlylabelfile: import pandas as pd origin_labels = pd.DataFrame() else: origin_labels = bark.read_events(readonlylabelfile).data return files, sampled, sampled
def test_datchunk(tmpdir): from bark.tools import barkutils CHUNK = 350 TOTAL_SIZE = 1000 data = np.array([range(TOTAL_SIZE), range(TOTAL_SIZE, 2 * TOTAL_SIZE)]).transpose() params = dict(sampling_rate=30000, units="mV", unit_scale=0.025, extra="barley") dset = bark.write_sampled(os.path.join(tmpdir.strpath, "test.dat"), data=data, **params) barkutils.datchunk(dset.path, CHUNK, use_seconds=False, one_cut=True) first_fn = os.path.join(tmpdir.strpath, "test-chunk-0.dat") second_fn = os.path.join(tmpdir.strpath, "test-chunk-1.dat") assert os.path.exists(first_fn) assert os.path.exists(second_fn) first = bark.read_sampled(first_fn) second = bark.read_sampled(second_fn) assert (first.data == dset.data[:CHUNK,:]).all() assert first.attrs.pop('offset') == 0 assert first.attrs == dset.attrs assert (second.data == dset.data[CHUNK:TOTAL_SIZE,:]).all() assert second.attrs.pop('offset') == CHUNK assert second.attrs == dset.attrs del first, second os.remove(first_fn) os.remove(second_fn) assert not os.path.exists(first_fn) assert not os.path.exists(second_fn) barkutils.datchunk(dset.path, CHUNK, use_seconds=False, one_cut=False) third_fn = os.path.join(tmpdir.strpath, "test-chunk-2.dat") assert os.path.exists(first_fn) assert os.path.exists(second_fn) assert os.path.exists(third_fn) first = bark.read_sampled(first_fn) second = bark.read_sampled(second_fn) third = bark.read_sampled(third_fn) assert (first.data == dset.data[:CHUNK,:]).all() assert (second.data == dset.data[CHUNK:2*CHUNK,:]).all() assert (third.data == dset.data[2*CHUNK:,:]).all()
def rb_to_wave_clus(): import argparse p = argparse.ArgumentParser(prog="dat2wave_clus", description=""" Converts a raw binary file to a wav_clus compatible matlab file """) p.add_argument("dat", help="dat file") p.add_argument("-o", "--out", help="name of output .mat file") opt = p.parse_args() from scipy.io import savemat dataset = bark.read_sampled(opt.dat) savemat(opt.out, {'data': dataset.data.T, 'sr': dataset.attrs['sampling_rate']}, appendmat=False)
def main(datfile): kill_shortcuts(plt) sampled = bark.read_sampled(datfile) # assert len(sampled.attrs['columns']) == 1 plt.figure() # Oscillogram and Spectrogram get osc_ax = plt.subplot2grid((7, 1), (0, 0), rowspan=3) spec_ax = plt.subplot2grid((7, 1), (3, 0), rowspan=3, sharex=osc_ax) map_ax = plt.subplot2grid((7, 1), (6, 0), rowspan=1) # Segement review is a context manager to ensure a save prompt # on exit. see SegmentReviewer.__exit__ with SegmentReviewer(osc_ax, spec_ax, map_ax, sampled) as reviewer: reviewer.connect() plt.show(block=True)
def read_files(bird_dir, load_events): ''' bird_dir: location of data load_events: If true, also load matching csvs Reads raw files for testing and training. Returns a list of sampled datasets and a list of event datasets ''' data_files = glob(join(bird_dir, "*.dat")) print('number of files: ', len(data_files)) sampled_dsets = [bark.read_sampled(dfile) for dfile in data_files] if not load_events: return sampled_dsets target_files = [splitext(x)[0] + ".csv" for x in data_files] event_dsets = [bark.read_events(tfile) for tfile in target_files] return sampled_dsets, event_dsets
def datchunk(dat, stride, use_seconds, one_cut): def write_chunk(chunk, attrs, i): filename = "{}-chunk-{}.dat".format(basename, i) attrs['offset'] = stride * i bark.write_sampled(filename, chunk, **attrs) attrs = bark.read_metadata(dat) if use_seconds: stride = stride * attrs['sampling_rate'] stride = int(stride) basename = os.path.splitext(dat)[0] if one_cut: sds = bark.read_sampled(dat) write_chunk(sds.data[:stride,:], attrs, 0) write_chunk(sds.data[stride:,:], attrs, 1) else: for i, chunk in enumerate(stream.read(dat, chunksize=stride)): write_chunk(chunk, attrs, i)
def _main(): args = _parse_args(sys.argv[1:]) spike_ds = bark.read_events(args.spikes) stim_time_ds = bark.read_events(args.stimtimes) if args.stim: if os.path.splitext(args.stim)[-1] == '.wav': sr, stim = scipy.io.wavfile.read(args.stim) stimulus = Stimulus(args.name, stim, sr) else: stim = bark.read_sampled(args.stim) stimulus = Stimulus(args.name, stim.data, stim.sampling_rate) else: stimulus = None title_str = '"{}"-aligned spike raster, unit {}' fn_str = '{}_aligned_raster_unit_{}.{}' for unit in set(spike_ds['name']): f = aligned_raster(spike_ds[spike_ds['name'] == unit]['start'], stim_time_ds, args.name, padding=(args.bef, args.aft), title=title_str.format(args.name, unit), stim_data=stimulus) f.savefig(fn_str.format(args.name, unit, args.ext)) plt.close(f)
def datartifact(datfile, outfile, std_lim): assert datfile != outfile copyfile(datfile, outfile) copyfile(datfile + ".meta.yaml", outfile + ".meta.yaml") dataset = read_sampled(datfile) data, params = dataset.data, dataset.attrs out_dataset = read_sampled(outfile, mode="r+") out, outparams = out_dataset.data, dataset.attrs n_channels = len(params["columns"]) # compute standard deviation stds = np.std(data[0:BUF * 50], axis=0) print("standard deviations: {}".format(stds)) # find locations of artifacts pos_artifacts = [[] for x in range(n_channels)] neg_artifacts = [[] for x in range(n_channels)] assert len(stds) == n_channels for i in range(0, len(out), BUF): for c in range(n_channels): x = data[i:i + BUF, c].copy().flatten() x[x < stds[c] * std_lim] = 0 peaks, = argrelmax(x) pos_artifacts[c] += [int(pe) + i for pe in peaks] x = data[i:i + BUF, c].copy().flatten() x[x > -stds[c] * std_lim] = 0 peaks, = argrelmin(x) neg_artifacts[c] = neg_artifacts[c] + [int(pe) + i for pe in peaks] # remove artifacts print("{}\t negative artifacts".format([len(x) for x in neg_artifacts])) print("locations: {}".format( [np.array(x) / params["sampling_rate"] for x in neg_artifacts])) print("{}\t positive artifacts".format([len(x) for x in pos_artifacts])) print("locations: {}".format( [np.array(x) / params["sampling_rate"] for x in pos_artifacts])) for c in range(n_channels): print([data[x, c] / stds[c] for x in pos_artifacts[c]]) print([data[x, c] / stds[c] for x in neg_artifacts[c]]) make_artifact_plots(data, outfile, pos_artifacts, neg_artifacts, stds) for chan in range(n_channels): for samp in pos_artifacts[c]: out[samp, chan] = 0 t = samp + 1 while out[t, chan] > stds[chan]: out[t, chan] = 0 t += 1 t = samp - 1 while out[t, chan] > stds[chan]: out[t, chan] = 0 t -= 1 for samp in neg_artifacts[c]: out[samp, chan] = 0 t = samp + 1 while out[t, chan] < -stds[chan]: out[t, chan] = 0 t += 1 t = samp - 1 while out[t, chan] < -stds[chan]: out[t, chan] = 0 t -= 1
def readfiles(outfile=None, shortcutfile=None, use_ops=True): """Read all files from the fileDialog and create files if those files are missing. If no .dat files, exit. Auto find label file named with '[dat_name]_split.csv' If not exist, create a new one with customize label and a .meta file create opstack and outfiles Returns: origin_labels,trace_num, gap, sampled, opstack, shortcuts, outfile, labels.attrs, opsfile """ gap = 0 file = FileDialog() files = file.openFileNamesDialog() if not files: sys.exit(app.exec_()) files.reverse() sampled = [bark.read_sampled(file) for file in files] readonlylabelfile = file.openFileNameDialog() if not readonlylabelfile: import pandas as pd origin_labels = pd.DataFrame() else: origin_labels = bark.read_events(readonlylabelfile).data trace_num = len(files) dat = files[0] labelfile = os.path.splitext(dat)[0] + '_split.csv' exist = os.path.exists(labelfile) kill_shortcuts(plt) opsfile = labelfile + '.ops.json' metadata = labelfile + '.meta.yaml' if not os.path.exists(labelfile): write_metadata(labelfile) if not os.path.exists(labelfile): showDia = Input() gap = int(showDia.showDialog()) start = 0 end = int( round(len(sampled[0].data) / sampled[0].attrs["sampling_rate"])) trace_num = len(sampled) createlabel(labelfile, start, end, gap) labels = bark.read_events(labelfile) labeldata = to_seconds(labels).data.to_dict('records') if len(labeldata) == 0: print('{} contains no intervals.'.format(labelfile)) return opstack = load_opstack(opsfile, labelfile, labeldata, use_ops) if not gap: if len(opstack.events) == 0: print('opstack is empty. Please delete {}.'.format(opstack)) return gap = opstack.events[0]['stop'] - opstack.events[0]['start'] shortcuts = build_shortcut_map(shortcutfile) #create a new outfile if not outfile: outfile = os.path.splitext(labelfile)[0] + '_edit.csv' channelname = [] import re for name in files: searchObj = re.search(r'(.*)/(.*).dat', name, re.M | re.I) channelname.append(searchObj.group(2)) return origin_labels, trace_num, channelname, gap, sampled, opstack, shortcuts, outfile, labels.attrs, opsfile