def append_duplicated_spikes(data_dir, output_dir, groupnum, idxs, n_samples=24): """Appends a fake neuron of duplicated spikes. This is useful for testing whether some of the spikes are all in one part of the cluster, which might suggest drift or bad clustering. data_dir : klusters directory of original data (will not be modified) output_dir : klusters directory containing copy of original data (THIS ONE WILL BE MODIFIED!) Copy over all clu, fet, res, etc files to the new directory. groupnum : tetrode number, ie extension of klusters files to modify idxs : indexes of spikes to duplicate as a new cluster This functions doesn't know which unit you are trying to clone (if any), so the indexes should be indexes into ALL of the spikes from the group. It will extract the times, features, and waveforms of the indexed spikes, then append them to the end of the same files in output_dir. The new cluster has an ID one greater than previous max. """ # find files kfs1 = KKFileSchema.coerce(data_dir) kfs2 = KKFileSchema.coerce(output_dir) # Duplicate clu clu = kkpandas.kkio.read_clufile(kfs1.clufiles[groupnum]) newclunum = clu.max() + 1 newclu = pandas.concat( [clu, pandas.Series(newclunum * np.ones(len(idxs)), dtype=np.int)], ignore_index=True) kkpandas.kkio.write_clufile(newclu, kfs2.clufiles[groupnum]) # Duplicate res res = kkpandas.kkio.read_resfile(kfs1.resfiles[groupnum]) newres = pandas.concat([res, res.ix[idxs]], ignore_index=True) kkpandas.kkio.write_resfile(newres, kfs2.resfiles[groupnum]) # Duplicate fet fet = kkpandas.kkio.read_fetfile(kfs1.fetfiles[groupnum]) newfet = pandas.concat([fet, fet.ix[idxs]], ignore_index=True) kkpandas.kkio.write_fetfile(newfet, kfs2.fetfiles[groupnum]) # Duplicate spk spk = kkpandas.kkio.read_spkfile(kfs1.spkfiles[groupnum], n_samples=24, n_spikes=fet.shape[0]) newspk = np.concatenate([spk, spk[idxs, :]], axis=0) kkpandas.kkio.write_spkfile(newspk, kfs2.spkfiles[groupnum])
def append_duplicated_spikes(data_dir, output_dir, groupnum, idxs, n_samples=24): """Appends a fake neuron of duplicated spikes. This is useful for testing whether some of the spikes are all in one part of the cluster, which might suggest drift or bad clustering. data_dir : klusters directory of original data (will not be modified) output_dir : klusters directory containing copy of original data (THIS ONE WILL BE MODIFIED!) Copy over all clu, fet, res, etc files to the new directory. groupnum : tetrode number, ie extension of klusters files to modify idxs : indexes of spikes to duplicate as a new cluster This functions doesn't know which unit you are trying to clone (if any), so the indexes should be indexes into ALL of the spikes from the group. It will extract the times, features, and waveforms of the indexed spikes, then append them to the end of the same files in output_dir. The new cluster has an ID one greater than previous max. """ # find files kfs1 = KKFileSchema.coerce(data_dir) kfs2 = KKFileSchema.coerce(output_dir) # Duplicate clu clu = kkpandas.kkio.read_clufile(kfs1.clufiles[groupnum]) newclunum = clu.max() + 1 newclu = pandas.concat([clu, pandas.Series(newclunum * np.ones(len(idxs)), dtype=np.int)], ignore_index=True) kkpandas.kkio.write_clufile(newclu, kfs2.clufiles[groupnum]) # Duplicate res res = kkpandas.kkio.read_resfile(kfs1.resfiles[groupnum]) newres = pandas.concat([res, res.ix[idxs]], ignore_index=True) kkpandas.kkio.write_resfile(newres, kfs2.resfiles[groupnum]) # Duplicate fet fet = kkpandas.kkio.read_fetfile(kfs1.fetfiles[groupnum]) newfet = pandas.concat([fet, fet.ix[idxs]], ignore_index=True) kkpandas.kkio.write_fetfile(newfet, kfs2.fetfiles[groupnum]) # Duplicate spk spk = kkpandas.kkio.read_spkfile(kfs1.spkfiles[groupnum], n_samples=24, n_spikes=fet.shape[0]) newspk = np.concatenate([spk, spk[idxs, :]], axis=0) kkpandas.kkio.write_spkfile(newspk, kfs2.spkfiles[groupnum])
def load_spikes(data_dir, group, samp_rate, n_samp, n_chan): ''' This function takes the feature, cluster, and spike files in KlustaKwik format and pulls out the features, spike times, spike waveforms for each cluster. Arguments --------- data_dir : path to the directory with the KlustaKwik files group : the group number you want to load samp_rate : the sampling rate of the recording in samples per second n_samp : number of samples for each stored spike in the spike file n_chan : number of channels stored in the spike file Returns ------- out : dict of numpy structured arrays A dictionary of the clusters. The keys are the cluster numbers. The values are numpy structured arrays with fields 'times', 'waveforms', and 'pca' which give the timestamp, tetrode waveform, and pca reduced values, respectively, for each spike in the cluster. ''' from KKFileSchema import KKFileSchema import os import kkio # Get the clustered data from Klustakwik files kfs = KKFileSchema.coerce(data_dir) # Get the spike features, time stamps, cluster labels, and waveforms feat = kkio.read_fetfile(kfs.fetfiles[group]) features = feat.values[:,:-1] time_stamps = feat.time.values cluster_labels = kkio.read_clufile(kfs.clufiles[group]) spikes = kkio.read_spkfile(kfs.spkfiles[group]) # Reshaping the spike waveforms into a useful form spikes = spikes.reshape((len(spikes)/(n_chan*n_samp), (n_chan*n_samp))) for ii, spike in enumerate(spikes): spikes[ii] = spike.reshape((n_chan, n_samp), order = 'F').reshape(n_chan*n_samp) INT_TO_VOLT = 4096.0 / 2.0**15 # uV per bit spikes = spikes*INT_TO_VOLT cluster_ids = np.unique(cluster_labels.values) cluster_indices = { cid : np.where(cluster_labels.values == cid)[0] for cid in cluster_ids } clusters = dict.fromkeys(cluster_ids) dtypes = [('times','f8'), ('waveforms', 'f8', n_chan*n_samp), ('pca', 'f8', len(features[0])) ] for cid, indices in cluster_indices.iteritems(): clusters[cid] = np.zeros(len(indices), dtype = dtypes) clusters[cid]['times'] = time_stamps[indices]/np.float(samp_rate) clusters[cid]['waveforms'] = spikes[indices] clusters[cid]['pca'] = features[indices] clusters[cid].sort(order = ['times']) return clusters
def flush(kfs_or_path, verbose=False): """Remove any memoized file (basename.kkp) from the directory.""" # Coerce to file schema kfs = KKFileSchema.coerce(kfs_or_path) # Find the memoized file to_delete = kfs.basename + '.kkp' # Delete it if it exists if os.path.exists(to_delete): if verbose: print "deleting", to_delete os.remove(to_delete) else: if verbose: print "no memoized files to delete"
def read_all_from_group(basename='.', group=1, n_samples=-1, n_spikes=-1, n_channels=-1): d = {} kfs = KKFileSchema.coerce(basename) res = read_resfile(kfs.resfiles[group]) d['res'] = res clu = read_clufile(kfs.clufiles[group]) d['clu'] = clu fet = read_fetfile(kfs.fetfiles[group]) d['fet'] = fet if n_spikes == -1: n_spikes = len(res) spk = read_spkfile(kfs.spkfiles[group], n_spikes=n_spikes, n_channels=n_channels, n_samples=n_samples) d['spk'] = spk return d
def load_spiketimes(kfs_or_path, group, fs=None): """Given KKFileSchema or path to one, load spike times from group Returns Series """ kfs = KKFileSchema.coerce(kfs_or_path) # check if res-files exist, which are faster to load if 'res' in kfs.available_filetypes: spiketimes = read_resfile(kfs.resfiles[group]) elif 'fet' in kfs.available_filetypes: spiketimes = read_fetfile(kfs.fetfiles[group])[SPIKE_TIME_COLUMN_NAME] else: raise ValueError("no available method to grab spike times") # optionally convert to seconds if fs: spiketimes = spiketimes / float(fs) return spiketimes
def from_KK(basename='.', groups_to_get=None, group_multiplier=None, fs=None, verify_unique_clusters=True, add_group_as_column=True, load_memoized=False, save_memoized=False, also_get_features=False, also_get_waveforms=False, n_samples=-1, n_channels=-1): """Main function for loading KlustaKwik data. basename : path to, or basename of, files group : int or list of groups to get, otherwise get all groups group_multiplier : if None, the cluster ids are used as-is if int, then the group number times this multiplier is added to the cluster id. This is useful if groups contain the same cluster ids but you want them to have unique labels. fs : if None, the times are returned as integer number of samples otherwise, they are divided by this number verify_unique_clusters : if True, check that there are no overlapping cluster ids across groups add_group_as_column : if True, then the returned value has a column for the group from which the spike came. also_get_features, also_get_waveforms : if True, then the returned value has columns for these as well. n_samples, n_channels : Only necessary if also_get_waveforms. Only one of these two parameters is necessary in that case. Memoization --- Loading is faster if it is done using the binary pandas save and load functions than it is with the ASCII KlustaKwik format. For this reason you can specify that the data be saved as a pandas file, or loaded from a pandas file. These options now default to False because of the potential for accidental mis-use. The reason is that no checking is done whether the current parameters are the same as the previous ones, when the memoization was done. load_memoized : If a file like basename.kkp exists, load this DataFrame and return. Note all other parameters (except basename) are ignored. save_memoized : the data will be written to a file like basename.kkp after loading. Returns: DataFrame with columns 'unit', 'time', and optionally 'group' """ memoized_filename = None # to be determined later, if necessary # load files like basename try: kfs = KKFileSchema.coerce(basename) except ValueError: # This occurs when no spike files are found, but there might still # be kkp files. load_memoized = True memoized_filename = glob.glob(os.path.join(basename, '*.kkp'))[0] # try to load memoized if load_memoized: if memoized_filename is None: memoized_filename = kfs.basename + '.kkp' try: data = pandas.load(memoized_filename) return_early = True except IOError: return_early = False if return_early: return data # which groups to get if groups_to_get: if not hasattr(groups_to_get, '__len__'): groups_to_get = [groups_to_get] else: groups_to_get = kfs.groups # get each group group_d = {} for group in groups_to_get: spiketimes = load_spiketimes(kfs, group, fs) if 'clu' in kfs.available_filetypes: unit_ids = read_clufile(kfs.clufiles[group]) else: unit_ids = np.ones(spike_times.shape) * group if group_multiplier: unit_ids += group_multiplier * group # concatenate into data frame and add to dict if add_group_as_column: group_d[group] = pandas.DataFrame({ spiketimes.name: spiketimes, unit_ids.name: unit_ids, 'group': np.ones(len(spiketimes), dtype=np.int) * group }) else: group_d[group] = pandas.DataFrame({ spiketimes.name: spiketimes, unit_ids.name: unit_ids }) # optionally get features too if also_get_features: assert 'fet' in kfs.available_filetypes # Read the feature file fetfile = kfs.fetfiles[group] features = read_fetfile(fetfile, guess_time_column=True, return_nfeatures=False) # Pop off the time column since we don't need it features.pop('time') # Concatenate to df for this group assert len(features) == len(group_d[group]) group_d[group] = pandas.concat([group_d[group], features], axis=1) # optionally get waveforms too if also_get_waveforms: assert 'spk' in kfs.available_filetypes # Read the spike file # We know the number of spikes, but we need either the number # of samples or the number of channels spkfile = kfs.spkfiles[group] waveforms = read_spkfile(spkfile, n_spikes=len(group_d[group]), n_samples=n_samples, n_channels=n_channels) # Flatten, convert to dataframe, and concatenate to result nsamptot = waveforms.shape[1] * waveforms.shape[2] waveforms_df = pandas.DataFrame( waveforms.swapaxes(1, 2).reshape(waveforms.shape[0], nsamptot), columns=['wf%d' % n for n in range(nsamptot)]) group_d[group] = pandas.concat([group_d[group], waveforms_df], axis=1) # optionally check if groups contain same cluster if verify_unique_clusters: clusters_by_group = [ set(np.unique(np.asarray(groupdata.unit))) for groupdata in group_d.values() ] if len(clusters_by_group) > 0: # find number of unique clusters # will error here if no clusters found n_unique_clusters = len(set.union(*clusters_by_group)) n_total_clusters = sum([len(g) for g in clusters_by_group]) if n_unique_clusters != n_total_clusters: raise ValueError("got %d overlapping clusters" % (n_total_clusters - n_unique_clusters)) # turn list into one giant dataframe for everybody sorted_keys = sorted(group_d.keys()) data = pandas.concat([group_d[key] for key in sorted_keys], ignore_index=True) if save_memoized: data.save(memoized_filename) return data
def load_spikes(data_dir, group, samp_rate, n_samp, n_chan): ''' This function takes the feature, cluster, and spike files in KlustaKwik format and pulls out the features, spike times, spike waveforms for each cluster. Parameters ----------------------------------------- data_dir : path to the directory with the KlustaKwik files group : the group number you want to load samp_rate : the sampling rate of the recording in samples per second n_samp : number of samples for each stored spike in the spike file n_chan : number of channels stored in the spike file Returns ----------------------------------------- out : dict out['features'] : dictionary of clustered features out['times'] : dictionary of clustered spike times out['waveforms'] : dictionary of clustered spike waveforms ''' # Get the clustered data from Klustakwik files kfs = KKFileSchema.coerce(data_dir) # Get the features and spike time stamps feat = kkio.read_fetfile(kfs.fetfiles[group]) features = feat.values[:,:-1] time_stamps = feat.time.values # Get spike cluster labels clu = kkio.read_clufile(kfs.clufiles[group]) # Get the spike waveforms spikes = kkio.read_spkfile(kfs.spkfiles[group]) # Reshape the spike waveforms into a useful form spikes = spikes.reshape((len(spikes)/(n_chan*n_samp), (n_chan*n_samp))) for ii, spike in enumerate(spikes): spikes[ii] = spike.reshape((n_chan, n_samp), order = 'F').reshape(n_chan*n_samp) # Convert spike waveforms into voltage spikes = spikes*(8192.0/2.**16) # Cluster numbers cluster_nums = np.unique(clu.values) # Grouping the indices by cluster cluster_ind = [ np.nonzero(clu.values == n)[0] for n in cluster_nums ] # Get the spike times for each cluster times = [ time_stamps[ind]/np.float(samp_rate) for ind in cluster_ind ] # Get the features for each cluster feats = [ features[ind] for ind in cluster_ind ] # Get the spike waveforms for each cluster spks = [ spikes[ind] for ind in cluster_ind ] # Make a dictionary where each key is the cluster number and the value # is an array of the spike times in that cluster clustered_times = dict(zip(cluster_nums, times)) # Make a dictionary where each key is the cluster number and the value # is an array of the features in that cluster clustered_features = dict(zip(cluster_nums, feats)) # Make a dictionary where each key is the cluster number and the value # is an array of the features in that cluster clustered_waveforms = dict(zip(cluster_nums, spks)) # Let's make sure the spike times for each cluster are sorted correctly for spikes in clustered_times.itervalues(): spikes.sort() out_dict = {'features' : clustered_features, 'times' : clustered_times, 'waveforms' : clustered_waveforms } return out_dict
def from_KK(basename='.', groups_to_get=None, group_multiplier=None, fs=None, verify_unique_clusters=True, add_group_as_column=True, load_memoized=False, save_memoized=False): """Main function for loading KlustaKwik data. basename : path to, or basename of, files group : int or list of groups to get, otherwise get all groups group_multiplier : if None, the cluster ids are used as-is if int, then the group number times this multiplier is added to the cluster id. This is useful if groups contain the same cluster ids but you want them to have unique labels. fs : if None, the times are returned as integer number of samples otherwise, they are divided by this number verify_unique_clusters : if True, check that there are no overlapping cluster ids across groups add_group_as_column : if True, then the returned value has a column for the group from which the spike came. Memoization --- Loading is faster if it is done using the binary pandas save and load functions than it is with the ASCII KlustaKwik format. For this reason you can specify that the data be saved as a pandas file, or loaded from a pandas file. These options now default to False because of the potential for accidental mis-use. The reason is that no checking is done whether the current parameters are the same as the previous ones, when the memoization was done. load_memoized : If a file like basename.kkp exists, load this DataFrame and return. Note all other parameters (except basename) are ignored. save_memoized : the data will be written to a file like basename.kkp after loading. Returns: DataFrame with columns 'unit', 'time', and optionally 'group' """ # load files like basename kfs = KKFileSchema.coerce(basename) # try to load memoized memoized_filename = kfs.basename + '.kkp' if load_memoized: try: data = pandas.load(memoized_filename) return_early = True except IOError: return_early = False if return_early: return data # which groups to get if groups_to_get: if not hasattr(groups_to_get, '__len__'): groups_to_get = [groups_to_get] else: groups_to_get = kfs.groups # get each group group_d = {} for group in groups_to_get: spiketimes = load_spiketimes(kfs, group, fs) if 'clu' in kfs.available_filetypes: unit_ids = read_clufile(kfs.clufiles[group]) else: unit_ids = np.ones(spike_times.shape) * group if group_multiplier: unit_ids += group_multiplier * group # concatenate into data frame and add to dict if add_group_as_column: group_d[group] = pandas.DataFrame( {spiketimes.name: spiketimes, unit_ids.name: unit_ids, 'group': np.ones(len(spiketimes), dtype=np.int) * group}) else: group_d[group] = pandas.DataFrame( {spiketimes.name: spiketimes, unit_ids.name: unit_ids}) # optionally check if groups contain same cluster if verify_unique_clusters: clusters_by_group = [ set(np.unique(np.asarray(groupdata.unit))) for groupdata in group_d.values()] n_unique_clusters = len(set.union(*clusters_by_group)) n_total_clusters = sum([len(g) for g in clusters_by_group]) if n_unique_clusters != n_total_clusters: raise ValueError("got %d overlapping clusters" % (n_total_clusters - n_unique_clusters)) # turn list into one giant dataframe for everybody sorted_keys = sorted(group_d.keys()) data = pandas.concat([group_d[key] for key in sorted_keys], ignore_index=True) if save_memoized: data.save(memoized_filename) return data
def load_spikes(data_dir, group, samp_rate, n_samp, n_chan): ''' This function takes the feature, cluster, and spike files in KlustaKwik format and pulls out the features, spike times, spike waveforms for each cluster. Arguments --------- data_dir : path to the directory with the KlustaKwik files group : the group number you want to load samp_rate : the sampling rate of the recording in samples per second n_samp : number of samples for each stored spike in the spike file n_chan : number of channels stored in the spike file Returns ------- out : dict of numpy structured arrays A dictionary of the clusters. The keys are the cluster numbers. The values are numpy structured arrays with fields 'times', 'waveforms', and 'pca' which give the timestamp, tetrode waveform, and pca reduced values, respectively, for each spike in the cluster. ''' from KKFileSchema import KKFileSchema import os import kkio # Get the clustered data from Klustakwik files kfs = KKFileSchema.coerce(data_dir) # Get the spike features, time stamps, cluster labels, and waveforms feat = kkio.read_fetfile(kfs.fetfiles[group]) features = feat.values[:, :-1] time_stamps = feat.time.values cluster_labels = kkio.read_clufile(kfs.clufiles[group]) spikes = kkio.read_spkfile(kfs.spkfiles[group]) # Reshaping the spike waveforms into a useful form spikes = spikes.reshape( (len(spikes) / (n_chan * n_samp), (n_chan * n_samp))) for ii, spike in enumerate(spikes): spikes[ii] = spike.reshape((n_chan, n_samp), order='F').reshape(n_chan * n_samp) INT_TO_VOLT = 4096.0 / 2.0**15 # uV per bit spikes = spikes * INT_TO_VOLT cluster_ids = np.unique(cluster_labels.values) cluster_indices = { cid: np.where(cluster_labels.values == cid)[0] for cid in cluster_ids } clusters = dict.fromkeys(cluster_ids) dtypes = [('times', 'f8'), ('waveforms', 'f8', n_chan * n_samp), ('pca', 'f8', len(features[0]))] for cid, indices in cluster_indices.iteritems(): clusters[cid] = np.zeros(len(indices), dtype=dtypes) clusters[cid]['times'] = time_stamps[indices] / np.float(samp_rate) clusters[cid]['waveforms'] = spikes[indices] clusters[cid]['pca'] = features[indices] clusters[cid].sort(order=['times']) return clusters