Exemplo n.º 1
0
def branch_cluster(features, *, branch_depth=2, npca=10):
    if features.size == 0:
        return np.array([])

    min_size_to_try_split = 20
    labels1 = cluster(features, npca=npca).ravel().astype('int64')
    if np.min(labels1) < 0:
        tmp_fname = '/tmp/isosplit5-debug-features.mda'
        mdaio.writemda32(features, tmp_fname)
        raise Exception(
            'Unexpected error in isosplit5. Features written to {}'.format(
                tmp_fname))
    K = int(np.max(labels1))
    if K <= 1 or branch_depth <= 1:
        return labels1
    label_offset = 0
    labels_new = np.zeros(labels1.shape, dtype='int64')
    for k in range(1, K + 1):
        inds_k = np.where(labels1 == k)[0]
        if len(inds_k) > min_size_to_try_split:
            labels_k = branch_cluster(features[:, inds_k],
                                      branch_depth=branch_depth - 1,
                                      npca=npca)
            K_k = int(np.max(labels_k))
            labels_new[inds_k] = label_offset + labels_k
            label_offset += K_k
        else:
            labels_new[inds_k] = label_offset + 1
            label_offset += 1
    return labels_new
Exemplo n.º 2
0
def bandpass_filter(timeseries,
                    timeseries_out,
                    samplerate,
                    freq_min,
                    freq_max,
                    freq_wid=1000,
                    padding=3000,
                    chunk_size=3000 * 10,
                    num_processes=os.cpu_count()):
    """
    Apply a bandpass filter to a multi-channel timeseries

    Parameters
    ----------
    timeseries : INPUT
        MxN raw timeseries array (M = #channels, N = #timepoints)
        
    timeseries_out : OUTPUT
        Filtered output (MxN array)
        
    samplerate : float
        The sampling rate in Hz
    freq_min : float
        The lower endpoint of the frequency band (Hz)
    freq_max : float
        The upper endpoint of the frequency band (Hz)
    freq_wid : float
        The optional width of the roll-off (Hz)
    """
    X = mdaio.DiskReadMda(timeseries)
    M = X.N1()  # Number of channels
    N = X.N2()  # Number of timepoints

    num_chunks = int(np.ceil(N / chunk_size))
    print('Chunk size: {}, Padding: {}, Num chunks: {}, Num processes: {}'.
          format(chunk_size, padding, num_chunks, num_processes))

    opts = {
        "timeseries": timeseries,
        "timeseries_out": timeseries_out,
        "samplerate": samplerate,
        "freq_min": freq_min,
        "freq_max": freq_max,
        "freq_wid": freq_wid,
        "chunk_size": chunk_size,
        "padding": padding,
        "num_processes": num_processes,
        "num_chunks": num_chunks
    }

    global g_shared_data
    g_shared_data = SharedChunkInfo(num_chunks)
    global g_opts
    g_opts = opts
    mdaio.writemda32(np.zeros([M, 0]), timeseries_out)

    pool = multiprocessing.Pool(processes=num_processes)
    pool.map(filter_chunk, range(num_chunks), chunksize=1)
    return True
Exemplo n.º 3
0
def whiten(*,
        timeseries,timeseries_out,
        chunk_size=30000*10,num_processes=os.cpu_count()
        ):
    """
    Whiten a multi-channel timeseries

    Parameters
    ----------
    timeseries : INPUT
        MxN raw timeseries array (M = #channels, N = #timepoints)
        
    timeseries_out : OUTPUT
        Whitened output (MxN array)

    """
    X=mdaio.DiskReadMda(timeseries)
    M=X.N1() # Number of channels
    N=X.N2() # Number of timepoints

    num_chunks_for_computing_cov_matrix=10
    
    num_chunks=int(np.ceil(N/chunk_size))
    print ('Chunk size: {}, Num chunks: {}, Num processes: {}'.format(chunk_size,num_chunks,num_processes))
    
    opts={
        "timeseries":timeseries,
        "timeseries_out":timeseries_out,
        "chunk_size":chunk_size,
        "num_processes":num_processes,
        "num_chunks":num_chunks
    }
    global g_opts
    g_opts=opts
    
    pool = multiprocessing.Pool(processes=num_processes)
    step=int(np.maximum(1,np.floor(num_chunks/num_chunks_for_computing_cov_matrix)))
    AAt_matrices=pool.map(compute_AAt_matrix_for_chunk,range(0,num_chunks,step),chunksize=1)
    
    AAt=np.zeros((M,M),dtype='float64')
    
    for M0 in AAt_matrices:
        AAt+=M0/(len(AAt_matrices)*chunk_size) ##important: need to fix the denominator here to account for possible smaller chunk
    
    U, S, Ut = np.linalg.svd(AAt, full_matrices=True)
    
    W = (U @ np.diag(1/np.sqrt(S))) @ Ut
    #print ('Whitening matrix:')
    #print (W)
    
    global g_shared_data
    g_shared_data=SharedChunkInfo(num_chunks)
    mdaio.writemda32(np.zeros([M,0]),timeseries_out)
    
    pool = multiprocessing.Pool(processes=num_processes)
    pool.starmap(whiten_chunk,[(num,W) for num in range(0,num_chunks)],chunksize=1)
    
    return True
Exemplo n.º 4
0
def test_mask_out_artifacts():
    
    # Create noisy array
    samplerate = int(48e3)
    duration = 30 # seconds
    n_samples = samplerate*duration
    noise_amplitude = 5
    noise = noise_amplitude*np.random.normal(0,1,n_samples)
    standard_dev = np.std(noise)
    
     # add three artefacts
    n_artifacts = 3
    artifacts = np.zeros_like(noise)
    artifact_duration = int(0.2*samplerate) # samples
    artifact_signal = np.zeros((n_artifacts, artifact_duration))

    for i in np.arange(n_artifacts):                   
        artifact_signal[i, :] = noise_amplitude*np.random.normal(0,6,artifact_duration)

    artifact_indices = np.tile(np.arange(artifact_duration), (3,1))

    artifact_shift = np.array([int(n_samples*0.10), int(n_samples*0.20), int(n_samples*0.70)])

    artifact_indices += artifact_shift.reshape((-1,1))

    for i, indices in enumerate(artifact_indices):
        artifacts[indices] = artifact_signal[i,:]

    signal = noise + artifacts

    timeseries = 'test_mask.mda'
    timeseries_out = 'masked.mda' 
    
    # write as mda
    mdaio.writemda32(signal.reshape((1,-1)), timeseries)
    
    # run the mask artefacts
    mask_out_artifacts(timeseries=timeseries, timeseries_out=timeseries_out, threshold=6, chunk_size=2000, 
                       num_write_chunks=150)
    
    # check that they are gone 
    read_data = mdaio.readmda(timeseries).reshape((-1,1))
    masked_data = mdaio.readmda(timeseries_out).reshape((-1,1))

    indices_masked = sum(masked_data[artifact_indices,0].flatten() == 0)
    total_indices_to_mask = len(artifact_indices.flatten())
    masked = indices_masked == total_indices_to_mask
    
    os.remove(timeseries)
    os.remove(timeseries_out)
    
    if masked:
        print('Artifacts 100% masked')
        return True
    else:
        print('Artifacts %.2f%% masked' % (100*(indices_masked/total_indices_to_mask)))
        return False
Exemplo n.º 5
0
def test_compute_templates():
    M, N, K, T, L = 5, 1000, 6, 50, 100
    X = np.random.rand(M, N)
    mdaio.writemda32(X, 'tmp.mda')
    F = np.zeros((3, L))
    F[1, :] = 1 + np.random.randint(N, size=(1, L))
    F[2, :] = 1 + np.random.randint(K, size=(1, L))
    mdaio.writemda64(F, 'tmp2.mda')
    ret = compute_templates(timeseries='tmp.mda',
                            firings='tmp2.mda',
                            templates_out='tmp3.mda',
                            clip_size=T)
    assert (ret)
    templates0 = mdaio.readmda('tmp3.mda')
    assert (templates0.shape == (M, T, K))
    return True
Exemplo n.º 6
0
def compute_templates(*, timeseries, firings, templates_out, clip_size=100):
    """
    Compute templates (average waveforms) for clusters defined by the labeled events in firings.

    Parameters
    ----------
    timeseries : INPUT
        Path of timeseries mda file (MxN) from which to draw the event clips (snippets) for computing the templates. M is number of channels, N is number of timepoints.
    firings : INPUT
        Path of firings mda file (RxL) where R>=3 and L is the number of events. Second row are timestamps, third row are integer labels.    
        
    templates_out : OUTPUT
        Path of output mda file (MxTxK). T=clip_size, K=maximum cluster label. Note that empty clusters will correspond to a template of all zeros. 
        
    clip_size : int
        (Optional) clip size, aka snippet size, number of timepoints in a single template
    """
    templates = compute_templates_helper(timeseries=timeseries,
                                         firings=firings,
                                         clip_size=clip_size)
    return mdaio.writemda32(templates, templates_out)
Exemplo n.º 7
0
    return True


bandpass_filter.name = 'ephys.bandpass_filter'
bandpass_filter.version = '0.1'

if __name__ == "__main__":

    samplerate = int(3e4)
    freq_min = 250
    freq_max = 6000
    data_dir = '../ephys_preprocessing/'

    raw_data_ch1 = np.asarray(
        sio.loadmat(os.path.join(data_dir, 'raw_data_ch1.mat'))['data'])
    mdaio.writemda32(raw_data_ch1, os.path.join(data_dir, 'raw_data_ch1.mda'))
    timeseries = os.path.join(data_dir, 'raw_data_ch1.mda')
    timeseries_out = os.path.join(data_dir, 'filtered_raw_data_ch1.mda')
    bandpass_filter(timeseries, timeseries_out, samplerate, freq_min, freq_max)
    filtered_data = mdaio.readmda(
        os.path.join(data_dir, 'filtered_raw_data_ch1.mda'))

    detrended_data_ch1 = np.asarray(
        sio.loadmat(os.path.join(data_dir, 'detrended_data_ch1.mat'))['copy'])
    mdaio.writemda32(detrended_data_ch1,
                     os.path.join(data_dir, 'detrended_data_ch1.mda'))
    timeseries = os.path.join(data_dir, 'detrended_data_ch1.mda')
    timeseries_out = os.path.join(data_dir, 'filtered_detrended_data_ch1.mda')
    bandpass_filter(timeseries, timeseries_out, samplerate, freq_min, freq_max)
    filtered_data_detrended = mdaio.readmda(
        os.path.join(data_dir, 'filtered_detrended_data_ch1.mda'))
Exemplo n.º 8
0
def synthesize_timeseries(*,
                          firings='',
                          waveforms='',
                          timeseries_out,
                          noise_level=1,
                          samplerate=30000,
                          duration=60,
                          waveform_upsamplefac,
                          amplitudes_row=0):
    """
    Synthesize an electrophysiology timeseries from a set of ground-truth firing events and waveforms

    Parameters
    ----------
    firings : INPUT
        (Optional) The path of firing events file in .mda format. RxL where R>=3 and L is the number of events. Second row is the timestamps, third row is the integer labels/
    waveforms : INPUT
        (Optional) The path of (possibly upsampled) waveforms file in .mda format. Mx(T*waveform_upsample_factor)*K, where M is the number of channels, T is the clip size, and K is the number of units.
    
    timeseries_out : OUTPUT
        The output path for the new timeseries. MxN

    noise_level : double
        (Optional) Standard deviation of the simulated background noise added to the timeseries
    samplerate : double
        (Optional) Sample rate for the synthetic dataset in Hz
    duration : double
        (Optional) Duration of the synthetic dataset in seconds. The number of timepoints will be duration*samplerate
    waveform_upsamplefac : int
        (Optional) The upsampling factor corresponding to the input waveforms. (avoids digitization artifacts)
    amplitudes_row : int
        (Optional) If positive, this is the row in the firings arrays where the amplitude scale factors are found. Otherwise, use all 1's
    """
    num_timepoints = np.int64(samplerate * duration)
    waveform_upsamplefac = int(waveform_upsamplefac)

    if type(waveforms) == str:
        if waveforms:
            W = mdaio.readmda(waveforms)
        else:
            W = np.zeros((4, 100 * waveform_upsamplefac, 0))
    else:
        W = waveforms

    if type(firings) == str:
        if firings:
            F = mdaio.readmda(firings)
        else:
            F = np.zeros((3, 0))
    else:
        F = firings

    times = F[1, :]
    labels = F[2, :].astype('int')

    M, TT, K = W.shape[0], W.shape[1], W.shape[2]
    T = int(TT / waveform_upsamplefac)
    Tmid = int(np.ceil((T + 1) / 2 - 1))

    N = num_timepoints
    if (N == 0):
        if times.size == 0:
            N = T
        else:
            N = max(times) + T

    X = np.random.randn(M, N) * noise_level

    waveform_list = []
    for k in range(K):
        waveform0 = W[:, :, k - 1]
        waveform_list.append(waveform0)

    for j in range(times.size):
        t0 = times[j]
        k0 = labels[j]
        amp0 = 1
        if amplitudes_row > 0:
            amp0 = F[amplitudes_row - 1, j]
        waveform0 = waveform_list[k0 - 1]
        frac_offset = int(np.floor((t0 - np.floor(t0)) * waveform_upsamplefac))
        tstart = np.int64(np.floor(t0)) - Tmid
        if (0 <= tstart) and (tstart + T <= N):
            X[:, tstart:tstart +
              T] = X[:, tstart:tstart +
                     T] + waveform0[:,
                                    frac_offset::waveform_upsamplefac] * amp0

    if timeseries_out:
        return mdaio.writemda32(X, timeseries_out)
    else:
        return (X)
Exemplo n.º 9
0
def mask_out_artifacts(*,
                       timeseries,
                       timeseries_out,
                       threshold=6,
                       chunk_size=2000,
                       num_write_chunks=150,
                       num_processes=os.cpu_count()):
    """
    Masks out artifacts. Each chunk will be analyzed, and if the square root of the
    RSS of the chunk is above threshold, all the samples in this chunk (and neighboring chunks)
    will be set to zero.

    Parameters
    ----------
    timeseries : INPUT
        MxN raw timeseries array (M = #channels, N = #timepoints)

    timeseries_out : OUTPUT
        masked output (MxN array)

    threshold : int
        Number of standard deviations away from the mean to consider as artifacts (default of 6).
    chunk_size : int
        This chunk size will be the number of samples that will be set to zero if the square root RSS of this chunk is above threshold.
    num_write_chunks : int
        How many chunks will be simultaneously written to the timeseries_out path (default of 150).
    """

    if threshold == 0 or chunk_size == 0 or num_write_chunks == 0:
        print(
            "Problem with input parameters. Either threshold, num_write_chunks, or chunk_size is zero.\n"
        )
        return False

    write_chunk_size = chunk_size * num_write_chunks

    opts = {
        "timeseries": timeseries,
        "timeseries_out": timeseries_out,
        "chunk_size": chunk_size,
        "num_processes": num_processes,
        "num_write_chunks": num_write_chunks,
        "write_chunk_size": write_chunk_size,
    }

    global g_opts
    g_opts = opts

    X = mdaio.DiskReadMda(timeseries)

    M = X.N1()  # Number of channels
    N = X.N2()  # Number of timepoints

    # compute norms of chunks
    num_chunks = int(np.ceil(N / chunk_size))
    num_write = int(np.ceil(N / write_chunk_size))

    norms = np.zeros((M, num_chunks))  # num channels x num_chunks

    for i in np.arange(num_chunks):
        t1 = int(i * chunk_size)  # first timepoint of the chunk
        t2 = int(np.minimum(N,
                            (t1 + chunk_size)))  # last timepoint of chunk (+1)

        chunk = X.readChunk(i1=0, N1=X.N1(), i2=t1,
                            N2=t2 - t1).astype(np.float32)  # Read the chunk

        norms[:, i] = np.sqrt(np.sum(chunk**2,
                                     axis=1))  # num_channels x num_chunks

    # determine which chunks to use
    use_it = np.ones(num_chunks)  # initialize use_it array

    for m in np.arange(M):
        vals = norms[m, :]

        sigma0 = np.std(vals)
        mean0 = np.mean(vals)

        artifact_indices = np.where(vals > mean0 + sigma0 * threshold)[0]

        # check if the first chunk is above threshold, ensure that we don't use negative indices later
        negIndBool = np.where(artifact_indices > 0)[0]

        # check if the last chunk is above threshold to avoid a IndexError
        maxIndBool = np.where(artifact_indices < num_chunks - 1)[0]

        use_it[artifact_indices] = 0
        use_it[artifact_indices[negIndBool] -
               1] = 0  # don't use the neighbor chunks either
        use_it[artifact_indices[maxIndBool] +
               1] = 0  # don't use the neighbor chunks either

        print("For channel %d: mean=%.2f, stdev=%.2f, chunk size = %d\n" %
              (m, mean0, sigma0, chunk_size))

    global g_shared_data
    g_shared_data = SharedChunkInfo(num_write)

    mdaio.writemda32(
        np.zeros([M, 0]), timeseries_out
    )  # create initial file w/ empty array so we can append to it

    pool = multiprocessing.Pool(processes=num_processes)
    # pool.starmap(mask_chunk,[(num,use_it[num]) for num in range(0,num_chunks)],chunksize=1)
    pool.starmap(
        mask_chunk,
        [(num, use_it[num * num_write_chunks:(num + 1) * num_write_chunks])
         for num in range(0, num_write)],
        chunksize=1)

    num_timepoints_used = sum(use_it)
    num_timepoints_not_used = sum(use_it == 0)
    print("Using %.2f%% of all timepoints.\n" %
          (num_timepoints_used * 100.0 /
           (num_timepoints_used + num_timepoints_not_used)))
    return True
Exemplo n.º 10
0
def synthesize_random_waveforms(*,waveforms_out=None,geometry_out=None,M=5,T=500,K=20,upsamplefac=13,timeshift_factor=3,average_peak_amplitude=10):
    """
    Synthesize random waveforms for use in creating a synthetic timeseries dataset

    Parameters
    ----------
    waveforms_out : OUTPUT
        Path to waveforms mda file. Mx(T*upsamplefac)xK
    geometry_out : OUTPUT
        (Optional) Path to geometry csv file
    M : int
        (Optional) Number of channels
    T : int
        (Optional) Number of timepoints for a waveform, before upsampling
    K : int
        (Optional) Number of waveforms to synthesize
    timeshift_factor : int
        (Optional) Controls amount of timeshift between waveforms on different channels for each template
    upsamplefac : int
        (Optional) used for upsampling the waveforms to avoid discretization artifacts
    average_peak_amplitude : float
        (Optional) used to scale the peak spike amplitude 
    """    
    geometry=None
    avg_durations=[200,10,30,200]
    avg_amps=[0.5,10,-1,0]
    rand_durations_stdev=[10,4,6,20]
    rand_amps_stdev=[0.2,3,0.5,0]
    rand_amp_factor_range=[0.5,1]
    geom_spread_coef1=0.2
    geom_spread_coef2=1
    
    if not geometry:
        geometry=np.zeros((2,M))
        geometry[0,:]=np.arange(1,M+1)
    
    geometry=np.array(geometry)
    avg_durations=np.array(avg_durations)
    avg_amps=np.array(avg_amps)
    rand_durations_stdev=np.array(rand_durations_stdev)
    rand_amps_stdev=np.array(rand_amps_stdev)
    rand_amp_factor_range=np.array(rand_amp_factor_range)
    
    neuron_locations=get_default_neuron_locations(M,K,geometry)

    ## The waveforms_out
    WW=np.zeros((M,T*upsamplefac,K))
    
    for k in range(1,K+1):
        for m in range(1,M+1):
            diff=neuron_locations[:,k-1]-geometry[:,m-1]
            dist=np.sqrt(np.sum(diff**2))
            durations0=np.maximum(np.ones(avg_durations.shape),avg_durations+np.random.randn(1,4)*rand_durations_stdev)*upsamplefac
            amps0=avg_amps+np.random.randn(1,4)*rand_amps_stdev
            waveform0=synthesize_single_waveform(N=T*upsamplefac,durations=durations0,amps=amps0)
            waveform0=np.roll(waveform0,int(timeshift_factor*dist*upsamplefac))
            waveform0=waveform0*np.random.uniform(rand_amp_factor_range[0],rand_amp_factor_range[1])
            WW[m-1,:,k-1]=waveform0/(geom_spread_coef1+dist*geom_spread_coef2)

    peaks=np.max(np.abs(WW),axis=(0,1))
    WW=WW/np.mean(peaks)*average_peak_amplitude

    if waveforms_out:
        mdaio.writemda32(WW,waveforms_out)
        if geometry_out:
            np.savetxt(geometry_out,geometry.transpose(),delimiter=",",fmt="%g")
            return True
        else:
            return True
    else:
        return (WW,geometry)