def integrator_solve(df):
    cum_vec = np.array(np.cumsum(df['ct']))
    binheaders = utils.get_column_headers(df)
    n_bins = 1000
    n_batches = len(binheaders)
    f_binned = sp.zeros((n_batches,n_bins))
    bins = np.linspace(cum_vec[-1]/1000-1,cum_vec[-1]-1,1000,dtype=int)
    for i in range(n_bins):
         for j in range(n_batches):
             batch_name = binheaders[j]
             f_binned[j,i] = scipy.integrate.quad(integrand_1,bins[i],bins[i+1])[0]
    f_reg = scipy.ndimage.gaussian_filter1d(f_binned,0.04*n_bins,axis=0)
    f_reg = f_reg/f_reg.sum()

    # compute marginal probabilities
    p_b = sp.sum(f_reg,axis=1)
    p_s = sp.sum(f_reg,axis=0)

    # finally sum to compute the MI
    MI = 0
    for j in range(n_batches):
        for i in range(n_bins):
            if f_reg[i,j] != 0:
                MI = MI + f_reg[i,j]*sp.log2(f_reg[i,j]/(p_b[i]*p_s[j]))
    return MI
Exemplo n.º 2
0
def integrator_solve(df):
    cum_vec = np.array(np.cumsum(df['ct']))
    binheaders = utils.get_column_headers(df)
    n_bins = 1000
    n_batches = len(binheaders)
    f_binned = sp.zeros((n_batches,n_bins))
    bins = np.linspace(cum_vec[-1]/1000-1,cum_vec[-1]-1,1000,dtype=int)
    for i in range(n_bins):
         for j in range(n_batches):
             batch_name = binheaders[j]
             f_binned[j,i] = scipy.integrate.quad(integrand_1,bins[i],bins[i+1])[0]
    f_reg = scipy.ndimage.gaussian_filter1d(f_binned,0.04*n_bins,axis=0)
    f_reg = f_reg/f_reg.sum()

    # compute marginal probabilities
    p_b = sp.sum(f_reg,axis=1)
    p_s = sp.sum(f_reg,axis=0)

    # finally sum to compute the MI
    MI = 0
    for j in range(n_batches):
        for i in range(n_bins):
            if f_reg[i,j] != 0:
                MI = MI + f_reg[i,j]*sp.log2(f_reg[i,j]/(p_b[i]*p_s[j]))
    return MI
def alt2(df):
    '''This is our standard mutual information calculator which is called when
        lm=mi. It is the fastest currently, but requires building a large matrix
        and so takes a lot of memory and can't run the mpra data set on my computer.'''
    n_bins=1000
    n_seqs = len(df.index)
    
    
    binheaders = utils.get_column_headers(df)
    ct_vec = np.array(df['ct'],dtype=int)
    
    n_batches = len(binheaders)
    '''Create a huge matrix with each column equal to an instance of the (normalized 
        such that sum of column equals 1) sequence. The total number of columns
        that each sequence gets is equal to its total number of counts.''' 
    f = np.repeat(np.array(df[binheaders]),ct_vec,axis=0)
    zlen = f.shape[0]
    #do a cumulative sum
    f_binned = f.cumsum(axis=0)
    #Now we are going to bin. First find bin edges.
    bins = np.linspace(zlen/1000-1,zlen-1,1000,dtype=int)
    '''We want to basically sum our original f matrix within each bin. We currently
        have a cumulative sum of this, so first we will select the entries at
        each bin edge.'''
    f_binned = f_binned[bins,:]
    #subtract off previous entry to get only summation within each range
    f_binned[1:,:] = np.subtract(f_binned[1:,:],f_binned[:-1,:])
    #convolve with gaussian
    f_reg = scipy.ndimage.gaussian_filter1d(f_binned,0.04*n_bins,axis=0)
    #regularize
    f_reg = f_reg/f_reg.sum()

    # compute marginal probabilities
    p_b = sp.sum(f_reg,axis=1)
    p_s = sp.sum(f_reg,axis=0)

    # finally sum to compute the MI
    MI = 0
    for j in range(n_batches):
        for i in range(n_bins):
            if f_reg[i,j] != 0:
                MI = MI + f_reg[i,j]*sp.log2(f_reg[i,j]/(p_b[i]*p_s[j]))
    return MI
Exemplo n.º 4
0
def alt2(df):
    '''This is our standard mutual information calculator which is called when
        lm=mi. It is the fastest currently, but requires building a large matrix
        and so takes a lot of memory and can't run the mpra data set on my computer.'''
    n_bins=1000
    n_seqs = len(df.index)
    
    
    binheaders = utils.get_column_headers(df)
    ct_vec = np.array(df['ct'],dtype=int)
    
    n_batches = len(binheaders)
    '''Create a huge matrix with each column equal to an instance of the (normalized 
        such that sum of column equals 1) sequence. The total number of columns
        that each sequence gets is equal to its total number of counts.''' 
    f = np.repeat(np.array(df[binheaders]),ct_vec,axis=0)
    zlen = f.shape[0]
    #do a cumulative sum
    f_binned = f.cumsum(axis=0)
    #Now we are going to bin. First find bin edges.
    bins = np.linspace(zlen/1000-1,zlen-1,1000,dtype=int)
    '''We want to basically sum our original f matrix within each bin. We currently
        have a cumulative sum of this, so first we will select the entries at
        each bin edge.'''
    f_binned = f_binned[bins,:]
    #subtract off previous entry to get only summation within each range
    f_binned[1:,:] = np.subtract(f_binned[1:,:],f_binned[:-1,:])
    #convolve with gaussian
    f_reg = scipy.ndimage.gaussian_filter1d(f_binned,0.04*n_bins,axis=0)
    #regularize
    f_reg = f_reg/f_reg.sum()

    # compute marginal probabilities
    p_b = sp.sum(f_reg,axis=1)
    p_s = sp.sum(f_reg,axis=0)

    # finally sum to compute the MI
    MI = 0
    for j in range(n_batches):
        for i in range(n_bins):
            if f_reg[i,j] != 0:
                MI = MI + f_reg[i,j]*sp.log2(f_reg[i,j]/(p_b[i]*p_s[j]))
    return MI
def alt4(df, coarse_graining_level = 0.01):
    '''
    MI ESTIMATOR EDITED BY JBK 
    Used when lm=memsaver 
    REQUIRES TESTING AND PROFILING.
    '''
    n_groups=500
    n_seqs = len(df.index)
    binheaders = utils.get_column_headers(df)
    n_batches = len(binheaders)
    cts_grouped = sp.zeros([n_groups,n_batches])
    group_num = 0
    frac_empty = 1.0
    
    #copy dataframe
    tmp_df = df.copy(binheaders+['val'])

    # Speed computation by coarse-graining model predictions
    if coarse_graining_level:
        assert type(coarse_graining_level)==float
        assert coarse_graining_level > 0
        vals = tmp_df['val'].values
        scale = np.std(vals)
        coarse_vals = np.floor((vals/scale)/coarse_graining_level)
        tmp_df['val'] = coarse_vals
        grouped = tmp_df.groupby('val')
        grouped_tmp_df = grouped.aggregate(np.sum)
        grouped_tmp_df.sort_index(inplace=True)
    else:
        grouped_tmp_df = tmp_df
        grouped_tmp_df.sort_values(by='val',inplace=True)
    # Get ct_xxx columns
    ct_df = grouped_tmp_df[binheaders].astype(float)
    cts_per_group = ct_df.sum(axis=0).sum()/n_groups
    # Histogram counts in groups. This is a bit tricky
    group_vec = np.zeros(n_batches)
    for i,row in ct_df.iterrows():
        row_ct_tot = row.sum()
        row_ct_vec = row.values
        row_frac_vec = row_ct_vec/row_ct_tot 

        while row_ct_tot >= cts_per_group*frac_empty:
            group_vec = group_vec + row_frac_vec*(cts_per_group*frac_empty)
            row_ct_tot -= cts_per_group*frac_empty

            # Only do once per group_num
            cts_grouped[group_num,:] = group_vec.copy() 
            # Reset for new group_num
            group_num += 1
            frac_empty = 1.0
            group_vec[:] = 0.0
        group_vec += row_frac_vec*row_ct_tot
        
        frac_empty -= row_ct_tot/cts_per_group
    if group_num == n_groups-1:
        cts_grouped[group_num,:] = group_vec.copy()
    elif group_num == n_groups:
        pass
    else:
        raise TypeError(\
            'group_num=%d does not match n_groups=%s'%(group_num,n_groups))
    # Smooth empirical distribution with gaussian KDE
    f_reg = scipy.ndimage.gaussian_filter1d(cts_grouped,0.04*n_groups,axis=0)

    # Return mutual information
    return info.mutualinfo(f_reg)
Exemplo n.º 6
0
def alt4(df, coarse_graining_level = 0.01,return_freg=False):
    '''
    MI ESTIMATOR EDITED BY JBK 
    Used when lm=memsaver 
    REQUIRES TESTING AND PROFILING.
    '''
    n_groups=500
    n_seqs = len(df.index)
    binheaders = utils.get_column_headers(df)
    n_batches = len(binheaders)
    cts_grouped = sp.zeros([n_groups,n_batches])
    group_num = 0
    frac_empty = 1.0
    
    #copy dataframe
    tmp_df = df.copy(binheaders+['val'])

    # Speed computation by coarse-graining model predictions
    if coarse_graining_level:
        assert type(coarse_graining_level)==float
        assert coarse_graining_level > 0
        vals = tmp_df['val'].values
        scale = np.std(vals)
        coarse_vals = np.floor((vals/scale)/coarse_graining_level)
        tmp_df['val'] = coarse_vals
        grouped = tmp_df.groupby('val')
        grouped_tmp_df = grouped.aggregate(np.sum)
        grouped_tmp_df.sort_index(inplace=True)
    else:
        grouped_tmp_df = tmp_df
        grouped_tmp_df.sort_values(by='val',inplace=True)
    # Get ct_xxx columns
    ct_df = grouped_tmp_df[binheaders].astype(float)
    cts_per_group = ct_df.sum(axis=0).sum()/n_groups
    # Histogram counts in groups. This is a bit tricky
    group_vec = np.zeros(n_batches)
    for i,row in ct_df.iterrows():
        row_ct_tot = row.sum()
        row_ct_vec = row.values
        row_frac_vec = row_ct_vec/row_ct_tot 

        while row_ct_tot >= cts_per_group*frac_empty:
            group_vec = group_vec + row_frac_vec*(cts_per_group*frac_empty)
            row_ct_tot -= cts_per_group*frac_empty

            # Only do once per group_num
            cts_grouped[group_num,:] = group_vec.copy() 
            # Reset for new group_num
            group_num += 1
            frac_empty = 1.0
            group_vec[:] = 0.0
        group_vec += row_frac_vec*row_ct_tot
        
        frac_empty -= row_ct_tot/cts_per_group
    if group_num == n_groups-1:
        cts_grouped[group_num,:] = group_vec.copy()
    elif group_num == n_groups:
        pass
    else:
        raise TypeError(\
            'group_num=%d does not match n_groups=%s'%(group_num,n_groups))
    # Smooth empirical distribution with gaussian KDE
    f_reg = scipy.ndimage.gaussian_filter1d(cts_grouped,0.08*n_groups,axis=0)

    # Return mutual information
    if return_freg:
       return info.mutualinfo(f_reg),f_reg
    else:
       return info.mutualinfo(f_reg)