Exemplo n.º 1
0
def NB_PCAclassification(train_set,test_set,train_ann,test_ann):

    global path, train_perc

    #Create z-scored data
    normalized_train = zscore(train_set)
    
    #create classifier object
    gaussian_nb=GaussianNB()
        
    #Create PCA object
    pca = PCA(n_components=20)
    pca.fit(train_set)
    normalized_train = pca.transform(train_set)

    #train the NB classifier
    gaussian_nb.fit(normalized_train, train_ann)


    #store the classifier and the pca object
    if train_perc<1.0:
    	pickle.dump( gaussian_nb, open(path+"GaussianNB_classifier.p", "wb+" ) )
    	pickle.dump( pca, open(path+"PCA_object.p", "wb+"))

    #convert test data to suitable format and test the NB classifier
    normalized_test = zscore(test_set)
    test = pca.transform(test_set)
    results = gaussian_nb.predict(test)

    cm = confusion_matrix(test_ann, results)

    print 'CONFUSION MATRIX = {}'.format(cm)
    return metrics(cm)
Exemplo n.º 2
0
def clean_confound(RS, COG, confmat):
    '''
    clean things and zscore things
    '''

    # regress out confound
    z_confound = zscore(confmat)
    # squared measures to help account for potentially nonlinear effects of these confounds
    z2_confound = z_confound**2
    conf_mat = np.hstack((z_confound, z2_confound))

    # Handle nan in z scores
    conf_mat = np.nan_to_num(conf_mat)

    # clean signal
    RS_clean = clean(zscore(RS),
                     confounds=conf_mat,
                     detrend=False,
                     standardize=False)
    COG_clean = clean(zscore(COG),
                      confounds=conf_mat,
                      detrend=False,
                      standardize=False)

    return RS_clean, COG_clean, conf_mat
Exemplo n.º 3
0
def stadardize(data, drop_vars, drop):
    if drop == 1:
        temp_data = data.drop(drop_vars, axis=1, inplace = False)
        po.DataFram`e(zscore(temp_data), columns=temp_data.columns)
    else:
        po.DataFrame(zscore(data), columns=data.columns)
    return data
Exemplo n.º 4
0
def manifold_plot(man,
                  fpkmMatrix,
                  samples,
                  standardize=3,
                  log=True,
                  show_text=False,
                  sep='_',
                  legend_loc='best',
                  legend_size=14):
    # man: the instance of a manifold algorithm
    ## preprocessing of the fpkmMatrix
    if log:
        fpkmMatrix = np.log10(fpkmMatrix + 1.)
    if standardize == 2:  # standardize along rows/genes
        fpkmMatrix = zscore(fpkmMatrix, axis=1)
    elif standardize == 1:  # standardize along cols/samples
        fpkmMatrix = zscore(fpkmMatrix, axis=0)

    fpkmMatrix = man.fit_transform(fpkmMatrix.T)
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    scatter_proxies = []
    labels_show = []
    groups = {}
    conditions = list(set([s.split(sep)[0] for s in samples]))

    for row, label in zip(fpkmMatrix, samples):
        label_show = label.split(sep)[0]
        idx = conditions.index(label_show)
        ax.scatter(row[0],
                   row[1],
                   label='label',
                   color=COLORS10[idx],
                   visible=not show_text,
                   s=50,
                   marker='o')
        if label_show not in labels_show:
            labels_show.append(label_show)
            scatter1_proxy = Line2D([0], [0],
                                    ls="none",
                                    c=COLORS10[idx],
                                    marker='o')
            scatter_proxies.append(scatter1_proxy)
        if show_text:
            ax.text(row[0], row[1], label, \
             ha='center', va='center', rotation=0, color=COLORS10[idx], size='large')

    ax.legend(scatter_proxies,
              labels_show,
              numpoints=1,
              frameon=True,
              loc=legend_loc,
              prop={'size': legend_size})
    ax.set_xlabel('M1', fontsize=20)
    ax.set_ylabel('M2', fontsize=20)
    enlarge_tick_fontsize(ax, 14)
    fig.tight_layout()
    plt.show()

    return
Exemplo n.º 5
0
def export_for_check_intercoder(data,field, folder, trinairize=True,trinary_cutoff=0.5, beta=1, errors=False, normalization='zscore'):
    # Load support libraries
    import sklearn.metrics
    import sklearn.utils
    from scipy.stats.mstats import zscore


    # Define helper function to parse results
    def parse_PRFS_result(predicted,true,beta):
        result = {}

        output = sklearn.metrics.precision_recall_fscore_support(true,predicted, beta=beta)
        fbeta_label = "f{beta}".format(beta=beta)
        # Parse the output
        labels = sklearn.utils.multiclass.unique_labels(true,predicted)
        n_predicted = [sum(predicted==label) for label in labels]
        output = (*output,n_predicted)
        output_fields = ["precision","recall",fbeta_label,"support",'n_predicted']
        for output_field,row in zip(output_fields,output):
            result[output_field] = dict(zip(labels,row))

        result['precision'].update({'global': sklearn.metrics.precision_score(true,predicted, average='weighted')})
        result['recall'].update({'global': sklearn.metrics.recall_score(true,predicted, average='weighted')})
        result[fbeta_label].update({'global': sklearn.metrics.fbeta_score(true, predicted, beta=beta, average='weighted')})
        result['support'].update({'global':len(predicted[~predicted.isnull()])})

        return result

    if normalization=="min-max":
        # min-max scale metrics to push them into a [-1,1] interval, assuming a minimum upper-value bound of 1
        colnames = [name for name in data.columns if field+'_' in name and not '_err' in name]
        data = data[colnames] / data[colnames].abs().max().map(lambda x: max(x,1))

    # Select appropriate subset of data for quality metrics
    gold_field = "{field}_gold".format(field=field)
    data = data[~data[gold_field].isnull()]

    # Run the quality report function over each column
    if not errors:
        cols = [col for col in data.columns if field+"_" in col and not '_gold' in col and not "_err" in col]
    df_results = {}
    for col in cols:
        df_results[col+'_gold'] = data[~data[col].isnull()&~data[gold_field].isnull()][gold_field]
        df_results[col] = data[~data[col].isnull()&~data[gold_field].isnull()][col]

        # Trinarize if so required to cast
        # task as a three-class classification problem
        if trinairize:
            df_results[col+'_gold'] = pandas.Series(zscore(df_results[col+'_gold'])).map(make_trinary)
            if normalization=='zscore':
                df_results[col] = pandas.Series(zscore(df_results[col])).map(make_trinary).map(int)
            else:
                df_results[col] = df_results[col].map(make_trinary).map(int)
                
        print(col)
        
    df_results = pandas.DataFrame(df_results)
    df_results.to_csv(folder+'/'+field+'_for_intercoder.csv')
        
    return 
Exemplo n.º 6
0
def dtw(x, y, dist, l=1, warp=1, z_normalize=False):

    if z_normalize:
        x = zscore(x)
        y = zscore(y)

    series_len = len(x)
    distance_cost = np.full((series_len + 1, series_len + 1), np.inf)
    distance_cost[0, 0] = 0
    ident = int(l * series_len)

    pairs = distance_cost[1:, 1:]
    for i in range(series_len):
        for j in range(max(0, i - ident), min(series_len, i + ident + 1)):
            pairs[i, j] = dist(x[i], y[j])

    pairwise_distances = pairs.copy()
    for i in range(1, series_len + 1):
        for j in range(max(1, i - ident), min(series_len + 1, i + ident + 1)):
            min_list = []
            for k in range(1, warp + 1):
                i_k = max(i - k, 0)
                j_k = max(j - k, 0)
                min_list += [distance_cost[i_k, j], distance_cost[i, j_k], distance_cost[i_k, j_k]]
            distance_cost[i, j] += min(min_list)
    
    path, path_cost = _traceback(distance_cost)
            
    return path_cost, path, distance_cost[1:, 1:], pairwise_distances
Exemplo n.º 7
0
def dtw_improved(x, y, dist, warp=1, l=0.3, zscr=False):
    if zscr:
        zscore(x)
        zscore(y)
    r, c = len(x), len(y)
    lc = int(round(c * l))
    D0 = zeros((r + 1, c + 1))
    D0[0, 1:] = inf
    D0[1:, 0] = inf
    # D0[0, 0] = 0
    D = D0[1:, 1:]  # view
    D[0:, 0:] = inf

    a1, a2 = 0, 0

    for i in range(r + c - 1):
        t1 = threading.Thread(target=count_lines, args=(0, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp))
        t2 = threading.Thread(target=count_lines, args=(1, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp))
        t3 = threading.Thread(target=count_lines, args=(2, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp))

        t1.start()
        t2.start()
        t3.start()

        t1.join()
        t2.join()
        t3.join()

        a1 = min(a1 + 1, r - 1)
        a2 = max(0, a1 - lc) + max((i + 2) - r, 0)




    # for i in range(r):
    #     for j in range(max(i - lc, 0), min(i + lc, c)):
    #         #             if (c >= r - lc and c <= r + lc):
    #         D[i, j] = dist(x[i], y[j])
    # #             else:
    # #                 D1[i, j] = inf
    # print(D0)
    # print("-----")
    # print(D)
    C = D.copy()
    # for i in range(r):
    #     for j in range(max(i - lc, 0), min(i + lc, c)):
    #         min_list = [D0[i, j]]
    #         for k in range(1, warp + 1):
    #             i_k = min(i + k, r - 1)
    #             j_k = min(j + k, c - 1)
    #             min_list += [D0[i_k, j], D0[i, j_k]]
    #         D[i, j] += min(min_list)
    if len(x) == 1:
        path = zeros(len(y)), range(len(y))
    elif len(y) == 1:
        path = range(len(x)), zeros(len(x))
    else:
        path = _traceback(D0)

    return D[-1, -1] / sum(D.shape), C, D, path
Exemplo n.º 8
0
def zscore_patient_adjmats():
	M_S1 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_176_Gordon_333_cortical_corrmat'),None)
	M_S2 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_128_Gordon_333_cortical_corrmat'), None)
	M_S3 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_168_Gordon_333_cortical_corrmat'), None)
	M_S4 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_163_Gordon_333_cortical_corrmat'), None)
	Patient_AdjMats = np.dstack((M_S1, M_S2, M_S3, M_S4))
	return Patient_AdjMats
Exemplo n.º 9
0
 def tra_linear_regression(self):
     tree_prepared = self.pipeline_processing()
     tree_labeled = self.outcome_processing()
     print(type(tree_prepared))
     model = sm.OLS(tree_labeled, tree_prepared).fit()
     z_model = sm.OLS(zscore(tree_labeled), zscore(tree_prepared)).fit()
     print(model.summary())
     print(z_model.summary())
Exemplo n.º 10
0
def scipy_z_transfer(df_input, direct):
    df = df_input.copy()
    if direct == 'c':
        for k in range(len(df.columns)):
            df.iloc[:, k] = mt.zscore(df.iloc[:, k], ddof=1)
    else:
        for k in range(len(df.index)):
            df.iloc[k, :] = mt.zscore(df.iloc[k, :], ddof=1)
    return df
Exemplo n.º 11
0
def PCA_3d_plot(fpkmMatrix, samples, standardize=3, log=True, show_text=False, sep='_', legend_loc='best', legend_size=14):
	# standardize: whether to a zscore transformation on the log10 transformed FPKM
	pca = PCA(n_components=None)
	## preprocessing of the fpkmMatrix
	if log:
		fpkmMatrix = np.log10(fpkmMatrix + 1.)	
	if standardize == 2: # standardize along rows/genes
		fpkmMatrix = zscore(fpkmMatrix, axis=1)
	elif standardize == 1: # standardize along cols/samples
		fpkmMatrix = zscore(fpkmMatrix, axis=0)
	
	## remove genes with NaNs
	fpkmMatrix = fpkmMatrix[~np.isnan(np.sum(fpkmMatrix, axis=1))]
	## get variance captured
	pca.fit(fpkmMatrix.T)
	variance_explained = pca.explained_variance_ratio_[0:3]
	variance_explained *= 100
	## compute PCA and plot
	pca = PCA(n_components=3)
	pca_transformed = pca.fit_transform(fpkmMatrix.T)
	fig = plt.figure(figsize=(9,9))
	ax = fig.add_subplot(111, projection='3d')
	labels_show = []
	scatter_proxies = []
	groups = {}
	conditions = list(set([s.split(sep)[0] for s in samples]))

	colors = COLORS10
	if len(conditions) > 10:
		colors = COLORS20
	if len(conditions) > 20:
		r = lambda: random.randint(0,255)
		colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(len(conditions))]		

	for row, label in zip(pca_transformed, samples):
		label_show = label.split(sep)[0]
		idx = conditions.index(label_show)
		ax.scatter(row[0], row[1], row[2], label='label', color=colors[idx], s=50, marker='o')
		if label_show not in labels_show:
			labels_show.append(label_show)
			scatter1_proxy = Line2D([0],[0], ls="none", c=colors[idx], marker='o')
			scatter_proxies.append(scatter1_proxy)			
		if show_text:	
			ax.text(row[0], row[1]-5, row[2]-5, label.split(sep)[1], \
				ha='center', va='center', rotation=0, color=colors[idx], size='large')

	ax.set_xlabel('PC1 (%.2f'%variance_explained[0] + '%' + ' variance captured)', fontsize=16)
	ax.set_ylabel('PC2 (%.2f'%variance_explained[1] + '%' + ' variance captured)', fontsize=16)
	ax.set_zlabel('PC3 (%.2f'%variance_explained[2] + '%' + ' variance captured)', fontsize=16)
	ax.legend(scatter_proxies, labels_show, numpoints=1, frameon=True,loc='upper left',prop={'size':legend_size})
	fig.tight_layout()
	plt.show()
Exemplo n.º 12
0
def standardize_pow_mat(stripped_pow_mat, events, sessions, outsample_session=None, outsample_list=None):
    zpow_mat = np.array(stripped_pow_mat)
    outsample_mask = None
    for session in sessions:
        sess_event_mask = (events.session == session)
        if session == outsample_session:
            outsample_mask = (events.list == outsample_list) & sess_event_mask
            insample_mask = ~outsample_mask & sess_event_mask
            zpow_mat[outsample_mask] = zmap(zpow_mat[outsample_mask], zpow_mat[insample_mask], axis=0, ddof=1)
            zpow_mat[insample_mask] = zscore(zpow_mat[insample_mask], axis=0, ddof=1)
        else:
            zpow_mat[sess_event_mask] = zscore(zpow_mat[sess_event_mask], axis=0, ddof=1)
    return zpow_mat, outsample_mask
    def cleanClusters(faces, similarityMatrix, labels):
        #first remove outlying clusters
        indices = groupLabels(labels)
        inter_cluster_variances = list()
        toRemove = list()
        for k, v in indices.iteritems():
            print('cluster {0}'.format(k))
            inter_cluster_variances.append(
                sum(sum(np.power(similarityMatrix[:, v][v, :], 2), 1)) /
                (len(v) - 1))

        inter_cluster_zscores = zscore(inter_cluster_variances)
        toRemove_inter_cluster = list()
        for index in range(0, len(inter_cluster_zscores)):
            if inter_cluster_zscores[index] <= (
                    -1) or inter_cluster_zscores[index] >= 1:
                toRemove_inter_cluster.append(index)

        for i in toRemove_inter_cluster:
            toRemove.extend(indices.pop(i, None))

        similarityMatrix = np.delete(similarityMatrix, toRemove, 0)
        similarityMatrix = np.delete(similarityMatrix, toRemove, 1)
        labels = np.delete(labels, toRemove)
        faces = np.delete(np.array(faces), toRemove, 0)
        print(inter_cluster_zscores, toRemove_inter_cluster)

        #them remove the individual images
        silhouetteSamples = zscore(
            silhouette_samples(similarityMatrix, labels, metric='precomputed'))
        print(silhouetteSamples)
        below = (silhouetteSamples <= (-1))
        above = (silhouetteSamples >= 1)
        toRemove = list()
        for index in range(0, len(below)):
            if below[index]:
                toRemove.append(index)

        for index in range(0, len(above)):
            if above[index]:
                toRemove.append(index)

        toRemove.sort()
        print('toRemove', toRemove)

        similarityMatrix = np.delete(similarityMatrix, toRemove, 0)
        similarityMatrix = np.delete(similarityMatrix, toRemove, 1)
        labels = np.delete(labels, toRemove)
        faces = np.delete(np.array(faces), toRemove, 0)
        return (faces, similarityMatrix, labels)
Exemplo n.º 14
0
def discard_outliers(Hshifts, Cshifts, H_thresh=15.0, C_thresh=10.0):
    """
    Returns a boolean array with True if points are outliers and False 
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
    """
    
    
    Hzscores = zscore(Hshifts)
    Hp_values = norm.sf(abs(Hzscores))*2
    adj_Hp_values = p_adjust_bh(Hp_values)
    final_discard_H = adj_Hp_values < 10e-100
    
    Czscores = zscore(Cshifts)
    Cp_values = norm.sf(abs(Czscores))*2
    adj_Cp_values = p_adjust_bh(Cp_values)
    final_discard_C = adj_Cp_values < 10e-10
    
    if len(Cshifts.shape) == 1:
        Cshifts = Cshifts[:,None]
    if len(Hshifts.shape) == 1:
        Hshifts = Hshifts[:,None]
    
    outliers = final_discard_C | final_discard_H    # boolean array, True if this value either in Cshifts or Hshifts is an outlier
    i=0
    new_Cshifts_list, new_Hshifts_list = [], []
    for C, H in zip(Cshifts, Hshifts):
        if outliers[i] == False:
            new_Cshifts_list.append(C[0])
            new_Hshifts_list.append(H[0])
        i += 1

    return np.array(new_Hshifts_list), np.array(new_Cshifts_list)
def embed_hierarchy(x, yy):
    # mean feature by level of abstract
    [x1,y1] = group_mean_x(x, yy['1'])
    [x2,y2] = group_mean_x(x, yy['2'])
    [x3,y3] = group_mean_x(x, yy['3'])
    # x4 = x.todense()
    # y4 = np.array(yy['1'])

    # concatenate x,y for embedding, h is the level (1,2,or 3)
    xh = np.concatenate((x1,x2,x3))
    yh = np.concatenate((y1,y2,y3))
    h  = np.concatenate((np.ones(len(y1))*1, np.ones(len(y2))*2, np.ones(len(y3))*3))

    # low-D embedding
    print('start embedding')
    xh_ld = embed(xh)

    # zscore for plotting
    xh_ld_z = zscore(xh_ld, axis=0)

    # coloring scheme
    mycolor = gen_distinct_color(len(y1))
    y_to_c = dict(zip([y[0] for y in y1], mycolor))   # y to color dictionary, based on top level
    ch = [y_to_c[i] for i in [y[0] for y in yh]]  # get color for every data point

    embed_results = {'xh_ld':xh_ld, 'xh_ld_z':xh_ld_z, 'xh':xh, 'yh':yh, 'h':h, 'ch':ch,'y1':y1}

    return embed_results
Exemplo n.º 16
0
def LDAclassification(train_set,test_set,train_ann,test_ann):

    global path, train_perc

    #Create z-scored data
    normalized_train = zscore(train_set)

    classifier = lda.LDA('lsqr')

    #train the LDA classifier
    classifier.fit(train_set, train_ann)

    #store the trained classifier
    if train_perc<1.0:
    	pickle.dump( classifier, open(path+"LDA_classifier.p", "wb+" ) )
        
    results = classifier.predict(test_set)
 
    #res2 = classifier.predict()


    cm = confusion_matrix(test_ann, results)

    print 'CONFUSION MATRIX = {}'.format(cm)
    return metrics(cm)
    def perform_regression(self, regressor_list):
        regressor_list = zscore(regressor_list, axis=1)
        regression_results = RegressionModel.load(regressor_list, "linear").fit(self.data)
        b = regression_results.select("betas").pack()
        rsq = regression_results.select("stats").pack()

        return regression_results, b, rsq
Exemplo n.º 18
0
def data_zcore_norm(data):
    '''z score normalise the ['Open', 'Low', 'Close','Volume'] of the stock data
    '''
    data_copy = pd.DataFrame.copy(data, deep=True)
    for col in ['High', 'Open', 'Low', 'Close', 'Volume']:
        data_copy[col] = zscore(data[col])
    return data_copy
Exemplo n.º 19
0
def forward_model(folder_out, folder_audio, model, feature_extractor):
    if not os.path.exists(os.path.join(folder_out, "clusters")):
        os.makedirs(os.path.join(folder_out, "clusters"))

    LOGGER.info("Saving results")
    for root, _, files in os.walk(folder_audio):
        LOGGER.info("Saving in: " + folder_out)
        for file in files:
            path_to_file = os.path.join(root, file)
            try:
                features_file = feature_extractor.get_feature_from_file(path_to_file)
            except Exception as exception:
                LOGGER.warning("There is a problem with: " + path_to_file)
                LOGGER.warning(exception)
                continue

            # normalize the feature
            features = mstats.zscore(features_file, axis=1, ddof=1)
            features = np.transpose(features)
            clusters = model.predic_clusters(features)

            # save results
            file_out, _ = os.path.splitext(file)
            path_out_forwarded = os.path.join(folder_out, "clusters/" + root.replace(folder_audio, ''))
            if not os.path.exists(path_out_forwarded):
                os.makedirs(path_out_forwarded)

            path_out_forwarded = os.path.join(path_out_forwarded, file_out + ".txt")
            np.savetxt(path_out_forwarded, clusters, delimiter=" ", fmt='%i',)
Exemplo n.º 20
0
def resample(data,
             source_to_target_ratio,
             ZSCORE,
             resample_method='sinc_best',
             N_channels_max=128):

    ######################
    # If downsampling by an integer, just anti-alias and subsample??
    ######################

    # 128 is the max for the underlying library
    N_channels_max = min(N_channels_max, 128)
    N_channels = data.shape[1]
    data_mat = None

    for i0 in np.arange(0, N_channels, N_channels_max):
        iF = np.min((i0 + N_channels_max, N_channels))
        resampler = samplerate.Resampler(resample_method, channels=iF - i0)
        data_chunk = resampler.process(data[:, i0:iF],
                                       1 / source_to_target_ratio,
                                       end_of_input=True)
        data_mat = (data_chunk if data_mat is None else np.concatenate(
            (data_mat, data_chunk), axis=1))
    if ZSCORE:
        data_mat = zscore(data_mat)

    return data_mat
Exemplo n.º 21
0
def test_grad_fourier(x, in_channels, filter_sz, n_filters, t, X):
        sz2 = filter_sz**2
	x_in = copy.deepcopy(x)
        x_shape = x.shape
        x = np.float32(x.reshape((in_channels*(filter_sz**2), n_filters)))
        x = zscore(x,axis=0)
        x = x.reshape(x_shape)

        t_start = time.time()

        ################ fourier
        grad_f = np.zeros((in_channels, sz2, sz2, n_filters))
        Xx_sum = np.zeros(sz2)
        l = 0
        for channel in range(in_channels):
                for filter in range(n_filters):
                        x = x_in.reshape((in_channels, sz2, n_filters))[channel][:,filter]
                        Xx = np.dot(X,x)
                        Xx_sum += Xx
                        l += np.abs(Xx)
                        sign_mat = np.ones_like(Xx) - 2*(Xx < 0)
                        grad_f[channel][:,:,filter] = X * sign_mat[:,np.newaxis]
        sign_mat2 = np.ones(sz2) - 2*(t > l)
        grad_f = (grad_f*sign_mat2[np.newaxis][:,:,np.newaxis,np.newaxis]).sum(1).ravel()
        fourier_loss = np.sum(np.abs(t - l))

        #########

        grad = grad_f
        loss = fourier_loss

        #print loss, fourier_loss, np.max(x_in)
        return np.double(loss), np.double(grad)
Exemplo n.º 22
0
def zscore_signals(signalArray):

    signalArray_zscore = np.zeros_like(signalArray)

    signalArray_zscore = mstats.zscore(signalArray)

    return signalArray_zscore
Exemplo n.º 23
0
    def _preprocess_data(self):
        """ process the raw data according to epoch info

        This is done in rank 0 which has the raw_data read in
        Average the activity within epochs and z-scoring within subject.
        Write the results to self.processed_data,
        which is a 4D array of averaged epoch by epoch processed data
        Also write the labels to self.label as a 1D numpy array
        """
        logger.info("mask size: %d" % np.sum(self.mask))
        num_epochs = len(self.epoch_info)
        (d1, d2, d3, _) = self.raw_data[0].shape
        self.processed_data_ = np.empty([d1, d2, d3, num_epochs])
        self.labels_ = np.empty(num_epochs)
        subject_count = [0]  # counting the epochs per subject for z-scoring
        cur_sid = -1
        # averaging
        for idx, epoch in enumerate(self.epoch_info):
            self.labels_[idx] = epoch[0]
            if cur_sid != epoch[1]:
                subject_count.append(0)
                cur_sid = epoch[1]
            subject_count[-1] += 1
            self.processed_data_[:, :, :, idx] = np.mean(self.raw_data[cur_sid][:, :, :, epoch[2] : epoch[3]], axis=3)
        # z-scoring
        cur_epoch = 0
        for i in subject_count:
            if i > 1:
                self.processed_data_[:, :, :, cur_epoch : cur_epoch + i] = zscore(
                    self.processed_data_[:, :, :, cur_epoch : cur_epoch + i], axis=3, ddof=0
                )
            cur_epoch += i
        # if zscore fails (standard deviation is zero),
        # set all values to be zero
        self.processed_data_ = np.nan_to_num(self.processed_data_)
Exemplo n.º 24
0
    def remove_outliers(self,
                        points_sr,
                        thresh=2.5,
                        window_length=5,
                        polyorder=3,
                        tz='Asia/Shanghai'):
        """
        Description: remove outliers by savgol_filter

        Parameters: points_sr: pandas.Series
                    thresh: float
                    window_length: int, odd number
                    polyorder: int
                    tz: str

        Returns: pandas.Series
        """
        points = points_sr.values
        points_filtered = savgol_filter(points,
                                        window_length,
                                        polyorder,
                                        mode='nearest')
        points_zscored = zscore(points - points_filtered)
        for i, score in enumerate(points_zscored):
            if abs(score) > thresh:
                points_sr[i] = np.nan

        return points_sr
Exemplo n.º 25
0
    def tra_linear_regression(self):
        tree_prepared = self.convert_dataframe_to_ndarray()
        tree_labeled = self.outcome()
        #tree_prepared=sm.add_constant(tree_prepared)
        model = sm.OLS(tree_labeled, tree_prepared).fit()
        z_model = sm.OLS(zscore(tree_labeled), zscore(tree_prepared)).fit()

        print(model.summary())

        with open('testt_PL.txt', 'wt') as f:
            print(z_model.summary(), file=f)
        tree_predict = model.predict(tree_prepared)
        z_tree_predict = z_model.predict(zscore(tree_prepared))
        res = tree_labeled - tree_predict
        #return zscore(tree_labeled),z_tree_predict
        return tree_labeled, tree_predict, res
Exemplo n.º 26
0
def check_cluster(cluster):
    n = len(cluster)
    if n < 2:
        return True, []

    # Run k_means on two centers
    children, labels, _ = k_means(cluster, 2)

    # Let v = c1 - c2 be a d-dimensional vector that connects the two centers. This is the direction that k-means
    # believes to be important for clustering.
    v = children[1]-children[0]

    # Then project X onto v: x'i = hxi, vi/||v||2. X0 is a 1-dimensional
    # representation of the data projected onto v.
    x_prime = [np.dot(point, v) for point in cluster]

    # Transform X0 so that it has mean 0 and variance 1.
    x_prime = zscore(x_prime)

    # Let zi = F(x0(i)). If A2*(Z) is in the range of non-critical values at confidence level alpha, then accept H0,
    # keep the original center, and discard {c1, c2}. Otherwise, reject H0 and keep {c1, c2} in place of the original
    # center.
    a2, critical, sig = anderson(x_prime)
    a2 *= (1+4.0/n-25.0/(n**2))

    return a2 < critical[0], children
Exemplo n.º 27
0
def apply_stcs(method='dSPM', event='LLst'):
    
    '''
       Normalize the individual STCs and average them across subjects.
        
       Parameters
       ----------
       method: string
          'dSPM' or 'MNE'.
       event: string
          the event name in the experimental conditions.
    '''
    import glob
    from scipy.signal import detrend
    from scipy.stats.mstats import zscore
    fn_list = glob.glob(subjects_dir+'/fsaverage/%s_ROIs/*/*,evtW_%s_bc-lh.stc' % (method, event))
    stcs = []
    for fname in fn_list:
        stc = mne.read_source_estimate(fname)
        #stc = stc.crop(tmin, tmax)
        cal_data = stc.data
        dt_data = detrend(cal_data, axis=-1)
        zc_data = zscore(dt_data, axis=-1)
        stc.data.setfield(zc_data, np.float32)
        stcs.append(stc)
    stcs = np.array(stcs)
    stc_avg = np.sum(stcs, axis=0)/stcs.shape[0]
    fn_avg = subjects_dir+'/fsaverage/%s_ROIs/%s' %(method,event)
    stc_avg.save(fn_avg, ftype='stc')
Exemplo n.º 28
0
def get_similarity_timeserie(path, name, condition, time, **kwargs):
    
    TR = 1.
    for arg in kwargs:
        if arg == 'TR':
            TR = np.float(kwargs[arg])
            
    file_list = os.listdir(path)
    
    file_list = [f for f in file_list if f.find(name) != -1 
                                        and f.find('_'+condition) != -1 
                                        and f.find(time) != -1 
                                        ]

    total_data = []
    for f in file_list:
        
        print(os.path.join(path, f))
        
        data = np.loadtxt(os.path.join(path, f), delimiter=',')
        data = np.sqrt(data.T)
        
        data_z = zscore(data, axis=1)
        
        total_data.append(data_z)
    
    ts = TimeSeries(np.vstack(total_data), sampling_interval=TR)
    
    return ts
Exemplo n.º 29
0
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand):
    """Filter DPs by strang lag and pvalue"""

    if not singlestrand:
        zscore_ratios = zscore(ratios)
        ratios_pass = np.where(np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False)
    if not no_correction:
        pv_pass = [True] * len(pvalues)
        pvalues = map(lambda x: 10**-x, pvalues)
        
        _output_BED(name + '-uncor', output, pvalues, pv_pass)
        _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass)
        
        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
    else:
        pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False)
    
    if not singlestrand:
        filter_pass = np.bitwise_and(ratios_pass, pv_pass)
        assert len(pv_pass) == len(ratios_pass)
    else:
        filter_pass = pv_pass
    
    assert len(output) == len(pvalues)
    assert len(filter_pass) == len(pvalues)
    
    return output, pvalues, filter_pass
Exemplo n.º 30
0
 def test_zscore(self):
     # This is not in R, so tested by using:
     #     (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4)
     y = mstats.zscore(self.testcase)
     desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996,
                               0.44721359549996, 1.3416407864999, np.nan])
     assert_almost_equal(desired, y, decimal=12)
Exemplo n.º 31
0
def _normalize_for_correlation(data, axis):
    """normalize the data before computing correlation

    The data will be z-scored and divided by sqrt(n)
    along the assigned axis

    Parameters
    ----------
    data: 2D array

    axis: int
        specify which dimension of the data should be normalized

    Returns
    -------
    data: 2D array
        the normalized data
    """
    shape = data.shape
    data = zscore(data, axis=axis, ddof=0)
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    data = np.nan_to_num(data)
    data = data / math.sqrt(shape[axis])
    return data
Exemplo n.º 32
0
def _normalize_for_correlation(data, axis, return_nans=False):
    """normalize the data before computing correlation

    The data will be z-scored and divided by sqrt(n)
    along the assigned axis

    Parameters
    ----------
    data: 2D array

    axis: int
        specify which dimension of the data should be normalized

    return_nans: bool, default:False
        If False, return zeros for NaNs; if True, return NaNs

    Returns
    -------
    data: 2D array
        the normalized data
    """
    shape = data.shape
    data = zscore(data, axis=axis, ddof=0)
    # if zscore fails (standard deviation is zero),
    # optionally set all values to be zero
    if not return_nans:
        data = np.nan_to_num(data)
    data = data / math.sqrt(shape[axis])
    return data
Exemplo n.º 33
0
    def __init__(self, data, bgdata, crop=None, fill=False, normalize=True,
                 down_sampling=0):
        super(DataPrep, self).__init__()

        self.gap_position = None
        self.gap_length = None
        self._data = data
        self._bgdata = bgdata

        dmin = data.position.min()
        self.position = data.position.copy() - dmin
        self.counts = data.counts.copy()

        if crop is not None:
            if len(crop) != 2:
                raise ValueError(("Cropping parameter requires a sequence "
                                  "of lenght 2"))
            self._crop(*crop)

        if normalize:
            self.counts = zscore(self.counts)

        if fill:
            self._fill_missing()

        if down_sampling != 0:
            self._down_sample(down_sampling)

        if self.gap_length is None:
            self._gap_lenght(self.position)
Exemplo n.º 34
0
def zscore_mag_div_matrix(mat):
    for col in xrange(mat.shape[1]):
        if col % 50000 == 0:
            print ' {}% done'.format(np.round(float(col)/float(mat.shape[1]),2)* 100)
        mat[:,col] = zscore(mat[:,col]) 
        mat[:,col] = mat[:,col] / np.sqrt( sum( [ x**2 for x in mat[:,col] ] ) )
    return mat
Exemplo n.º 35
0
def regress_subject(f, gradients, n_jobs=1):
    """
        This function estimates coefficients of a linear model that fits gradients to individual volumes.
        It will fit all gradients to each volume.

        Inputs:
        f: volume time series (V x T)
        gradients: matrix containing gradients in columns (V x Ng)

        Outputs:
        Ng x T matrix with linear regression coefficients
        """

    from sklearn.linear_model import LinearRegression
    from scipy.stats.mstats import zscore

    # Load the data
    d = nib.load(f).get_data()

    # Z-score
    d_z = zscore(d, axis=0)

    # Regress
    m = LinearRegression(fit_intercept=True, n_jobs=n_jobs)
    m.fit(d_z.T, gradients)

    return m.coef_.T
Exemplo n.º 36
0
def export_scatterplots(data, field, tool_order, folder, standardization=False):
    from scipy.stats.mstats import zscore
    import seaborn as sns
    import matplotlib.pyplot as plt

    data = data[[field+"_"+t for t in tool_order]+[field+"_gold",field+"_top3"]].dropna()
    cols = data.columns

    if standardization:
        colnames = [name for name in data.columns if field+'_' in name and not '_err' in name]
        if standardization=="min-max":
            data_std = ((data[colnames] - data[colnames].min()) / (data[colnames].max() - data[colnames].min()))*2-1
        elif standardization == "z-score":
            data_std = data.copy()
            for col in colnames:
                data_std[col] = zscore(data[col])
        else:
            data_std = data
        if 'ID' in data.columns: # for normal data
            data_std['ID'] = data['ID']
        else: # for aggregated data, create placeholder IDs
            data_std['ID'] = [i for i in range(len(data_std))]
        data_std['%s_recessie' %field] = data['%s_recessie' %field]
        print(data_std.describe().transpose())
        data_std.to_csv(folder+'/'+field+'_' + standardization + '_for_scatterplots.csv')

    else:
        print(data.describe().transpose())
        data.to_csv(folder+'/'+field+'_' + 'unstandardized' + '_for_scatterplots.csv')
	
    

    return
Exemplo n.º 37
0
def _findTriggerEnd(reference_signal, window=101, prominence=1, zscoring=True):
    """Uses z-scoring of rolling standard deviation to
    find the end trigger from the camera to synch audio and video.

    :param reference_signal: reference signal in audio data
    :type reference_signal: numpy.ndarray
    :param window: window size, defaults to 101
    :type window: int, optional
    :param prominence: prominence for peak finding, defaults to 1
    :type prominence: int, optional
    :param zscoring: enables z-scoring of data before peak finding, defaults to True
    :type zscoring: bool, optional
    :return: peak location and std signal
    :rtype: tuple(int, numpy.ndarray)
    """
    std = _rolling_std_numba(reference_signal, window)

    if zscoring:
        std = zscore(std)

    else:
        std = (std - std.min()) / (std.max() - std.min())

    peaks = find_peaks(std, prominence=prominence)[0]

    if len(peaks):
        return peaks[0], std

    else:
        return False
Exemplo n.º 38
0
def test_F(F):
	sparsity = 0
	sse = 0
	batch = 9001
	for batch in range(9001,9001+3):#9011):
		batch_data =  np.load('/storage/batch128_img138_full/data_batch_' + str(batch))['data'].reshape((3,138,138,128)).transpose((3,1,2,0))[:,66:66+7,66:66+7]
		for step in range(128):
		        patch = batch_data[img].ravel()
			patch = zscore(patch)
			Ft = pinv(F)
		        sse += np.sum((patch - np.dot(Ft, np.dot(F,patch)))**2)
			sparsity += np.sum(np.abs(np.dot(F,patch)))
	l, g =  test_grad_transpose(F.T, 3, 7, n_out)
	lf, gf = test_grad_fourier_l1(F.T, 3, 7, n_out, X, X2)
	lc, gc = test_grad_channel_corr(F.T, 3, 7, n_out)
	ls, gs = test_grad_second_order(F.T, 3, 7, n_out, c_mat_input)
	loss = sse + lambds*sparsity + lambdt*l + lambdf*lf + lambdc*lc

	backprop_corr = pearsonr(1-pdist(F.T,'correlation'), backprop_rdm)[0]
	
	print 'recon:',sse, 'sparsity:',lambdf*sparsity, 'transpose:',lambdt*l, 'fourier:',lambdf*lf, 'loss:',loss, 'transpose: ',np.mean(np.abs(1-pdist(F.T,'correlation'))),  img_t
	print 'channel corr:',lambdc*lc, 'second order:',lambds*ls, 'backprop corr:',backprop_corr
	sparsities.append(sparsity)
	sses.append(sse)
	transposes.append(l)
	fouriers.append(lf)
	losses.append(loss)
	channel_corrs.append(lc)
	second_orders.append(ls)
	backprop_corrs.append(backprop_corr)
Exemplo n.º 39
0
def _normalize_for_correlation(data, axis):
    """normalize the data before computing correlation

    The data will be z-scored and divided by sqrt(n)
    along the assigned axis

    Parameters
    ----------
    data: 2D array

    axis: int
        specify which dimension of the data should be normalized

    Returns
    -------
    data: 2D array
        the normalized data
    """
    shape = data.shape
    data = zscore(data, axis=axis, ddof=0)
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    data = np.nan_to_num(data)
    data = data / math.sqrt(shape[axis])
    return data
Exemplo n.º 40
0
def get_similarity_timeserie(path, name, condition, time, **kwargs):
    
    TR = 1.
    for arg in kwargs:
        if arg == 'TR':
            TR = np.float(kwargs[arg])
            
    file_list = os.listdir(path)
    
    file_list = [f for f in file_list if f.find(name) != -1 
                                        and f.find('_'+condition) != -1 
                                        and f.find(time) != -1 
                                        ]

    total_data = []
    for f in file_list:
        
        print os.path.join(path, f)
        
        data = np.loadtxt(os.path.join(path, f), delimiter=',')
        data = np.sqrt(data.T)
        
        data_z = zscore(data, axis=1)
        
        total_data.append(data_z)
    
    ts = TimeSeries(np.vstack(total_data), sampling_interval=TR)
    
    return ts
Exemplo n.º 41
0
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand):
    """Filter DPs by strang lag and pvalue"""
    if not singlestrand:
        zscore_ratios = zscore(ratios)
        ratios_pass = np.where(np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False)
    if not no_correction:
        pv_pass = [True] * len(pvalues)
        pvalues = map(lambda x: 10**-x, pvalues)
        
        _output_BED(name + '-uncor', output, pvalues, pv_pass)
        _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass)
        
        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
    else:
        pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False)
    
    if not singlestrand:
        filter_pass = np.bitwise_and(ratios_pass, pv_pass)
        assert len(pv_pass) == len(ratios_pass)
    else:
        filter_pass = pv_pass
    
    assert len(output) == len(pvalues)
    assert len(filter_pass) == len(pvalues)
    
    return output, pvalues, filter_pass
Exemplo n.º 42
0
 def test_zscore(self):
     # This is not in R, so tested by using:
     #     (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4)
     y = mstats.zscore(self.testcase)
     desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996,
                               0.44721359549996, 1.3416407864999, np.nan])
     assert_almost_equal(desired, y, decimal=12)
Exemplo n.º 43
0
def frequency_harmonies(x, settings):
    """
    This was used by Michael Hills for the seizure detection competition in 2014 in Kaggle.
    See https://github.com/MichaelHills/seizure-detection/raw/master/seizure-detection.pdf

    :param x: the input signal. Its size is (number of channels, samples).
    :param settings: a dictionary including the "freq_hramonies_max_freq".
    :return:
    """

    time = [0, 0, 0]
    t = timer()
    m_x = x - np.mean(x, axis=1, keepdims=True)
    x_mgn = np.log10(
        np.absolute(
            np.fft.rfft(m_x, axis=1)[:,
                                     1:settings["freq_harmonies_max_freq"]]))
    time[0] = timer() - t
    x_zscored = mstats.zscore(x_mgn, axis=1)
    channels_correlations = fch.calc_corr(x_zscored)
    eigs = fch.calc_eigens(channels_correlations)
    time[1] = timer() - t
    time[2] = time[1]

    channels_corrs_eig_values = eigs["lambda"]
    channels_corrs_eigs_vectors = eigs["vectors"]
    results = fch.fill_results(
        ["frequency_harmonies", "lambdas", "eigen_vectors"],
        [x_mgn, channels_corrs_eig_values, channels_corrs_eigs_vectors],
        "frequency_harmonise", time, settings["is_normalised"])
    return results
Exemplo n.º 44
0
    def __distance(self):
        """
		Compute the average euclidean distance from members of the cluster
		taking the weighted label into account
		"""
        self._data_distance = []
        for x in range(0, len(self._data)):
            cluster = self._pred_cluster[x]
            n = len(self._cluster_member[cluster])
            population = self._cluster_member[cluster]

            if n > 10:
                population = random.sample(self._cluster_member[cluster], 10)
                n = 10

            dist = 0

            for y in self._cluster_member[cluster]:
                vx = np.array(self._data[x] +
                              [self._labels[x] * self._label_weight])
                vy = np.array(self._data[y] +
                              [self._labels[y] * self._label_weight])
                d = euclidean(vx, vy)
                dist += d * d
            self._data_distance.append(dist / n)

        self._data_distance = zscore(self._data_distance)
def word_party_correlations(folder='model'):
    stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:]
    stops = map(lambda x: x.lower().strip(), stopwords)

    # using now stopwords and filtering out digits
    bow = TfidfVectorizer(min_df=2)
    datafn = folder + '/textdata/rawtext.pickle'
    data = cPickle.load(open(datafn))
    bow = bow.fit(chain.from_iterable(data.values()))

    # create numerical labels
    Y = hstack(
        map((lambda x: ones(len(data[data.keys()[x]])) * x), range(len(data))))

    # create data matrix
    for key in data.keys():
        data[key] = bow.transform(data[key])

    X = vstack(data.values())

    # map sentiment vector to bow space
    words = load_sentiment()
    sentiment_vec = zeros(X.shape[1])
    for key in words.keys():
        if bow.vocabulary_.has_key(key):
            sentiment_vec[bow.vocabulary_[key]] = words[key]

    # do sentiment analysis
    sentiments = X.dot(sentiment_vec)

    # compute label-BoW-tfidf-feature correlation
    lb = LabelBinarizer()
    partylabels = zscore(lb.fit_transform(Y), axis=0)
    # sentiment  vs party correlation
    sentVsParty = corrcoef(partylabels.T, sentiments)[-1, :-1]
    fn = folder + '/sentiment_vs_party.json'

    for key in range(len(data.keys())):
        print "Sentiment vs Party %s: %0.2f" % (data.keys()[key],
                                                sentVsParty[key])

    json.dump(dict(zip(data.keys(), sentVsParty)), open(fn, 'wb'))

    wordidx2word = dict(zip(bow.vocabulary_.values(), bow.vocabulary_.keys()))
    allcors = dict(zip(data.keys(), [[]] * len(data.keys())))
    # this is extremely cumbersome and slow, ...
    # but computing the correlations naively on the matrices
    # requires densifying the matrix X, which is memory intense
    for partyidx in range(len(data.keys())):
        cors_words = []
        print 'Computing correlations for %s' % data.keys()[partyidx]
        for wordidx in range(X.shape[-1]):
            cors = corrcoef(X[:, wordidx].todense().flatten(),
                            partylabels[:, partyidx])[1, 0]
            if abs(cors) > .01:
                cors_words.append((wordidx2word[wordidx], cors))
        allcors[data.keys()[partyidx]] = dict(cors_words)
    fn = folder + '/words_correlations.json'
    json.dump(dict(allcors), open(fn, 'wb'))
def word_party_correlations(folder='model'):
    stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:]
    stops = map(lambda x:x.lower().strip(),stopwords)

    # using now stopwords and filtering out digits
    bow = TfidfVectorizer(min_df=2)
    datafn = folder+'/textdata/rawtext.pickle'
    data = cPickle.load(open(datafn))
    bow = bow.fit(chain.from_iterable(data.values()))

    # create numerical labels
    Y = hstack(map((lambda x: ones(len(data[data.keys()[x]]))*x),range(len(data))))
    
    # create data matrix
    for key in data.keys():
        data[key] = bow.transform(data[key])
    
    X = vstack(data.values())
    
    # map sentiment vector to bow space
    words = load_sentiment()
    sentiment_vec = zeros(X.shape[1])
    for key in words.keys():
        if bow.vocabulary_.has_key(key):
            sentiment_vec[bow.vocabulary_[key]] = words[key]
 
    # do sentiment analysis
    sentiments = X.dot(sentiment_vec)    

    # compute label-BoW-tfidf-feature correlation
    lb = LabelBinarizer()
    partylabels = zscore(lb.fit_transform(Y),axis=0)
    # sentiment  vs party correlation
    sentVsParty = corrcoef(partylabels.T,sentiments)[-1,:-1]
    fn = folder+'/sentiment_vs_party.json'
    
    for key in range(len(data.keys())):
        print "Sentiment vs Party %s: %0.2f"%(data.keys()[key],sentVsParty[key])
    
    json.dump(dict(zip(data.keys(),sentVsParty)),open(fn,'wb'))
 
    wordidx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))
    allcors = dict(zip(data.keys(),[[]]*len(data.keys())))
    # this is extremely cumbersome and slow, ...
    # but computing the correlations naively on the matrices
    # requires densifying the matrix X, which is memory intense
    for partyidx in range(len(data.keys())):
        cors_words = []
        print 'Computing correlations for %s'%data.keys()[partyidx]
        for wordidx in range(X.shape[-1]):
            cors = corrcoef(X[:,wordidx].todense().flatten(),partylabels[:,partyidx])[1,0]
            if abs(cors)>.01:
                cors_words.append((wordidx2word[wordidx],cors))
        allcors[data.keys()[partyidx]] = dict(cors_words)   
    fn = folder+'/words_correlations.json' 
    json.dump(dict(allcors),open(fn,'wb'))
Exemplo n.º 47
0
def zscorenormalize(values):
    return zscore(filter_one_d_array(values,3))
    

# # tests and chill    
# arr = np.array([0, 2, 3, 3, 4, 6, 10, 15, 97])
# arr2 = [2, 80, 6, 3] 
# print (median_filter(arr, 5))
# print (filter_one_d_array(arr,5))
# print (zscorenormalize(arr))
Exemplo n.º 48
0
def create_epoch():
    row = 12
    col = 5
    mat = prng.rand(row, col).astype(np.float32)
    mat = zscore(mat, axis=0, ddof=0)
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    mat = np.nan_to_num(mat)
    mat = mat / math.sqrt(mat.shape[0])
    return mat
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
    '''

    Computes clustering of bag-of-words vectors of articles

    INPUT
    folder      model folder
    nclusters   number of clusters

    '''
    from sklearn.cluster import KMeans
    # filtering out some noise words
    stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])

    # vectorize non-stopwords 
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)

    # creating bow-index-to-word map
    idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))

    # using now stopwords and filtering out digits
    print 'Computing pairwise distances' 
    K = pairwise_distances(X,metric='l2',n_jobs=1)
    perc = 50.0
    width = percentile(K.flatten(),perc)

    # KPCA transform bow vectors
    Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
    
    if zscored:
        Xc = zscore(Xc)
    
    # compute clusters
    km = KMeans(n_clusters=nclusters).fit(Xc)
    Xc = km.predict(Xc)

    clusters = []
    for icluster in range(nclusters):
        nmembers = (Xc==icluster).sum()
        if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
            members = (Xc==icluster).nonzero()[0]
            topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
            topwords = ' '.join([idx2word[wi] for wi in topwordidx])
            meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
            meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
            # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
            clusters.append({
                'name':'Cluster-%d'%icluster,
                'description': topwords,
                'members': list(members),
                'meanL2Distances': meanDist
                })

    return clusters
Exemplo n.º 50
0
def soma_lfp(ns, ts, N, T, tau=0.002, dt=.001, norm=True):
    """Simulate LFP (1d) bu convlution with an 'alpha' kernel.

    Parameters
    ----------

    ns : array-list (1d)
        Neuron codes (integers)
    ts : array-list (1d, seconds)
        Spikes times 
    tau : numeric (default: 0.001)
        The alpha estimate time constant
    dt : numeric (default: 0.001, seconds)
        Step time 
    """
    spikes = to_spikes(ns, ts, T, N, dt)

    if spikes.ndim > 2:
        raise ValueError("spikes must be 1 of 2d")
    if tau < 0:
        raise ValueError("tau must be > 0")
    if dt < 0:
        raise ValueError("dt must be > 0")

    # Enforce col orientation if 1d
    if spikes.ndim == 1:
        spikes = spikes[:, np.newaxis]

    # 10 x tau (10 half lives) should be enough to span the
    # interesting parts of g, the alpha function we are
    # using to convert broadband firing to LFP
    # a technique we are borrowing from:
    #
    # http://www.ncbi.nlm.nih.gov/pubmed/20463210
    #
    # then abusing a bit (too much?).
    #
    # We want 10*tau but we have to resample to dt time first
    n_alpha_samples = ((tau * 10) / dt)
    t0 = np.linspace(0, tau * 10, n_alpha_samples)

    # Define the alpha (g notation borrow from BV's initial code)
    gmax = 0.1
    g = gmax * (t0 / tau) * np.exp(-(t0 - tau) / tau)

    # make LFP
    spsum = spikes.astype(np.float).sum(1)
    spsum /= spsum.max()

    lfps = np.convolve(spsum, g)[0:spikes.shape[0]]

    if norm:
        lfps = zscore(lfps)

    return lfps
Exemplo n.º 51
0
def prepare_mvpa_data(data_dir, extension, mask_file, epoch_file):
    """ obtain the data for activity-based model training and prediction

    Average the activity within epochs and z-scoring within subject.

    Parameters
    ----------
    data_dir: str
        the path to all subject files
    extension: str
        the file extension, usually nii.gz or nii
    mask_file: str
        the absolute path of the mask file,
        we apply the mask right after reading a file for saving memory
    epoch_file: str
        the absolute path of the epoch file

    Returns
    -------
    processed\_data: 2D array in shape [num_voxels, num_epochs]
        averaged epoch by epoch processed data

    labels: 1D array
        contains labels of the data
    """
    activity_data = read_activity_data(data_dir, extension, mask_file)
    epoch_list = np.load(epoch_file)
    epoch_info = generate_epochs_info(epoch_list)
    num_epochs = len(epoch_info)
    (d1, _) = activity_data[0].shape
    processed_data = np.empty([d1, num_epochs])
    labels = np.empty(num_epochs)
    subject_count = [0]  # counting the epochs per subject for z-scoring
    cur_sid = -1
    # averaging
    for idx, epoch in enumerate(epoch_info):
        labels[idx] = epoch[0]
        if cur_sid != epoch[1]:
            subject_count.append(0)
            cur_sid = epoch[1]
        subject_count[-1] += 1
        processed_data[:, idx] = np.mean(activity_data[cur_sid][:, epoch[2] : epoch[3]], axis=1)
    # z-scoring
    cur_epoch = 0
    for i in subject_count:
        if i > 1:
            processed_data[:, cur_epoch : cur_epoch + i] = zscore(
                processed_data[:, cur_epoch : cur_epoch + i], axis=1, ddof=0
            )
        cur_epoch += i
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    processed_data = np.nan_to_num(processed_data)

    return processed_data, labels
Exemplo n.º 52
0
def perform_PCA(fpkmMatrix, standardize=3, log=True):
	## preprocessing of the fpkmMatrix
	if log:
		fpkmMatrix = np.log10(fpkmMatrix + 1.)
	if standardize == 2: # standardize along rows/genes
		fpkmMatrix = zscore(fpkmMatrix, axis=1)
	elif standardize == 1: # standardize along cols/samples
		fpkmMatrix = zscore(fpkmMatrix, axis=0)

	## remove genes with NaNs
	fpkmMatrix = fpkmMatrix[~np.isnan(np.sum(fpkmMatrix, axis=1))]

	pca = PCA(n_components=None)
	## get variance captured
	pca.fit(fpkmMatrix.T)
	variance_explained = pca.explained_variance_ratio_[0:3]
	variance_explained *= 100
	## compute PCA and plot
	pca_transformed = pca.transform(fpkmMatrix.T)
	return variance_explained, pca_transformed
Exemplo n.º 53
0
def get_bold_signals (image, mask, TR, 
                      normalize=True, 
                      ts_extraction='mean', 
                      filter_par=None, 
                      roi_values=None):
    '''
    Image and mask must be in nibabel format
    '''
    
    mask_data = np.int_(mask.get_data())
    if roi_values == None:
        labels = np.unique(mask_data)[1:]
    else:
        labels = np.int_(roi_values)
    
    final_data = []
    #print labels
    for v in labels[:]:
        #print str(v)
        data = image.get_data()[mask_data == v]
        
        if normalize == True:
            data = zscore(data, axis = 1)
            data[np.isnan(data)] = 0

        if ts_extraction=='mean':
            #assert np.mean(data, axis=0) == data.mean(axis=0)
            data = data.mean(axis=0)
        elif ts_extraction=='pca':
            if data.shape[0] > 0:
                data = PCA(n_components=1).fit_transform(data.T)
                data = np.squeeze(data)
            else:
                data = data.mean(axis=0)
                
        ts = TimeSeries(data, sampling_interval=float(TR))
        
        if filter_par != None:
            
            upperf = filter_par['ub']
            lowerf = filter_par['lb']
            
            F = FilterAnalyzer(ts, ub=upperf, lb=lowerf)
            
            ts = TimeSeries(F.fir.data, sampling_interval=float(TR))
            
            del F
        
        final_data.append(ts.data)

    del data
    del mask_data
    del ts
    return TimeSeries(np.vstack(final_data), sampling_interval=float(TR))
Exemplo n.º 54
0
 def get_arc(self, word):
     '''
     implements the neighbourhood size/density algorithm described in [1]
     
     the algorithm simply extends the proposal of Shaoul & Westbury (2006) by relativising
     the threshold for EACH word. That is, we simply ask if the potential neighbour stands
     closer to the target word or further away than the average pairing of the target with
     any other word. In our implementation of the calculation of the semantic distances we
     take A ⋅ w_i for each word (see get_neighbourhood method with topn=0), where A is the
     ∣V∣ × D or the ∣V∣ × ∣V∣ vocabulary matrix and w_i is the target word. Having normalised
     each vector in the matrix to unit length, (see the _init method) this operation is
     equivalent to taking the cosine similarity between the word in question and all the
     other words in the vocabulary.
     The resulting ∣V∣ dimensional vector contains effectively the similarity values between
     the word in question and all the other words in the matrix. From this point it is a
     trivial task to obtain descriptive statistics for these distributions.
     Converting, thus, the vector of similarities into z-scores, we are able to keep all
     the scores above a predefined threshold obtaining thus neighbourhood size and density
     for each word, taking into account its similarity to all the other words in the lexicon.
     
     The number of stdevs above which a word is considered a neighbour, does not have to be
     explicitly set, the list_ variable reports the size and the density of the neighbourhood
     in predefined steps.
     
     [1] Alikaniotis D. (2014) Approximating semantic structures using high-dimensional lexical spaces
     '''
     list_ = np.arange(-5, 10, 1) ## range and step of reporting
     ans_list = ARCObject(word, list_)
     most_similar = self.get_neighbourhood(word, topn=0) ## get all
     zscores = zscore([sim for (w, sim) in most_similar])
     self.nl = zip(most_similar, zscores)
     ans_ncount = ans_arc = 0
     it = iter(self.nl)
     high_ind = -1
     high_val = list_[high_ind]
     try:
         low_ind = -2
         low_val = list_[low_ind]
     except IndexError:
         print "Please use a list that contains more than two items"
         raise
     for i, (k, v) in enumerate(it):
         if v < list_[0]:
             return ans_list
         while v < high_val and v < low_val:
             ans_list[low_val] = tuple([ans_ncount, ans_arc / i if ans_ncount != 0 else 0])
             ans_ncount = 0
             low_ind -= 1
             high_ind -= 1
             low_val, high_val = list_[low_ind], list_[high_ind]
         ans_ncount += 1
         ans_arc += k[1]
     return ans_list
Exemplo n.º 55
0
def _separate_epochs(activity_data, epoch_list):
    """ create data epoch by epoch

    Separate data into epochs of interest specified in epoch_list
    and z-score them for computing correlation

    Parameters
    ----------
    activity_data: list of 2D array in shape [nVoxels, nTRs]
        the masked activity data organized in voxel*TR formats of all subjects
    epoch_list: list of 3D array in shape [condition, nEpochs, nTRs]
        specification of epochs and conditions
        assuming all subjects have the same number of epochs
        len(epoch_list) equals the number of subjects

    Returns
    -------
    raw_data: list of 2D array in shape [epoch length, nVoxels]
        the data organized in epochs
        and z-scored in preparation of correlation computation
        len(raw_data) equals the number of epochs
    labels: list of 1D array
        the condition labels of the epochs
        len(labels) labels equals the number of epochs
    """
    time1 = time.time()
    raw_data = []
    labels = []
    for sid in range(len(epoch_list)):
        epoch = epoch_list[sid]
        for cond in range(epoch.shape[0]):
            sub_epoch = epoch[cond, :, :]
            for eid in range(epoch.shape[1]):
                r = np.sum(sub_epoch[eid, :])
                if r > 0:   # there is an epoch in this condition
                    # mat is row-major
                    # regardless of the order of acitvity_data[sid]
                    mat = activity_data[sid][:, sub_epoch[eid, :] == 1]
                    mat = np.ascontiguousarray(mat.T)
                    mat = zscore(mat, axis=0, ddof=0)
                    # if zscore fails (standard deviation is zero),
                    # set all values to be zero
                    mat = np.nan_to_num(mat)
                    mat = mat / math.sqrt(r)
                    raw_data.append(mat)
                    labels.append(cond)
    time2 = time.time()
    logger.debug(
        'epoch separation done, takes %.2f s' %
        (time2 - time1)
    )
    return raw_data, labels
Exemplo n.º 56
0
def prepare_mvpa_data(images, conditions, mask):
    """Prepare data for activity-based model training and prediction.

    Average the activity within epochs and z-scoring within subject.

    Parameters
    ----------
    images: Iterable[SpatialImage]
        Data.
    conditions: List[UniqueLabelConditionSpec]
        Condition specification.
    mask: np.ndarray
        Mask to apply to each image.

    Returns
    -------
    processed_data: 2D array in shape [num_voxels, num_epochs]
        averaged epoch by epoch processed data
    labels: 1D array
        contains labels of the data
    """
    activity_data = list(mask_images(images, mask, np.float32))
    epoch_info = generate_epochs_info(conditions)
    num_epochs = len(epoch_info)
    (d1, _) = activity_data[0].shape
    processed_data = np.empty([d1, num_epochs])
    labels = np.empty(num_epochs)
    subject_count = [0]  # counting the epochs per subject for z-scoring
    cur_sid = -1
    # averaging
    for idx, epoch in enumerate(epoch_info):
        labels[idx] = epoch[0]
        if cur_sid != epoch[1]:
            subject_count.append(0)
            cur_sid = epoch[1]
        subject_count[-1] += 1
        processed_data[:, idx] = \
            np.mean(activity_data[cur_sid][:, epoch[2]:epoch[3]],
                    axis=1)
    # z-scoring
    cur_epoch = 0
    for i in subject_count:
        if i > 1:
            processed_data[:, cur_epoch:cur_epoch + i] = \
                zscore(processed_data[:, cur_epoch:cur_epoch + i],
                       axis=1, ddof=0)
        cur_epoch += i
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    processed_data = np.nan_to_num(processed_data)

    return processed_data, labels
Exemplo n.º 57
0
def create_epoch(idx, num_voxels):
    row = 12
    col = num_voxels
    mat = prng.rand(row, col).astype(np.float32)
    # impose a pattern to even epochs
    if idx % 2 == 0:
        mat = np.sort(mat, axis=0)
    mat = zscore(mat, axis=0, ddof=0)
    # if zscore fails (standard deviation is zero),
    # set all values to be zero
    mat = np.nan_to_num(mat)
    mat = mat / math.sqrt(mat.shape[0])
    return mat