def perform_pyrcca_cca(X: np.ndarray, Y: np.ndarray): assert X.shape[0] == Y.shape[0] n, p, q = X.shape[0], X.shape[1], Y.shape[1] n_components = min(n, p, q) cca = rcca.CCA(kernelcca=False, reg=1e4, numCC=n_components) cca.train([X, Y]) return cca
def lc_rcca(datasets, kernelcca=True, reg=0.1, numCC=2, verbose=False): # datasets contain 2 subsets: X and Y cca = rcca.CCA(kernelcca=kernelcca, reg=reg, numCC=numCC) cca.train(datasets) # calc the correlation between the first cannonical variate corr_firstVariate = cca.__dict__['cancorrs'][0] return corr_firstVariate, cca
def __compute_fitness(self): """ For each dataset, computes the average out of sample correlation between signature exposure and gene expression over all k, folds and repeats. CCA coefficients are computed using training samples and then correlation is evaluated on test samples. """ corr_all = {} for dataset in DATASETS: corr_all[dataset.dataset_name] = [] rel_ge_data = dataset.train_ge_data for k in dataset.k_range: rel_exposures = self.train_exposures[dataset.dataset_name][k] for rep in np.arange(REPEATS): folds = dataset.cca_folds['train'][rep] folds_in_dataset = len(folds) for fold_i in np.arange(folds_in_dataset): not_fold = np.array( np.setdiff1d(np.arange(folds_in_dataset), fold_i)) rel_indices_train = np.concatenate( [folds[f] for f in not_fold]) rel_indices_val = folds[fold_i] cca = rcca.CCA(reg=1e-4, numCC=1, verbose=False) cca.train([ rel_exposures[rel_indices_train], rel_ge_data[rel_indices_train] ]) corr = self.__compute_correlation( cca, rel_exposures[rel_indices_val], rel_ge_data[rel_indices_val]) corr_all[dataset.dataset_name].append(corr) self.fitness_scores[dataset.dataset_name] = np.round( np.mean(corr_all[dataset.dataset_name]), 4) print('Correlation:', self.fitness_scores)
def main(): parser=OptionParser() parser.add_option('--out',dest='out') parser.add_option('--matrices',dest='ms',default='',help='Comma delimited, .npy files, nodes should be aligned') opts,args=parser.parse_args() matrices=opts.ms.split(',') m1=np.load(matrices[0]) m2=np.load(matrices[1]) # Set up Pyrcca cca = rcca.CCA(kernelcca=False, numCC=2, reg=0.5) # Find canonical components training=cca.train([m1,m2])
def run_cca(data1, data2, test1, test2, numCC, reg): cca = rcca.CCA(kernelcca=False, numCC=numCC, reg=reg) cca.train([data1, data2]) # Find canonical components # Test on held-out data corrs = cca.validate([test1, test2]) fig = plt.figure() ax = fig.add_subplot(111) plot0 = ax.bar(np.arange(corrs[0].shape[0]), corrs[0], 0.3, color="steelblue") plot1 = ax.bar(np.arange(corrs[1].shape[0]) + 0.35, corrs[1], 0.3, color="orangered") ax.legend([plot0[0], plot1[0]], ["Dataset 1", "Dataset 2"]) ax.set_ylabel("Prediction correlation") ax.set_xticks(np.arange(0, corrs[0].shape[0], 20) + 0.325) ax.set_xticklabels(["%d" % i for i in range(0, corrs[0].shape[0], 20)]) ax.set_xlabel("Test data m=113") fig.savefig(str(PROJDIR) + '/Prediction.png', dpi=fig.dpi) #printMatrix(trainCaptureC, str(PROJDIR)+'/trainCaptureC_chr1', '1-q_value', 1, 'upper') #printMatrix(valiCaptureC, str(PROJDIR)+'/valiCaptureC_chr1', '1-q_value', 1, 'upper') #printMatrix(testCaptureC, str(PROJDIR)+'/testCaptureC_chr1', '1-q_value', 1, 'upper') #printMatrix(PPMatrix, str(PROJDIR)+'/PPMatrixHiC_chr1', '1-q_value', 1, 'upper') #DiffKMatrix10=DiffusionKernel(PPMatrix, 10) #DiffKMatrix5=DiffusionKernel(PPMatrix, 5) #DiffKMatrix1=DiffusionKernel(PPMatrix, 1) #Difference15=np.subtract(DiffKMatrix5, DiffKMatrix1) #Difference110=np.subtract(DiffKMatrix10, DiffKMatrix1) #printMatrix(Difference15, str(PROJDIR)+'/DiffKMatrix15HiC_chr1', 'Exp(5*H)-Exp(1*H)', 0.001, 'lower') #printMatrix(Difference110, str(PROJDIR)+'/DiffKMatrix110HiC_chr1', 'Exp(10*H)-Exp(1*H)', 0.001, 'lower') #train = train_vali_test(PPMatrix, 0.9, 0.1) #printMatrix(train, str(PROJDIR)+'/train_chr1', '1-q_value', 1, 'upper') #np.savetxt(output, PPMatrix) return cca.train([data1, data2])
def cca_rcca(spikes, stimulus, filter_length, n_components, regularization, whiten): ncells = int(spikes.shape[1] / filter_length[1]) if whiten: spikes, spikes_rotation = whiten_data(spikes) cca = rcca.CCA(kernelcca=False, reg=regularization, numCC=n_components) cca.train([spikes, stimulus]) spikes_res = cca.ws[0] # Derotate the spikes to be able to interpret the responses if whiten: spikes_res = spikes_rotation @ spikes_res resp_comps = np.swapaxes(spikes_res, 1, 0) resp_comps = resp_comps.reshape((n_components, ncells, filter_length[1])) stim_comps = np.swapaxes(cca.ws[1], 1, 0) stim_comps = stim_comps.reshape((n_components, 2, filter_length[0])) cancorrs = cca.cancorrs return resp_comps, stim_comps, cancorrs
def fit_cca_clf_loop( h_load_file: str, n_seeds: int = 3, rescale: bool = True, save_file: str = None, **kwargs, ): save_file = 'results_cca_clf_{}.df'.format( now()) if save_file is None else save_file default_args = { 'seeds': [int(2**i) for i in range(max(1, n_seeds))], 'min_nb_trials': 100, 'target': True, 'global_normalize': True, 'augment_data': False, 'xv_folds': 5, 'timepoint': 45, 'num_ccs': np.arange(5, 91, 5), 'cca_regs': np.logspace(-3, -1.5, num=20), 'clf_regs': np.logspace(-3, -1.4, num=20), 'clf_max_iter': int(1e3), 'clf_tol': 1e-4, } for k in default_args: if k in kwargs: default_args[k] = kwargs[k] results = pd.DataFrame() warnings.filterwarnings('ignore', category=RuntimeWarning) for random_state in tqdm(default_args['seeds']): np.random.seed(random_state) for fold in tqdm(range(default_args['xv_folds']), leave=False): data_trn, data_tst = prepare_cca_data( h_load_file=h_load_file, min_nb_trials=default_args['min_nb_trials'], timepoint=default_args['timepoint'], target=default_args['target'], normalize_mode='zscore', augment=default_args['augment_data'], xv_folds=default_args['xv_folds'], which_fold=fold, verbose=False, ) train_list, y_trn = data_trn['processed'], data_trn['labels'] test_list, y_tst = data_tst['processed'], data_tst['labels'] for n_components in tqdm(default_args['num_ccs'], leave=False): train_list_centered = [ (item - item.mean()) / np.sqrt(n_components) if rescale else item - item.mean() for item in train_list ] for reg in tqdm(default_args['cca_regs'], leave=False): cca = rcca.CCA( kernelcca=True, ktype='linear', numCC=n_components, reg=reg, cutoff=1e-15, verbose=False, ).train(train_list_centered) testcorrs = cca.validate(test_list) corrs = [] for item in testcorrs: corrs.append(np.mean(np.abs(item))) pred_r = np.mean(corrs) x_trn = [x @ w for x, w in zip(train_list, cca.ws)] x_tst = [x @ w for x, w in zip(test_list, cca.ws)] x_trn, x_tst = tuple(map(np.concatenate, [x_trn, x_tst])) for C in default_args['clf_regs']: clf = LogisticRegression( C=C, penalty='l1', solver='liblinear', class_weight='balanced', max_iter=default_args['clf_max_iter'], tol=default_args['clf_tol'], random_state=random_state, ).fit(x_trn, y_trn) y_pred = clf.predict(x_tst) balacc = balanced_accuracy_score(y_tst, y_pred) mcc = matthews_corrcoef(y_tst, y_pred) data_dict = { 'seed': [random_state] * 3, 'fold': [fold] * 3, 'n_components': [n_components] * 3, 'cca_reg': [reg] * 3, 'clf_reg': [C] * 3, 'metric': ['mcc', 'bal_acc', 'pred_r'], 'value': [mcc, balacc, pred_r], } results = pd.concat( [results, pd.DataFrame.from_dict(data_dict)]) save_obj(obj=results, file_name=save_file, save_dir='./results', mode='df', verbose=False) results = reset_df(results) save_obj(obj=results, file_name=save_file, save_dir='./results', mode='df', verbose=True) best = extract_best_hyperparams(results, metric='mcc', verbose=True) return results, best, default_args
def get_best_cca_clf( h_load_file: str, best: dict, min_nb_trials: int = -1, time_range: range = range(45, 46), target: bool = True, global_normalize: bool = True, augment_data: bool = False, xv_folds: int = 5, which_fold: int = 0, random_sate: int = 42, ): data_trn, data_tst = prepare_cca_data( h_load_file=h_load_file, min_nb_trials=min_nb_trials, target=target, global_normalize=global_normalize, augment_data=augment_data, xv_folds=xv_folds, which_fold=which_fold, time_range=time_range, verbose=False, ) train_list, y_trn = data_trn['processed'], data_trn['labels'] test_list, y_tst = data_tst['processed'], data_tst['labels'] cca = rcca.CCA( kernelcca=True, ktype='linear', reg=best['cca_reg'], numCC=best['n_components'], verbose=False, ) cca.train([item / np.sqrt(best['n_components']) for item in train_list]) testcorrs = cca.validate(test_list) corrs = [] for item in testcorrs: corrs.append(np.mean(np.abs(item))) pred_r = np.mean(corrs) x_trn = [x @ w for x, w in zip(train_list, cca.ws)] x_tst = [x @ w for x, w in zip(test_list, cca.ws)] x_trn, x_tst = tuple(map(np.concatenate, [x_trn, x_tst])) clf = LogisticRegression( random_state=random_sate, C=best['clf_reg'], penalty='l1', solver='liblinear', class_weight='balanced', max_iter=int(1e4), tol=1e-6, ).fit(x_trn, y_trn) y_pred = clf.predict(x_tst) balacc = balanced_accuracy_score(y_tst, y_pred) mcc = matthews_corrcoef(y_tst, y_pred) msg = "[PROGRESS] fitting done. results:\n" msg += "corr: {:.3f}, balanced accuracy: {:.3f}, mcc: {:.3f}" msg = msg.format(pred_r, balacc, mcc) print(msg) comps_trn, comps_df_trn = extract_components(data_trn, cca) comps_tst, comps_df_tst = extract_components(data_tst, cca) output = { 'data_trn': data_trn, 'data_tst': data_tst, 'cca': cca, 'clf': clf, 'comps_trn': comps_trn, 'comps_tst': comps_tst, 'comps_df_trn': comps_df_trn, 'comps_df_tst': comps_df_tst, } return output
def cca_omb_components(exp: str, stim_nr: int, n_components: int = 6, regularization=None, filter_length=None, maxframes=None, shufflespikes: bool = False, savedir: str = None, savefig: bool = True, sort_by_nspikes: bool = True, select_cells: list = None, plot_first_ncells: int = None): """ Analyze OMB responses using cannonical correlation analysis and plot the results. Parameters --- n_components: Number of components that will be requested from the CCA anaylsis. More numbers mean the algortihm will stop at a later point. That means components of analyses with fewer n_components are going to be identical to the first n components of the higher-number component analyses. regularization: The regularization parameter to be passed onto rcca.CCA. filter_length: The length of the time window to be considered in the past for the stimulus and the responses. Can be different for stimulus and response, if a tuple is given. maxframes: int Number of frames to load in the the experiment object. Used to avoid memory and performance issues. shufflespikes: bool Whether to randomize the spikes, to validate the results savedir: str Custom directory to save the figures and data files. If None, will be saved in the experiment directory under appropritate path. savefig: bool Whether to save the figures. sort_by_nspikes: bool Wheter to sort the cell weights array by the number of spikes during the stimulus. select_cells: list A list of indexes for the subset of cells to perform the analysis for. plot_first_ncells: int Number of cells to plot in the cell plots. """ if regularization is None: regularization = 0 cca = rcca.CCA(kernelcca=False, reg=regularization, numCC=n_components) st = OMB(exp, stim_nr, maxframes=maxframes) if filter_length is None: filter_length = st.filter_length if type(filter_length) is int: filter_length = (filter_length, filter_length) if type(savedir) is str: savedir = Path(savedir) if savedir is None: savedir = st.stim_dir / 'CCA' savedir.mkdir(exist_ok=True, parents=True) spikes = st.allspikes() # Set the mean to zero for spikes spikes -= spikes.mean(axis=1)[:, None] bgsteps = st.bgsteps if select_cells is not None: if type(select_cells) is not np.array: select_cells = np.array(select_cells) spikes = spikes[select_cells] st.nclusters = len(select_cells) # Convert to list for better string representation # np.array is printed as "array([....])" # with newline characters which is problematic in filenames select_cells = list(select_cells) nspikes_percell = spikes.sum(axis=1) if shufflespikes: spikes = spikeshuffler.shufflebyrow(spikes) figsavename = f'{n_components=}_{shufflespikes=}_{select_cells=}_{regularization=}_{filter_length=}_{whiten=}' # If the file length gets too long due to the list of selected cells, summarize it. if len(figsavename) > 200: figsavename = f'{n_components=}_{shufflespikes=}_select_cells={len(select_cells)}cells-index{select_cells[0]}to{select_cells[-1]}_{regularization=}_{filter_length=}_{whiten=}' #sp_train, sp_test, stim_train, stim_test = train_test_split(spikes, bgsteps) stimulus = mft.packdims(st.bgsteps, filter_length[0]) spikes = mft.packdims(spikes, filter_length[1]) if whiten: spikes, spikes_rotation = whiten_data(spikes) cca.train([spikes, stimulus]) # import IPython.core.debugger as ipdb; ipdb.set_trace() spikes_res = cca.ws[0] # Derotate the data to be able to interpret the responses if whiten: spikes_res = spikes_rotation @ spikes_res cells = np.swapaxes(spikes_res, 1, 0) cells = cells.reshape((n_components, st.nclusters, filter_length[1])) nsp_argsorted = np.argsort(nspikes_percell) cells_sorted_nsp = cells[:, nsp_argsorted, :] if sort_by_nspikes: cells_toplot = cells_sorted_nsp else: cells_toplot = cells if plot_first_ncells is not None: cells_toplot = cells_toplot[:, :plot_first_ncells, ...] motionfilt_x = cca.ws[1][:filter_length[0]].T motionfilt_y = cca.ws[1][filter_length[0]:].T motionfilt_r, motionfilt_theta = mft.cart2pol(motionfilt_x, motionfilt_y) #%% nrows, ncols = plf.numsubplots(n_components) fig_cells, axes_cells = plt.subplots(nrows, ncols, figsize=(10, 10)) for i in range(n_components): ax = axes_cells.flat[i] im = ax.imshow(cells[i, :], cmap='RdBu_r', vmin=asc.absmin(cells), vmax=asc.absmax(cells), aspect='auto', interpolation='nearest') ax.set_title(f'{i}') fig_cells.suptitle(f'Cells default order {shufflespikes=}') if savefig: fig_cells.savefig(savedir / f'{figsavename}_cells_default_order.pdf') plt.close(fig_cells) nsubplots = plf.numsubplots(n_components) height_list = [1, 1, 1, 3] # ratios of the plots in each component # Create a time vector for the stimulus plots t_stim = -np.arange(0, filter_length[0] * st.frame_duration, st.frame_duration)[::-1] * 1000 t_response = -np.arange(0, filter_length[1] * st.frame_duration, st.frame_duration)[::-1] * 1000 xtick_loc_params = dict(nbins=4, steps=[2, 5, 10], integer=True) nsubrows = len(height_list) height_ratios = nsubplots[0] * height_list fig, axes = plt.subplots(nrows=nsubplots[0] * nsubrows, ncols=nsubplots[1], gridspec_kw={'height_ratios': height_ratios}, figsize=(11, 10)) for row, ax_row in enumerate(axes): for col, ax in enumerate(ax_row): mode_i = int(row / nsubrows) * nsubplots[1] + col # ax.text(0.5, 0.5, f'{mode_i}') ax.set_yticks([]) # Plot motion filters if row % nsubrows == 0: ax.plot(t_stim, motionfilt_x[mode_i, :], marker='o', markersize=1) ax.plot(t_stim, motionfilt_y[mode_i, :], marker='o', markersize=1) if col == 0: ax.set_ylabel('Motion', rotation=0, ha='right', va='center') ax.set_ylim(cca.ws[1].min(), cca.ws[1].max()) # Draw a horizontal line for zero and prevent rescaling of x-axis xlims = ax.get_xlim() ax.hlines(0, *ax.get_xlim(), colors='k', linestyles='dashed', alpha=0.3) ax.set_xlim(*xlims) # ax.set_title(f'Component {mode_i}', fontweight='bold') ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params)) if not mode_i == 0 or filter_length[0] == filter_length[1]: ax.xaxis.set_ticklabels([]) else: ax.tick_params(axis='x', labelsize=8) # Plot magnitude of motion elif row % nsubrows == 1: ax.plot(t_stim, motionfilt_r[mode_i, :], color='k', marker='o', markersize=1) if col == 0: ax.set_ylabel('Magnitude', rotation=0, ha='right', va='center') ax.set_ylim(motionfilt_r.min(), motionfilt_r.max()) ax.xaxis.set_ticklabels([]) ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params)) # Plot direction of motion elif row % nsubrows == 2: ax.plot(t_stim, motionfilt_theta[mode_i, :], color='r', marker='o', markersize=1) if mode_i == 0: ax.yaxis.set_ticks([-np.pi, 0, np.pi]) ax.yaxis.set_ticklabels(['-π', 0, 'π']) ax.xaxis.set_ticklabels([]) ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params)) # Plot cell weights elif row % nsubrows == nsubrows - 1: im = ax.imshow(cells_toplot[mode_i, :], cmap='RdBu_r', vmin=asc.absmin(cells), vmax=asc.absmax(cells), aspect='auto', interpolation='nearest', extent=[ t_response[0], t_response[-1], 0, cells_toplot.shape[1] ]) ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params)) if row == axes.shape[0] - 1: ax.set_xlabel('Time before spike [ms]') # ax.set_xticks(np.array([0, .25, .5, .75, 1]) * cells_toplot.shape[-1]) # ax.xaxis.set_ticklabels(-np.round((ax.get_xticks()*st.frame_duration), 2)[::-1]) else: ax.xaxis.set_ticklabels([]) plf.integerticks(ax, 5, which='y') if col == 0: ax.set_ylabel( f'Cells\n{"(sorted nsp)"*sort_by_nspikes}\n{("(first " + str(plot_first_ncells)+ " cells)")*(type(plot_first_ncells) is int) }', rotation=0, ha='right', va='center') else: ax.yaxis.set_ticklabels([]) if mode_i == n_components - 1: plf.colorbar(im) # Add ticks on the right side of the plots if col == nsubplots[1] - 1 and row % nsubrows != nsubrows - 1: plf.integerticks(ax, 3, which='y') ax.yaxis.tick_right() fig.suptitle( f'CCA components of {st.exp_foldername}\n{shufflespikes=} {n_components=}\n{sort_by_nspikes=}\n' + f'{select_cells=} {regularization=} {filter_length=}') fig.subplots_adjust(wspace=0.1, hspace=0.3) if savefig: fig.savefig(savedir / f'{figsavename}_cellsandcomponents.pdf') # plt.show() plt.close(fig) #%% fig_corrs = plt.figure() plt.plot(cca.cancorrs, 'ko') # plt.ylim([0.17, 0.24]) plt.xlabel('Component index') plt.ylabel('Correlation') plt.title(f'Cannonical correlations {shufflespikes=}') if savefig: fig_corrs.savefig(savedir / f'{figsavename}_correlation_coeffs.pdf') # plt.show() plt.close(fig_corrs) fig_nlt, axes_nlt = plt.subplots(nrows, ncols, figsize=(10, 10)) for i, ax in enumerate(axes_nlt.flatten()): # Reshape to perform the convolution as a matrix multiplication generator_motion = stimulus @ cca.ws[1][..., i] generator_cells = spikes @ cca.ws[0][..., i] nonlinearity, bins = nlt.calc_nonlin(generator_cells, generator_motion) # ax.scatter(generator_motion, generator_cells, s=1, alpha=0.5, facecolor='grey') ax.plot(bins, nonlinearity, 'k') if i == 0: all_nonlinearities = np.empty((n_components, *nonlinearity.shape)) all_bins = np.empty((n_components, *bins.shape)) all_nonlinearities[i, ...] = nonlinearity all_bins[i, ...] = bins nlt_xlims = [] nlt_ylims = [] for i, ax in enumerate(axes_nlt.flatten()): xlim = ax.get_xlim() ylim = ax.get_ylim() nlt_xlims.extend(xlim) nlt_ylims.extend(ylim) nlt_maxx, nlt_minx = max(nlt_xlims), min(nlt_xlims) nlt_maxy, nlt_miny = max(nlt_ylims), min(nlt_ylims) for i, ax in enumerate(axes_nlt.flatten()): ax.set_xlim([nlt_minx, nlt_maxx]) ax.set_ylim([nlt_miny, nlt_maxy]) for i, axes_row in enumerate(axes_nlt): for j, ax in enumerate(axes_row): if i == nrows - 1: ax.set_xlabel('Generator (motion)') if j == 0: ax.set_ylabel('Generator (cells)') else: ax.yaxis.set_ticklabels([]) ax.set_xlim([nlt_minx, nlt_maxx]) ax.set_ylim([nlt_miny, nlt_maxy]) fig_nlt.suptitle(f'Nonlinearities\n{figsavename}') if savefig: fig_nlt.savefig(savedir / f'{figsavename}_nonlinearity.png') plt.close(fig_nlt) keystosave = [ 'n_components', 'cells', 'motionfilt_x', 'motionfilt_y', 'motionfilt_r', 'motionfilt_theta', 'cells_sorted_nsp', 'select_cells', 'regularization', 'filter_length', ] datadict = dict() for key in keystosave: datadict[key] = locals()[key] np.savez(savedir / figsavename, **datadict)
def run_cca_loop(n_seeds: int = 10, reg: float = 0.1, **kwargs): default_args = { 'sample_sizes': [int(10 ** i) for i in range(2, 5)], 'seeds': [int(2 ** i) for i in range(n_seeds)], 'sigmas': np.linspace(0, 5, num=11), 'normal': [True], 'three_d': [False], 'num_expts': [10, 20, 30], 'min_ncs': [int(2**i) for i in range(1, 7)], 'max_ncs': [int(4**i) for i in range(1, 7)], } for k in default_args: if k in kwargs: default_args[k] = list(kwargs[k]) if isinstance(kwargs[k], (list, tuple, np.ndarray)) else [kwargs[k]] df = pd.DataFrame() for n_samples in tqdm(default_args['sample_sizes']): for three_d in default_args['three_d']: train, test = generate_source_signal(n_samples=n_samples, three_d=three_d) for seed in tqdm(default_args['seeds'], leave=False): for normal in default_args['normal']: for sigma in default_args['sigmas']: for num_expts in default_args['num_expts']: for min_nc in default_args['min_ncs']: for max_nc in [item for item in default_args['max_ncs'] if item > min_nc]: # create sim sim = create_cca_simulation( train=train, test=test, num_expts=num_expts, min_num_cells=min_nc, max_num_cells=max_nc, n_samples=n_samples, three_d=three_d, angle_spacing=1.0, magnitude_range=None, sigma=sigma, normal=normal, seed=seed, ) # fit PLS cca = rcca.CCA( kernelcca=True, ktype='linear', reg=reg, numCC=sim['metadata']['dim_z'], verbose=False, ) cca.train(sim['x_train']) # get results results = visualize_cca_results(cca, sim, verbose=False) results.update({ 'n_samples': n_samples, 'three_d': three_d, 'seed': seed, 'normal': normal, 'sigma': sigma, 'num_expts': num_expts, 'min_nc': min_nc, 'max_nc': max_nc, }) results = {k: [v] for k, v in results.items()} df = pd.concat([df, pd.DataFrame.from_dict(results)]) return reset_df(df), default_args
'Collaborative Teachers %', 'Supportive Environment %', 'Effective School Leadership %', 'Strong Family-Community Ties %', 'Trust %','Average ELA Proficiency', 'Average Math Proficiency']] # drop missing values data = data.dropna() # separate X and Y groups X = data[['Collaborative Teachers %', 'Supportive Environment %', 'Strong Family-Community Ties %', 'Trust %']] Y = data[['Average ELA Proficiency', 'Average Math Proficiency']] for col in X.columns: X[col] = X[col].str.strip('%') X[col] = X[col].astype('int') # Standardise the data from sklearn.preprocessing import StandardScaler sc = StandardScaler(with_mean=True, with_std=True) X_sc = sc.fit_transform(X) Y_sc = sc.fit_transform(Y) import rcca nComponents = 2 # min(p,q)=2 cca = rcca.CCA(kernelcca = False, reg = 0., numCC = nComponents,) # train on data cca.train([X_sc, Y_sc]) print('Canonical Correlation Per Component Pair:',cca.cancorrs) print('% Shared Variance:',cca.cancorrs**2)
videoObj = cv2.VideoCapture("input/" + input_video) fps = videoObj.get(cv2.CAP_PROP_FPS) num_frames = int(videoObj.get(cv2.CAP_PROP_FRAME_COUNT)) video_features = extract_video_features() audio_features = extract_audio_features() # Calculates potential black frames at the beginning of videos and removes related audio features if len(audio_features) >= num_frames: black_frames = abs(num_frames - len(audio_features)) audio_features = audio_features[black_frames:] else: black_frames = 0 # Loads the CCA Model cca = rcca.CCA() cca.load("Model.hdf5") print("Detecting speakers...") speakers = detect_speakers() print("Building output...") build_output() #Temporary and input files cleaning for file in os.listdir("input"): if os.path.isdir("input/" + file): shutil.rmtree("input/" + file) elif not file == input_video: os.remove("input/" + file)
def _CCA(self): cca = rcca.CCA(kernelcca=False, reg=self.reg, numCC=self.n_components) cca.train([self.X, self.Y]) return cca.ws[0], cca.cancorrs, cca.ws[1]
def cca(training: List[EmbeddingCollection], test: List[EmbeddingCollection], ncomponents: int, reg=.01, verbose=False): """Applies CCA to extract canonical components. Args: training: A list where each item is a collection of `EmbeddingMatrix` objects used to train CCA. test: A list of the same size as `training` but where the collections are used for testing. ncomponents: Number of canonical components. reg: Regularization parameter. verbose: Sets the pyrcca.CCA's verbosity. Returns: A tuple consisting of: - List[EmbeddingCollection] containing principal components for the training sets. - None or List[EmbeddingCollection] containing principal components for the test sets. - An `rcca.CCA` object. """ # Validate the arguments. if len(training) < 2: raise ValueError(f'Expected at least 2 training collections') reference = training[0] for collection in training[1:]: if len(collection) != len(reference): raise ValueError( f'Number of embedding matrices within each collection is not consistent in the training set' ) for em in collection: if em.items != reference[0].items: raise ValueError( f'Training embedding matrices do not have the same number of items' ) if test is not None and len(test) != 0: if len(test) != len(training): raise ValueError( 'Number of collections must be the same in the training and test sets' ) for collection in test: for em in collection: if em.items != test[0][0].items: raise ValueError( f'Test embedding matrices do not have the same number of items' ) _cca = rcca.CCA(kernelcca=False, reg=reg, numCC=ncomponents, verbose=verbose) # Training. training_set, items = _cca_pack(training) _cca.train(training_set) training_cc_collection = _cca_unpack(training, _cca.comps, items) if test is None or len(test) == 0: return training_cc_collection, None, _cca # Test. def _recon(data, ws): def _listdot(d1, d2): return [np.dot(x[0].T, x[1]) for x in zip(d1, d2)] ccomp = _listdot([d.T for d in data], ws) return ccomp test_set, items = _cca_pack(test) comps = _recon(test_set, _cca.ws) comps = np.array(comps) test_cc_collection = _cca_unpack(test, comps, items) return training_cc_collection, test_cc_collection, _cca
def fit_cca_loop( h_load_file: str, rescale: bool = False, save_file: str = None, **kwargs, ): save_file = 'results_cca_{}.df'.format( now()) if save_file is None else save_file default_args = { 'min_nb_trials': 100, 'target': True, 'global_normalize': True, 'augment_data': False, 'xv_folds': 5, 'time_range': range(45, 46), 'num_ccs': np.arange(5, 91, 5), 'cca_regs': np.logspace(-3, -1.5, num=20), 'cutoffs': np.logspace(-18, -12, num=3), } for k in default_args: if k in kwargs: default_args[k] = kwargs[k] results = pd.DataFrame() warnings.filterwarnings('ignore', category=RuntimeWarning) for fold in tqdm(range(default_args['xv_folds']), leave=False): data_trn, data_tst = prepare_cca_data( h_load_file=h_load_file, min_nb_trials=default_args['min_nb_trials'], target=default_args['target'], global_normalize=default_args['global_normalize'], augment_data=default_args['augment_data'], xv_folds=default_args['xv_folds'], which_fold=fold, time_range=default_args['time_range'], verbose=False, ) train_list, y_trn = data_trn['processed'], data_trn['labels'] test_list, y_tst = data_tst['processed'], data_tst['labels'] for n_components in tqdm(default_args['num_ccs'], leave=False): train_list_centered = [ (item - item.mean()) / np.sqrt(n_components) if rescale else item - item.mean() for item in train_list ] for reg in tqdm(default_args['cca_regs'], leave=False): for cutoff in tqdm(default_args['cutoffs'], leave=False): cca = rcca.CCA( kernelcca=True, ktype='linear', numCC=n_components, reg=reg, cutoff=cutoff, verbose=False, ) cca.train(train_list_centered) testcorrs = cca.validate(test_list) corrs = [] for item in testcorrs: corrs.append(np.mean(np.abs(item))) pred_r = np.mean(corrs) data_dict = { 'fold': [fold], 'n_components': [n_components], 'cca_reg': [reg], 'cutoff': [cutoff], 'metric': ['pred_r'], 'value': [pred_r], } results = pd.concat( [results, pd.DataFrame.from_dict(data_dict)]) save_obj(obj=results, file_name=save_file, save_dir='./results', mode='df', verbose=False) results = reset_df(results, downcast='none') save_obj(obj=results, file_name=save_file, save_dir='./results', mode='df', verbose=True) # TODO: reimplement extract best hyperparams so that it works for this too return results, default_args
for index in range(SET_PARAMS.Number_of_multiple_orbits): Y, Y_buffer, X, X_buffer, Orbit = Dataset_order( index, direction, binary_set, buffer, categorical_num, use_previously_saved_models, columns_compare=["Earth x", "Earth y", "Earth z"], columns_compare_to=[ "Angular momentum of wheels x", "Angular momentum of wheels y", "Angular momentum of wheels z" ]) All_orbits.append(Orbit) # Split each dataset into two halves: training set and test set train1 = Y[:int(nSamples / 2)] train2 = X[:int(nSamples / 2)] test1 = Y[int(nSamples / 2):] test2 = X[int(nSamples / 2):] # Create a cca object as an instantiation of the CCA object class. cca = rcca.CCA(kernelcca=False, reg=0., numCC=2) # Use the train() method to find a CCA mapping between the two training sets. cca.train([train1, train2]) # Use the validate() method to test how well the CCA mapping generalizes to the test data. # For each dimension in the test data, correlations between predicted and actual data are computed. testcorrs = cca.validate([test1, test2]) print(testcorrs)
a_matrix = preprocessing.normalize(a_matrix) train_v = np.array(v_matrix) train_a = np.array(a_matrix) print("Number of examples: ", len(train_v)) max_numCCs = np.shape(train_v)[1] if np.shape(train_v)[1] < np.shape( train_a)[1] else np.shape(train_a)[1] numCCs = list(range(3, max_numCCs + 1)) reg_coeffs = [100, 10, 1, 0, 0.1, 0.01, 0.001, 0.0001, 0.00001] print("START TRAINING") for kernel in [True, False]: ktypes = ["poly", "gaussian"] if kernel else [None] for ktype in ktypes: for coeff in reg_coeffs: for numCC in numCCs: cca = rcca.CCA(kernelcca=kernel, numCC=numCC, reg=coeff, ktype=ktype) cca.train([train_v, train_a]) cca.save("TrainingModels/" + feat_type + "_" + str(kernel) + "_" + str(ktype) + "_" + str(numCC) + "_" + str(coeff) + ".hdf5") # print("Model: "+f_type+"_"+str(kernel)+"_"+str(ktype)+"_"+str(numCC)+"_"+str(coeff)) # print("Cancorrs:", cca.cancorrs) # print("----------------------------------")