def main(params_file, output_dir, output_code, datasets, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] data_for_spca, specimen_ids = ld.load_h5_data("C:\\Users\\SMest\\fv_ nmmouse_human.h5", metadata_file=None, limit_to_cortical_layers=None, id_file=None, params_file="C:\\Users\\SMest\\source\\repos\\drcme\\drcme\\bin\\default_spca_params.json") imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,) for l, m in data_for_spca.items(): if type(m) == np.ndarray: # nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) data_for_spca[l] = imp.fit_transform(nu_m) #data_for_spca[l] = nu_m data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = KNNImputer(copy=False) for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = m p = np.nonzero(nu_m[:, :])[1] p = max(p) print(l) print(p) data_for_spca[l] = nu_m if 'EXTRA' not in ds["fv_h5_file"]: data_for_spca[l] = nu_m * -1 data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: do[k] = normalize_ds(do[k], norm_type) data_for_spca[k] = do[k] else: data_for_spca[k], do[k] = equal_ar_size( data_for_spca[k], do[k], k, i) do[k] = normalize_ds(do[k], norm_type) data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) np.savetxt(output_fld + k + str(i) + '.csv', do[k], delimiter=",", fmt='%12.5f') np.savetxt(output_fld + k + str(i) + 'mean.csv', np.vstack((np.nanmean(do[k], axis=0), np.nanstd(do[k], axis=0))), delimiter=",", fmt='%12.5f') specimen_ids = np.hstack(specimen_ids_list) ### Now run through again and impute missing: for l in data_for_spca: nu_m = data_for_spca[l] nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, norm_type, labels_file, spca_file, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = KNNImputer(copy=False) pad_len = 0 for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = m p = np.nonzero(nu_m[:, :])[1] p = max(p) print(l) print(p) if p > pad_len: pad_len = p data_for_spca[l] = nu_m data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) HPARAMS = HParams() data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: do[k] = norm.normalize_ds(do[k], norm_type) data_for_spca[k] = do[k] else: data_for_spca[k], do[k] = equal_ar_size( data_for_spca[k], do[k], k, i) do[k] = norm.normalize_ds(do[k], norm_type) data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) np.savetxt(output_fld + k + str(i) + '.csv', do[k], delimiter=",", fmt='%12.5f') np.savetxt(output_fld + k + str(i) + 'mean.csv', np.vstack((np.nanmean(do[k], axis=0), np.nanstd(do[k], axis=0))), delimiter=",", fmt='%12.5f') specimen_ids = np.hstack(specimen_ids_list) labels = pd.read_csv(labels_file, index_col=0) df_s = pd.read_csv(spca_file, index_col=0) #labels['0'] = labelnoise(df_s, labels) train_ind = np.where(labels['0'] > -1)[0] pred_ind = np.where(labels['0'] == -1)[0] train_id = specimen_ids[train_ind] pred_id = specimen_ids[pred_ind] train_label = labels.iloc[train_ind] train_label = labels.iloc[train_ind] pred_label = labels.iloc[pred_ind] ### Now run through again and impute missing: train_data = {} pred_data = {} for l in data_for_spca: nu_m = data_for_spca[l] nu_m = imp.fit_transform(nu_m) if nu_m.shape[1] < pad_len: pad_wid = (pad_len - nu_m.shape[1]) + 1 nu_m = np.hstack((nu_m, np.zeros((nu_m.shape[0], pad_wid)))) train_data[l] = nu_m[train_ind] pred_data[l] = nu_m[pred_ind] data_for_spca[l] = nu_m ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) ## Form our datasets for training HPARAMS.input_shape = [pad_len + 1, 1, len(data_for_spca)] full_data = np.hstack( (data_for_spca[i] for i in sorted(data_for_spca.keys()))) train_data = np.hstack((train_data[i] for i in sorted(train_data.keys()))) pred_data = np.hstack((pred_data[i] for i in sorted(pred_data.keys()))) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) ## Write the Data to a record for use with 'graph params' writer = tf.io.TFRecordWriter(output_dir + 'train_data.tfr') for id, data, label in zip(train_id, train_data, train_label.values): example = nsl_tools.create_example(data, label, id) writer.write(example.SerializeToString()) writer = tf.io.TFRecordWriter(output_dir + 'pred_data.tfr') for id, data, label in zip(specimen_ids, full_data, labels.values): example = nsl_tools.create_example(data, label, id) writer.write(example.SerializeToString()) logging.info("Proceeding with %d cells", len(specimen_ids)) # Define the Keras TensorBoard callback. logdir = os.path.join( "logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"), ) full_labels = np.where(labels.values != -1, labels.values, (np.unique(labels.values)[-1] + 1)) #Split into validation / test / train datasets train_dataset = tf.data.Dataset.from_tensor_slices({ 'waves': train_data, 'label': np.ravel(train_label.values) }).shuffle(2000).batch(HPARAMS.batch_size) train_size = train_data.shape[0] // HPARAMS.batch_size test_fraction = 0.3 test_size = int(test_fraction * train_size) test_dataset = train_dataset.take(test_size) train_dataset = train_dataset.skip(test_size) train_size = train_size - test_size validation_fraction = 0.6 validation_size = int(validation_fraction * train_size) train_size = train_size - validation_size print('taking val: ' + str(validation_size) + ' test: ' + str(test_size) + ' train: ' + str(train_size)) validation_dataset = train_dataset.take(validation_size) train_dataset = train_dataset.skip(validation_size) nsl_tools.HPARAMS.max_seq_length = train_data.shape[1] base_model = nsl_tools.build_base_model() # Wrap the model with adversarial regularization. adv_config = nsl.configs.make_adv_reg_config(multiplier=0.2, adv_step_size=0.05) adv_model = nsl.keras.AdversarialRegularization(base_model, adv_config=adv_config) # Compile, train, and evaluate. adv_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy']) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir) history = adv_model.fit(train_dataset, validation_data=validation_dataset, epochs=15) print('### FIT COMPLETE ### TESTING') acc = adv_model.evaluate(test_dataset, verbose=1) np.savetxt('full_acc.csv', acc, delimiter=",", fmt='%12.5f') pred_labels_prob = adv_model.predict({ 'waves': full_data, 'label': full_labels }) pred_labels = np.argmax(pred_labels_prob, axis=1) logging.info("Saving results...") labels['0'] = pred_labels labels.to_csv(output_code + '_NSL_pred_adv_learn.csv') #####GRAPH NETWORK ###nsl_tools.save_for_gam(full_data, full_labels) #nsl_tools.build_graph(df_s, output_dir + 'embed.tsv') #spca_pack_nbrs.pack_nbrs( # output_dir + '/train_data.tfr', # output_dir + '/pred_data.tfr', # output_dir + 'embed.tsv', # output_dir + '/nsl_train_data.tfr', #add_undirected_edges=True, #max_nbrs=6) #predictions = nsl_tools.graph_nsl(output_dir + '/nsl_train_data.tfr', output_dir + '/pred_data.tfr', train_data) #pred_labels = np.argmax(predictions, axis=1) #logging.info("Saving results...") #labels['0'] = pred_labels #labels.to_csv(output_code + '_NSL_pred_graph_learn.csv') logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, norm_type, labels_file, spca_file, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = SimpleImputer( missing_values=0, strategy='mean', copy=False, ) for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:, :])[1] p = max(p) nu_m = nu_m[:, :p] print(l) print(p) nu_m = imp.fit_transform(nu_m) data_for_spca[l] = normalize_ds(nu_m, norm_type) data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) specimen_ids = np.hstack(specimen_ids_list) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k], do[k] = equal_ar_size( data_for_spca[k], do[k], k, i) data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) df_s = pd.read_csv(spca_file, index_col=0) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) labels = pd.read_csv(labels_file, index_col=0) print(labels) print(labels.values) uni_labels = np.unique(labels.values) ids_list = labels.index.values if labels.shape[0] == ids_list.shape[0]: print("Same Ids loaded... Proceeding") logging.info("Proceeding with %d cells", len(specimen_ids)) for p in data_for_spca: labels_means = pd.DataFrame() arr_data = data_for_spca[p] for x in uni_labels: indx = np.where(labels['0'] == x)[0] row, col = arr_data[indx].shape n_co = np.full(col, row) mean = pd.Series(data=np.mean(arr_data[indx], axis=0), name=('Cluster ' + str(x) + ' mean')) std = pd.Series(data=np.std(arr_data[indx], axis=0), name=('Cluster ' + str(x) + ' std')) n = pd.Series(data=n_co, name=('Cluster ' + str(x) + ' n')) labels_means = labels_means.append(mean, ignore_index=True) labels_means = labels_means.append(std, ignore_index=True) labels_means = labels_means.append(n, ignore_index=True) labels_means.to_csv(output_fld + p + '_cluster_mean.csv') train_df, test_df, labels_2, _ = train_test_split(df_s, labels) rf = RandomForestClassifier(n_estimators=500, oob_score=True, random_state=0) #per = multiclass.OneVsOneClassifier(RandomForestClassifier(n_estimators=500, oob_score=True, # random_state=0), n_jobs=-1).fit(train_df.values, labels.to_numpy().flatten()) rf.fit(train_df.values, labels_2.to_numpy().flatten()) logging.info("OOB score: {:f}".format(rf.oob_score_)) pred_labels = rf.predict(test_df.values) feat_import = rf.feature_importances_ print(rf.oob_score_) logging.debug("Saving results") #pd.DataFrame(pred_labels, index=test_df.index.values).to_csv('rf_predictions.csv') pd.DataFrame(feat_import).to_csv('rf_feat_importance.csv') ### Now compute for labeled data train_ind = np.where(labels['0'] > -1)[0] labeled = labels.iloc[train_ind] labeled_df_s = df_s.iloc[train_ind] train_df, test_df, labels_2, labels_3 = train_test_split( labeled_df_s, labeled) clf1 = LogisticRegression(random_state=1, max_iter=1000) clf2 = RandomForestClassifier(n_estimators=500, random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', n_jobs=-1) eclf.fit(train_df, np.ravel(labels_2.values)) fit_score = eclf.score(test_df, np.ravel(labels_3.values)) print(fit_score) params = { 'lr__C': np.linspace(1.0, 1000.0, 10), 'rf__n_estimators': np.linspace(20, 1000, 10, dtype=np.int64) } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1, verbose=1) grid.fit(train_df, np.ravel(labels_2.values)) fit_score = grid.score(test_df, np.ravel(labels_3.values)) print("grid search params") print(fit_score) grid_CV = grid.best_estimator_ full_acc = np.arange(15, dtype=np.float64) PARAMS = grid.best_estimator_ for i, a in enumerate(full_acc): train_df, test_df, labels_2, labels_3 = train_test_split( labeled_df_s, labeled, test_size=0.6, train_size=0.28) clf1 = LogisticRegression(random_state=1, max_iter=1000) clf2 = RandomForestClassifier(n_estimators=500, random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', n_jobs=-1) eclf.fit(train_df, np.ravel(labels_2.values)) full_acc[i] = eclf.score(test_df, np.ravel(labels_3.values)) np.savetxt('full_acc.csv', full_acc, delimiter=",", fmt='%12.5f') _, _, pvalue = permutation_test_score(grid_CV, train_df, np.ravel(labels_2.values), n_jobs=-1) print("pvalue: " + str(pvalue)) fclf = OneVsOneClassifier(grid, n_jobs=-1) fclf.fit(train_df, np.ravel(labels_2.values)) fit_score = fclf.score(test_df, np.ravel(labels_3.values)) y_pred = fclf.predict(test_df) print(fit_score) print(metrics.classification_report(y_pred, np.ravel(labels_3.values))) pred_labels = fclf.predict(df_s.values) pd.DataFrame(pred_labels, index=df_s.index.values).to_csv('full_predictions.csv') feat_import_by_label = np.hstack((0, np.full(feat_import.shape[0], np.nan))) for i in permutations(uni_labels, 2): indx_1 = np.where((labels['0'] == i[0]))[0] indx_2 = np.where((labels['0'] == i[1]))[0] indx = np.hstack((indx_1, indx_2)) if indx.shape[0] >= 100: print(indx.shape[0]) df_s_temp = df_s.iloc[indx] labels_s_temp = labels.iloc[indx] train_df, test_df, labels_2, _ = train_test_split( df_s_temp, labels_s_temp) rf = RandomForestClassifier(n_estimators=500, oob_score=True, random_state=0) #per = multiclass.OneVsOneClassifier(RandomForestClassifier(n_estimators=500, oob_score=True, # random_state=0), n_jobs=-1).fit(train_df.values, labels.to_numpy().flatten()) rf.fit(train_df.values, labels_2.to_numpy().flatten()) logging.info("OOB score: {:f}".format(rf.oob_score_)) pred_labels = rf.predict(test_df.values) feat_import = rf.feature_importances_ print(str(i) + ' ' + str(rf.oob_score_)) logging.debug("Saving results") feat_import_by_label = np.vstack( (feat_import_by_label, np.hstack((str(i), np.ravel(feat_import))))) del rf pd.DataFrame(feat_import_by_label).to_csv(output_fld + 'label_rf_feat_importance.csv') logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = KNNImputer(copy=False) dataset_no = [] for i, ds in enumerate(datasets): if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = m p = np.nonzero(nu_m[:, :])[1] p = max(p) filename = ds["fv_h5_file"] if 'INTRA' not in filename: nu_m = nu_m[:, 30:-1] #else: #nu_m = nu_m * -1 print(l) print(p) data_for_spca[l] = nu_m[:, 94:] data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) dataset_no = np.hstack((dataset_no, np.full(specimen_ids.shape[0], i))) truncate = [] for i, do in enumerate(data_objects[0]): ###FIND THE ARGMIN argmin = [] for l in np.arange(len(data_objects)): argmin = np.hstack( (argmin, np.nanargmin(norm.normalize_ds(data_objects[l][do], 4)))) data_for_spca = {} data_for_spca_nonorm = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca_nonorm[k] = do[k] do[k] = norm.normalize_ds(do[k], norm_type) data_for_spca[k] = do[k] else: #data_for_spca_nonorm[k], do[k] = norm.center_on_m(data_for_spca_nonorm[k], do[k]) #data_for_spca[k], do[k] = norm.equal_ar_size(data_for_spca[k], do[k]) #data_for_spca_nonorm[k], do[k] = equal_ar_size(data_for_spca_nonorm[k], do[k], k, i) #data_for_spca[k] = norm.normalize_ds(data_for_spca[k], norm_type) #_, do[k] = norm.shift_means(data_for_spca_nonorm[k], do[k]) #data_for_spca_nonorm[k] = np.vstack([data_for_spca_nonorm[k], do[k]]) do[k] = norm.normalize_ds(do[k], norm_type) # data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) np.savetxt(output_fld + k + str(i) + '.csv', do[k], delimiter=",", fmt='%12.5f') np.savetxt(output_fld + k + str(i) + 'mean.csv', np.vstack((np.nanmean(do[k], axis=0), np.nanstd(do[k], axis=0))), delimiter=",", fmt='%12.5f') specimen_ids = np.hstack(specimen_ids_list) ### Now run through again and impute missing: for l in data_for_spca: nu_m = data_for_spca[l] nu_m = imp.fit_transform(nu_m) nu_m = norm.normalize_ds(nu_m, 1) data_for_spca[l] = nu_m ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) row = int((len(combo_df.index) / 2)) df_2 = combo_df.iloc[row:] df_1 = combo_df.iloc[:row] _df = umap.combined_umap(df_1, df_2) cmap = cm.get_cmap('tab10') _df.plot.scatter(x='x', y='y', c=dataset_no, cmap=cmap) plt.show() _df.to_csv(output_dir + 'umap_' + output_code + '.csv') with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, use_noise, **kwargs): spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,) # These arguments should be parameterized orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = {} for i, do in enumerate(orig_data_objects): for k in do: if k not in orig_data_for_spca: orig_data_for_spca[k] = do[k] else: orig_data_for_spca[k] = np.vstack([orig_data_for_spca[k], do[k]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format(len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore_h5(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(new_data_objects): for k in do: if k not in data_for_spca: _, do[k] = equal_ar_size(orig_data_for_spca[k], do[k], k, i) data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data_h5(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file)
def main(params_file, output_dir, output_code, datasets, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, **kwargs): """ Main runner function for script. See :class:`SpcaTransformParameters` for argument descriptions. """ spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) # Load original data sets orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike=ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = [] for i, do in enumerate(orig_data_objects): for k in do: if k not in orig_data_for_spca: orig_data_for_spca[k] = do[k] else: orig_data_for_spca[k] = np.vstack( [orig_data_for_spca[k], do[k]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format( len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike=ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = [] for i, do in enumerate(new_data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file)