def plot_avg_similarities(dataset_name, save_file=False): paths = [path for path in os.listdir('results/similarity') if path.startswith(dataset_name)] values = [np.mean(read_dataset(dataset_name)['gt_sim'].get_values())] embeddings = ['GT'] for path in paths: values.append(np.nanmean(pd.read_csv(f'results/similarity/{path}')['cosine_sim'].get_values())) emb_name = path.split('_') if emb_name[1] == 'numberbatch': embeddings.append(f'{emb_name[2][0].upper()}-{emb_name[1][0].upper()}-{emb_name[3]}') else: embeddings.append(f'{emb_name[1][0].upper()}-{emb_name[2][0].upper()}-{emb_name[3]}') data = pd.DataFrame() data['embeddings'] = embeddings data['similarities'] = values sns.set(style='darkgrid', context='poster', font='Verdana') f, ax = plt.subplots() sns.barplot(x='embeddings', y='similarities', ax=ax, data=data) ax.set_xticklabels(ax.get_xticklabels(), rotation=75) ax.axhline(0, color='k', clip_on=False) plt.ylim(0, 10) for bar, value in zip(ax.patches, data['similarities'].get_values()): text_x = bar.get_x() + bar.get_width() / 2.0 text_y = bar.get_height() + 0.025 text = f'{round(value, 5)}' ax.text(text_x, text_y, text, fontsize=20, ha='center', va='bottom', rotation=90, color='k') sns.despine(bottom=True) plt.title(dataset_name) if save_file: figure = plt.gcf() figure.set_size_inches(10, 8) plt.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.3) plt.savefig(f'results/img/{dataset_name}_avg_sim.png') else: plt.show()
def calculate_cosine_similarity(dataset_name, emb_name, emb_type, emb_size): cosine = list() dataset = read_dataset(dataset_name) embeddings = read_embeddings(dataset_name, emb_name, emb_type, emb_size) for _, row in dataset.iterrows(): if row['word1'].lower() in embeddings.keys() and row['word2'].lower( ) in embeddings.keys(): vec1 = embeddings[row['word1'].lower()] vec2 = embeddings[row['word2'].lower()] cosine.append( round(cosine_similarity([vec1], [vec2])[0][0] * 10, 2)) else: cosine.append(None) dataset['cosine_sim'] = cosine dataset.to_csv( f'results/similarity/{dataset_name}_{emb_name}_{emb_type}_{emb_size}_cosine.csv', index=None)
def plot_similarity(dataset_name, embeddings): dataset = read_dataset(dataset_name) data = pd.DataFrame() pairs = list(range(len(dataset['gt_sim']))) embedding_names = ['gt_similarity'] * len(dataset['gt_sim']) similarities = list(dataset['gt_sim'].get_values()) for embedding in embeddings: dataset = pd.read_csv(f'results/similarity/{dataset_name}_{embedding}_cosine.csv') pairs += list(range(len(dataset['cosine_sim']))) embedding_names += [embedding] * len(dataset['cosine_sim']) similarities += list(dataset['cosine_sim'].get_values()) data['pairs'] = pairs data['embeddings'] = embedding_names data['similarities'] = similarities sns.set(style='darkgrid', context='poster', font='Verdana', font_scale=0.5) sns.lineplot(x='pairs', y='similarities', hue='embeddings', style='embeddings', dashes=False, data=data) plt.show()
def process_dataset(self): if self.dataset: logging.debug("Processing dataset.") X, Y = preprocess.read_dataset(self.dataset, balanced=True) print X.shape Y = to_categorical(Y) logging.debug("X example: %s\ny example: %s" % (X[0], Y[0])) X_train, X_val, X_test, y_train, y_val, y_test = preprocess.split_dataset( X, Y) self.num_steps = X.shape[1] elif self.train_path: X_train, y_train = preprocess.read_set(self.train_path) X_train, X_val, y_train, y_val = preprocess.split_dataset( X_train, y_train, test_size=0.2, validation=False) X_test, y_test = preprocess.read_set(self.test_path) self.num_steps = X_train.shape[1] return X_train, X_val, X_test, y_train, y_val, y_test
def anta_normalize(x, y): # preprocessing scRNA-seq read counts matrix y = y.astype(np.int32) adata = sc.AnnData(x) adata.obs['Group'] = y adata = read_dataset(adata, transpose=False, test_split=False, copy=True) adata = process_normalize(adata, size_factors=True, normalize_input=True, logtrans_input=True) print(adata.X.shape) x_sd = adata.X.std(0) x_sd_median = np.median(x_sd) print("median of gene sd: %.5f" % x_sd_median) x = adata.X.astype(np.float32) y = y.astype(np.int32) raw_data = adata.raw.X return x, y, adata.obs.size_factors, raw_data
args = parser.parse_args() # load dataset optimizer1 = Adam(amsgrad=True) optimizer2 = 'adadelta' data_mat = h5py.File(args.data_file) x = np.array(data_mat['X']) y = np.array(data_mat['Y']) adata = sc.AnnData(x) adata.obs['Group'] = y adata = read_dataset(adata, transpose=False, test_split=False, copy=True) adata = normalize(adata, size_factors=True, normalize_input=True, logtrans_input=True) input_size = adata.n_vars print(adata.X.shape) print(y.shape) x_sd = adata.X.std(0) x_sd_median = np.median(x_sd) print("median of gene sd: %.5f" % x_sd_median)
def process_dataset(self): X, Y = preprocess.read_dataset(self.dataset, balanced=True) return preprocess.split_dataset(X, Y, validation=False)
p_train = 0.8 np.random.seed(42) filename = 'data_info.csv' add_dir(save_dir_public_train) add_dir(save_dir_public_test) add_dir(save_dir_private_train) add_dir(save_dir_private_test) atlas_dataset = 'dataset_1' # already public, old dataset. all must stay # public # private dataset, may be split between private and public new_datasets = ['dataset_4', 'dataset_5'] # read datasets data_info_atlas = read_dataset(atlas_dataset) assert data_info_atlas is not None file_list_atlas = list_files(data_info_atlas) file_list_new = [] for dataset in new_datasets: data_info_new = read_dataset(dataset) assert data_info_new is not None file_list_new.extend(list_files(data_info_new)) n_atlas = len(file_list_atlas) n_new = len(file_list_new) n_total = n_atlas + n_new # calculate the number of samples required in each dataset n_public = max(int(0.6 * n_total), n_atlas)
save_dir = os.path.join(save_dir, 'public') else: dataset_name = 'dataset_2' filename = 'private.csv' save_dir = os.path.join(save_dir, 'private') if save_preprocessed_data: if not os.path.exists(save_dir): os.mkdir(save_dir) path_analysis = 'data/data_analysis' path_raw = os.path.join(path_analysis, filename) if not os.path.exists(path_analysis): os.mkdir(path_analysis) data_info = read_dataset(dataset_name) assert data_info is not None file_list = list_files(data_info) column_names = [ 'RawPath', 'T1_filename', 'n_lesions', 'RawSize_x', 'RawSize_y', 'RawSize_z', 'RawLesionSize', 'AverageGrey' ] if save_preprocessed_data: column_names.extend( ['NewPath', 'NewT1_name', 'NewMask_name', 'NewAverageGrey']) data = [] idx = 1 if not len(file_list): print(f'No data files found in {data_info["raw_dir"]}')