for word2vec_vec,word2vec_name in zip(word2vec_vecs,word2vec_names): csv_filename = os.path.join(saving_dir,'{} {} {} {} {} {}.csv'.format( experiment, here, sub_name, roi_name, condition, word2vec_name)) processed = glob(os.path.join(saving_dir,'*.csv')) if csv_filename in processed: # don't repeat what have done print(csv_filename) pass else: if n_splits >= 100: idxs_test = utils.customized_partition(df_data,['id','words'],n_splits) while utils.check_train_test_splits(idxs_test): # just in case idxs_test = utils.customized_partition(df_data,['id','words'],n_splits = n_splits) idxs_train = [shuffle(np.array([idx for idx in np.arange(df_data.shape[0]) if (idx not in idx_test)])) for idx_test in idxs_test] # idxs_train = [utils.check_train_balance(df_data,idx_train,list(label_map.keys())) for idx_train in tqdm(idxs_train)] cv = zip(idxs_train,idxs_test) else: from sklearn.model_selection import GroupShuffleSplit cv = GroupShuffleSplit(n_splits = n_splits, test_size = 0.2, random_state = 12345) idxs_train,idxs_test = [],[] for idx_train,idx_test in cv.split(BOLD,targets,groups=groups): idxs_train.append(idx_train) idxs_test.append(idx_test) embedding_features = np.array([word2vec_vec[word.lower()] for word in df_data['words']])
variance_threshold.fit(BOLD) scaler = StandardScaler() scaler.fit(variance_threshold.transform(BOLD)) csv_filename = os.path.join( saving_dir, '{} {} {} {} {}.csv'.format(experiment, here, sub_name, roi_name, condition)) processed = glob(os.path.join(saving_dir, '*.csv')) if csv_filename in processed: print(csv_filename) pass else: print('partitioning ...') idxs_train, idxs_test = utils.get_train_test_splits( dataset, label_map, n_splits) if utils.check_train_test_splits(idxs_test): idxs_train, idxs_test = utils.get_train_test_splits( dataset, label_map, n_splits) for word2vec_name, word2vec_features in zip( word2vec_names, word2vec_vecs): r_squares, scores = [], [] for fold, (idx_train, idx_test) in tqdm( enumerate(zip(idxs_train, idxs_test))): if average: tr = dataset[idx_train].get_mapped( mean_group_sample(['chunks', 'id'], order='occurrence')) else: tr = dataset[idx_train] te = dataset[idx_test].get_mapped( mean_group_sample(['chunks', 'id'],