def main(n_images, n_tissues, n_patches, patch_size, model_file_id): logger.info('Initializing cluster_classify script') dataset = Dataset(n_tissues=n_tissues, n_images=n_images) data = dataset.sample_data(patch_size, n_patches) patches, GTEx_IDs = data image_objs = [Image(x) for x in GTEx_IDs] dataset_name = ''.join([s for s in str(dataset) if s.isalnum()]) features_ID = dataset_name + f'_{n_patches}_{patch_size}_{n_images}' \ + model_file_id features = generate_features(features_ID, patches, model_file_id) a_features, a_image_objs = aggregate_features(dataset_name, features, image_objs, 'GTEx_IDs', np.mean) a_features, a_image_objs = aggregated_features['GTEx_factor_IDs'][ 'np.mean'] lung_features, lung_image_objs = subselect_tissue(dataset_name, 'Lung', features, image_objs) train_classifiers(dataset_name, features_ID, lung_features, lung_image_objs, 'GTEx_IDs', retrain=True)
def run_naive_bayes_bow_vocabulary(nbr, str_list): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_bayers(nbr) vectorizer = CountVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = MultinomialNB().fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "NB BOW voc Avg f1: " + avg_f1.__str__(), "NB BOW voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_lp_bow_vocabulary(nbr, str_list, gamma): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = CountVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelPropagation(kernel='rbf', gamma=gamma).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "RBF BOW voc Avg f1: " + avg_f1.__str__(), "RBF BOW voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def main(n_tissues, n_images, n_patches, patch_size, model_type, param_string): np.random.seed(42) os.makedirs('data/images', exist_ok=True) dataset = Dataset(n_tissues=n_tissues, n_images=n_images) logger.debug('Initializing download script') params = extract_params(param_string) params['patch_size'] = patch_size N = dataset.n_tissues * dataset.n_images * params['batch_size'] data = dataset.sample_data(patch_size, int(n_patches)) patches_data, imageIDs_data = data if model_type == 'concrete_vae': from dependencies.vae_concrete.vae_concrete import VAE m = VAE(latent_cont_dim=256) m.fit(patches_data, num_epochs=20) else: Model = eval(model_type) m = Model(inner_dim=params['inner_dim']) N = patches_data.shape[0] assert N == imageIDs_data.shape[0] p = np.random.permutation(N) patches_data, imageIDs_data = patches_data[p], imageIDs_data[p] m.train_on_data(patches_data, params) m.save()
def main(): logger.info('Initializing debug script') dataset = Dataset(n_tissues=6, n_images=10) data = dataset.sample_data(128, 50) patches_data, imageIDs_data = data for i in tqdm(range(len(imageIDs_data))): GTEx_ID = imageIDs_data[i] idx = i % 50 scipy.misc.imsave( f'data/cellprofiler/patches/{i:04d}_{GTEx_ID}_{idx}.png', 255 - patches_data[i])
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_naive_bayes_tfidf_runtime_vocabulary(nbr, str_list): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_bayers(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = TfidfVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = MultinomialNB().fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "NB TF-IDF runtime voc Avg f1: " + avg_f1.__str__(), "NB TF-IDF runtime voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def main(n_tissues, n_images, n_patches, patch_size, model_file): logger.info('Initializing inspect script') dataset = Dataset(n_tissues=n_tissues, n_images=n_images) data = dataset.sample_data(patch_size, 15) patches_data, imageIDs_data = data K = 5 N = patches_data.shape[0] idx = np.random.choice(range(N), K) patches = patches_data[idx] if model_file: # fig, ax = plt.subplots( # 2, K, figsize=(8, 3) # ) fig = plt.figure() figsize = 128 figure = np.zeros((figsize * 2, figsize * K, 3)) model = load_model(MODEL_PATH + f'{model_file}.pkl') decoded_patches = model.predict(patches) fig.suptitle(model_file, fontsize=10) for i in range(K): figure[0 * figsize:(0 + 1) * figsize, i * figsize:(i + 1) * figsize, :] = deprocess(patches[i]) figure[1 * figsize:(1 + 1) * figsize, i * figsize:(i + 1) * figsize, :] = deprocess(decoded_patches[i]) # ax[0][i].imshow(deprocess(patches[i])) # ax[0][i].axis('off') # ax[1][i].imshow(deprocess(decoded_patches[i])) # ax[1][i].axis('off') plt.imshow(figure) fig.savefig(f'figures/{model_file}.png', bbox_inches='tight') else: model_files = sorted(os.listdir(MODEL_PATH)) n = len(model_files) fig, ax = plt.subplots(2 * n, K, figsize=(8, 4 * n)) for (k, model_file) in enumerate(model_files): model_name = model_file.replace('.pkl', '') model = load_model(MODEL_PATH + f'{model_name}.pkl') logger.debug(f'Generating decodings for {model_file}') decoded_patches = model.predict(patches) for i in range(K): ax[2 * k][i].imshow(deprocess(patches[i])) ax[2 * k][i].axis('off') if i == int(K / 2): ax[2 * k][i].set_title(model_file) ax[2 * k + 1][i].imshow(deprocess(decoded_patches[i])) ax[2 * k + 1][i].axis('off') plt.savefig(f'figures/all_models.png')
def run_lp_tfidf(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def process(categories): i = 0 while i < len(categories): trainingdata = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=[categories[i]]) testdata = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=[categories[i]]) lemmatize_newsgroup(trainingdata, testdata, categories[i]) remove_stopwords(trainingdata) remove_stopwords(testdata) print_docs(trainingdata, testdata, categories[i]) i += 1 dataset = Dataset(categories) dataset.load_preprocessed_V1(categories) remove_frequent_and_infrequent_words(dataset.train) print_docs_reduced_feature_count(dataset, categories) print_v2_docs(categories) print_v2_test_docs_vocabulary(categories)
def print_v2_test_docs_vocabulary_labeled(categories): i = 0 removed_test = 0 print("Printing docs...") while i < len(categories): with open( '../assets/20newsgroups/test2vocabulary_labeled/newsgroups_test_' + categories[i] + '.txt', 'w') as f: lines = [ line.rstrip('\n') for line in open('../assets/20newsgroups/test/newsgroups_test_' + categories[i] + '.txt') ] j = 0 dataset = Dataset(categories) vectorizer = CountVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) vocabulary = vectorizer.vocabulary_ while j < len(lines): lines[j] = re.sub(r'[^\w]', " ", lines[j]) lines[j] = re.sub(r'\b[a-zA-Z]\b', " ", lines[j]) lines[j] = re.sub(r'[ \t]+', " ", lines[j]) # remove extra space or tab lines[j] = lines[j].strip() + "\n" remove_doc = 1 words = lines[j].split() for word in words: if word in vocabulary.keys(): remove_doc = 0 break size = len(lines[j]) # lines[j] = lines[j][1:size] if len(lines[j]) > 4 and not remove_doc: f.write(lines[j]) else: removed_test += 1 j += 1 f.close() i += 1 print("Printing finished") print("Removed testing doc:", removed_test)
def main(n_images, n_tissues): os.makedirs('data/images', exist_ok=True) logger.info('Initializing download script') dataset = Dataset(n_images=n_images, n_tissues=n_tissues) dataset.download()
def main(n_images, n_tissues): os.makedirs('data/patches', exist_ok=True) logger.info('Initializing patches script') dataset = Dataset(n_images=n_images, n_tissues=n_tissues) dataset.get_patchcoordfiles()
'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] """ categories = [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] # initialize dataset dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(10) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) dataset_knn = Dataset(categories) dataset_knn.load_preprocessed_vocabulary_in_use(categories) dataset_knn.split_train_true(10) print_v2_test_docs_vocabulary_labeled(categories) dataset_knn.load_preprocessed_test_vocabulary_labeled_in_use(categories) # feature extraction vectorizer_rbf = TfidfVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectorizer_knn = TfidfVectorizer(
'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] """ categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] # initialize dataset dataset_rbf = Dataset(categories) dataset_rbf.split_train_true(100) dataset_knn = Dataset(categories) dataset_knn.split_train_true(100) # feature extraction vectorizer_rbf = TfidfVectorizer() vectorizer_knn = TfidfVectorizer() vectors_rbf = vectorizer_rbf.fit_transform(dataset_rbf.train['data']) vectors_knn = vectorizer_knn.fit_transform(dataset_knn.train['data']) # classification # use max_iter=10 when 20 categories clf_rbf = LabelPropagation(kernel='rbf', gamma=5).fit(vectors_rbf.todense(), dataset_rbf.train['target']) clf_knn = LabelSpreading(kernel='knn', n_neighbors=10).fit(vectors_knn.todense(), dataset_knn.train['target']) test_vec_rbf = vectorizer_rbf.transform(dataset_rbf.test['data'])
'credit_card_balance.csv.zip': 'mean', 'installments_payments.csv.zip': 'min', 'POS_CASH_balance.csv.zip': 'mean', 'bureau.csv.zip': 'max' }) df_test = proj_utils.load_data(train=False, supp_dict={ 'previous_application.csv.zip': 'max', 'credit_card_balance.csv.zip': 'mean', 'installments_payments.csv.zip': 'min', 'POS_CASH_balance.csv.zip': 'mean', 'bureau.csv.zip': 'max' }) data = Dataset(df_train, df_test, 'TARGET') # Clean and transform data data.preprocess() # Determine initial feature importances data.ae_train_model(model=LGBMClassifier()) # Auto-discover ratios weighted by feature importance data.autoengineer_ratios() ############################################################# models = {} for m in M: # Define a model models[m] = {}