) print("%s model analysis completion time: %.3f s = %.3f min" % (dnn_model, (end - start), (end - start) / 60.0)) print( "==================================================================\n" ) if __name__ == "__main__": path = os.getcwd()[:os.getcwd().rfind('/')] to_write_filename = path + '/stats/dnn_models_analysis.txt' utils.initialize_writer(to_write_filename) # Load the train and test sets for the selected dataset dataset = "ghosh" train_data, _, train_labels, test_data, _, test_labels = data_proc.get_dataset( dataset) # Alternatively, if other experiments with the data are to be made (on Ghosh's dataset) # load different tokens (grammatical, strict, filtered, etc) and train on those """ train_filename = "train_sample.txt" test_filename = "test_sample.txt" train_data = utils.load_file(path + "/res/tokens/tokens_clean_original_" + train_filename) test_data = utils.load_file(path + "/res/tokens/tokens_clean_original_" + test_filename) train_labels = [int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename)] test_labels = [int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename)] """ # Transform the output into categorical data
loss=loss_op, train_op=train_op, training_hooks=train_hook_list, eval_metric_ops={'accuracy': accuracy}) return estim_specs # Visualize the process of training steps tf.logging.set_verbosity(tf.logging.INFO) # Try getting the dataset if there is no data set in local data_processing.try_download_and_extract() # This is the case of MNIST # In case of other data set (Cifar10, Cifar100) you have to check mnist = data_processing.get_dataset() config = None # Logging Device placement #config=tf.estimator.RunConfig(session_config=tf.ConfigProto(log_device_placement=True)) # Create Estimator with model directory [XXX] Model Directory check model = tf.estimator.Estimator(model_fn, model_dir="/Users/jwl1993/model_dir", config=config) # Training input, if num_epochs == None, training will run forever until the max_step is set input_fn = tf.estimator.inputs.numpy_input_fn( x=mnist.train.images, y=mnist.train.labels, batch_size=batch_size, num_epochs=None, shuffle=True) _ = model.train(input_fn, steps=number_of_iteration)
import pandas as pd import data_processing as data import HBA_analysis as hba import math from pathlib import Path adult_exp = data.get_dataset(dataset='adult', probes_strategy='reannotator') fetal_exp = data.get_dataset(dataset='fetal', probes_strategy='reannotator') negraes = data.get_genelist('negraes') duncan = data.get_genelist('duncan') lutterAN = data.get_genelist('lutterAN') lutterBN = data.get_genelist('lutterBN') results_dir = Path('./results') results_dir.mkdir(exist_ok=True) def add_sig_marks(df): """adds markers to brain names column: ** for pFDR < 0.05 and * p<0.05""" # add ** if FDR < 0.05 and AUROC > 0.5 mask = (df['pFDR'] < 0.05) & (df['AUROC'] > 0.5) df.loc[mask, 'brain area'] = df.loc[mask, 'brain area'].apply(lambda x: str(x) + ' **') # add * if FDR>0.05 but p<0.05 and AUROC > 0.5 mask = (df['pFDR'] > 0.05) & (df['p'] < 0.05) & (df['AUROC'] > 0.5) df.loc[mask, 'brain area'] = df.loc[mask, 'brain area'].apply(lambda x: str(x) + ' *')
decision_tree. Compares sklearn scores and decision_tree on one of UCI datasets. Shows accuracy measurements, number of rules used in each classificator before and after post pruning (only decision_tree, sklearn does not provide this functionality [December 2018]) """ from sklearn.tree import DecisionTreeClassifier from data_processing import convert_to_numerical, get_dataset, split_data from decision_tree import DecisionTree, accuracy_score # READ DATA AND RENAME COLUMNS df = get_dataset( './car.data', 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data') df.columns = [ 'buying', 'maintenance', 'doors', 'people', 'lug_boot', 'safety', 'class' ] # CONVERT STRING VALUES TO THEIR NUMERICAL COUNTERPARTS (FASTER CALCULATION) convert_to_numerical(df, columns=[ 'buying', 'maintenance', 'doors', 'people', 'lug_boot', 'safety', 'class' ], inplace=True) # SPLIT DATASET INTO TRAINING, VALIDATION, TESTING training, validation, test = split_data(df, inplace=True)
# -*- coding: utf-8 -*- """example. Example usage of functionalities provided in: data_processing decision_tree """ from sklearn.metrics import accuracy_score from binary_decision_tree import DecisionTree from data_processing import convert_to_numerical, get_dataset, split_data from random_forests import RandomForest df = get_dataset( './kr-vs-kp.data', 'https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data' ) df.columns = list(range(37)) convert_to_numerical(df, columns=list(range(37)), inplace=True) training, validation, test = split_data(df, inplace=True) training_X = training.iloc[:, :-1] training_y = training.iloc[:, -1] clf = RandomForest() clf.fit(training_X, training_y) validation_X = validation.iloc[:, :-1] validation_y = validation.iloc[:, -1]
import sqlalchemy as db from sqlalchemy.orm import sessionmaker from tables import Base, State, Tornado, Industry, Gdp from data_processing import get_dataset dataset = get_dataset() metadata = db.MetaData() engine = db.create_engine('sqlite:///test.db', echo=True) connection = engine.connect() Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() sts = [State(**i) for i in dataset.states.to_dict(orient="records")] session.add_all(sts) tds = [Tornado(**i) for i in dataset.tornados.to_dict(orient="records")] session.add_all(tds) inds = [Industry(**i) for i in dataset.industries.to_dict(orient="records")] session.add_all(inds) gdps = [Gdp(**i) for i in dataset.gdp.to_dict(orient="records")] session.add_all(gdps) session.commit() session.close() connection.close() engine.dispose()
print("Shape of the x train set (%d, %d)" % (len(x_train_scaled), len(x_train_scaled[0]))) print("Shape of the x test set (%d, %d)" % (len(x_test_scaled), len(x_test_scaled[0]))) # Run the model on the selection of features made start = time.time() utils.run_supervised_learning_models(x_train_scaled, y_train, x_test_scaled, y_test) end = time.time() print("Completion time of the Linear SVM model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) if __name__ == "__main__": path = os.getcwd()[:os.getcwd().rfind('/')] to_write_filename = path + '/stats/ml_analysis.txt' utils.initialize_writer(to_write_filename) dataset = "ghosh" # can be "ghosh", "riloff", "sarcasmdetection" and "ptacek" train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels = data_proc.get_dataset( dataset) run_baseline = False if run_baseline: baseline(train_tokens, train_labels, test_tokens, test_labels) else: ml_model(train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels)
fetal_svgs = ['0893_101892619.svg', '1097_101892615.svg', '1352_101892610.svg'] fetal_brainstem_svgs = ['0391_102182817.svg', '0639_102182810.svg'] human_diagram = 'human_diagram.svg' # input directories svg_dir = Path('./data/svg') adult_dir = svg_dir / 'slices' / 'adult' fetal_dir = svg_dir / 'slices' / 'fetal21' fetal_brainstem_dir = svg_dir / 'slices' / 'fetal21_brainstem' # define output directory figures_dir = Path('./figures') figures_dir.mkdir(exist_ok=True) # get data adult_exp = data.get_dataset('adult', 'reannotator') fetal_exp = data.get_dataset('fetal', 'reannotator') negraes = data.get_genelist('negraes') # create tables to match AUC values to structures adult_lookup = svg_utils.create_auc_lookup(exp_df=adult_exp, gene_list=negraes, ontology='adult') fetal_lookup = svg_utils.create_auc_lookup(exp_df=fetal_exp, gene_list=negraes, ontology='fetal') adult_lookup = adult_lookup.rename(index=str, columns={"AUROC": "AUC"}) fetal_lookup = fetal_lookup.rename(index=str, columns={"AUROC": "AUC"}) svg_utils.modify_svg(svg_dir / human_diagram,