def process_pubmed(output_graph): pubmed_df = read_csv(PREPROCESSED_PUBMED_FILEPATH) drugs_df = read_csv(PREPROCESSED_DRUGS_FILE_PATH) drugs_list = drugs_df['drug'].tolist() for index, row in pubmed_df.iterrows(): drugs_found = find_drugs_in_title(row['title'], drugs_list) for drug in drugs_found: output_graph.add_pubmed(drug, row['id'], row['date']) output_graph.add_journal(drug, row['journal'], row['date'])
def process_clinical_trial(output_graph): clinical_trial_df = read_csv(PREPROCESSED_CLINICAL_TRIALS_FILE_PATH) drugs_df = read_csv(PREPROCESSED_DRUGS_FILE_PATH) drugs_list = drugs_df['drug'].tolist() for index, row in clinical_trial_df.iterrows(): drugs_found = find_drugs_in_title(row['scientific_title'], drugs_list) for drug in drugs_found: output_graph.add_clinical_trial(drug, row['id'], row['date']) output_graph.add_journal(drug, row['journal'], row['date'])
def preprocess_drugs(): """ Loads the drugs file and preprocesses it :returns: a dataframe with clean data on drugs :rtype: pandas.DataFrame """ drugs_df = read_csv(DRUGS_INPUT_FILE_PATH) drugs_df['drug'] = drugs_df['drug'].str.lower() return drugs_df
def preprocess_pubmed_csv(): """ Loads the pubmed csv file and preprocesses it :returns: a dataframe with clean data on pubmed :rtype: pandas.DataFrame """ pubmed_df = read_csv(PUBMED_CSV_FILE_PATH) pubmed_df['title'] = pubmed_df['title'].str.lower() return pubmed_df
def main(): filename = "training_data.csv" n_hidden_nodes = [5] l_rate = 0.6 n_epochs = 800 n_folds = 4 print("Neural network model:\n n_hidden_nodes = {}".format(n_hidden_nodes)) print(" l_rate = {}".format(l_rate)) print(" n_epochs = {}".format(n_epochs)) print(" n_folds = {}".format(n_folds)) print("\nReading '{}'...".format(filename)) X, y = utils.read_csv(filename) utils.normalize(X) N, d = X.shape n_classes = len(np.unique(y)) print(" X.shape = {}".format(X.shape)) print(" y.shape = {}".format(y.shape)) print(" n_classes = {}".format(n_classes)) idx_all = np.arange(0, N) idx_folds = utils.crossval_folds(N, n_folds, seed=1) acc_train, acc_test = list(), list() print("\nTraining and cross-validating...") for i, idx_test in enumerate(idx_folds): idx_train = np.delete(idx_all, idx_test) X_train, y_train = X[idx_train], y[idx_train] X_test, y_test = X[idx_test], y[idx_test] model = NeuralNetwork(n_input=d, n_output=n_classes, n_hidden_nodes=n_hidden_nodes) model.train(X_train, y_train, l_rate=l_rate, n_epochs=n_epochs) y_train_predict = model.predict(X_train) y_test_predict = model.predict(X_test) acc_train.append(100 * np.sum(y_train == y_train_predict) / len(y_train)) acc_test.append(100 * np.sum(y_test == y_test_predict) / len(y_test)) print( " Fold {}/{}: train acc = {:.2f}%, test acc = {:.2f}% (n_train = {}, n_test = {})" .format(i + 1, n_folds, acc_train[-1], acc_test[-1], len(X_train), len(X_test))) print("\nAvg train acc = {:.2f}%".format( sum(acc_train) / float(len(acc_train)))) print("Avg test acc = {:.2f}%".format( sum(acc_test) / float(len(acc_test))))
def main(): # =================================== # Settings # =================================== csv_filename = "data/creditcard.csv" hidden_layers = [5] eta = 0.1 n_epochs = 500 n_folds = 3 X, y, n_classes = utils.read_csv(csv_filename, target_name="Class") N, d = X.shape print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format(X.shape, y.shape, n_classes)) print("Running") idx_all = np.arange(0, N) idx_folds = utils.crossval_folds(N, n_folds, seed=1) acc_train, acc_valid = list(), list() print("Cross-validation") for i, idx_valid in enumerate(idx_folds): idx_train = np.delete(idx_all, idx_valid) X_train, y_train = X[idx_train], y[idx_train] X_valid, y_valid = X[idx_valid], y[idx_valid] model = NeuralNetwork(input_dim=d, output_dim=n_classes, hidden_layers=hidden_layers, seed=1) model.train(X_train, y_train, eta=eta, n_epochs=n_epochs) ypred_train = model.predict(X_train) ypred_valid = model.predict(X_valid) acc_train.append(100 * np.sum(y_train == ypred_train) / len(y_train)) acc_valid.append(100 * np.sum(y_valid == ypred_valid) / len(y_valid)) print("TP: " + str(np.sum((y_valid == ypred_valid) & (y_valid == 1)))) print("TN: " + str(np.sum((y_valid == ypred_valid) & (y_valid == 0)))) print("FP: " + str(np.sum((y_valid != ypred_valid) & (y_valid == 1)))) print("FN: " + str(np.sum((y_valid != ypred_valid) & (y_valid == 0)))) TP = np.sum((y_valid == ypred_valid) & (y_valid == 1)) TN = np.sum((y_valid == ypred_valid) & (y_valid == 0)) FP = np.sum((y_valid != ypred_valid) & (y_valid == 1)) FN = np.sum((y_valid != ypred_valid) & (y_valid == 0)) precision = calculate_precision(TP, FP) recall = calculate_recall(TP, FN) print(str(f1_score(recall, precision))) print(" Fold {}/{}: acc_train = {:.2f}%, acc_valid = {:.2f}% (n_train = {}, n_valid = {})".format( i + 1, n_folds, acc_train[-1], acc_valid[-1], len(X_train), len(X_valid))) print(" -> acc_train_avg = {:.2f}%, acc_valid_avg = {:.2f}%".format( sum(acc_train) / float(len(acc_train)), sum(acc_valid) / float(len(acc_valid))))
def train(_start_temp, _end_temp, _eq_number, _cool_number, nodes_number, stride): """ Trains the model by applying the optimization by simulated annealing """ input = read_csv(nodes_number, stride) weights = np.zeros((nodes_number, nodes_number)) start_temp = _start_temp end_temp = _end_temp T = start_temp energy = mse(weights, input, nodes_number, stride) eq_number = _eq_number cool_parameter = _cool_number energies = [energy] best_energies = [energy] best_weights = weights best_energy = energy while T >= end_temp: print(T) # stay on the same temp (in the equilibrium) for eq_number iterations for _ in range(eq_number): weights, energy = annealing(weights, energy, T, input, nodes_number, stride) T = decrease_temp(T, cool_parameter) if energy < best_energy: best_energy = energy best_weights = weights energies.append(energy) best_energies.append(best_energy) plt.plot(energies, label="Energy") plt.plot(best_energies, label="Best energy") plt.xlabel("Epochs") plt.ylabel("MSE") plt.legend() plt.show() return best_energy
def preprocess_clinical_trials(): """ Loads the clinical_trials file and preprocesses it :returns: a dataframe with clean data on clinical_trials :rtype: pandas.DataFrame """ clinical_trials_df = read_csv(CLINICAL_TRIALS_INPUT_FILE_PATH) clinical_trials_df['scientific_title'] = clinical_trials_df[ 'scientific_title'].str.lower() # Remove byte-like characters clinical_trials_df['scientific_title'] = clinical_trials_df[ 'scientific_title'].apply( lambda x: x.replace('\\xc3', '').replace('\\xb1', '')) clinical_trials_df['journal'] = clinical_trials_df['journal'].astype( str).apply(lambda x: x.replace('\\xc3', '').replace('\\x28', '')) return clinical_trials_df
def main(): # =================================== # Settings # =================================== csv_filename = "data/Leeds02.csv" hidden_layers = [5] # number of nodes in hidden layers i.e. [layer1, layer2, ...] eta = 0.1 # learning rate n_epochs = 400 # number of training epochs n_folds = 4 # number of folds for cross-validation seed_crossval = 1 # seed for cross-validation seed_weights = 1 # seed for NN weight initialization # =================================== # Read csv data + normalize features # =================================== print("Reading '{}'...".format(csv_filename)) X, y, n_classes = utils.read_csv(csv_filename, target_name="y", normalize=True) N, d = X.shape print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format(X.shape, y.shape, n_classes)) print("Neural network model:") print(" input_dim = {}".format(d)) print(" hidden_layers = {}".format(hidden_layers)) print(" output_dim = {}".format(n_classes)) print(" eta = {}".format(eta)) print(" n_epochs = {}".format(n_epochs)) print(" n_folds = {}".format(n_folds)) print(" seed_crossval = {}".format(seed_crossval)) print(" seed_weights = {}\n".format(seed_weights)) # =================================== # Create cross-validation folds # =================================== idx_all = np.arange(0, N) idx_folds = utils.crossval_folds(N, n_folds, seed=seed_crossval) # list of list of fold indices # =================================== # Train/evaluate the model on each fold # =================================== acc_train, acc_valid = list(), list() # training/test accuracy score print("Cross-validating with {} folds...".format(len(idx_folds))) for i, idx_valid in enumerate(idx_folds): # Collect training and test data from folds idx_train = np.delete(idx_all, idx_valid) X_train, y_train = X[idx_train], y[idx_train] X_valid, y_valid = X[idx_valid], y[idx_valid] # Build neural network classifier model and train model = NeuralNetwork(input_dim=d, output_dim=n_classes, hidden_layers=hidden_layers, seed=seed_weights) model.train(X_train, y_train, eta=eta, n_epochs=n_epochs) # Make predictions for training and test data ypred_train = model.predict(X_train) ypred_valid = model.predict(X_valid) # Compute training/test accuracy score from predicted values acc_train.append(100*np.sum(y_train==ypred_train)/len(y_train)) acc_valid.append(100*np.sum(y_valid==ypred_valid)/len(y_valid)) # Print cross-validation result print(" Fold {}/{}: acc_train = {:.2f}%, acc_valid = {:.2f}% (n_train = {}, n_valid = {})".format( i+1, n_folds, acc_train[-1], acc_valid[-1], len(X_train), len(X_valid))) # =================================== # Print results # =================================== print(" -> acc_train_avg = {:.2f}%, acc_valid_avg = {:.2f}%".format( sum(acc_train)/float(len(acc_train)), sum(acc_valid)/float(len(acc_valid))))
def main(model_name, data_file, i_days, mva, batch_size, p_days): """ Args: model_name (string): model name data_file (string): data_file i_days (int): number of input days per test sequence mva (bool): whether to apply 7-day moving average batch_size (int): number of test sequences p_days (int): number of prediction days per test sequence """ # initialize model if model_name == 'arma': model = ARMA() elif model_name == 'seird': model = SEIRD() elif model_name == 'gamma': model_type = 'default' #There is no other model for now. delta1 = 11 delta2 = 18 delta3 = 14 p = 0.02 num_past_days = 7 model = GAMMA(model_type, delta1, delta2, delta3, p, num_past_days) #Config file needs to be added. elif model_name == 'gamma_l1': model_type = 'default' #There is no other model for now. delta1 = 11 delta2 = 18 delta3 = 14 p = 0.02 num_past_days = 7 lbd = 1000 model = GAMMA_L1(model_type, delta1, delta2, delta3, p, num_past_days, lbd) #Config file needs to be added. elif model_name == 'gamma_2': model_type = 'default' #There is no other model for now. delta1 = 11 delta2 = 18 delta3 = 14 p = 0.02 num_past_days = 7 lbd = 1000 model = GAMMA_2(model_type, delta1, delta2, delta3, p, num_past_days) #Config file needs to be added. else: raise ('Invalid model type:', model) # load datafile data_dir = './datasets/processed/' data, dates, columns = utils.read_csv(data_dir + data_file) # apply moving average if mva: data, dates = utils.moving_average(data, dates, days=7) # split up data into batch_size train+test sequences datasplit = utils.train_test_split_multi(data, dates, train_days=i_days, test_days=p_days, batch_size=batch_size, seed=0) train, test, train_dates, test_dates = datasplit B = train.shape[0] #Arec: Is B batch size? # fit and predict on each sequence c_preds, h_preds, d_preds = [], [], [] for i in range(B): # refit model with new sequence model.fit(train[i]) # predict days c_preds.append(model.predict_cases(p_days)) h_preds.append(model.predict_hospitalizations(p_days)) d_preds.append(model.predict_deaths(p_days)) # evaluate metrics print("Model:", model_name) c_true, h_true, d_true = test[..., 0], test[..., 1], test[..., 2] for (pred, true, name) in [(c_preds, c_true, "Cases"), (h_preds, h_true, "Hospitalizations"), (d_preds, d_true, "Deaths")]: print(f'{name}...') # skip if no prediction if pred[0] is None: print("%s: no predictions" % (name)) continue # batch predictions pred_batch = np.stack(pred) # run metrics on batches rmses = utils.rmse(true, pred_batch) maes = utils.mae(true, pred_batch) mapes = utils.mape(true, pred_batch) # report mean and std around mean (std / sqrt(B)) for (metric, name) in [(rmses, "RMSE"), (maes, "MAE"), (mapes, "MAPE")]: print('%s: %f \pm %f' % (name, metric.mean(), metric.std() / (len(metric)**0.5))) #plotting utils.plotting(train, test, c_preds, h_preds, d_preds, model_name)
import time from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from src.generate_data import parse_web_page from src.utils import read_csv def parallelize(func, iterable, use_thread=True, *args, **kwargs): pool_executor = ThreadPoolExecutor if use_thread else ProcessPoolExecutor with pool_executor() as executor: data = list(executor.map(func, iterable)) return data if __name__ == "__main__": data = read_csv("data/summary/common.csv") urls = [row["doc_url"] for row in data if row["doc_url"]] selected_urls = urls[:100] start = time.time() norm_result = [parse_web_page(url) for url in selected_urls] done = time.time() print(f"Done, initial {start} -> {done} = {done - start}") start = done thread_result = parallelize(parse_web_page, selected_urls, use_thread=True) done = time.time() print(f"Done, Threading {start} -> {done} = {done - start}") # start = time.time() # parallelize(parse_web_page, selected_urls, use_thread=False) # # parallelize(print, selected_urls, use_thread=False)
def main(): # =================================== # Settings # =================================== filename = "data/seeds_dataset.csv" n_hidden_nodes = [ 5 ] # nodes in hidden layers i.e. [n_nodes_1, n_nodes_2, ...] l_rate = 0.6 # learning rate n_epochs = 800 # number of training epochs n_folds = 4 # number of folds for cross-validation print("Neural network model:\n n_hidden_nodes = {}".format(n_hidden_nodes)) print(" l_rate = {}".format(l_rate)) print(" n_epochs = {}".format(n_epochs)) print(" n_folds = {}".format(n_folds)) # =================================== # Read data (X,y) and normalize X # =================================== print("\nReading '{}'...".format(filename)) X, y = utils.read_csv(filename) # read as matrix of floats and int utils.normalize(X) # normalize N, d = X.shape # extract shape of X n_classes = len(np.unique(y)) print(" X.shape = {}".format(X.shape)) print(" y.shape = {}".format(y.shape)) print(" n_classes = {}".format(n_classes)) # =================================== # Create cross-validation folds # These are a list of a list of indices for each fold # =================================== idx_all = np.arange(0, N) idx_folds = utils.crossval_folds(N, n_folds, seed=1) # =================================== # Train and evaluate the model on each fold # =================================== acc_train, acc_test = list(), list() # training/test accuracy score print("\nTraining and cross-validating...") for i, idx_test in enumerate(idx_folds): # Collect training and test data from folds idx_train = np.delete(idx_all, idx_test) X_train, y_train = X[idx_train], y[idx_train] X_test, y_test = X[idx_test], y[idx_test] # Build neural network classifier model and train model = NeuralNetwork(n_input=d, n_output=n_classes, n_hidden_nodes=n_hidden_nodes) model.train(X_train, y_train, l_rate=l_rate, n_epochs=n_epochs) # Make predictions for training and test data y_train_predict = model.predict(X_train) y_test_predict = model.predict(X_test) # Compute training/test accuracy score from predicted values acc_train.append(100 * np.sum(y_train == y_train_predict) / len(y_train)) acc_test.append(100 * np.sum(y_test == y_test_predict) / len(y_test)) # Print cross-validation result print( " Fold {}/{}: train acc = {:.2f}%, test acc = {:.2f}% (n_train = {}, n_test = {})" .format(i + 1, n_folds, acc_train[-1], acc_test[-1], len(X_train), len(X_test))) # =================================== # Print results # =================================== print("\nAvg train acc = {:.2f}%".format( sum(acc_train) / float(len(acc_train)))) print("Avg test acc = {:.2f}%".format( sum(acc_test) / float(len(acc_test))))
import src.utils as utils # Settings csv_filename = "data/seeds_dataset.csv" hidden_layers = [ 5 ] # number of nodes in hidden layers i.e. [layer1, layer2, ...] eta = 0.1 # learning rate n_epochs = 400 # number of training epochs n_folds = 4 # number of folds for cross-validation seed_crossval = 1 # seed for cross-validation seed_weights = 1 # seed for NN weight initialization # Read csv data + normalize features print("Reading '{}'...".format(csv_filename)) X, y, n_classes = utils.read_csv(csv_filename, target_name="y", normalize=True) print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format( X.shape, y.shape, n_classes)) N, d = X.shape print("Neural network model:") print(" input_dim = {}".format(d)) print(" hidden_layers = {}".format(hidden_layers)) print(" output_dim = {}".format(n_classes)) print(" eta = {}".format(eta)) print(" n_epochs = {}".format(n_epochs)) print(" n_folds = {}".format(n_folds)) print(" seed_crossval = {}".format(seed_crossval)) print(" seed_weights = {}\n".format(seed_weights)) # Create cross-validation folds
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging from src.domain.euromillions.rules import Rules from src.domain.euromillions.compute import Compute # import argparse from src import utils FORMAT = "%(asctime)s %(name)s %(levelname)s - %(message)s" logging.basicConfig(format=FORMAT, level=logging.INFO) logging.getLogger("winner").setLevel(logging.INFO) history_file = "assets/history/euromillions-1354.csv" if __name__ == "__main__": logging.info("lets check for a winner") numbers = utils.read_csv(file=history_file, depth=5) for n in numbers: logging.info(n) rules = Rules() cmp = Compute(rules, numbers) cmp.compute()
unknown_file = os.path.join('', uc_configs['trajectory']) is_trajectory = True else: print("Please determine a correct value for your use case") sys.exit() error_handler.handle_wrong_arguments(known_file, unknown_file) known_filename, known_file_extension = os.path.splitext(known_file) unknown_filename, unknown_file_extension = os.path.splitext(unknown_file) trajectory_list = [] leak_lists = [] # read trajectories and leaking source files if known_file_extension == '.csv': leak_lists = utils.read_csv(known_file) elif known_file_extension == '.json': leak_lists = get_matrix_geojson(known_file) if unknown_file_extension == '.csv': trajectory_list = utils.read_csv(unknown_file) elif unknown_file_extension == '.json': trajectory_list = get_matrix_geojson(unknown_file) print('checking requirements of known stations...') error_handler.check_requirements(leak_lists[0], is_random) print('checking requirements of trajectories or unknown stations...') error_handler.check_requirements(trajectory_list[0], is_random) # initialize variables uknown_stations = trajectory_list sensors_payload = []
import sys, os sys.path.append('./') import numpy as np import matplotlib.pyplot as plt from src import utils # load data from processed csv path_loc = './datasets/processed/sf.csv' data, dates, columns = utils.read_csv(path_loc) print("Data columns: ", columns) print("Max Daily Deaths: ", data[:, 2].max()) # without averaging train, test, train_dates, test_dates = utils.train_test_split(data, dates) plt.figure() plt.title('SF Data w/o moving average') plt.plot(train_dates, train) plt.plot(test_dates, test) plt.legend(columns) plt.show() # with 7-day moving average mva_data, mva_dates = utils.moving_average(data, dates, days=7) train, test, train_dates, test_dates = utils.train_test_split( mva_data, mva_dates) plt.figure() plt.title('SF Data w/ 7-day moving average') plt.plot(train_dates, train)