def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=Path, required=True, help='Root directory for data') parser.add_argument('--seed', type=int, default=18, help='Random seed') args = parser.parse_args() root = args.input if not os.path.exists(root): os.makedirs(root) # Download raw data at root download_data(root) # Parse data into groups domains = [['Solo Cello', ''], ['Solo Violin', ''], ['Solo Piano', 'Beethoven']] parsed_dir = root / 'parsed' parse_data(root, parsed_dir, domains) # Split data into train-val-test random.seed(args.seed) split_dir = root / 'split' for input_path in parsed_dir.glob("*/"): basename = os.path.basename(input_path) output_path = Path(split_dir / basename) split(input_path, output_path, train_ratio=0.8, val_ratio=0.1, filetype='wav', copy=True) # Preprocess data preproc_dir = root / 'preprocessed' preprocess(split_dir, preproc_dir)
def horizon_search(): ''' Experiment to demonstrate the effect of the predictive horizon on model performance on the test set. Trains models at different values of predictive horizon (N) in weeks. Set experiment options in config.yml. ''' # Load relevant values from config input_stream = open(os.getcwd() + "/config.yml", 'r') cfg = yaml.full_load(input_stream) N_MIN = cfg['HORIZON_SEARCH']['N_MIN'] N_MAX = cfg['HORIZON_SEARCH']['N_MAX'] N_INTERVAL = cfg['HORIZON_SEARCH']['N_INTERVAL'] test_metrics_df = pd.DataFrame() for n in range(N_MIN, N_MAX + N_INTERVAL, N_INTERVAL): print('** n = ', n, ' of ', N_MAX) # Preprocess data. Avoid recomputing ground truths and classifying features after first iteration. if n == N_MIN: preprocess(n_weeks=n, calculate_gt=True, classify_cat_feats=True, load_ct=False) else: if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': preprocess(n_weeks=n, calculate_gt=True, classify_cat_feats=False, load_ct=True) else: preprocess(n_weeks=n, calculate_gt=False, classify_cat_feats=False, load_ct=True) # Conduct cross validation at this prediction horizon callbacks = define_callbacks(cfg) if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': results_df = nested_cross_validation(cfg, callbacks, None) # If time series data, do nested CV else: results_df = kfold_cross_validation(cfg, callbacks, None) # If not time series data, do k-fold CV results_df = results_df[0:-2] # Remove rows for mean and std dev results_df.drop('Fold', axis=1, inplace=True) # Remove fold column results_df.insert(0, 'n', n) # Add prediction horizon to test results test_metrics_df = test_metrics_df.append(results_df) # Append results from this value of n # Save results test_metrics_df.to_csv(cfg['PATHS']['HORIZON_SEARCH'] + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', sep=',', header=True, index=False) # Plot results plot_horizon_search(test_metrics_df, cfg['PATHS']['IMAGES'])
import os import yaml import argparse from src.data.preprocess import preprocess parser = argparse.ArgumentParser() parser.add_argument('--miladatadir', type=str, help="Mila dataset directory") parser.add_argument('--fig1datadir', type=str, help="Mila dataset directory") parser.add_argument('--rsnadatadir', type=str, help="RSNA dataset directory") parser.add_argument('--preprocesseddir', type=str, help="preprocessed output") args = parser.parse_args() cfg = yaml.full_load(open(os.getcwd() + "./config.yml", 'r')) # Load config data cfg['PATHS']['MILA_DATA'] = args.miladatadir cfg['PATHS']['FIGURE1_DATA'] = args.fig1datadir cfg['PATHS']['RSNA_DATA'] = args.rsnadatadir cfg['PATHS']['PROCESSED_DATA'] = args.preprocesseddir cfg['PATHS']['TRAIN_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['TRAIN_SET'].split('/')[-1] cfg['PATHS']['VAL_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['VAL_SET'].split('/')[-1] cfg['PATHS']['TEST_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['TEST_SET'].split('/')[-1] preprocess(cfg)
# Run preprocessing. if os.getenv("AML_PARAMETER_PIPELINE") == 'train': cfg['PATHS']['DATA_INFO'] = args.preprocessedoutputdir + '/' + cfg[ 'PATHS']['DATA_INFO'].split('/')[-1] cfg['PATHS'][ 'ORDINAL_COL_TRANSFORMER'] = args.preprocessedoutputdir + '/' + cfg[ 'PATHS']['ORDINAL_COL_TRANSFORMER'].split('/')[-1] cfg['PATHS'][ 'OHE_COL_TRANSFORMER_MV'] = args.preprocessedoutputdir + '/' + cfg[ 'PATHS']['OHE_COL_TRANSFORMER_MV'].split('/')[-1] cfg['PATHS'][ 'OHE_COL_TRANSFORMER_SV'] = args.preprocessedoutputdir + '/' + cfg[ 'PATHS']['OHE_COL_TRANSFORMER_SV'].split('/')[-1] preprocessed_data = preprocess(cfg=cfg, n_weeks=None, include_gt=True, calculate_gt=True, classify_cat_feats=True, load_ct=False) # Preprocessing for training else: cfg['PATHS']['DATA_INFO'] = args.inferencedir + cfg['PATHS'][ 'DATA_INFO'].split('/')[-1] cfg['PATHS']['OHE_COL_TRANSFORMER_SV'] = args.inferencedir + cfg['PATHS'][ 'OHE_COL_TRANSFORMER_SV'].split('/')[-1] cfg['PATHS']['ORDINAL_COL_TRANSFORMER'] = args.inferencedir + cfg['PATHS'][ 'ORDINAL_COL_TRANSFORMER'].split('/')[-1] cfg['PATHS']['OHE_COL_TRANSFORMER_MV'] = args.inferencedir + cfg['PATHS'][ 'OHE_COL_TRANSFORMER_MV'].split('/')[-1] preprocessed_data = preprocess(cfg=cfg, n_weeks=0, include_gt=False, calculate_gt=False,
ys=list(grid_search_res[metric_name].values()), labels=list(grid_search_res[metric_name].keys()), title=metric_name.name, xaxis='num_topics', yaxis='score', file=None) # f'selection_{metric_name}') PATH_TOTAL_RAW = Path('data/processed/total.csv') PATH_TOTAL_PROCESSED = Path('data/processed/total.csv') PATH = Path(os.getcwd()) filename = 'contents.csv' if filename in os.listdir(PATH / 'data/processed'): df = pd.read_csv(PATH / 'data/processed' / filename) else: df = pd.read_csv(PATH / 'data/raw' / filename) df['preproc'] = preprocess(df['content'].values, stem=True) df.to_csv(PATH / 'data/processed' / filename) df.dropna(inplace=True) df = df[~df['title'].str.contains('Калып:')] # remove templates documents = df['preproc'].values count_vect = CountVectorizer(input='content') # , stop_words=STOPWORDS) tf_idf_vect = TfidfVectorizer(input='content') # , stop_words=STOPWORDS) vect_model = count_vect main(documents, vect_model, true_labels=df['topic'])
def predict_and_explain_set(cfg=None, data_path=None, save_results=True, give_explanations=True, include_feat_values=True, processed_df=None): ''' Preprocess a raw dataset. Then get model predictions and corresponding LIME explanations. :param cfg: Custom config object :param data_path: Path to look for raw data :param save_results: Flag specifying whether to save the prediction results to disk :param give_explanations: Flag specifying whether to provide LIME explanations with predictions spreadsheet :param include_feat_values: Flag specifying whether to include client feature values with predictions spreadsheet :param processed_df: Dataframe of preprocessed data. data_path will be ignored if passed. :return: Dataframe of prediction results, including explanations. ''' # Load project config data if cfg is None: cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load data transformers scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER']) ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV']) # Get preprocessed data if processed_df is not None: df = processed_df if cfg['TRAIN']['DATASET_TYPE'] == 'static_and_dynamic': indexes = np.array( pd.concat( [df.pop('ClientID'), df.pop('Date')], 1)) else: indexes = np.array(df.pop('ClientID')) else: if data_path is None: data_path = cfg['PATHS']['RAW_DATA'] # Preprocess the data, using pre-existing sklearn transformers and classification of categorical features. df = preprocess(n_weeks=0, include_gt=False, calculate_gt=False, classify_cat_feats=False, load_ct=True, data_path=data_path) indexes = np.array(df.index) # Ensure DataFrame does not contain ground truth (could happen if custom preprocessed data is passed) if 'GroundTruth' in df.columns: df.drop('GroundTruth', axis=1, inplace=True) # Load feature mapping information (from preprocessing) data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r')) # Convert dataset to numpy array X = np.array(df) # Restore the model and LIME explainer from their respective serializations explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb')) model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False) # Get the predictive horizon that this model was trained on. It's embedded within the model name. n_weeks = int(model._name.split('_')[1].split('-')[0]) # Load LIME and prediction constants from config model_def = cfg['TRAIN']['MODEL_DEF'].upper() NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES'] NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES'] THRESHOLD = cfg['PREDICTION']['THRESHOLD'] CLASS_NAMES = cfg['PREDICTION']['CLASS_NAMES'] # Define column names of the DataFrame representing the prediction results col_names = [ 'ClientID', 'Predictive Horizon [weeks]', 'At risk of chronic homelessness', 'Probability of chronic homelessness [%]' ] # Add columns for client explanation if give_explanations: for i in range(NUM_FEATURES): col_names.extend( ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)]) # Add columns for client feature values if include_feat_values: col_names.extend(list(df.columns)) rows = [] # Predict and explain all items in dataset print('Predicting and explaining examples.') for i in tqdm(range(X.shape[0])): # Predict this example x = np.expand_dims(X[i], axis=0) y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T, axis=1) # Predict example prediction = 1 if y[1] >= THRESHOLD else 0 # Model's classification predicted_class = CLASS_NAMES[prediction] client_id = indexes[i][0] if cfg['TRAIN'][ 'DATASET_TYPE'] == 'static_and_dynamic' else indexes[i] row = [int(client_id), n_weeks, predicted_class, y[1] * 100] # Explain this prediction if give_explanations: x = sp.sparse.csr_matrix(X[i]) explanation = predict_and_explain(x, model, explainer, ohe_ct_sv, scaler_ct, NUM_FEATURES, NUM_SAMPLES) exp_tuples = explanation.as_list() for exp_tuple in exp_tuples: row.extend(list(exp_tuple)) if len(exp_tuples) < NUM_FEATURES: row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples))) ) # Fill with empty space if explanation too small # Add client's feature values if include_feat_values: client_vals = list(df.loc[indexes[i], :]) for idx in data_info['SV_CAT_FEATURE_IDXS']: ordinal_encoded_val = int(client_vals[idx]) client_vals[idx] = data_info['SV_CAT_VALUES'][idx][ ordinal_encoded_val] row.extend(client_vals) rows.append(row) # Convert results to a Pandas dataframe and save results_df = pd.DataFrame(rows, columns=col_names) if save_results: results_path = cfg['PATHS']['BATCH_PREDICTIONS'] + datetime.now( ).strftime("%Y%m%d-%H%M%S") + '.csv' results_df.to_csv(results_path, columns=col_names, index_label=False, index=False) return results_df