def preprocess(data, pipeline_automator): print('\n\nPreprocessing...') start = time() # Get the cleaned data cleaned_data = clean_data(data, pipeline_automator) # Get the corpus text corpus_text = get_corpus_text(data) # Get the list of words words = get_words(corpus_text) # Get the corpus tokens corpus_tokens = get_tokens(corpus_text) # Get the bigrams, trigrams, collocations and lemmas in the data bigrams = get_bigrams(corpus_tokens) trigrams = get_trigrams(corpus_tokens) collocations2 = get_bigram_collocations(corpus_tokens, pipeline_automator) collocations3 = get_trigram_collocations(corpus_tokens, pipeline_automator) lemmas = get_lemmas(cleaned_data, pipeline_automator) if pipeline_automator.parameters['remove_sub_terms']: lemmas, collocations2, collocations3 = remove_redundant_terms( lemmas, collocations2, collocations3) # Get the terms that will be selected from in the feature selection step (lemmas and collocations) terms = lemmas + collocations2 + collocations3 # Store all of the meta-data generated during preprocessing. pipeline_automator.metadata['ngrams']['words'] = words pipeline_automator.metadata['words_count'] = len(words) pipeline_automator.metadata['lemmas'] = lemmas pipeline_automator.metadata['lemma_count'] = len(lemmas) pipeline_automator.metadata['text'] = corpus_text pipeline_automator.metadata['ngrams']['bigrams'] = list(bigrams) pipeline_automator.metadata['ngrams']['trigrams'] = list(trigrams) pipeline_automator.metadata['bigram_collocations'] = collocations2 pipeline_automator.metadata['bigram_collocation_count'] = len( collocations2) pipeline_automator.metadata['trigram_collocations'] = collocations3 pipeline_automator.metadata['trigram_collocation_count'] = len( collocations3) pipeline_automator.metadata['terms'] = terms pipeline_automator.metadata['term_count'] = len(terms) stop = time() time_elapsed = get_time_string(stop - start) pipeline_automator.metadata['preprocessing_time'] = time_elapsed return cleaned_data
def tune_models(promising_models, pipeline_automator): print('\n\nModel Tuning...') start = time() # Take our dict of promising models and perform randomized search training_data = pipeline_automator.metadata['training_data'] n_features = training_data.shape[1] - 1 # last column is the target column n_hyperparam_combos = pipeline_automator.parameters['n_hyperparam_combos'] model_tuning_cv_folds_count = pipeline_automator.parameters[ 'model_tuning_cv_folds_count'] include_ensemble_voters = pipeline_automator.parameters[ 'include_ensemble_voters'] model_tuning_scoring = pipeline_automator.parameters[ 'model_tuning_scoring'] random_state = pipeline_automator.parameters['random_state'] np.random.seed(random_state) X, y = training_data[:, :n_features], training_data[:, n_features] X = X.astype(np.float64) y = y.reshape((y.shape[0], 1)) y = np.array(y.T)[0] tuned_models = {} ignored_models = { 'Gaussian Process Classifier': "This model doesn't need hyperparameter tuning because it's 'hyperparameters' were optimized at creation.", 'Naive Bayes Classifier': "We have not determined if there are hyperparameters to tune for this model.", 'Discriminant Analysis Classifier': "We have not determined if there are hyperparameters to tune for this model.", 'RBF SVM Classifier': "The model autotunes gamma.", 'Polynomial SVM Classifier': 'Tuned already' } for model_name in tqdm(promising_models, 'Tuning Promising Models...'): model = promising_models[model_name] if model_name not in ignored_models: print('Beginning Tuning Process for ', model_name, '...') param_distribution = get_parameter_distribution( model_name, random_state=random_state, n_iter=n_hyperparam_combos) random_search = RandomizedSearchCV(model, param_distribution, n_iter=n_hyperparam_combos, cv=model_tuning_cv_folds_count, scoring=model_tuning_scoring, refit=True, n_jobs=-1, random_state=random_state) random_search.fit(X, y) clf = random_search.best_estimator_ tuned_models[model_name] = clf else: print('Skipped tuning of', model_name, 'for the following reason:', ignored_models[model_name]) tuned_models[model_name] = model model.fit(X, y) clf = model print('Best Tuning:') print(clf) tuned_models_list = tuned_models.items() stop = time() time_elapsed = get_time_string(stop - start) pipeline_automator.metadata['model_tuning_time'] = time_elapsed return tuned_models
def get_promising_models(feature_selected_data, pipeline_automator): """ Automates the process of selecting a good classifier, hyperparameter tuning for the classifier and training and testing the model. This process is very slow so we start by limiting the data that we want the models to be evaluated on. The current version takes 15 minutes or so to run on using 1500 records which is acceptable for testing purposes all things considered (training dozens of classifiers on 1500 records and hundreds of features) """ print('\n\nModel Selection...') start = time() random_state = pipeline_automator.parameters['random_state'] np.random.seed(random_state) n_records = feature_selected_data.shape[0] n_features = feature_selected_data.shape[1] - 1 # n_features = n_cols - 1, (last column is the target column) split_type = pipeline_automator.parameters['data_splitting'] """ 1) Split our data up into development aka training set for the model selection and tuning data, and a validation set to see how well our models can classify never before seen examples after we tune them. We want to use a lot of data for developing good models, but we also want to set aside enough data to test how well our models can generalize to unseen examples. """ print('Splitting Our Data into training and testing sets...') testing_set_size = pipeline_automator.parameters['testing_set_size'] train_data, test_data = split_data(feature_selected_data, test_size=testing_set_size, type_=split_type, random_state=random_state) n_examples_in_training_set = len(train_data) pipeline_automator.metadata['training_data'] = train_data pipeline_automator.metadata['training_data_size'] = n_examples_in_training_set pipeline_automator.metadata['testing_data'] = test_data pipeline_automator.metadata['testing_data_size'] = len(test_data) print('total examples:', len(feature_selected_data), '\ttraining examples:', len(train_data), '\ttesting examples:', len(test_data),end='\n\n') """ 2) Enforce the Model Selection Data Limit. Model selection is very slow. We presently have 13 classifier types. Each classifier is cloned cv_folds number of times and then trained on the training data. For cv=10, that's 130 classifiers being trained and tested. If we trained all of those models on the entire data set the computational complexity grows in a combinatorial way, i.e., will not finish in a reasonable amount time. Therefore, we get around this by using a subset if we have more records than the model_selection_data_limit. If we do not have more than that many records in the training set, then we use the entire training set to select our models. """ model_selection_data_limit = pipeline_automator.parameters['model_selection_data_limit'] if model_selection_data_limit < n_examples_in_training_set : print('Splitting our data because it exceeds the model_selection_data_limit parameter...') rest_of_the_data_set, evaluation_set = split_data(feature_selected_data, test_size=model_selection_data_limit, type_=split_type, random_state=random_state) print(len(rest_of_the_data_set),'saved for later +',len(evaluation_set),'evaluation') else: rest_of_the_data_set = [] evaluation_set = feature_selected_data """ 3) Split our training data up into feature matrix X and label vector y: """ X, y = evaluation_set[:,:n_features], evaluation_set[:,n_features] X = X.astype(np.float64) y = y.reshape( (y.shape[0], 1) ) y = np.array(y.T)[0] """ 4) Place all our generated classifier functions above into a list of callable functions that we iterate through and call one at a time on the training data X and y. Then each classifier trains the data and uses cross validation in order to assess how good that particular classifier is at classifying the data. A confusion matrix for each is computed and calculates the accuracy, precision, recall, etc.. Depending on the needs of the classifier(maximize precision vs. recall, balanced, etc.), we select the appropriate rating indicator, and rate the classifiers by their performance. """ # list of callables used to generate each classifier: cv_folds = pipeline_automator.parameters['model_selection_cv_folds_count'] classifiers_getters = [ get_sgd_classifier, get_knn_classifier, get_linear_svm_classifier, get_polynomial_svm_classifier, get_rbf_svm_classifier, get_decision_tree_classifier, get_random_forest_classifier, get_extra_trees_classifier, get_adaboost_forest_classifier, get_mlp_classifier ] classifiers = [ get_classifier(X, y, random_state=random_state, cv_folds=cv_folds) for get_classifier in classifiers_getters ] classifier_dictionary = {name: clf for name, clf, results in classifiers} """ 5) Choose the most promising models to send to the next phase: model tuning. """ # Rank the classifiers by their F1 score (for now) we can decide which metric or even automate this in # a pipeline hyperparameter later on. performance_metric = pipeline_automator.parameters['model_selection_performance_metric'] ranked_classifiers = sorted( classifiers, key=lambda x: x[-1][performance_metric], reverse=True ) for name, classifier, results in ranked_classifiers: print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}") print() # Filter the ones with low performance. min_performance = pipeline_automator.parameters['model_selection_min_performance'] print('Filtering models that do not meet the model selection minimum performance: ', performance_metric, '>=', min_performance,'...') ranked_classifiers = list(filter(lambda x: x[-1][performance_metric] >= min_performance, classifiers)) ranked_classifiers = sorted( ranked_classifiers, key=lambda x: x[-1][performance_metric], reverse=True ) if len(ranked_classifiers) > 0: for name, classifier, results in ranked_classifiers: print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}") print() else: print('None of the models met the minimum performance of', performance_metric, '>=', min_performance) n_promising_models_to_select = min(pipeline_automator.parameters['n_promising_models_to_select'], len(ranked_classifiers)) promising_models = [ (promising__clf_name, promising__classifier, promising__results) for promising__clf_name, promising__classifier, promising__results in ranked_classifiers[:n_promising_models_to_select]] print('Selecting up to the top',len(promising_models),'models...') if len(ranked_classifiers) > 0: for name, classifier, results in promising_models: print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}") print() else: print('None of the models met the minimum performance of', performance_metric, '>=', min_performance) promising_models = {promising__clf_name:promising__classifier for promising__clf_name, promising__classifier, promising__results in promising_models} stop = time() time_elapsed = get_time_string(stop - start) pipeline_automator.metadata['model_selection_time'] = time_elapsed return promising_models
def evaluate_best_models(tuned_models, pipeline_automator): import numpy as np performance_metric = pipeline_automator.parameters[ 'final_min_performance_metric'] min_performance = pipeline_automator.parameters['final_min_performance'] # This is used to control reproducibility in the output. random_state = pipeline_automator.parameters['random_state'] np.random.seed(random_state) print('\n\nModel Evaluation...') start = time() # Prepare features matrix and label vector for final evaluation. test_data = pipeline_automator.metadata['testing_data'] n_features = test_data.shape[1] - 1 # last column is the target column X, y = test_data[:, :n_features], test_data[:, n_features] X = X.astype(np.float64) y = y.reshape((y.shape[0], 1)) y = np.array(y.T)[0] # For each model in tuned models, calculate the F1-score on the test data and select the model with best F1-score. These models are already trained. classifiers = [] for model_name in tuned_models: model = tuned_models[model_name] y_pred = model.predict(X) cm = confusion_matrix(y, y_pred) balanced_accuracy = balanced_accuracy_score( y, y_pred, ) precision = precision_score(y, y_pred, pos_label='Complaint', average='macro') recall = recall_score(y, y_pred, pos_label='Complaint', average='macro') f1 = f1_score(y, y_pred, pos_label='Complaint', average='macro') results = { 'confusion matrix': cm, 'balanced accuracy': balanced_accuracy, 'precision': precision, 'recall': recall, 'f1_macro': f1 } print('Final Evaluation of Tuned', model_name, ':') print(cm) print('balanced accuracy:', balanced_accuracy) print(classification_report(y, y_pred)) print() classifiers.append((model_name, model, results)) ranked_classifiers = sorted(classifiers, key=lambda x: x[-1]['f1_macro'], reverse=True) for name, classifier, results in ranked_classifiers: print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}") print() best_model_name, best_model, best_model_results = ranked_classifiers[0] if best_model_results[performance_metric] >= min_performance: print('The best model Pipeline Automator found was the', best_model_name, '.') print('results:') print(best_model_results['confusion matrix']) print('balanced accuracy:', best_model_results['balanced accuracy']) print('precision:', best_model_results['precision']) print('recall:', best_model_results['recall']) print('f1-score:', best_model_results['f1_macro']) print() else: print('No model had minimum performace of', performance_metric, '>=', min_performance, '. Please adjust the pipeline parameters and try again.\n') best_model_name, best_model, best_model_results = None, None, None stop = time() time_elapsed = get_time_string(stop - start) pipeline_automator.metadata['model_evaluation_time'] = time_elapsed return best_model_name, best_model, best_model_results
def feature_select(data, pipeline_automator): print('\n\nFeature Selection... ') start = time() # seed the random number generator for reproducibility. random_state = pipeline_automator.parameters['random_state'] np.random.seed(random_state) # Take our cleaned data and terms and transform into a TF-IDF matrix with L2-normalization applied to the row vectors terms = pipeline_automator.metadata['terms'] print('Getting TF-IDF Matrix...') tfidf_matrix, tfidf_terms = get_tfidf_matrix(data, pipeline_automator, print_matrix=True) # Add the Meta-Features to the tf-idf matrix print('Adding Meta-Features...') meta_features_matrix, meta_features_col_names = get_meta_features_matrix( data) if meta_features_matrix is not None: X, y = tfidf_matrix[:, :-1], tfidf_matrix[:, -1] X = X.astype(np.float64) y = y.reshape((y.shape[0], 1)) y = np.array(y.T)[0] meta_features_tfidf_matrix = np.column_stack( [X, meta_features_matrix, y]) features = tfidf_terms + meta_features_col_names print('tfidf + meta features shape:', meta_features_tfidf_matrix.shape) else: meta_features_tfidf_matrix = tfidf_matrix features_count = int(meta_features_tfidf_matrix.shape[1] - 1) #exclude the label column # Selected Features Matrix # Use the specified feature selection metric and the number of features to keep to determine which terms aka features to select print('Performing Univariate Feature Selection...') feature_selection_metric = pipeline_automator.parameters[ 'feature_selection_metric'] n_features_to_keep = int( features_count * pipeline_automator.parameters['n_selected_features'] ) # number / ratio of features to select top_features, top_features_matrix = univariate_feature_selection( meta_features_tfidf_matrix, n_features_to_keep, feature_selection_metric, features, pipeline_automator) print('reduced tfidf shape:', top_features_matrix.shape) if pipeline_automator.parameters['remove_zero_length_vectors']: # Some records may not contain any of the selected features and should thus be ignored. top_features_matrix = remove_zero_length_vectors(top_features_matrix) print('Selected Features Matrix shape:', top_features_matrix.shape) print(top_features_matrix) for feature in top_features: print(feature) if pipeline_automator.parameters['use_L2_row_normalization']: # the remaining vectors are then normalized one last time to take the meta features into account. top_features_matrix = normalize_matrix(top_features_matrix) # Cache the metadata zero_row_vector_count = len(data) - len(top_features_matrix) feature_selection_matrix_shape = top_features_matrix.shape pipeline_automator.metadata['features'] = features pipeline_automator.metadata['features_count'] = len(features) pipeline_automator.metadata['selected_features'] = top_features pipeline_automator.metadata['selected_features_count'] = n_features_to_keep pipeline_automator.metadata[ 'zero_row_vector_count'] = zero_row_vector_count pipeline_automator.metadata[ 'feature_selected_matrix_shape'] = feature_selection_matrix_shape pipeline_automator.metadata[ 'feature_selected_matrix'] = top_features_matrix stop = time() time_elapsed = get_time_string(stop - start) pipeline_automator.metadata['feature_selection_time'] = time_elapsed # Return the selected terms tf-idf-L2 scaled matrix representation of the data return top_features_matrix
""" This is the main file of Pipeline Automator. Run this file using the following the command in a terminal/command prompt window: python main.py "file_name.csv" description_column label_column """ start = time() commandline_arguments = argv[1:] if len(commandline_arguments) != 3: raise ValueError( "Must specify 3 arguments: the file name, the description column name, and the target column name." ) file_name, feature_column, label_column = commandline_arguments parameters = { 'feature_col_name': feature_column, 'label_col_name': label_column } # Initialize the pipeline and display the parameters used... pipeline = PipelineAutomator(file_name, parameters) # Commence the first run cycle through the pipeline... pipeline.display_parameters() record_type_classifier = pipeline.generate_model() pipeline.display_metadata() # Compute and show the run time: stop = time() time_elapsed = get_time_string(stop - start) print('Time elapsed:', time_elapsed)