def run(): ## Load example data frame dataframe = pd.read_csv("../data/spanish/train.tsv", sep="\t") train_sequences = dataframe['tweet'].values.tolist() train_targets = dataframe['offensive'].values print(train_sequences[0:3]) print(train_targets[0:3]) #Possible metrics: ['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'] autoBOTLibObj = autoBOTLib.GAlearner(train_sequences, train_targets, scoring_metric="accuracy", representation_type="neurosymbolic", time_constraint=8).evolve() autoBOTLib.store_autobot_model( autoBOTLibObj, "../stored_models/example_spanish_model.pickle") fitness_summary = autoBOTLibObj.visualize_fitness( image_path="./spanish_fitness.png") importances_local, importances_global = autoBOTLibObj.feature_type_importances( ) final_learners = autoBOTLibObj.summarise_final_learners() ## storing the results for analysis importances_local.to_csv("spanish_local.tsv", sep="\t") importances_global.to_csv("spanish_global.tsv", sep="\t") final_learners.to_csv("final_learners.tsv", sep="\t") fitness_summary.to_csv("fitness_summary.tsv", sep="\t")
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t").iloc[:300] train_sequences = dataframe['text_a'] train_targets_c1 = dataframe['label'].values.tolist() train_targets_c2 = [ 0 if len(x) < 100 else 1 for x in train_sequences.values ] joint_target_space = [[train_targets_c1[enx], train_targets_c2[enx]] for enx in range(len(train_targets_c1))] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, joint_target_space, representation_type= "neurosymbolic-lite", ## See the documentation for all possible representation types. n_fold_cv=3, memory_storage="memory2", sparsity=0.1, learner_preset="test", upsample= False, ## Suitable for imbalanced data - randomized upsampling tends to help. time_constraint=0.1).evolve( strategy="evolution" ) ## strategy = "direct-learning" trains a single learner. test_sequences = pd.read_csv("../data/insults/test.tsv", sep="\t")["text_a"] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) print(predictions) print(prob_predictions) autoBOTLibObj.generate_report(output_folder="./report/", job_id="MLC")
def test_minimal_mlc(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'] train_targets_c1 = dataframe['label'].values.tolist() train_targets_c2 = [ 0 if len(x) < 100 else 1 for x in train_sequences.values ] joint_target_space = [[train_targets_c1[enx], train_targets_c2[enx]] for enx in range(len(train_targets_c1))] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, joint_target_space, representation_type= "symbolic", ## See the documentation for all possible representation types. n_fold_cv=3, sparsity=0.1, upsample= False, ## Suitable for imbalanced data - randomized upsampling tends to help. time_constraint=0.2).evolve( strategy="direct-learning" ) ## strategy = "direct-learning" trains a single learner. dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) print(predictions) print(prob_predictions) autoBOTLibObj.generate_report(output_folder="./report/", job_id="as9y0gb98ss")
def test_minimal(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t").iloc[:500] train_sequences = dataframe['text_a'] train_targets = dataframe['label'] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, representation_type= "symbolic", ## See the documentation for all possible representation types. n_fold_cv=3, memory_storage="memory2", sparsity=0.1, upsample= False, ## Suitable for imbalanced data - randomized upsampling tends to help. time_constraint=0.1).evolve( strategy="evolution" ) ## strategy = "direct-learning" trains a single learner. dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) print(predictions) print(prob_predictions) autoBOTLibObj.generate_report(output_folder="./report/", job_id="as9y0gb98s")
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'] train_targets = dataframe['label'] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, learner_preset="mini-l1", validation_type= "train_test", ## This parallelizes at the individual (not learner) level -> this results in additional memory overhead as shown in the paper. validation_percentage=0.15, num_cpu=10, representation_type= "neurosymbolic-lite", ## full representation space -- note that this includes sentence-transformers. For a lightweight version, consider neurosymbolic-lite time_constraint=0.1).evolve( strategy="evolution" ) ## strategy = "direct-learning" trains a single learner. dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) print(predictions) print(prob_predictions) importances_local, importances_global = autoBOTLibObj.feature_type_importances( ) print(importances_global) print(importances_local) importances_local.to_csv("local_insults.tsv", sep="\t") topic_df = autoBOTLibObj.get_topic_explanation() print(topic_df)
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, # input sequences train_targets, # target space time_constraint=3, # time in hoursc num_cpu="all", # number of CPUs to use latent_dim=768, ## latent dim for neural representations sparsity=0.1, ## latent_dim/sparsity dim for sparse representations task_name="example test", # task identifier scoring_metric="f1", # sklearn-compatible scoring metric as the fitness. hof_size=3, # size of the hall of fame top_k_importances=25, # how many top features to output as final ranking memory_storage="./memory", # tripled base for concept features representation_type="neurosymbolic") # or symbolic or neural autoBOTLibObj.evolve( nind=8, ## population size strategy="evolution", ## optimization strategy crossover_proba=0.6, ## crossover rate mutpb=0.4) ## mutation rate ## Persistence demonstration (how to store models for further use?) autoBOTLib.store_autobot_model( autoBOTLibObj, "../stored_models/example_insults_model.pickle") autoBOTLibObj = autoBOTLib.load_autobot_model( "../stored_models/example_insults_model.pickle") dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() test_targets = dataframe2['label'].values predictions = autoBOTLibObj.predict(test_sequences) print(predictions) performance = autoBOTLib.compute_metrics( "first_run_task_name", predictions, test_targets) ## compute F1, acc and F1_acc (as in GLUE) ## visualize performance print(performance) ## Visualize importances (global -> type, local -> individual features) importances_local, importances_global = autoBOTLibObj.feature_type_importances( ) print(importances_global) print(importances_local) final_learners = autoBOTLibObj.summarise_final_learners() print(final_learners) ## Visualize the fitness trace fitness_summary = autoBOTLibObj.visualize_fitness( image_path="./fitness_new.png") print(fitness_summary)
def test_minimal(): ## Load example data frame dataframe = pd.read_csv("./data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values autoBOTLibObj = autoBOTLib.GAlearner(train_sequences, train_targets, time_constraint=0.1).evolve() dataframe2 = pd.read_csv("./data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibObj.predict(test_sequences)
def test_initializations(fold_number, representation_type, sparsity, time_constraint): dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'] train_targets = dataframe['label'] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, representation_type= representation_type, ## See the documentation for all possible representation types. n_fold_cv=fold_number, memory_storage="memory2", sparsity=sparsity, upsample= False, ## Suitable for imbalanced data - randomized upsampling tends to help. time_constraint=time_constraint)
def test_custom_features(): ## Load example data frame dataframe = pd.read_csv("./data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values ## Let's say we wish to use only the following two text-to-feature transformer objects tfidf_word_unigram = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=False, max_features=100) tfidf_char_bigram = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_features=100) ## Note: You can use any transformer class that is implemented in accordance with the scikit-learn API (.fit, .transform, .fit_transform, .get_feature_names, etc.) ## Next, put them into a list. Note the use of text_col class. custom_features = [ ('word_features', pipeline.Pipeline([ ('s1', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('word_tfidf_unigram', tfidf_word_unigram) ])), ('char_features', pipeline.Pipeline([ ('s2', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('char_tfidf_bigram', tfidf_char_bigram) ])) ] ## Finally, specify this list as autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=1, custom_transformer_pipeline=custom_features).evolve() dataframe2 = pd.read_csv("./data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibObj.predict(test_sequences)
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t").iloc[:] train_sequences = dataframe['text_a'] train_targets = dataframe['label'] reptype = "neurosymbolic" autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, representation_type= reptype, ## See the documentation for all possible representation types. n_fold_cv=3, framework="torch", memory_storage="memory", learner_preset="default", verbose=1, sparsity=0.1, visualize_progress= True, ## Stores progress as PROGRESS_{generation}.pdf file upsample= False, ## Suitable for imbalanced data - randomized upsampling tends to help. time_constraint=1).evolve( strategy="evolution", nind=3) ## strategy = "direct-learning" trains a single learner. # Store autoBOTLib.store_autobot_model(autoBOTLibObj, f"model.pickle") # Load autoBOTObj = autoBOTLib.load_autobot_model(f"model.pickle") # Predict dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) autoBOTLibObj.predict_proba(test_sequences) # autoBOTLibObj.generate_report(output_folder="./report/", # job_id="REPORTNEW") test_classes = dataframe2['label'].values.tolist() output_classification_results(predictions, test_classes, f"./predictions/TORCH.json", model_spec={})
def run(): dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].iloc[0:20] train_targets = dataframe['label'].iloc[0:20] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=0.1).evolve(representation_step_only=True) input_instance_embedding = autoBOTLibObj.transform(train_sequences) all_feature_names = [] for transformer in autoBOTLibObj.vectorizer.named_steps[ 'union'].transformer_list: features = transformer[1].steps[1][1].get_feature_names() all_feature_names += features assert input_instance_embedding.shape[1] == len(all_feature_names) print(input_instance_embedding.shape)
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values ## Define custom transformer classes as in the example above tfidf_word_unigram = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=False, max_features=100) tfidf_char_bigram = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_features=100) custom_features = [ ('word_features_custom', pipeline.Pipeline([ ('s1', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('word_tfidf_unigram', tfidf_word_unigram) ])), ('char_features_cusom', pipeline.Pipeline([ ('s2', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('char_tfidf_bigram', tfidf_char_bigram) ])) ] ## Finally, use the flag "combine_with_existing_representation" to append the new transformer pipeline to an existing one (e.g., neurosymbolic). This way, you can easily extend current autoBOTLib! autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=1, representation_type="neurosymbolic", custom_transformer_pipeline=custom_features, combine_with_existing_representation=True).evolve() dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibObj.predict(test_sequences)
def run(): jid = secrets.token_hex(nbytes=16) df_path = None ## Load example data frame dataframe = pd.read_csv(df_path, sep="\t") train_sequences = None train_sequences = None train_targets = None print(len(train_sequences)) print(len(train_targets)) classx = "genericTargetName" autoBOTObj = autoBOTLib.GAlearner( train_sequences, # input sequences train_targets, # target space time_constraint=1, # time in hoursc num_cpu=32, # number of CPUs to use sparsity=0.1, task_name="example test", # task identifier scoring_metric="f1", # sklearn-compatible scoring metric as the fitness. hof_size=3, # size of the hall of fame top_k_importances=25, # how many top features to output as final ranking memory_storage="./memory", # tripled base for concept features representation_type="neurosymbolic") # or symbolic or neural autoBOTObj.evolve( nind=8, ## population size strategy="evolution", ## optimization strategy crossover_proba=0.6, ## crossover rate mutpb=0.4) ## mutation rate autoBOTLib.store_autobot_model(autoBOTObj, f"./models/{jid}_{classx}_model.pickle") test_sequences = None autoBOTObj = autoBOTLib.load_autobot_model( f"./models/{jid}_{classx}_model.pickle") autoBOTObj.predict(test_sequences)
def run(): dataframe = pd.read_csv("../data/depression/train.tsv", sep="\t") train_sequences = dataframe['text_a'][0:] train_targets = dataframe['label'][0:] autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=0.1).evolve(representation_step_only=True) input_instance_embedding = autoBOTLibObj.transform(train_sequences) print(input_instance_embedding.shape) transf = umap.UMAP() embedding = transf.fit_transform(input_instance_embedding) sns.scatterplot(embedding[:, 0], embedding[:, 1], hue=train_targets, palette="coolwarm") plt.gca().set_aspect('equal', 'datalim') plt.title( f'UMAP-based document projection ({input_instance_embedding.shape[1]}D -> 2D)', fontsize=12) plt.show() #or store with plt.savefig("path.pdf", dpi=300)
def run(): ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values ## The syntax for specifying a learner and the hyperparameter space! ## These are the hyperparameters to be explored for each representation. classifier_hyperparameters = { "loss": ["hinge"], "penalty": ["elasticnet"], "alpha": [0.01, 0.001], "l1_ratio": [0, 0.001, 1] } ## This is the classifier compatible with the hyperparameters. custom_classifier = SGDClassifier() autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, # input sequences train_targets, # target space time_constraint=1, # time in hours num_cpu=4, # number of CPUs to use task_name="example test", # task identifier hof_size=3, # size of the hall of fame top_k_importances=25, # how many top features to output as final ranking memory_storage="./memory", representation_type="symbolic", learner=custom_classifier, learner_hyperparameters=classifier_hyperparameters ) # or neurosymbolic or neural autoBOTLibObj.evolve( nind=10, ## population size strategy="evolution", ## optimization strategy crossover_proba=0.6, ## crossover rate mutpb=0.4) ## mutation rate
tfidf_char_bigram = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_features=100) ## Note: You can use any transformer class that is implemented in accordance with the scikit-learn API (.fit, .transform, .fit_transform, .get_feature_names, etc.) ## Next, put them into a list. Note the use of text_col class. custom_features = [ ('word_features', pipeline.Pipeline([ ('s1', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('word_tfidf_unigram', tfidf_word_unigram) ])), ('char_features', pipeline.Pipeline([ ('s2', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('char_tfidf_bigram', tfidf_char_bigram) ])) ] ## Finally, specify this list as autoBOTLibLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=1, custom_transformer_pipeline=custom_features).evolve() dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibLibObj.predict(test_sequences)
## A simple example showcasing the minimal usecase of autoBOTLib on an insults classification data. import autoBOTLib import pandas as pd ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values autoBOTLibObj = autoBOTLib.GAlearner(train_sequences, train_targets, time_constraint=0.1).evolve() dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibObj.predict(test_sequences)
## These are the hyperparameters to be explored for each representation. classifier_hyperparameters = { "loss": ["hinge"], "penalty": ["elasticnet"], "alpha": [0.01, 0.001], "l1_ratio": [0, 0.001, 1] } ## This is the classifier compatible with the hyperparameters. custom_classifier = SGDClassifier() autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, # input sequences train_targets, # target space time_constraint=1, # time in hours num_cpu=4, # number of CPUs to use task_name="example test", # task identifier hof_size=3, # size of the hall of fame top_k_importances=25, # how many top features to output as final ranking memory_storage="./memory", representation_type="symbolic", classifier=custom_classifier, classifier_hyperparameters=classifier_hyperparameters ) # or neurosymbolic or neural autoBOTLibObj.evolve( nind=10, ## population size strategy="evolution", ## optimization strategy crossover_proba=0.6, ## crossover rate mutpb=0.4) ## mutation rate
max_features=100) tfidf_char_bigram = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_features=100) custom_features = [ ('word_features_custom', pipeline.Pipeline([ ('s1', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('word_tfidf_unigram', tfidf_word_unigram) ])), ('char_features_cusom', pipeline.Pipeline([ ('s2', autoBOTLib.feature_constructors.text_col(key='no_stopwords')), ('char_tfidf_bigram', tfidf_char_bigram) ])) ] ## Finally, use the flag "combine_with_existing_representation" to append the new transformer pipeline to an existing one (e.g., neurosymbolic). This way, you can easily extend current autoBOTLib! autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, train_targets, time_constraint=1, representation_type="neurosymbolic", custom_transformer_pipeline=custom_features, combine_with_existing_representation=True).evolve() dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'].values.tolist() predictions = autoBOTLibObj.predict(test_sequences)
import autoBOTLib import pandas as pd ## Load example data frame dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'].values.tolist() train_targets = dataframe['label'].values autoBOTLibObj = autoBOTLib.GAlearner( train_sequences, # input sequences train_targets, # target space time_constraint=1, # time in hoursc num_cpu="all", # number of CPUs to use latent_dim=512, ## latent dim for neural representations sparsity=0.05, ## latent_dim/sparsity dim for sparse representations task_name="example test", # task identifier scoring_metric="f1", # sklearn-compatible scoring metric as the fitness. hof_size=3, # size of the hall of fame top_k_importances=25, # how many top features to output as final ranking memory_storage="./memory", # tripled base for concept features representation_type="neurosymbolic") # or symbolic or neural autoBOTLibObj.evolve( nind=8, ## population size strategy="evolution", ## optimization strategy crossover_proba=0.6, ## crossover rate mutpb=0.4) ## mutation rate ## Persistence demonstration (how to store models for further use?) autoBOTLib.store_autobot_model(