def main(args, max_evals): # Hyperparameters space model_path = f"./checkpoint/{args.experiment_id}_ckpt.pth" trials_path = f"./results/{args.experiment_id}_trials.p" display_step = args.iterations//100 if args.iterations > 1000 else 10 # relative display steps space = {#------------------------------------- Architecture -------------------------------------# 'experiment_id': hp.choice(label='experiment_id', options=[args.experiment_id]), 'input_size': hp.choice(label='input_size', options=[SIZE]), 'n_classes': hp.choice(label='n_classes', options=[4_000]), #------------------------------ Optimization Regularization -----------------------------# 'iterations': hp.choice(label='iterations', options=[args.iterations]), 'display_step': scope.int(hp.choice(label='display_step', options=[display_step])), 'batch_size': scope.int(hp.choice(label='batch_size', options=[512])), #'initial_lr': hp.loguniform(label='lr', low=np.log(5e-3), high=np.log(0.1)), 'initial_lr': scope.float(hp.choice(label='initial_lr', options=[0.1])), 'lr_decay': scope.float(hp.choice(label='lr_decay', options=[0.5])), 'adjust_lr_step': hp.choice(label='adjust_lr_step', options=[300_000//3]), 'weight_decay': hp.choice(label='weight_decay', options=[5e-4]), 'with_center_loss': hp.choice(label='with_center_loss', options=[bool(args.with_center_loss)]), 'initial_clr': hp.choice(label='initial_clr', options=[0.01, 0.05, 0.1, 0.5]), 'alpha': hp.choice(label='alpha', options=[0.1, 0.01]), #'display_step': scope.int(hp.choice(label='eval_epochs', options=[3_000])), #-------------------------------------- Others --------------------------------------# 'path': hp.choice(label='path', options=[model_path]), 'trials_path': hp.choice(label='trials_path', options=[trials_path]), 'random_seed': scope.int(hp.quniform('random_seed', 1, 10, 1))} # Hyperparameters search trials = Trials() fmin_objective = partial(fit_and_log, trials=trials, verbose=True) best_model = fmin(fmin_objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) # Save output with open(trials_path, "wb") as f: pickle.dump(trials, f)
def train(self,training_file,rare_thresh=100,as_text=True,tune_mode=None,size=None,clf_params=None,chosen_clf=None): """ Train the EnsembleSentencer. Note that the underlying estimators are assumed to be pretrained already. :param training_file: File in DISRPT shared task .conll format :param rare_thresh: Rank of rarest word to include (rarer items are replace with POS) :param genre_pat: Regex pattern with capturing group to extract genre from document names :param as_text: Boolean, whether the input is a string, rather than a file name to read :return: """ if tune_mode is not None and size is None: size = 5000 sys.stderr.write("o No sample size set - setting size to 5000\n") if clf_params is None: # Default classifier parameters clf_params = {"n_estimators":100,"min_samples_leaf":3,"random_state":42} if chosen_clf is None: chosen_clf = DEFAULTCLF data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words = self.read_data(training_file,rare_thresh=rare_thresh,as_text=as_text) sys.stderr.write("o Learning...\n") if tune_mode == "hyperopt": from hyperopt import hp from hyperopt.pyll.base import scope dev_file = training_file.replace("_train","_dev") _, val_x, val_y, _, _, _, _, _, _ = self.read_data(dev_file,rare_thresh=rare_thresh,as_text=False,no_cache=True) space = { 'average': hp.choice('average',["micro","weighted"]), 'n_estimators': scope.int(hp.quniform('n_estimators', 50, 150, 10)), 'max_depth': scope.int(hp.quniform('max_depth', 3, 35, 1)), 'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)), 'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)), 'colsample_bytree': hp.choice('colsample_bytree', [0.4,0.5,0.6,0.7,0.8,1.0]), 'subsample': hp.choice('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]), 'clf': hp.choice('clf', ["xgb"]) } best_clf, best_params = hyper_optimize(data_x,data_y,val_x=val_x,val_y=val_y,space=space) return best_clf, best_params elif tune_mode is not None: best_params = {} best_params_by_clf = defaultdict(dict) # Tune individual params separately for speed, or do complete grid search if building final model params_list = [{"n_estimators":[75,100,125]}, {'max_depth': [7,10,15,None]}, {"min_samples_split": [2, 5, 10]}, {"min_samples_leaf":[1,2,3]}] if tune_mode == "full": # Flatten dictionary if doing full CV params_list = [{k: v for d in params_list for k, v in d.items()}] best_score = -10000 for clf in [RandomForestClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier()]: for params in params_list: base_params = copy.deepcopy(clf_params) # Copy default params if clf.__class__.__name__ != "GradientBoostingClassifier": base_params.update({"n_jobs":4, "oob_score":True, "bootstrap":True}) for p in params: if p in base_params: # Ensure base_params don't conflict with grid search params base_params.pop(p) clf.set_params(**base_params) grid = GridSearchCV(clf,params,cv=3,n_jobs=3,error_score="raise",refit=False,scoring="f1") grid.fit(data_x,data_y) if tune_mode == "full": if grid.best_score_ > best_score: best_score = grid.best_score_ best_clf = clf for param in params: best_params[param] = grid.best_params_[param] else: if grid.best_score_ > best_score: best_clf = clf for param in params: best_params_by_clf[clf.__class__.__name__][param] = grid.best_params_[param] if tune_mode == "paramwise": best_params = best_params_by_clf[best_clf.__class__.__name__] else: best_params["best_score"] = best_score clf_name = best_clf.__class__.__name__ with io.open(segmenters_dir + os.sep + "params" + os.sep + "EnsembleConnective"+self.auto+"_best_params.tab",'a',encoding="utf8") as bp: corpus = os.path.basename(training_file).split("_")[0] for k, v in best_params.items(): bp.write("\t".join([corpus, clf_name, k, str(v)])+"\n") self.clf = best_clf return best_clf, best_params else: clf = chosen_clf clf.set_params(**clf_params) if clf.__class__.__name__ != "GradientBoostingClassifier": clf.set_params(**{"n_jobs":3,"oob_score":True,"bootstrap":True}) clf.set_params(**{"random_state":42}) clf.fit(data_x,data_y) self.clf = clf feature_names = cat_labels + num_labels zipped = zip(feature_names, clf.feature_importances_) sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True) sys.stderr.write("o Feature importances:\n\n") for name, importance in sorted_zip: sys.stderr.write(name + "=" + str(importance) + "\n") #sys.stderr.write("\no OOB score: " + str(clf.oob_score_)+"\n") sys.stderr.write("\no Serializing model...\n") joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts), self.model, compress=3)
def BayesSearch(X, y): """Search Hyper parameter""" global MODEL if MODEL == "log_reg": param_space = { "solver": hp.choice("solver", ["newton-cg", "saga", "lbfgs"]), "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)), "C": scope.float(hp.lognormal("C", 0.0001, 3)), } elif MODEL == "sgd": param_space = { "loss": hp.choice("loss", ["log", "modified_huber"]), "penalty": hp.choice("penalty", ["l2", "l1", "elasticnet"]), "alpha": scope.float(hp.uniform("alpha", 0.001, 1)), "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)), } elif MODEL == "rftree": param_space = { "max_depth": scope.int(hp.quniform("max_depth", 6, 15, 1)), "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000, 1)), "criterion": hp.choice("criterion", ["gini", "entropy"]), "max_features": hp.choice("max_features", ["auto", "log2"]), "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 6, 100, 1)), "min_samples_split": scope.int(hp.quniform("min_samples_split", 6, 100, 1)), #'bootstrap': hp.choice('bootstrap', [True, False]), } elif MODEL == "extree": param_space = { "max_depth": scope.int(hp.quniform("max_depth", 5, 25, 1)), "n_estimators": scope.int(hp.quniform("n_estimators", 100, 2000, 1)), "criterion": hp.choice("criterion", ["gini", "entropy"]), "max_features": hp.choice("max_features", ["auto", "log2"]), "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 3, 100, 1)), "min_samples_split": scope.int(hp.quniform("min_samples_split", 3, 100, 1)), #'bootstrap': hp.choice('bootstrap', [True, False]), } elif MODEL == "lgbm": param_space = { "num_leaves": scope.int(hp.uniform("num_leaves", 10, 1000)), "max_depth": scope.int(hp.uniform("max_depth", 6, 100)), "cat_smooth": scope.int(hp.uniform("cat_smooth", 1, 100)), "subsample": scope.float(hp.uniform("subsample", 0.4, 1)), "colsample_bytree": scope.float(hp.uniform("colsample_bytree", 0.4, 1)), # "subsample_freq":scope.int(hp.uniform("subsample_freq", 1, 20)), "min_child_samples": scope.int(hp.uniform("min_child_samples", 2, 100)), "min_split_gain": scope.float( hp.loguniform("min_split_gain", np.log(0.001), np.log(10)) ), "reg_alpha": scope.float( hp.loguniform("reg_alpha", np.log(0.001), np.log(10)) ), "reg_lambda": scope.float( hp.loguniform("reg_lambda", np.log(0.001), np.log(10)) ), } elif MODEL == "xgbm": param_space = { "max_depth": scope.int(hp.quniform("max_depth", 6, 10, 1)), "subsample": scope.float(hp.uniform("subsample", 0.4, 1)), "colsample_bytree": scope.float(hp.uniform("colsample_bytree", 0.4, 1)), "gamma": scope.int(hp.quniform("gamma", 0, 20, 1)), "reg_alpha": scope.float(hp.uniform("reg_alpha", 0.01, 1)), "reg_lambda": scope.float(hp.uniform("reg_lambda", 0.01, 1)), # "scale_pos_weight":scope.float(hp.uniform("scale_pos_weight", 0.001, 1)), } # optimize function trails = Trials() optimization_function = partial(optimize, X=X, y=y) result = fmin( fn=optimization_function, space=param_space, algo=tpe.suggest, max_evals=10, trials=trails, verbose=1, ) print("Best Result is:", "_" * 10, result) return result, trails
def train(self, train_file, lexicon_file=None, freq_file=None, test_prop=0.1, output_importances=False, dump_model=False, cross_val_test=False, output_errors=False, ablations=None, dump_transformed_data=False, do_shuffle=True, conf=None): """ :param train_file: File with segmentations to train on in one of the two formats described in make_prev_next() :param lexicon_file: Tab delimited lexicon file with full forms in first column and POS tag in second column (multiple rows per form possible) :param freq_file: Tab delimited file with segment forms and their frequencies as integers in two columns :param conf: configuration file for training (by default: <MODELNAME>.conf) :param test_prop: (0.0 -- 0.99) Proportion of shuffled data to test on :param output_importances: Whether to print feature importances (only if test proportion > 0.0) :param dump_model: Whether to dump trained model to disk via joblib :param cross_val_test: Whether to perform cross-validation for hyper parameter optimization :param output_errors: Whether to output prediction errors to a file 'errs.txt' :param ablations: Comma separated string of feature names to ablate, e.g. "freq_ratio,prev_grp_pos,next_grp_pos" :param dump_transformed_data: If true, transform data to a pandas dataframe and write to disk, then quit (useful to train other approaches on the same features, e.g. a DNN classifier) :param do_shuffle: Whether training data is shuffled after context extraction but before test partition is created (this has no effect if training on whole training corpus) :return: None """ import timing self.read_conf_file(file_name=conf) pos_lookup = read_lex(self.short_pos,lexicon_file) self.pos_lookup = pos_lookup conf_file_parser = self.conf_file_parser letter_config = LetterConfig(self.letters, self.conf["vowels"], self.pos_lookup) np.random.seed(42) if lexicon_file is None: print("i WARN: No lexicon file provided, learning purely from examples") seg_table = io.open(train_file,encoding="utf8").read() seg_table = seg_table.replace("\r","").strip() for c in self.conf["diacritics"]: # TODO: configurable diacritic removal pass #seg_table = seg_table.replace(c,"") seg_table = seg_table.split("\n") sys.stderr.write("o Encoding Training data\n") # Validate training data non_tab_lines = 0 non_tab_row = 0 for r, line in enumerate(seg_table): if line.count("\t") < 1: non_tab_lines += 1 non_tab_row = r if non_tab_lines > 0: sys.stderr.write("FATAL: found " + str(non_tab_lines) + " rows in training data not containing tab\n") sys.stderr.write(" Last occurrence at line: " + str(non_tab_row) + "\n") sys.exit() # Make into four cols: prev \t next \t current \t segmented (unless already receiving such a table, for shuffled datasets) if seg_table[0].count("\t") == 1: seg_table = make_prev_next(seg_table) # Ensure OOV symbol is in data seg_table = ["_\t_\t_\t_"] + seg_table data_y = [] words = [] all_encoded_groups = [] encoding_cache = {} non_ident_segs = 0 shuffle_mapping = list(range(len(seg_table))) zipped = list(zip(seg_table, shuffle_mapping)) # Shuffle table to sample across entire dataset if desired if do_shuffle and False: random.Random(24).shuffle(zipped) seg_table, shuffle_mapping = zip(*zipped) headers = bg2array("_________",prev_group="_",next_group="_",print_headers=True,is_test=1,grp_id=1,config=letter_config) word_idx = -1 bug_rows = [] freqs = defaultdict(float) total_segs = 0.0 flines = io.open(freq_file,encoding="utf8").read().replace("\r","").split("\n") if freq_file is not None else [] for l in flines: if l.count("\t")==1: w, f = l.split("\t") freqs[w] += float(f) total_segs += float(f) for u in freqs: freqs[u] = freqs[u]/total_segs # Don't use freqs if they're empty if len(freqs) == 0: sys.stderr.write("o No segment frequencies provided, adding 'freq_ratio' to ablated features\n") if ablations is None: ablations = "freq_ratio" else: if "freq_ratio" not in ablations: ablations += ",freq_ratio" step = int(1/test_prop) if test_prop > 0 else 0 test_indices = list(range(len(seg_table)))[0::step] if step > 0 else [] test_rows = [] for row_idx, row in enumerate(seg_table): is_test = 1 if row_idx in test_indices else 0 prev_group, next_group, bound_group, segmentation = row.split("\t") if bound_group != "|": if len(bound_group) != len(segmentation.replace("|","")): # Ignore segmentations that also normalize non_ident_segs += 1 bug_rows.append((row_idx,bound_group,segmentation.replace("|",""))) continue ### if dump_transformed_data: if is_test: test_rows.append(bound_group + "\t" + segmentation) ### word_idx += 1 words.append(bound_group) group_type = "_".join([x for x in [prev_group, next_group, bound_group] if x != ""]) if group_type in encoding_cache: # No need to encode, an identical featured group has already been seen encoded_group = encoding_cache[group_type] for c in encoded_group: c[headers.index("is_test")] = is_test # Make sure that this group's test index is correctly assigned else: encoded_group = bg2array(bound_group,prev_group=prev_group,next_group=next_group,is_test=is_test,grp_id=word_idx,config=letter_config,train=True,freqs=freqs) encoding_cache[group_type] = encoded_group all_encoded_groups += encoded_group data_y += segs2array(segmentation) sys.stderr.write("o Finished encoding " + str(len(data_y)) + " chars (" + str(len(seg_table)) + " groups, " + str(len(encoding_cache)) + " group types)\n") if non_ident_segs > 0: with io.open("bug_rows.txt",'w',encoding="utf8") as f: f.write(("\n".join([str(r) + ": " + g + "<>" + s for r, g, s in sorted([[shuffle_mapping[x], g, s] for x, g, s in bug_rows])]) + "\n")) sys.stderr.write("i WARN: found " + str(non_ident_segs) + " rows in training data where left column characters not identical to right column characters\n") sys.stderr.write(" Row numbers dumped to: bug_rows.txt\n") sys.stderr.write(" " + str(non_ident_segs) + " rows were ignored in training\n\n") data_y = np.array(data_y) # Remove features switched off in .conf file for label in self.conf["unused"]: if label in cat_labels: cat_labels.remove(label) if label in num_labels: num_labels.remove(label) # Handle temporary ablations if specified in option -a if ablations is not None: sys.stderr.write("o Applying ablations\n") if len(ablations) > 0 and ablations != "none": abl_feats = ablations.split(",") sys.stderr.write("o Ablating features:\n") for feat in abl_feats: found = False if feat in cat_labels: cat_labels.remove(feat) found = True elif feat in num_labels: num_labels.remove(feat) found = True if found: sys.stderr.write("\t"+feat+"\n") else: sys.stderr.write("\tERR: can't find ablation feature " + feat + "\n") sys.exit() sys.stderr.write("o Creating dataframe\n") data_x = pd.DataFrame(all_encoded_groups, columns=headers) ### if dump_transformed_data: data_x["resp"] = data_y import csv to_remove = ["is_test","grp_id"] # Columns to remove from transformed data dump out_cols = [col for col in headers if col not in to_remove] + ["resp"] # Add the response column as 'resp' data_x.iloc[data_x.index[data_x["is_test"] == 0]].to_csv("rftokenizer_train_featurized.tab",sep="\t",quotechar="",quoting=csv.QUOTE_NONE,encoding="utf8",index=False,columns=out_cols) data_x.iloc[data_x.index[data_x["is_test"] == 1]].to_csv("rftokenizer_test_featurized.tab",sep="\t",quotechar="",quoting=csv.QUOTE_NONE,encoding="utf8",index=False,columns=out_cols) # Dump raw test rows to compare gold solution with io.open("rftokenizer_test_gold.tab","w",encoding="utf8") as gold: gold.write("\n".join(test_rows) + "\n") sys.stderr.write("o Wrote featurized train/test set and gold test to rftokenizer_*.tab\n") sys.exit() ### data_x_enc, multicol_dict = multicol_fit_transform(data_x, pd.Index(cat_labels)) if test_prop > 0: sys.stderr.write("o Generating train/test split with test proportion "+str(test_prop)+"\n") data_x_enc["boundary"] = data_y strat_train_set = data_x_enc.iloc[data_x_enc.index[data_x_enc["is_test"] == 0]] strat_test_set = data_x_enc.iloc[data_x_enc.index[data_x_enc["is_test"] == 1]] sys.stderr.write("o Transforming data to numerical array\n") train_x = strat_train_set[cat_labels+num_labels].values train_y = strat_train_set["boundary"] train_y_bin = np.where(strat_train_set['boundary'] == 0, 0, 1) if test_prop > 0: test_x = strat_test_set[cat_labels+num_labels].values test_y_bin = np.where(strat_test_set['boundary'] == 0, 0, 1) bound_grp_idx = np.array(strat_test_set['grp_id']) from sklearn.dummy import DummyClassifier d = DummyClassifier(strategy="most_frequent") d.fit(train_x,train_y_bin) pred = d.predict(test_x) print("o Majority baseline:") print("\t" + str(accuracy_score(test_y_bin, pred))) # Classifier used in 2018 paper: #clf = ExtraTreesClassifier(n_estimators=250, max_features=None, n_jobs=3, random_state=42) # Use xgboost for slightly better accuracy than paper from xgboost import XGBClassifier clf = XGBClassifier(n_estimators=230,n_jobs=3,random_state=42,max_depth=17,subsample=1.0,colsample_bytree=0.6,eta=.07,gamma=.09) if cross_val_test: # Modify code to tune hyperparameters from hyperopt import hp from hyperopt.pyll import scope space = { 'n_estimators': scope.int(hp.quniform('n_estimators', 100, 250, 10)), 'max_depth': scope.int(hp.quniform('max_depth', 8, 35, 1)), 'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)), 'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)), 'colsample_bytree': hp.choice('colsample_bytree', [0.6,0.7,0.8,1.0]), 'subsample': hp.choice('subsample', [0.6,0.7,0.8,0.9,1.0]), 'clf': hp.choice('clf', ["xgb"]) } if test_prop > 0: best_clf, best_params = hyper_optimize(train_x,train_y_bin,val_x=test_x,val_y=test_y_bin,space=space,max_evals=20) else: best_clf, best_params = hyper_optimize(train_x,train_y_bin,val_x=None,val_y=None,space=space,max_evals=100) print(best_params) clf = best_clf print("\nBest parameters:\n" + 30 * "=") print(best_params) sys.exit() sys.stderr.write("o Learning...\n") clf.fit(train_x, train_y_bin) if test_prop > 0: pred = clf.predict(test_x) j=-1 for i, row in strat_test_set.iterrows(): j+=1 if row["idx"] +1 == row["len_bound_group"]: pred[j] = 0 print("o Binary clf accuracy:") print("\t" + str(accuracy_score(test_y_bin, pred))) group_results = defaultdict(lambda : 1) for i in range(len(pred)): grp = bound_grp_idx[i] if test_y_bin[i] != pred[i]: group_results[grp] = 0 correct = 0 total = 0 for grp in set(bound_grp_idx): if group_results[grp] == 1: correct +=1 total +=1 print("o Perfect bound group accuracy:") print("\t" + str(float(correct)/total)) errs = defaultdict(int) for i, word in enumerate(words): if i in group_results: if group_results[i] == 0: errs[word] += 1 if output_errors: print("o Writing prediction errors to errs.txt") with io.open("errs.txt",'w',encoding="utf8") as f: for err in errs: f.write(err + "\t" + str(errs[err])+"\n") else: print("o Test proportion is 0%, skipping evaluation") if output_importances: feature_names = cat_labels + num_labels zipped = zip(feature_names, clf.feature_importances_) sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True) print("o Feature importances:\n") for name, importance in sorted_zip: print(name, "=", importance) if dump_model: plain_dict_pos_lookup = {} plain_dict_pos_lookup.update(pos_lookup) joblib.dump((clf, num_labels, cat_labels, multicol_dict, plain_dict_pos_lookup, freqs, conf_file_parser), self.lang + ".sm" + str(sys.version_info[0]), compress=3) print("o Dumped trained model to " + self.lang + ".sm" + str(sys.version_info[0]))
def train(self, training_file, rare_thresh=200, clf_params=None, chosen_feats=None, tune_mode=None, size=None, as_text=True, multitrain=False, chosen_clf=DEFAULTCLF): """ :param training_file: :param rare_thresh: :param clf_params: :param chosen_feats: List of feature names to force a subset of selected features to be used :param tune_mode: None for no grid search, "paramwise" to tune each hyperparameter separately, or "full" for complete grid (best but slowest) :param size: Sample size to optimize variable importance with :return: """ if tune_mode is not None and size is None: size = 5000 sys.stderr.write("o No sample size set - setting size to 5000\n") if clf_params is None: # Default classifier parameters clf_params = { "n_estimators": 150, "min_samples_leaf": 3, "random_state": 42 } if DEFAULTCLF.__class__.__name__ not in [ "GradientBoostingClassifier", "CatBoostClassifier", "XGBClassifier" ]: clf_params.update({ "n_jobs": 4, "oob_score": True, "bootstrap": True }) data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words = self.read_data( training_file, size, as_text=as_text, rare_thresh=rare_thresh, chosen_feats=chosen_feats) sys.stderr.write("o Learning...\n") if tune_mode is not None: # Randomly select |size| samples for training and leave rest for validation, max |size| samples data_x = data_encoded[cat_labels + num_labels + ["label"]].sample( frac=1, random_state=42) data_y = np.where(data_x['label'] == "_", 0, 1) data_x = data_x[cat_labels + num_labels] if len(data_y) > 2 * size: val_x = data_x[size:2 * size] val_y = data_y[size:2 * size] else: val_x = data_x[size:] val_y = data_y[size:] data_x = data_x[:size] data_y = data_y[:size] if tune_mode == "importances": sys.stderr.write( "o Measuring correlation of categorical variables\n") theil_implications = report_theils_u(val_x, cat_labels) for (var1, var2) in theil_implications: if var1 in cat_labels and var2 in cat_labels and var2 != "word": drop_var = var2 u = theil_implications[(var1, var2)] sys.stderr.write("o Removed feature " + drop_var + " due to Theil's U " + str(u)[:6] + " of " + var1 + "->" + var2 + "\n") cat_labels.remove(drop_var) sys.stderr.write( "o Measuring correlation of numerical variables\n") cor_mat = report_correlations(val_x[num_labels], thresh=0.95) for (var1, var2) in cor_mat: if var1 in num_labels and var2 in num_labels: drop_var = var2 # if imp[var1] > imp[var2] else var1 if drop_var == "word": continue corr_level = cor_mat[(var1, var2)] sys.stderr.write("o Removed feature " + drop_var + " due to correlation " + str(corr_level) + " of " + var1 + ":" + var2 + "\n") num_labels.remove(drop_var) return cat_labels, num_labels if tune_mode in ["paramwise", "full"]: # Grid Search best_clf, best_params = grid_search(data_x, data_y, tune_mode, clf_params) clf_name = best_clf.__class__.__name__ self.clf = best_clf return best_clf, best_params elif tune_mode == "hyperopt": # TPE guided random search from hyperopt import hp from hyperopt.pyll.base import scope val_x, val_y = None, None if self.corpus_dir is not None: dev_file = self.corpus_dir + os.sep + self.corpus + "_dev.conll" _, val_x, val_y, _, _, _, _, _, _ = self.read_data( dev_file, size, as_text=False, rare_thresh=rare_thresh, chosen_feats=chosen_feats) space = { 'n_estimators': scope.int(hp.quniform('n_estimators', 100, 250, 10)), 'max_depth': scope.int(hp.quniform('max_depth', 3, 30, 1)), 'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)), 'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)), 'colsample_bytree': hp.choice('colsample_bytree', [0.4, 0.5, 0.6, 0.7, 1.0]), 'subsample': hp.choice('subsample', [0.5, 0.6, 0.7, 0.8, 1.0]), 'clf': hp.choice('clf', ["xgb"]) } best_clf, best_params = hyper_optimize(data_x.values, data_y, val_x=None, val_y=None, space=space, max_evals=20) return best_clf, best_params else: # No hyperparameter optimization clf = chosen_clf if chosen_clf is not None else DEFAULTCLF sys.stderr.write("o Setting params " + str(clf_params) + "\n") clf.set_params(**clf_params) if clf.__class__.__name__ not in [ "GradientBoostingClassifier", "CatBoostClassifier", "XGBClassifier" ]: clf.set_params(**{ "n_jobs": 3, "oob_score": True, "bootstrap": True }) if clf.__class__.__name__ in ["XGBClassifier"]: clf.set_params(**{"n_jobs": 3}) clf.set_params(**{"random_state": 42}) if multitrain: multitrain_preds = get_multitrain_preds( clf, data_x, data_y, self.multifolds) multitrain_preds = "\n".join(multitrain_preds.strip().split( "\n")[1:-1]) # Remove OOV tokens at start and end with io.open(script_dir + os.sep + "multitrain" + os.sep + self.name + self.auto + '_' + self.corpus, 'w', newline="\n") as f: sys.stderr.write( "o Serializing multitraining predictions\n") f.write(multitrain_preds) if clf.__class__.__name__ == "CatBoostClassifier": clf.fit(data_x, data_y, cat_features=list(range(len(cat_labels)))) else: clf.fit(data_x, data_y) self.clf = clf feature_names = cat_labels + num_labels sys.stderr.write("o Using " + str(len(feature_names)) + " features\n") zipped = zip(feature_names, clf.feature_importances_) sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True) sys.stderr.write("o Feature Gini importances:\n\n") for name, importance in sorted_zip: sys.stderr.write(name + "=" + str(importance) + "\n") if self.clf.__class__.__name__ not in [ "GradientBoostingClassifier", "CatBoostClassifier", "XGBClassifier" ]: sys.stderr.write("\no OOB score: " + str(clf.oob_score_) + "\n\n") if tune_mode == "permutation": # Filter features based on permutation importance score threshold imp = permutation_importances(clf, val_x, val_y) for var, score in imp.items(): if score < 0 and var != "word": sys.stderr.write("o Dropping feature " + var + " due to low permutation importance of " + str(score) + "\n") if var in cat_labels: cat_labels.remove(var) elif var in num_labels: num_labels.remove(var) sys.stderr.write( "o Measuring correlation of numerical variables\n") cor_mat = report_correlations(val_x[num_labels]) for (var1, var2) in cor_mat: if var1 in num_labels and var2 in num_labels: drop_var = var2 if imp[var1] > imp[var2] else var1 if drop_var == "word": continue corr_level = cor_mat[(var1, var2)] sys.stderr.write("o Removed feature " + drop_var + " due to correlation " + str(corr_level) + " of " + var1 + ":" + var2 + "\n") num_labels.remove(drop_var) return cat_labels, num_labels sys.stderr.write("\no Serializing model...\n") joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts), self.model, compress=3)