def train_classifier(X_train, y_train, base_learners, method, cl_names): if method == 'AdaBoost': clf = CostSensitiveAlgorithms(algorithm='AdaBoost', n_estimators=base_learners) clf.fit(X_train, y_train) elif 'AdaCC' in method or 'AdaN-CC' in method: clf = AdaCC(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) elif 'RareBoost' in method: clf = RareBoost(n_estimators=base_learners) clf.fit(X_train, y_train) else: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 9.5, 9.9] processes = [] if not os.path.exists('temp_preds_cdf'): os.makedirs('temp_preds_cdf') for ratio in ratios: p = Process(target=train_competitors, args=(X_train, y_train, base_learners, method, majority, minority, ratio)) p.start() processes.append(p) for p in processes: p.join() best_score = -1 for ratio in ratios: if os.path.exists('temp_preds_cdf/' + method + str(ratio)): with open('temp_preds_cdf/' + method + str(ratio), 'rb') as filehandle: temp = pickle.load(filehandle) if temp[0] > best_score: best_score = temp[0] clf = temp[1] if os.path.exists('temp_preds_cdf/' + method + str(ratio)): os.remove('temp_preds_cdf/' + method + str(ratio)) with open('temp/' + method, 'wb') as filehandle: pickle.dump(numpy.asarray(clf.get_confidence_scores(X_train)), filehandle)
def train_classifier(X_train, y_train, base_learners, method, cl_names): if method == 'AdaBoost': clf = CostSensitiveAlgorithms(algorithm='AdaBoost', n_estimators=base_learners) clf.fit(X_train, y_train) elif 'AdaCC' in method or 'AdaN-CC' in method: clf = AdaCC(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) elif 'RareBoost' in method: clf = RareBoost(n_estimators=base_learners) clf.fit(X_train, y_train) else: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] best = -1 ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] for j in ratios: try: clf = CostSensitiveAlgorithms(n_estimators=base_learners, algorithm=method, class_weight={ minority: 1, majority: j / 10. }) clf.fit(X_train, y_train) if clf.error == 1: clf = None else: score = f1_score(y_train, clf.predict(X_train)) # score = balanced_accuracy_score(y_train, clf.predict(X_train)) if score >= best: best = score best_clf = clf except: pass clf = best_clf with open('temp_features/' + method, 'wb') as filehandle: pickle.dump(clf, filehandle)
def train_and_predict(X_train, y_train, X_test, base_learners, method, cl_names): if "AdaCC" not in method or "AdaN-CC" not in method: X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.4, stratify=y_train) if method == 'AdaBoost': t1_start = process_time() clf = CostSensitiveAlgorithms(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return elif 'AdaCC' in method: t1_start = process_time() clf = AdaCC(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return elif 'AdaN-CC' in method: t1_start = process_time() clf = AdaCC(n_estimators=base_learners, algorithm=method.replace("N-", ""), amortised=False) clf.fit(X_train, y_train) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return elif 'AdaMEC_Cal' in method: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] t1_start = process_time() clf = AdaMEC_Cal(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) best_score = -1 best_idx = 0 for idx, cost in enumerate(ratios): class_weight = {minority: 1, majority: cost / 10.} clf.set_costs(y_train, class_weight) score = f1_score(y_valid, clf.predict(X_valid)) if best_score < score: best_idx = idx best_score = score class_weight = {minority: 1, majority: ratios[best_idx] / 10.} clf.set_costs(y_train, class_weight) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time, best_score, ratios[best_idx]], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return elif 'AdaMEC' in method: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] t1_start = process_time() clf = AdaMEC(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) best_score = -1 best_idx = 0 for idx, cost in enumerate(ratios): class_weight = {minority: 1, majority: cost / 10.} clf.set_costs(class_weight) score = f1_score(y_valid, clf.predict(X_valid)) if best_score < score: best_idx = idx best_score = score class_weight = {minority: 1, majority: ratios[best_idx] / 10.} clf.set_costs(class_weight) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time, best_score, ratios[best_idx]], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return elif method == 'RareBoost': t1_start = process_time() clf = RareBoost(n_estimators=base_learners) clf.fit(X_train, y_train) t1_stop = process_time() oveall_time = t1_stop - t1_start with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return else: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] processes = [] for ratio in ratios: p = Process(target=train_competitors, args=(X_train, y_train, X_valid, y_valid, base_learners, method, majority, minority, ratio)) p.start() processes.append(p) for p in processes: p.join() best_score = -1 best_ratio = 0 oveall_time = 0 for ratio in ratios: if os.path.exists('temp_preds/' + method + str(ratio)): with open('temp_preds/' + method + str(ratio), 'rb') as filehandle: temp = pickle.load(filehandle) oveall_time += temp[2] if temp[0] > best_score: best_ratio = ratio best_score = temp[0] clf = temp[1] if os.path.exists('temp_preds/' + method + str(ratio)): os.remove('temp_preds/' + method + str(ratio)) with open('temp_preds/stats_' + method, 'wb') as filehandle: pickle.dump([oveall_time, best_score, best_ratio], filehandle) with open('temp_preds/' + method, 'wb') as filehandle: # pickle.dump(clf.predict(X_test), filehandle) pickle.dump([clf.predict(X_test), clf.predict_proba(X_test)], filehandle) return
def train_and_predict(X_train, y_train, base_learners, method): if method == 'AdaBoost': clf = CostSensitiveAlgorithms(algorithm='AdaBoost', n_estimators=base_learners, debug=True) clf.fit(X_train, y_train) elif 'AdaCC' in method: clf = AdaCC(n_estimators=base_learners, algorithm=method, debug=True) clf.fit(X_train, y_train) elif 'AdaN-AC' in method: clf = AdaCC(n_estimators=base_learners, algorithm=method.replace("N-", ""), debug=True, amortised=False) clf.fit(X_train, y_train) elif 'RareBoost' in method: clf = RareBoost(n_estimators=base_learners, debug=True) clf.fit(X_train, y_train) with open('temp_preds_AdaCC/' + method, 'wb') as filehandle: pickle.dump([ clf._class_weights_pos, clf._class_weights_neg, clf.training_error, clf.estimator_weights_pos, clf.estimator_weights_neg ], filehandle) return else: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] processes = [] for ratio in ratios: p = Process(target=train_competitors, args=(X_train, y_train, base_learners, method, majority, minority, ratio)) p.start() processes.append(p) for p in processes: p.join() best_score = -1 for ratio in ratios: if os.path.exists('temp_preds_AdaCC/' + method + str(ratio)): with open('temp_preds_AdaCC/' + method + str(ratio), 'rb') as filehandle: temp = pickle.load(filehandle) if temp[0] > best_score: best_score = temp[0] clf = temp[1] if os.path.exists('temp_preds_AdaCC/' + method + str(ratio)): os.remove('temp_preds_AdaCC/' + method + str(ratio)) with open('temp_preds_AdaCC/' + method, 'wb') as filehandle: pickle.dump([ clf._class_weights_pos, clf._class_weights_neg, clf.training_error, clf.estimator_alphas_ ], filehandle)
def train_and_predict(X_train, y_train, base_learners, method): if method == 'AdaBoost': clf = CostSensitiveAlgorithms(algorithm='AdaBoost', n_estimators=base_learners) clf.fit(X_train, y_train) elif 'AdaCC' in method: clf = AdaCC(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) elif 'AdaMEC' in method: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 10.] clf = AdaMEC_Cal(n_estimators=base_learners, algorithm=method) clf.fit(X_train, y_train) best_score = -1 best_idx = 0 for idx, cost in enumerate(ratios): class_weight = {minority: 1, majority: cost / 10.} clf.set_costs(y_train, class_weight) score = f1_score(y_train, clf.predict(X_train)) # score = balanced_accuracy_score(y_train, clf.predict(X_train)) if best_score < score: best_idx = idx best_score = score class_weight = {minority: 1, majority: ratios[best_idx] / 10.} clf.set_costs(y_train, class_weight) elif 'RareBoost' in method: clf = RareBoost(n_estimators=base_learners) clf.fit(X_train, y_train) else: counter_dict = Counter(list(y_train)) majority = max(counter_dict.items(), key=operator.itemgetter(1))[0] minority = max(counter_dict.items(), key=operator.itemgetter(0))[0] ratios = [1., 2., 3., 4., 5., 6., 7, 8., 9., 9.99] processes = [] for ratio in ratios: p = Process(target=train_competitors, args=(X_train, y_train, base_learners, method, majority, minority, ratio)) p.start() processes.append(p) for p in processes: p.join() best_ratio = -1 predictor = None for ratio in ratios: if os.path.exists('boundary_temp_preds/' + method + str(ratio)): with open('boundary_temp_preds/' + method + str(ratio), 'rb') as filehandle: temp = pickle.load(filehandle) if temp[0] > best_ratio: best_ratio = temp[0] predictor = temp[1] if os.path.exists('boundary_temp_preds/' + method + str(ratio)): os.remove('boundary_temp_preds/' + method + str(ratio)) with open('boundary_temp_preds/' + method, 'wb') as filehandle: joblib.dump(predictor, filehandle) return with open('boundary_temp_preds/' + method, 'wb') as filehandle: joblib.dump(clf, filehandle)