def load_final_dict(classifier, metric): csv_path = RESULTS_PATH / 'results.csv' df = pd.read_csv(csv_path) df = df[df['Description'] == 'Final'] df['Scores'] = df['Scores'].str.replace('\'', '"') df['Parameters'] = df['Parameters'].str.replace('\'', '"') df['Classifier'] = df['Parameters'].apply( lambda x: json.loads(x)['classifier']) df[metric] = df['Scores'].apply(lambda x: json.loads(x)[metric]) df = df.drop(['Parameters', 'Description', 'Scores'], axis=1) df = df[df['Classifier'] == classifier] df = df.groupby(['Dataset', 'Classifier', 'Algorithm'])[metric].agg('mean').reset_index() rows = [] for dataset in datasets.names('final'): row = [dataset] for algorithm in ALGORITHMS: row.append( np.round( list(df[(df['Algorithm'] == algorithm) & (df['Dataset'] == dataset)][metric])[0], 4)) rows.append(row) ds = pd.DataFrame(rows, columns=['Dataset'] + ALGORITHMS) ds.to_csv(RESULTS_PATH / ('%s_%s.csv' % (classifier, metric)), index=False) measurements = OrderedDict() for algorithm in ALGORITHMS: measurements[algorithm] = [] for dataset in datasets.names('final'): scores = df[(df['Algorithm'] == algorithm) & (df['Dataset'] == dataset)][metric] assert len(scores) == 1 measurements[algorithm].append(list(scores)[0]) return measurements
def prepare_df(): try: info = pd.read_csv(RESULTS_PATH / 'dataset_info.csv') except FileNotFoundError: print('Extracting dataset info...') info = extract(verbose=False) info = info[info['name'].isin( datasets.names('final'))].reset_index(drop=True) info['Dataset'] = info['name'] rows = [] for clf in CLASSIFIERS: for metric in METRICS: res = pd.read_csv(RESULTS_PATH / ('%s_%s.csv' % (clf, metric))) res['rank'] = list(res.rank(axis=1, ascending=False)['RBU']) res = pd.merge(res, info, on='Dataset') assert len(res) == 30 for ptype in PTYPES: for index, row in res.iterrows(): newrow = [ row['name'], ptype, clf, metric, row['percentage_%s' % ptype], row['rank'] ] rows.append(newrow) return pd.DataFrame( rows, columns=['name', 'type', 'clf', 'metric', 'percentage [%]', 'rank'])
def prepare_df(): try: info = pd.read_csv(RESULTS_PATH / 'dataset_info.csv') except FileNotFoundError: print('Extracting dataset info...') info = extract(verbose=False) info = info[info['Name'].isin(datasets.names())].reset_index(drop=True) info['Dataset'] = info['Name'] info = info.drop('Name', axis=1) rows = [] for clf in CLASSIFIERS: for metric in METRICS: res = load_final_dict(clf, metric) res['Rank'] = list(res.rank(axis=1, ascending=False)['PA']) res = pd.merge(res, info, on='Dataset') assert len(res) == 60 for _, row in res.iterrows(): rows.append( [row['Dataset'], clf, metric, row['Rank'], row['DI']]) return pd.DataFrame( rows, columns=['Dataset', 'Classifier', 'Metric', 'Rank', 'DI'])
def load_final_dict(classifier, metric): csv_path = RESULTS_PATH / 'results_final.csv' df = pd.read_csv(csv_path) df = df[(df['Classifier'] == classifier) & (df['Metric'] == metric)] df = df.groupby(['Dataset', 'Classifier', 'Resampler', 'Metric'])['Score'].agg('mean').reset_index() rows = [] for dataset in datasets.names(): row = [dataset] for resampler in RESAMPLERS: ds = df[(df['Resampler'] == resampler) & (df['Dataset'] == dataset)] assert len(ds) == 1 row.append(np.round(list(ds['Score'])[0], 4)) rows.append(row) return pd.DataFrame(rows, columns=['Dataset'] + RESAMPLERS)
def main(argv): output_dir = os.path.join("datasets", "tfrecords") if not os.path.exists(output_dir): os.makedirs(output_dir) # Get all possible datasets we can generate adaptation_problems = datasets.names() # Save tfrecord files for each of the adaptation problems if FLAGS.parallel: # TensorFlow will error from all processes trying to use ~90% of the # GPU memory on all parallel jobs, which will fail, so do this on the # CPU. os.environ["CUDA_VISIBLE_DEVICES"] = "" if FLAGS.jobs == 0: cores = None else: cores = FLAGS.jobs run_job_pool(save_dataset, [(d, output_dir) for d in adaptation_problems], cores=cores) else: for dataset_name in adaptation_problems: save_dataset(dataset_name, output_dir)
def load_final_dict(classifier, metric): csv_path = RESULTS_PATH / 'results_pa_rb_comparison.csv' df = pd.read_csv(csv_path) df = df[(df['Classifier'] == classifier) & (df['Metric'] == metric)] measurements = OrderedDict() for algorithm in ALGORITHMS: measurements[algorithm] = [] for dataset in datasets.names(): scores = df[(df['Resampler'] == algorithm) & (df['Dataset'] == dataset)]['Score'] assert len(scores) == 10 measurements[algorithm].append(np.mean(scores)) return measurements
pd.DataFrame(np.c_[X, y]).to_csv(csv_path, index=False, header=header) else: raise NotImplementedError if __name__ == '__main__': logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('-dataset', type=str, choices=datasets.names(), required=True) parser.add_argument('-partition', type=int, choices=[1, 2, 3, 4, 5], required=True) parser.add_argument('-fold', type=int, choices=[1, 2], required=True) parser.add_argument('-mode', type=str, choices=['OVA', 'OVO'], default='OVA') parser.add_argument('-output_path', type=str, default=DEFAULT_ROOT_OUTPUT_PATH) parser.add_argument('-energy', type=float, default=0.25) parser.add_argument('-cleaning_strategy',
import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import databases import datasets for dataset in datasets.names('preliminary'): for fold in range(1, 11): for classifier in ['KNN', 'CART', 'SVM', 'NB']: for n_steps in [1000, 2000, 4000, 8000, 16000]: for gamma in [0.001, 0.01, 0.1, 1.0, 10.0]: trial = { 'Algorithm': 'RBO', 'Parameters': { 'gamma': gamma, 'n_steps': n_steps, 'step_size': 0.001, 'stop_probability': 0.0, 'criterion': 'balance', 'classifier': classifier }, 'Dataset': dataset, 'Fold': fold, 'Description': 'Preliminary' } databases.add_to_pending(trial)
def extract(verbose=True): dfs = [] for partition in ['preliminary', 'final']: rows = [] for name in tqdm(datasets.names(partition)): dataset = datasets.load(name) (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1] X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) n_samples = X.shape[0] n_features = X.shape[1] majority_class = Counter(y).most_common()[0][0] n_majority_samples = Counter(y).most_common()[0][1] n_minority_samples = Counter(y).most_common()[1][1] imbalance_ratio = np.round(n_majority_samples / n_minority_samples, 2) knn = NearestNeighbors(6).fit(X) n_safe = 0 n_borderline = 0 n_rare = 0 n_outliers = 0 for X_i, y_i in zip(X, y): if y_i == majority_class: continue else: indices = knn.kneighbors([X_i], return_distance=False)[0, 1:] n_majority_neighbors = sum(y[indices] == majority_class) if n_majority_neighbors in [0, 1]: n_safe += 1 elif n_majority_neighbors in [2, 3]: n_borderline += 1 elif n_majority_neighbors == 4: n_rare += 1 elif n_majority_neighbors == 5: n_outliers += 1 else: raise ValueError n_total = n_safe + n_borderline + n_rare + n_outliers percentage_safe = np.round(n_safe / n_total * 100, 2) percentage_borderline = np.round(n_borderline / n_total * 100, 2) percentage_rare = np.round(n_rare / n_total * 100, 2) percentage_outlier = np.round(n_outliers / n_total * 100, 2) rows.append([ name, imbalance_ratio, n_samples, n_features, percentage_safe, percentage_borderline, percentage_rare, percentage_outlier ]) df = pd.DataFrame(rows, columns=[ 'name', 'imbalance_ratio', 'n_samples', 'n_features', 'percentage_safe', 'percentage_borderline', 'percentage_rare', 'percentage_outlier' ]) df = df.sort_values('imbalance_ratio') dfs.append(df) df = pd.concat(dfs).reset_index(drop=True) df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv', index=False) if verbose: for i, row in df.iterrows(): row = [str(v).replace('_', '\_') for v in row] print(' & '.join(row) + ' \\\\') if i == 19: print('\\midrule') return df
def evaluate_trial(resampler_name, fold): RESULTS_PATH = Path(__file__).parents[0] / 'results_final' RANDOM_STATE = 42 resamplers = { 'SMOTE': sv.SMOTE(random_state=RANDOM_STATE), 'polynom-fit-SMOTE': sv.polynom_fit_SMOTE(random_state=RANDOM_STATE), 'Lee': sv.Lee(random_state=RANDOM_STATE), 'SMOBD': sv.SMOBD(random_state=RANDOM_STATE), 'G-SMOTE': sv.G_SMOTE(random_state=RANDOM_STATE), 'LVQ-SMOTE': sv.LVQ_SMOTE(random_state=RANDOM_STATE), 'Assembled-SMOTE': sv.Assembled_SMOTE(random_state=RANDOM_STATE), 'SMOTE-TomekLinks': sv.SMOTE_TomekLinks(random_state=RANDOM_STATE), 'RBO': RBO(random_state=RANDOM_STATE), 'PA': PA(random_state=RANDOM_STATE) } for dataset_name in datasets.names(): classifiers = { 'CART': DecisionTreeClassifier(random_state=RANDOM_STATE), 'KNN': KNeighborsClassifier(n_neighbors=3), 'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE), 'MLP': MLPClassifier(random_state=RANDOM_STATE) } trial_name = f'{dataset_name}_{fold}_{resampler_name}' trial_path = RESULTS_PATH / f'{trial_name}.csv' if trial_path.exists(): continue logging.info(f'Evaluating {trial_name}...') dataset = datasets.load(dataset_name) (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] resampler = resamplers[resampler_name] assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2 X_train, y_train = resampler.sample(X_train, y_train) rows = [] for classifier_name in classifiers.keys(): classifier = classifiers[classifier_name] clf = classifier.fit(X_train, y_train) predictions = clf.predict(X_test) scoring_functions = { 'Precision': metrics.precision, 'Recall': metrics.recall, 'AUC': metrics.auc, 'G-mean': metrics.g_mean } for scoring_function_name in scoring_functions.keys(): score = scoring_functions[scoring_function_name](y_test, predictions) row = [ dataset_name, fold, classifier_name, resampler_name, scoring_function_name, score ] rows.append(row) columns = [ 'Dataset', 'Fold', 'Classifier', 'Resampler', 'Metric', 'Score' ] RESULTS_PATH.mkdir(exist_ok=True, parents=True) pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
def extract(k=5, verbose=True): rows = [] columns = ['Name', 'DI', 'IR', 'Samples', 'Features'] for name in tqdm(datasets.names()): dataset = datasets.load(name) (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1] X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) n_samples = X.shape[0] n_features = X.shape[1] majority_class = Counter(y).most_common()[0][0] n_majority_samples = Counter(y).most_common()[0][1] n_minority_samples = Counter(y).most_common()[1][1] imbalance_ratio = np.round(n_majority_samples / n_minority_samples, 2) knn = NearestNeighbors(k + 1).fit(X) difficulty_coefficients = [] for X_i, y_i in zip(X, y): if y_i == majority_class: continue else: indices = knn.kneighbors([X_i], return_distance=False)[0, 1:] n_majority_neighbors = sum(y[indices] == majority_class) difficulty_coefficients.append(n_majority_neighbors / k) difficulty_index = np.round(np.mean(difficulty_coefficients), 3) rows.append( [name, difficulty_index, imbalance_ratio, n_samples, n_features]) df = pd.DataFrame(rows, columns=columns) df = df.sort_values('DI') df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv', index=False) if verbose: for column in ['DI', 'IR']: df[column] = df[column].map(lambda x: f'{x:.2f}') for i in range(30): row = [str(df.iloc[i][c]) for c in columns] if i + 30 < len(df): row += [str(df.iloc[i + 30][c]) for c in columns] else: row += ['' for _ in columns] print(' & '.join(row).replace('_', '\\_') + ' \\\\') return df
import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import databases import datasets REFERENCE_ALGORITHMS = [ 'RUS', 'AKNN', 'CC', 'CNN', 'ENN', 'IHT', 'NCL', 'NM', 'OSS', 'RENN', 'TL', 'ROS', 'SMOTE', 'Bord', 'RBO', 'SMOTE+TL', 'SMOTE+ENN' ] for dataset in datasets.names('final'): for fold in range(1, 11): for classifier in ['KNN', 'CART', 'SVM', 'NB']: trial = { 'Algorithm': 'RBU', 'Parameters': { 'gamma': [0.01, 0.1, 1.0, 10.0], 'ratio': [0.5, 0.75, 1.0], 'classifier': classifier }, 'Dataset': dataset, 'Fold': fold, 'Description': 'Final' } databases.add_to_pending(trial) for algorithm in REFERENCE_ALGORITHMS:
Note: sets CUDA_VISIBLE_DEVICES= so that it doesn't use the GPU. """ import os import numpy as np import tensorflow as tf import matplotlib.pyplot as plt from absl import app from absl import flags from PIL import Image import datasets FLAGS = flags.FLAGS flags.DEFINE_enum("source", None, datasets.names(), "What dataset to use as the source") flags.DEFINE_enum("target", "", [""] + datasets.names(), "What dataset to use as the target") flags.DEFINE_boolean("test", False, "Show test images instead of training images") flags.mark_flag_as_required("source") def display(name, images, labels, max_number=16, office=False, save_images=False):
def evaluate_trial(ratio, fold): RESULTS_PATH = Path(__file__).parents[0] / 'results_ratio' RANDOM_STATE = 42 for dataset_name in datasets.names(): classifiers = { 'CART': DecisionTreeClassifier(random_state=RANDOM_STATE), 'KNN': KNeighborsClassifier(n_neighbors=3), 'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE), 'MLP': MLPClassifier(random_state=RANDOM_STATE) } trial_name = f'{dataset_name}_{fold}_{ratio}' trial_path = RESULTS_PATH / f'{trial_name}.csv' if trial_path.exists(): continue logging.info(f'Evaluating {trial_name}...') dataset = datasets.load(dataset_name) (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] resampler = PA(ratio=ratio, random_state=RANDOM_STATE) assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2 try: X_train, y_train = resampler.sample(X_train, y_train) except RuntimeError: continue rows = [] for classifier_name in classifiers.keys(): classifier = classifiers[classifier_name] clf = classifier.fit(X_train, y_train) predictions = clf.predict(X_test) scoring_functions = { 'Precision': metrics.precision, 'Recall': metrics.recall, 'AUC': metrics.auc, 'G-mean': metrics.g_mean } for scoring_function_name in scoring_functions.keys(): score = scoring_functions[scoring_function_name](y_test, predictions) row = [ dataset_name, fold, classifier_name, ratio, scoring_function_name, score ] rows.append(row) columns = ['Dataset', 'Fold', 'Classifier', 'Ratio', 'Metric', 'Score'] RESULTS_PATH.mkdir(exist_ok=True, parents=True) pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)