def prepare_data(dataset_name, n_minority_samples=20, scaler='MinMax'):
    dataset = datasets.load(dataset_name)
    (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1]
    X, y = np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test])

    if n_minority_samples is not None:
        minority_class = Counter(y).most_common()[1][0]
        majority_class = Counter(y).most_common()[0][0]

        n_minority = Counter(y).most_common()[1][1]
        n_majority = Counter(y).most_common()[0][1]

        X, y = RandomUnderSampler(
            sampling_strategy={
                minority_class: np.min([n_minority, n_minority_samples]),
                majority_class: n_majority
            },
            random_state=42,
        ).fit_sample(X, y)

    X = TSNE(n_components=2, random_state=42).fit_transform(X)

    if scaler == 'MinMax':
        X = MinMaxScaler().fit_transform(X)
    elif scaler == 'Standard':
        X = StandardScaler().fit_transform(X)
    else:
        raise NotImplementedError

    return X, y
Exemplo n.º 2
0
def dump_ds(kind):
    ps = qu.Params(**(ds_small if kind == 'small' else ds_large))
    ss = [s for s in qd.dump(ps, f'/tmp/q/data/{kind}')]
    ds = qd.load(ps, shards=ss).map(qd.adapter)
    for i, _ in enumerate(ds):
        pass
    print(f'dumped {i + 1} batches of {ps.dim_batch} samples each')
Exemplo n.º 3
0
def run(model_settings, dataset_settings, _log):
    _log.info('dataset_settings: ' + str(dataset_settings))
    _log.info('model_settings: ' + str(model_settings))
    dataset = datasets.load(dataset_settings)
    model_settings.update({'dataset': dataset})
    model = models.load(model_settings)

    train(model, dataset)
    evaluate(model, dataset)
Exemplo n.º 4
0
def run(model_settings, dataset_settings, num_experiments, _log):
    _log.info('dataset_settings: ' + str(dataset_settings))
    _log.info('model_settings: ' + str(model_settings))

    ex.info['evaluations'] = []
    for i in range(num_experiments):
        dataset = datasets.load(dataset_settings)
        model_settings.update({'dataset': dataset})
        model = models.load(model_settings)
        train(model, dataset)
        ex.info['evaluations'].append(evaluate(model, dataset))
def run(model_settings, dataset_settings, num_experiments, _log):
    _log.info('dataset_settings: ' + str(dataset_settings))
    _log.info('model_settings: ' + str(model_settings))
    ex.info['evaluations'] = []
    for i in range(1, num_experiments+1):
        print('#'*10, 'Run', i, '#'*10)
        dataset_settings['train_size'] = i/num_experiments
        dataset = datasets.load(dataset_settings)
        model_settings.update({'dataset': dataset})
        model = models.load(model_settings)
        train(model, dataset)
        ex.info['evaluations'].append(evaluate(model, dataset))
    ex.info['sota'] = dataset.sota
def run(model_settings, dataset_settings, _log):
    _log.info('dataset_settings: ' + str(dataset_settings))
    dataset = datasets.load(dataset_settings)

    model_settings.update({
        'input_shape': dataset.input_shape,
        'num_classes': dataset.num_classes,
    })
    _log.info('model_settings: ' + str(model_settings))
    model = models.load(model_settings)

    train(model, dataset)
    evaluate(model, dataset)
Exemplo n.º 7
0
def load_experiment(path, alignment='luminance'):
    """
    Load an FHD experiment located at given path.

    Parameters
    ----------
    path : str
        The path of the experiment to load.
    alignment : str, default = 'luminance'
        Default alignment of layers for the FHDs.

    Returns
    -------
    experiment : Bunch
        The loaded FHD experiment.

    Notes
    -----
    FHistograms are scaled between [0, 1] globally (no loss of information) and
    independently for shapes and spatial relations.

    """
    path = os.path.normpath(path)
    dataset = datasets.load(path.split('/')[-2])
    n_layers = int(path.split('/')[-1].split('-')[0])

    if alignment not in ALIGNMENTS:
        raise ValueError("Incorrect alignment.")

    fhd_files = sorted(glob.glob(os.path.join(path, '*/fhd.txt')))
    fhds = np.array([
        from_file(fhd_file, n_layers, alignment=alignment)
        for fhd_file in fhd_files
    ])

    # Feature scaling (shapes and spatial relations independently)
    shapes = np.vstack([_[np.diag_indices(n_layers)] for _ in fhds])
    spatials = np.vstack([_[np.triu_indices(n_layers, 1)] for _ in fhds])
    for fhd in fhds:
        fhd[np.diag_indices(n_layers)] -= shapes.min()
        fhd[np.diag_indices(n_layers)] /= (shapes.max() - shapes.min())
        fhd[np.triu_indices(n_layers, 1)] -= spatials.min()
        fhd[np.triu_indices(n_layers, 1)] /= (spatials.max() - spatials.min())

    experiment = dataset
    experiment['path'] = path
    experiment['n_layers'] = n_layers
    experiment['fhds'] = fhds
    return experiment
Exemplo n.º 8
0
def test_create_package():
    acc = datasets.run('iris.csv')
    candidate = acc.candidates[-1]

    example = autom8.create_example_input(
        pipeline=candidate.pipeline,
        dataset=datasets.load('iris.csv'),
        indices=acc.test_indices[1:3],
    )

    package_bytes = autom8.create_package(
        package_name='autom8-test',
        pipeline=candidate.pipeline,
        example_input=example,
        extra_notes='foo bar baz',
    )

    with zipfile.ZipFile(io.BytesIO(package_bytes)) as z:
        assert sorted(z.namelist()) == sorted([
            'autom8-test/.dockerignore',
            'autom8-test/Dockerfile',
            'autom8-test/LICENSE',
            'autom8-test/Makefile',
            'autom8-test/README.md',
            'autom8-test/pipeline.pickle',
            'autom8-test/requirements.txt',
            'autom8-test/service.py',
            'autom8-test/tests.py',
        ])

        def read(name):
            with z.open(f'autom8-test/{name}') as f:
                return f.read().decode('utf-8')

        assert 'requirements.txt' in read('Dockerfile')
        assert 'MIT License' in read('LICENSE')

        with z.open('autom8-test/pipeline.pickle') as f:
            pipeline = pickle.load(f)

        readme = read('README.md')
        assert 'foo bar baz' in readme

        sample_input = _extract_json(readme, '--data \'')
        expected_output = _extract_json(readme, '\nThis will return:\n')
        received_output = pipeline.run(sample_input['rows'])
        assert expected_output['predictions'] == received_output.predictions
Exemplo n.º 9
0
def save_dataset(dataset_name, output_dir, seed=0):
    """ Save single dataset """
    train_filename = os.path.join(output_dir,
        tfrecord_filename(dataset_name, "train"))
    valid_filename = os.path.join(output_dir,
        tfrecord_filename(dataset_name, "valid"))
    test_filename = os.path.join(output_dir,
        tfrecord_filename(dataset_name, "test"))

    # Skip if they already exist
    if os.path.exists(train_filename) \
            and os.path.exists(valid_filename) \
            and os.path.exists(test_filename):
        if FLAGS.debug:
            print("Skipping:", train_filename, valid_filename, test_filename,
               "already exist")
        return

    if FLAGS.debug:
        print("Saving dataset", dataset_name)
    dataset, dataset_class = datasets.load(dataset_name)

    # Skip if already normalized/bounded, e.g. UCI HAR datasets
    already_normalized = dataset_class.already_normalized

    # Split into training/valid datasets
    valid_data, valid_labels, train_data, train_labels = \
        valid_split(dataset.train_data, dataset.train_labels, seed=seed)

    # Calculate normalization only on the training data
    if FLAGS.normalize != "none" and not already_normalized:
        normalization = datasets.calc_normalization(train_data, FLAGS.normalize)

        # Apply the normalization to the training, validation, and testing data
        train_data = datasets.apply_normalization(train_data, normalization)
        valid_data = datasets.apply_normalization(valid_data, normalization)
        test_data = datasets.apply_normalization(dataset.test_data, normalization)
    else:
        test_data = dataset.test_data

    # Saving
    write(train_filename, train_data, train_labels)
    write(valid_filename, valid_data, valid_labels)
    write(test_filename, test_data, dataset.test_labels)
def run_test():
    for current_dataset in Datasets:
        for current_fold in range (1,11):

            df_train, df_test, ds_infos = ds.load(source_path, current_dataset, current_fold)
            X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1].values
            X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1].values

            for v in range(0, 3, 2):

                fileName = current_dataset + '.BA' + str(current_fold) + '.O5.T10.V' + str(v)
                results_dir = os.path.join(output_path, 'trees10', current_dataset,'Results_' + current_dataset)
                curr_dir = os.path.join(output_path, 'trees10', current_dataset, fileName)
                source_dir = Path(curr_dir)
                files = source_dir.iterdir()
                files = source_dir.glob('*.tree')
                resultFile = os.path.join(results_dir,fileName + '.results.txt')
                f = open(resultFile, 'w')

                for file in files:
                    str_file = str(file)
                    pos = str_file.find('.D')
                    pos2 = str_file.find('.tree')
                    depth = str_file[pos+2:pos2]
                    born_againO5 = tree_io.classifier_from_file(str_file, X_train, y_train, pruning=False)

                    # BornAgainNew
                    banew_test_pred = born_againO5.predict(X_test)
                    banew_train_pred = born_againO5.predict(X_train)
                    report_banew = classification_report(y_test, banew_test_pred, output_dict=True)

                    test_acc = report_banew['accuracy']
                    test_F1 = report_banew['weighted avg']['f1-score']

                    f.write(depth)
                    f.write(" " + str(test_acc))
                    f.write(" " + str(test_F1))
                    f.write("\n")

                f.close()
                print(fileName)
                tree_view.plotStatistics(resultFile)
Exemplo n.º 11
0
def main(argv):
    # Don't bother using the GPU for this
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

    # Input data
    if FLAGS.target != "":
        source_dataset, target_dataset = datasets.load_da(
            FLAGS.source, FLAGS.target)
    else:
        source_dataset = datasets.load(FLAGS.source)
        target_dataset = None

    if not FLAGS.test:
        source_data = source_dataset.train_images
        source_labels = source_dataset.train_labels
        target_data = target_dataset.train_images \
            if target_dataset is not None else None
        target_labels = target_dataset.test_labels \
            if target_dataset is not None else None
    else:
        source_data = source_dataset.test_images
        target_data = target_dataset.test_images \
            if target_dataset is not None else None

    display("Source",
            source_data,
            source_labels,
            office="office_" in FLAGS.source)

    if target_dataset is not None:
        display("Target",
                target_data,
                target_labels,
                office="office_" in FLAGS.target)

    plt.show()
Exemplo n.º 12
0
from flask import Flask
from datasets import load, load_buildings, load_invest
import os

app = Flask(__name__)

app.url_map.strict_slashes = False
_dir = os.path.dirname(os.path.abspath(__file__))
app.template_folder = os.path.join(_dir, "templates")
app.static_folder = os.path.join(_dir, "static")
app.config['UPLOAD_FOLDER'] = os.path.join(_dir, "upload")

power = load()
invest = load_invest()
apart = load_buildings()
Exemplo n.º 13
0
def monks(task_type, param_grid, model_assessment=False):
    # this file contains the whole dataset, we rely on it instead of using the provided splitting 
    # because in that way we simulate a splitting according to hold-out technique
    dataset = ds.load('datasets/'+ task_type + '.test', 'monks')
    dataset.shuffle() # bacause data are taken randomly in monks-*.train
    # simple hold-out strategy 
    # ~123 elements for training set as in the original splitting (monks-1, monks-3)
    splitting = 43/100
    if task_type == 'monks-2':
        # monks-2 uses ~169 elements in the training set
        splitting = 59/100

    trainvalset, testset = dataset.split(splitting)
    # validation set is half of training set
    trainset, validationset = trainvalset.split(66.6/100)
    

    for params in ms.grid_search(param_grid):
        
        # if batch size is -1 means we want the batch equal to the entire training set size
        params['batch_size'] = params['batch_size'] if params['batch_size'] > 0 else trainset.size()
        print(params)
        
        epochs = params['epochs'] # value taken from the monks problem paper
        batch_size = params['batch_size']

        # trying different runs, to be independent from random weights init
        # and to have a bias-variance estimation (ensemble learning) when using inference on testset   
        runs_number = 3 # 5 can be used as well
        for r in range(runs_number): 
            # we are going to init more instances of the model to 
            # perform a better computation of the metrics    
            nn.from_parameters(params, 'sigmoid', 'sigmoid')
            model = nn.build()

            ms.add_model(model) 
        
        ms.set_datasets(trainset,validationset)

        start_time = time.time()
        for e in range(epochs):
            printProgressBar(e + 1, epochs, prefix = 'Training:', suffix = 'Complete')

            # for each model we initialized above
            for model_id, model in ms.models():
                # doing one step of training
                model.fit(trainset,batch_size,e)
                    
                # computing the output values for this training step
                train_outputs = model.forward_dataset(trainset)
                val_outputs = model.forward_dataset(validationset)

                # compute the metrics
                ms.compute_error(model_id, train_outputs, val_outputs)
                ms.compute_other(model_id, train_outputs, val_outputs, metrics=['acc'],threshold=0.5)

        training_time = time.time() - start_time
        print("TRAINING TIME " + str(training_time) + " seconds") 

        # getting the average of errors and accuracy         
        avg_tr_error, avg_val_error = ms.avg_mse()
        avg_tr_acc, avg_val_acc = ms.avg_acc()
        # precision and recall will be used during model assessment (see below)
        final_accuracy = avg_val_acc[-1]

        res.set_task(task_type)

        plt = res.plot_mse(epochs, avg_tr_error, avg_val_error, params, final_accuracy)
        msepath = res.save_plot(plt,'mse')
        
        plt = res.plot_acc(epochs,avg_tr_acc,avg_val_acc,params)
        res.save_plot(plt,'acc')
        
        # adding the result
        res.add_result(avg_tr_error[-1], avg_val_error[-1], params['batch_size'], params['weights_bound'], params['learning_rate'] , params['momentum_alpha'], final_accuracy, msepath)
        
        if not model_assessment:
            # cleaning model selection for next run
            ms.clean()

    res.add_result_header('mse_tr' , 'mse_val','batch_s','weights', 'lr','m_alpha', 'acc', 'path')     
    res.save_results()
    
    # WARNING this code must be executed only once
    # it must be executed only after model selection otherwise we will invalidate the test set
    if model_assessment:
        # here we want to use the testset to assess the model performances
        trained_models = [m  for _, m in  ms.models()]
        voted_outputs = []
        avg_outputs = []
        for batch in testset.batch(1):
            for pattern in batch:
                tmp_voted_outputs = []
                tmp_real_outputs = []
                for m in trained_models:
                    class_out , real_out = m.classify(pattern[1],threshold=0.5)
                    tmp_voted_outputs.append( class_out )
                    tmp_real_outputs.append(real_out)
                
                # we get the most frequent element ( majority vote)
                voted_outputs.append(mode(tmp_voted_outputs))
                # we get the average output to compute the error
                avg_outputs.append([mean(tmp_real_outputs)])

        metrics = ms.get_metrics()
        target_outputs = [ x[0] for x in testset.data_set[:,2]]
        # computing acc, rec and precision for the testset
        acc = metrics.accuracy(voted_outputs,target_outputs)
        recall = metrics.recall(voted_outputs, target_outputs)
        precision = metrics.precision(voted_outputs, target_outputs)

        mse = metrics.mean_square_error(avg_outputs, testset.data_set[:,2])
        
        print("ACCURACY " + str(acc))
        print("PRECISION " + str(precision))
        print("RECALL " + str(recall))
        print("MSE " + str(mse))
Exemplo n.º 14
0
 def __init__(self, username, password, download_directory=None):
     self.data_dir = self.setup_data_dir(download_directory)
     self.datasets = datasets.load()
     res = self.login(username, password)
Exemplo n.º 15
0
        "Russian Federation":"Russia",
        "Congo, Dem. Rep.":"Congo (Kinshasa)",
        "Venezuela, RB":"Venezuela",
        "St. Lucia":"Saint Lucia",
        "St. Vincent and the Grenadines":"Saint Vincent and the Grenadines",
        "Congo, Rep.":"Republic of the Congo",
        "Bahamas, The":"The Bahamas",
        "Gambia, The":"The Gambia"
    }
    for t in trans :
        s["Country/Region"] = s["Country/Region"].replace(t, trans[t])
    return(s)


if __name__ == "__main__":
    dsets = datasets.load()
    covid = datasets.combine(dsets)
    if (os.path.isfile(WDI_FILE)) :
        warnings.warn("Reading cached WDI data from disk, delete file to download updated")
        wdi = pd.read_pickle(WDI_FILE)
    else :
        wdi = covid.drop(columns=["Date","Province/State","Lat","Long", datasets.CONFIRMED,"deaths","recoveries"]).drop_duplicates()
        for id in INDICES_USED:
            s = wb.download(indicator=id, country="all", start=2005, end=2019).reset_index()

            # use most recent non missing value
            s = s.dropna().groupby("country").last()
            s = s.drop(columns="year").reset_index()

            # match country names to covid data
            s = s.rename(columns={"country":"Country/Region"})
parser.add_argument('-name')
parser.add_argument('-iteration', type=int)

args = vars(parser.parse_args())

print('Running iteration #%d for dataset %s...' %
      (args['iteration'], args['name']))

RESULTS_PATH = os.path.join(os.path.dirname(__file__), 'results')
TRIAL_PATH = os.path.join(RESULTS_PATH, args['name'])

for path in [RESULTS_PATH, TRIAL_PATH]:
    if not os.path.exists(path):
        os.mkdir(path)

partitions = load(args['name'])

for i in range(5):
    partition = partitions[i]

    for j in range(2):
        X_train, y_train = partition[j % 2]
        X_test, y_test = partition[(j + 1) % 2]

        mask = select(X_train, y_train, verbose=True)

        base_score = score(X_train, y_train, X_test, y_test,
                           RandomForestClassifier())
        selection_score = score(X_train[:, mask], y_train, X_test[:, mask],
                                y_test, RandomForestClassifier())
        features = []
Exemplo n.º 17
0
def evaluate_trial(resampler_name, fold):
    RESULTS_PATH = Path(__file__).parents[0] / 'results_final'
    RANDOM_STATE = 42

    resamplers = {
        'SMOTE': sv.SMOTE(random_state=RANDOM_STATE),
        'polynom-fit-SMOTE': sv.polynom_fit_SMOTE(random_state=RANDOM_STATE),
        'Lee': sv.Lee(random_state=RANDOM_STATE),
        'SMOBD': sv.SMOBD(random_state=RANDOM_STATE),
        'G-SMOTE': sv.G_SMOTE(random_state=RANDOM_STATE),
        'LVQ-SMOTE': sv.LVQ_SMOTE(random_state=RANDOM_STATE),
        'Assembled-SMOTE': sv.Assembled_SMOTE(random_state=RANDOM_STATE),
        'SMOTE-TomekLinks': sv.SMOTE_TomekLinks(random_state=RANDOM_STATE),
        'RBO': RBO(random_state=RANDOM_STATE),
        'PA': PA(random_state=RANDOM_STATE)
    }

    for dataset_name in datasets.names():
        classifiers = {
            'CART': DecisionTreeClassifier(random_state=RANDOM_STATE),
            'KNN': KNeighborsClassifier(n_neighbors=3),
            'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE),
            'MLP': MLPClassifier(random_state=RANDOM_STATE)
        }

        trial_name = f'{dataset_name}_{fold}_{resampler_name}'
        trial_path = RESULTS_PATH / f'{trial_name}.csv'

        if trial_path.exists():
            continue

        logging.info(f'Evaluating {trial_name}...')

        dataset = datasets.load(dataset_name)

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        resampler = resamplers[resampler_name]

        assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2

        X_train, y_train = resampler.sample(X_train, y_train)

        rows = []

        for classifier_name in classifiers.keys():
            classifier = classifiers[classifier_name]

            clf = classifier.fit(X_train, y_train)
            predictions = clf.predict(X_test)

            scoring_functions = {
                'Precision': metrics.precision,
                'Recall': metrics.recall,
                'AUC': metrics.auc,
                'G-mean': metrics.g_mean
            }

            for scoring_function_name in scoring_functions.keys():
                score = scoring_functions[scoring_function_name](y_test,
                                                                 predictions)
                row = [
                    dataset_name, fold, classifier_name, resampler_name,
                    scoring_function_name, score
                ]
                rows.append(row)

        columns = [
            'Dataset', 'Fold', 'Classifier', 'Resampler', 'Metric', 'Score'
        ]

        RESULTS_PATH.mkdir(exist_ok=True, parents=True)

        pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
def extract(k=5, verbose=True):
    rows = []
    columns = ['Name', 'DI', 'IR', 'Samples', 'Features']

    for name in tqdm(datasets.names()):
        dataset = datasets.load(name)

        (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1]

        X = np.concatenate([X_train, X_test])
        y = np.concatenate([y_train, y_test])

        n_samples = X.shape[0]
        n_features = X.shape[1]

        majority_class = Counter(y).most_common()[0][0]

        n_majority_samples = Counter(y).most_common()[0][1]
        n_minority_samples = Counter(y).most_common()[1][1]

        imbalance_ratio = np.round(n_majority_samples / n_minority_samples, 2)

        knn = NearestNeighbors(k + 1).fit(X)

        difficulty_coefficients = []

        for X_i, y_i in zip(X, y):
            if y_i == majority_class:
                continue
            else:
                indices = knn.kneighbors([X_i], return_distance=False)[0, 1:]
                n_majority_neighbors = sum(y[indices] == majority_class)

                difficulty_coefficients.append(n_majority_neighbors / k)

        difficulty_index = np.round(np.mean(difficulty_coefficients), 3)

        rows.append(
            [name, difficulty_index, imbalance_ratio, n_samples, n_features])

    df = pd.DataFrame(rows, columns=columns)
    df = df.sort_values('DI')

    df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv',
              index=False)

    if verbose:
        for column in ['DI', 'IR']:
            df[column] = df[column].map(lambda x: f'{x:.2f}')

        for i in range(30):
            row = [str(df.iloc[i][c]) for c in columns]

            if i + 30 < len(df):
                row += [str(df.iloc[i + 30][c]) for c in columns]
            else:
                row += ['' for _ in columns]

            print(' & '.join(row).replace('_', '\\_') + ' \\\\')

    return df
Exemplo n.º 19
0
def run():
    while True:
        trial = pull_pending()

        if trial is None:
            break

        params = eval(trial['Parameters'])

        logging.info(trial)

        dataset = load(trial['Dataset'])
        fold = int(trial['Fold']) - 1

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        n_minority = Counter(y_train).most_common()[1][1]
        n_majority = Counter(y_train).most_common()[0][1]

        imblearn_ratios = [
            ((n_majority - n_minority) * ratio + n_minority) / n_majority
            for ratio in [0.5, 0.75, 1.0]
        ]

        clf = {
            'NB': NB(),
            'KNN': KNN(),
            'SVM': SVM(gamma='scale'),
            'CART': CART()
        }[params['classifier']]

        if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'):
            algorithm = None
        else:
            algorithms = {
                'AKNN':
                ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]),
                'Bord':
                ResamplingCV(SMOTE,
                             clf,
                             kind=['borderline1'],
                             k_neighbors=[1, 3, 5, 7, 9],
                             m_neighbors=[5, 10, 15],
                             sampling_strategy=imblearn_ratios),
                'CC':
                ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios),
                'CNN':
                ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]),
                'ENN':
                ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]),
                'IHT':
                ResamplingCV(IHT,
                             clf,
                             sampling_strategy=imblearn_ratios,
                             cv=[2]),
                'NCL':
                ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]),
                'NM':
                ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]),
                'OSS':
                ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]),
                'RBO':
                ResamplingCV(RBO,
                             clf,
                             gamma=[0.01, 0.1, 1.0, 10.0],
                             ratio=[0.5, 0.75, 1.0]),
                'RBU':
                ResamplingCV(RBU,
                             clf,
                             gamma=params.get('gamma'),
                             ratio=params.get('ratio')),
                'RENN':
                ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]),
                'ROS':
                ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios),
                'RUS':
                ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios),
                'SMOTE':
                ResamplingCV(SMOTE,
                             clf,
                             k_neighbors=[1, 3, 5, 7, 9],
                             sampling_strategy=imblearn_ratios),
                'SMOTE+ENN':
                ResamplingCV(
                    SMOTEENN,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'SMOTE+TL':
                ResamplingCV(
                    SMOTETomek,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'TL':
                TL(),
            }

            algorithm = algorithms.get(trial['Algorithm'])

            if algorithm is None:
                raise NotImplementedError

        if algorithm is not None:
            X_train, y_train = algorithm.fit_sample(X_train, y_train)

        clf = clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        scores = {
            'Precision': metrics.precision(y_test, predictions),
            'Recall': metrics.recall(y_test, predictions),
            'F-measure': metrics.f_measure(y_test, predictions),
            'AUC': metrics.auc(y_test, predictions),
            'G-mean': metrics.g_mean(y_test, predictions)
        }

        submit_result(trial, scores)
Exemplo n.º 20
0
Arquivo: run.py Projeto: zhouying3/RBO
def run():
    while True:
        trial = pull_pending()

        if trial is None:
            break

        params = eval(trial['Parameters'])

        print(trial)

        clf = eval(params['classifier'])()

        if trial['Algorithm'] == 'RBO':
            algorithm = RBO(gamma=params['gamma'], n_steps=params['n_steps'], step_size=params['step_size'],
                            stop_probability=params['stop_probability'], criterion=params['criterion'])
        elif trial['Algorithm'] == 'RBOSelection':
            if params['measure'] == 'AUC':
                measure = metrics.roc_auc_score
            else:
                raise NotImplementedError

            algorithm = RBOSelection(classifier=clf, measure=measure, gammas=params['gammas'], n_steps=params['n_steps'],
                                     step_size=params['step_size'], stop_probability=params['stop_probability'],
                                     criterion=params['criterion'])
        elif (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'):
            algorithm = None
        else:
            algorithms = {
                'SMOTE': SMOTE(),
                'SMOTE+ENN': SMOTEENN(),
                'SMOTE+TL': SMOTETomek(),
                'Bord': SMOTE(kind='borderline1'),
                'ADASYN': ADASYN(),
                'NCL': NCL()
            }

            algorithm = algorithms.get(trial['Algorithm'])

            if algorithm is None:
                raise NotImplementedError

        dataset = load(trial['Dataset'], noise_type=params.get('noise_type', None),
                       noise_level=params.get('noise_level', 0.0))
        fold = int(trial['Fold']) - 1

        (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1]

        labels = np.unique(y_test)
        counts = [len(y_test[y_test == label]) for label in labels]
        minority_class = labels[np.argmin(counts)]

        if algorithm.__class__ in [SMOTE, SMOTEENN, SMOTETomek]:
            train_labels = np.unique(y_train)
            train_counts = [len(y_train[y_train == train_label]) for train_label in train_labels]
            train_minority_class = labels[np.argmin(train_counts)]
            algorithm.k = algorithm.k_neighbors = np.min([len(y_train[y_train == train_minority_class]) - 1, 5])

        if algorithm is not None:
            X_train, y_train = algorithm.fit_sample(X_train, y_train)

        clf = clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        g_mean = 1.0

        for label in np.unique(y_test):
            idx = (y_test == label)
            g_mean *= metrics.accuracy_score(y_test[idx], predictions[idx])

        g_mean = np.sqrt(g_mean)

        scores = {
            'Accuracy': metrics.accuracy_score(y_test, predictions),
            'Average accuracy': np.mean(metrics.recall_score(y_test, predictions, average=None)),
            'Precision': metrics.precision_score(y_test, predictions, pos_label=minority_class),
            'Recall': metrics.recall_score(y_test, predictions, pos_label=minority_class),
            'F-measure': metrics.f1_score(y_test, predictions, pos_label=minority_class),
            'AUC': metrics.roc_auc_score(y_test, predictions),
            'G-mean': g_mean
        }

        submit_result(trial, scores)
Exemplo n.º 21
0
def cup(param_grid):
    dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP')
    # we do the train combining the previous trainingset and validation set
    # to have more data
    trainset, testset = dataset.split(75 / 100)
    # data normalization

    params = next(ms.grid_search(param_grid))
    print(params)
    params['batch_size'] = params[
        'batch_size'] if params['batch_size'] > 0 else trainset.size()

    epochs = params['epochs']
    batch_size = params['batch_size']

    runs_number = 1
    for run in range(runs_number):
        nn.from_parameters(params, 'sigmoid', 'linear')
        model = nn.build()
        ms.add_model(model)

    ms.set_datasets(trainset, testset)

    start = time.time()
    for e in range(epochs):
        ppb(e + 1, epochs, prefix='Training', suffix='Completed')
        for model_id, model in ms.models():
            model.fit(trainset, batch_size, e)

            train_outputs = model.forward_dataset(trainset)
            test_outputs = model.forward_dataset(testset)

        ms.compute_error(model_id,
                         train_outputs,
                         test_outputs,
                         metrics=['mse', 'mee'])

    training_time = time.time() - start
    print('TRAINING TIME: ' + str(training_time) + 'seconds')

    avg_tr_mse, avg_ts_mse = ms.avg_mse()
    avg_tr_mee, avg_ts_mee = ms.avg_mee()

    res.set_task('CUP')
    plt = res.plot_mse(epochs, avg_tr_mse, avg_ts_mse, params, label2='test')
    msepath = res.save_plot(plt, 'mse')

    plt = res.plot_mee(epochs, avg_tr_mee, avg_ts_mee, params, label2='test')
    res.save_plot(plt, 'mee')

    print("TRAINING MSE " + str(avg_tr_mse[-1]))
    print("TRAINING MEE " + str(avg_tr_mee[-1]))

    # here we want to use the testset to assess the model performances
    trained_models = [m for _, m in ms.models()]
    avg_outputs = []
    for batch in testset.batch(1):
        for pattern in batch:
            tmp_real_outputs_x = []
            tmp_real_outputs_y = []
            for m in trained_models:
                real_out = m.feed_forward(pattern[1])
                tmp_real_outputs_x.append(real_out[0])
                tmp_real_outputs_y.append(real_out[1])

                # we get the average output to compute the error
                avg_outputs.append(
                    [mean(tmp_real_outputs_x),
                     mean(tmp_real_outputs_y)])

    metrics = ms.get_metrics()
    mse = metrics.mean_square_error(avg_outputs, testset.data_set[:, 2])
    mee = metrics.mean_euclidian_error(avg_outputs, testset.data_set[:, 2])

    print("MSE " + str(mse))
    print("MEE " + str(mee))

    blindds = ds.load_blind('datasets/ML-CUP19-TS.csv', 'CUP')

    avg_outputs = []
    for batch in blindds.batch(1):
        for pattern in batch:
            tmp_real_outputs_x = []
            tmp_real_outputs_y = []
            for m in trained_models:
                real_out = m.feed_forward(pattern[1])
                tmp_real_outputs_x.append(real_out[0])
                tmp_real_outputs_y.append(real_out[1])

                # we get the average output to compute the error
                avg_outputs.append(
                    [mean(tmp_real_outputs_x),
                     mean(tmp_real_outputs_y)])

    with open("report/poxebur_wikilele_ML-CUP-TS.csv", "a+") as cupfile:
        # cleaning the file
        cupfile.seek(0)
        cupfile.truncate()

        cupfile.write("# Leonardo Frioli Luigi Quarantiello \n")
        cupfile.write("# poxebur_wikilele \n")
        cupfile.write("# ML-CUP19 \n")
        cupfile.write("# 10/01/2020 \n")

        for i in range(len(avg_outputs)):
            cupfile.write(
                str(i + 1) + ", " + str(avg_outputs[i][0]) + ", " +
                str(avg_outputs[i][1]) + "\n")
Exemplo n.º 22
0
def cup(param_grid):
    dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP')
    # 25% testset, 75% training set + validationset
    trainvalset, testset = dataset.split(75 / 100)
    # if we use hold out: validation set == 1/2 trainingset
    trainset, validationset = trainvalset.split(66.6 / 100)

    for params in ms.grid_search(param_grid):
        params['batch_size'] = params[
            'batch_size'] if params['batch_size'] > 0 else trainset.size()
        print(params)

        epochs = params['epochs']
        batch_size = params['batch_size']

        runs_number = 1
        for run in range(runs_number):
            nn.from_parameters(params, 'sigmoid', 'linear')
            model = nn.build()
            ms.add_model(model)

        ms.set_datasets(trainset, validationset)

        start = time.time()
        for e in range(epochs):
            ppb(e + 1, epochs, prefix='Training', suffix='Completed')

            for model_id, model in ms.models():
                model.fit(trainset, batch_size, e)

                train_outputs = model.forward_dataset(trainset)
                val_outputs = model.forward_dataset(validationset)

                ms.compute_error(model_id,
                                 train_outputs,
                                 val_outputs,
                                 metrics=['mse', 'mee'])

        training_time = time.time() - start
        print('TRAINING TIME: ' + str(training_time) + 'seconds')

        avg_tr_mse, avg_val_mse = ms.avg_mse()
        avg_tr_mee, avg_val_mee = ms.avg_mee()

        res.set_task('CUP')
        plt = res.plot_mse(epochs, avg_tr_mse, avg_val_mse, params)
        msepath = res.save_plot(plt, 'mse')

        plt = res.plot_mee(epochs, avg_tr_mee, avg_val_mee, params)
        res.save_plot(plt, 'mee')

        res.add_result(avg_tr_mse[-1], avg_val_mse[-1], avg_tr_mee[-1],
                       avg_val_mee[-1], params['epochs'], params['batch_size'],
                       params['weights_bound'], params['learning_rate'],
                       params['momentum_alpha'], params['use_nesterov'],
                       params['regularization_lambda'], msepath)
        ms.clean()

    res.add_result_header('mse_tr', 'mse_val', 'mee_tr', 'mee_val', 'batch_s',
                          'weights', 'lr', 'm_alpha', 'nesterov', 'r_lambda',
                          'path')
    res.save_results()
Exemplo n.º 23
0
from KarfNN.layer import Dense, Dropout
from KarfNN.models import Karf

from datasets import load

def toDummies(df, Columns):
    for Column in Columns:
        new_df = pandas.get_dummies(df[Column], prefix=Column)
        df = pandas.concat([df, new_df], axis=1)
    df = df.drop(Columns, axis=1)
    return df

np.random.seed(1)

df = load("Iris")

# shuffle data
data = df.iloc[np.random.permutation(len(df))]

# split data to X and y and code Species names to numbers
X = data.drop(["Id", "Species"], axis=1).astype(np.float)
y = data[["Species"]]

# OneHot encoding for output vector
y = toDummies(y,["Species"])

# split data to training sets and testing sets
train_split = int(len(X) * 0.75)

Xtrain = X[:train_split].values
def run_average():

    for v in range(0, 3, 2):
        for current_dataset in Datasets:

            avgAccRF = avgAccBA = avgAccBANew = avgF1RF = avgF1BA = avgF1BANew = 0.0;

            for current_fold in range(1, 11):
                df_train, df_test, ds_infos = ds.load(source_path, current_dataset, current_fold)
                X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1].values
                X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1].values
                fileName = current_dataset + '.RF' + str(current_fold) + ".txt"
                random_forest_file = os.path.join(source_path, 'resources', 'forests', current_dataset, fileName)
                random_forest = tree_io.classifier_from_file(random_forest_file, X_train, y_train, pruning=False)
                fileName = current_dataset + '.BA' + str(current_fold) + '.O0.T10.tree'
                born_again_O0_file = os.path.join(output_path, 'trees10', current_dataset, fileName)
                born_againO0 = tree_io.classifier_from_file(born_again_O0_file, X_train, y_train, pruning=False)
                # print(born_again_O0_file)
                fileName = current_dataset + '.BA' + str(current_fold) + '.O5.T10.V' + str(v) + '.tree'
                born_again_O5_file = os.path.join(output_path, 'trees10', current_dataset, 'ExactDepth', fileName)
                print(born_again_O5_file)
                born_againO5 = tree_io.classifier_from_file(born_again_O5_file, X_train, y_train, pruning=False)

                # RandomForest
                rf_test_pred = random_forest.predict(X_test)
                rf_train_pred = random_forest.predict(X_train)
                report_rf = classification_report(y_test, rf_test_pred, output_dict=True)
                report_rf_train = classification_report(y_train, rf_train_pred, output_dict=True)

                # BornAgain
                ba_test_pred = born_againO0.predict(X_test)
                ba_train_pred = born_againO0.predict(X_train)
                report_ba = classification_report(y_test, ba_test_pred, output_dict=True)
                report_ba_train = classification_report(y_train, ba_train_pred, output_dict=True)

                # BornAgainNew
                banew_test_pred = born_againO5.predict(X_test)
                banew_train_pred = born_againO5.predict(X_train)
                report_banew = classification_report(y_test, banew_test_pred, output_dict=True)
                report_banew_train = classification_report(y_train, banew_train_pred, output_dict=True)

                add_report(df, 'RandomForest', report_rf_train, report_rf)
                add_report(df, 'BornAgain', report_ba_train, report_ba)
                add_report(df, 'BornAgainNew', report_banew_train, report_banew)

                avgAccRF = avgAccRF + report_rf_train['accuracy']
                avgF1RF = avgF1RF + report_rf['weighted avg']['f1-score']
                avgAccBA = avgAccBA + report_ba_train['accuracy']
                avgF1BA = avgF1BA + report_ba['weighted avg']['f1-score']
                avgAccBANew = avgAccBANew + report_banew_train['accuracy']
                avgF1BANew = avgF1BANew + report_banew['weighted avg']['f1-score']

            print("Average RF Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str(
                avgAccRF / 10) + " " + str(avgF1RF / 10))
            print("Average BA Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str(
                avgAccBA / 10) + " " + str(avgF1BA / 10))
            print("Average BANew Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str(
                avgAccBANew / 10) + " " + str(avgF1BANew / 10))

        a = pd.DataFrame(data=df, index=None)
        path = output_path + '/ResultsV' + str(v) + '.xlsx'
        a.to_excel(path)
Exemplo n.º 25
0
def evaluate_trial(ratio, fold):
    RESULTS_PATH = Path(__file__).parents[0] / 'results_ratio'
    RANDOM_STATE = 42

    for dataset_name in datasets.names():
        classifiers = {
            'CART': DecisionTreeClassifier(random_state=RANDOM_STATE),
            'KNN': KNeighborsClassifier(n_neighbors=3),
            'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE),
            'MLP': MLPClassifier(random_state=RANDOM_STATE)
        }

        trial_name = f'{dataset_name}_{fold}_{ratio}'
        trial_path = RESULTS_PATH / f'{trial_name}.csv'

        if trial_path.exists():
            continue

        logging.info(f'Evaluating {trial_name}...')

        dataset = datasets.load(dataset_name)

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        resampler = PA(ratio=ratio, random_state=RANDOM_STATE)

        assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2

        try:
            X_train, y_train = resampler.sample(X_train, y_train)
        except RuntimeError:
            continue

        rows = []

        for classifier_name in classifiers.keys():
            classifier = classifiers[classifier_name]

            clf = classifier.fit(X_train, y_train)
            predictions = clf.predict(X_test)

            scoring_functions = {
                'Precision': metrics.precision,
                'Recall': metrics.recall,
                'AUC': metrics.auc,
                'G-mean': metrics.g_mean
            }

            for scoring_function_name in scoring_functions.keys():
                score = scoring_functions[scoring_function_name](y_test,
                                                                 predictions)
                row = [
                    dataset_name, fold, classifier_name, ratio,
                    scoring_function_name, score
                ]
                rows.append(row)

        columns = ['Dataset', 'Fold', 'Classifier', 'Ratio', 'Metric', 'Score']

        RESULTS_PATH.mkdir(exist_ok=True, parents=True)

        pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
Exemplo n.º 26
0
from __future__ import division, print_function
import numpy as np
import utils
from model01 import MLPModel01
from metrics import performance_report
import datasets

n_categories = 2  # implicit in prepare_data (maybe parameterise)
lookahead = 1
window = 60
sym = 'USDJPY'

# In[21]:

X_train, Y_train, prices_train, _ = datasets.load(
    datasets.filename('DS2', lookahead, window, sym, 2009))

X_dev, Y_dev, prices_dev = datasets.load(
    datasets.filename('DS2', lookahead, window, sym, 2010))
# sample 50k records from 2010 as dev set
dev_idx = np.random.choice(len(X_dev), 50000, replace=False)
X_dev, Y_dev, prices_dev, _ = X_dev.ix[dev_idx], Y_dev.ix[
    dev_idx], prices_dev.ix[dev_idx]

X_test, Y_test, prices_test, _ = datasets.load(
    datasets.filename('DS2', lookahead, window, sym, 2011))

# In[23]:

print("train", X_train.shape)
print("dev", X_dev.shape)
Exemplo n.º 27
0
def extract(verbose=True):
    dfs = []

    for partition in ['preliminary', 'final']:
        rows = []

        for name in tqdm(datasets.names(partition)):
            dataset = datasets.load(name)

            (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1]

            X = np.concatenate([X_train, X_test])
            y = np.concatenate([y_train, y_test])

            n_samples = X.shape[0]
            n_features = X.shape[1]

            majority_class = Counter(y).most_common()[0][0]

            n_majority_samples = Counter(y).most_common()[0][1]
            n_minority_samples = Counter(y).most_common()[1][1]

            imbalance_ratio = np.round(n_majority_samples / n_minority_samples,
                                       2)

            knn = NearestNeighbors(6).fit(X)

            n_safe = 0
            n_borderline = 0
            n_rare = 0
            n_outliers = 0

            for X_i, y_i in zip(X, y):
                if y_i == majority_class:
                    continue
                else:
                    indices = knn.kneighbors([X_i], return_distance=False)[0,
                                                                           1:]
                    n_majority_neighbors = sum(y[indices] == majority_class)

                    if n_majority_neighbors in [0, 1]:
                        n_safe += 1
                    elif n_majority_neighbors in [2, 3]:
                        n_borderline += 1
                    elif n_majority_neighbors == 4:
                        n_rare += 1
                    elif n_majority_neighbors == 5:
                        n_outliers += 1
                    else:
                        raise ValueError

            n_total = n_safe + n_borderline + n_rare + n_outliers

            percentage_safe = np.round(n_safe / n_total * 100, 2)
            percentage_borderline = np.round(n_borderline / n_total * 100, 2)
            percentage_rare = np.round(n_rare / n_total * 100, 2)
            percentage_outlier = np.round(n_outliers / n_total * 100, 2)

            rows.append([
                name, imbalance_ratio, n_samples, n_features, percentage_safe,
                percentage_borderline, percentage_rare, percentage_outlier
            ])

        df = pd.DataFrame(rows,
                          columns=[
                              'name', 'imbalance_ratio', 'n_samples',
                              'n_features', 'percentage_safe',
                              'percentage_borderline', 'percentage_rare',
                              'percentage_outlier'
                          ])
        df = df.sort_values('imbalance_ratio')

        dfs.append(df)

    df = pd.concat(dfs).reset_index(drop=True)
    df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv',
              index=False)

    if verbose:
        for i, row in df.iterrows():
            row = [str(v).replace('_', '\_') for v in row]

            print(' & '.join(row) + ' \\\\')

            if i == 19:
                print('\\midrule')

    return df
    eval_model = Evaluate(scan_model)
    results = eval_model.evaluate(np.array(test_x), np.array(test_y), task='continuous',folds=10, metric='loss')
    return np.array([inverse_transform(scaler,result) for result in results])

if __name__ == "__main__":

    
    base_dir = os.getcwd()
    start_time = datetime.now()
    experiment_name = start_time.strftime("%m_%d_%Y_%H_%M_%S")


    
    scaler = MinMaxScaler()
    dataset = 'hele_norge'
    train_x, train_y, validation_x, validation_y, test_x, test_y, scaler = datasets.load(f'../input/'+dataset+'.csv', scaler)
    
    #Save scaler for future predictions:
    joblib.dump(scaler, f'../talos_training/'+ dataset +'.scaler') 


    round_lim = 30
    
    if len(sys.argv) == 2:
        print("10-feature training initialized")
        features = ['boligtype_Leilighet', 'boligtype_Enebolig', 'bruksareal', 'boligtype_Tomannsbolig', 'postnummer', 'boligtype_Rekkehus', 
        'neighborhood_environment_demographics_housingage_10-30', 'neighborhood_environment_demographics_housingprices_0-2000000', 'neighborhood_environment_demographics_housingage_30-50',
        'eieform_Andel']
        parameters = {'activation_1':['relu', 'elu'],
        'activation_2':['relu', 'elu'],
        'activation_3':['relu', 'elu'],
def prepare(dataset,
            partition,
            fold,
            mode='OVA',
            output_path=DEFAULT_ROOT_OUTPUT_PATH,
            energy=0.25,
            cleaning_strategy='translate',
            selection_strategy='proportional',
            p_norm=1.0,
            method='sampling'):
    logging.info('Processing fold %dx%d of dataset "%s"...' %
                 (partition, fold, dataset))

    output_path = Path(output_path) / dataset
    output_path.mkdir(parents=True, exist_ok=True)

    (X_train, y_train), (X_test,
                         y_test) = datasets.load(dataset, partition, fold)

    header = pd.read_csv(DEFAULT_DATA_PATH / 'folds' / dataset /
                         ('%s.%d.%d.train.csv' %
                          (dataset, partition, fold))).columns

    if mode == 'OVA':
        logging.info('Training distribution before resampling: %s.' %
                     Counter(y_train))

        X_train, y_train = algorithms.MultiClassCCR(
            energy=energy,
            cleaning_strategy=cleaning_strategy,
            selection_strategy=selection_strategy,
            p_norm=p_norm,
            method=method).fit_sample(X_train, y_train)

        logging.info('Training distribution after resampling: %s.' %
                     Counter(y_train))

        csv_path = output_path / ('%s.%d.%d.train.oversampled.csv' %
                                  (dataset, partition, fold))

        pd.DataFrame(np.c_[X_train, y_train]).to_csv(csv_path,
                                                     index=False,
                                                     header=header)
    elif mode == 'OVO':
        classes = np.unique(np.concatenate([y_train, y_test]))

        for i in range(len(classes)):
            for j in range(i + 1, len(classes)):
                logging.info('Resampling class %s vs. class %s.' %
                             (classes[i], classes[j]))

                indices = ((y_train == classes[i]) | (y_train == classes[j]))

                X, y = X_train[indices].copy(), y_train[indices].copy()

                logging.info('Training distribution before resampling: %s.' %
                             Counter(y))

                X, y = algorithms.CCR(energy=energy,
                                      cleaning_strategy=cleaning_strategy,
                                      selection_strategy=selection_strategy,
                                      p_norm=p_norm).fit_sample(X, y)

                logging.info('Training distribution after resampling: %s.' %
                             Counter(y))

                csv_path = output_path / (
                    '%s.%d.%d.train.oversampled.%dv%d.csv' %
                    (dataset, partition, fold, classes[i], classes[j]))

                pd.DataFrame(np.c_[X, y]).to_csv(csv_path,
                                                 index=False,
                                                 header=header)
    else:
        raise NotImplementedError
Exemplo n.º 30
0
def get_datasets():
    return datasets.load()