Пример #1
0
def get_best_estimator_for_specific_classifier(X_train, y_train, specific,
                                               n_jobs, grid_size, fold):

    # Create the classifier
    clf = MetaLazyClassifier(specific_classifier=specific,
                             select_features=False,
                             n_neighbors=200,
                             n_jobs=n_jobs,
                             grid_size=grid_size)

    tuned_parameters = {
        'weight_function': ['cosine', 'inverse', 'dwknn'],
        'number_of_cooccurrences': [5, 15]
    }

    # first we find the best configuration in general
    print('GRID SEARCH FOR FOLD {}'.format(fold))
    start_grid = time.time()
    grid = GridSearchCV(clf,
                        tuned_parameters,
                        cv=3,
                        scoring='f1_macro',
                        n_jobs=1)
    grid.fit(X_train, y_train)
    end = time.time()
    print('GENERAL - Total grid time: {}'.format((end - start_grid)))
    print('GENERAL - Best score was {} with \n {}'.format(
        grid.best_score_, grid.best_estimator_))

    estimator = grid.best_estimator_
    best_param = grid.best_params_

    print('GENERAL - Best param was {}\n'.format(grid.best_params_))

    return estimator
Пример #2
0
def get_best_version_for_each_dataset(dataset, n_jobs, grid_size):
    '''
    Returns the MetaLazy with the best configuration found for each dataset

    :param dataset:
    :param n_jobs:
    :param grid_size:
    :return:
    '''
    if dataset in ['logistic_200_inverse','20ng', 'reut', 'reut90']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='logistic', weight_function='inverse', n_neighbors=200,
                                  number_of_cooccurrences=10)
    elif dataset in ['acm','logistic_200_cosine']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='logistic', weight_function='cosine', n_neighbors=200,
                                  number_of_cooccurrences=10)
    elif dataset in ['stanford', 'stanford_tweets', 'logistic_100_inverse']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='logistic', weight_function='inverse', n_neighbors=100,
                                  number_of_cooccurrences=10)
    elif dataset in ['4uni', 'webkb', 'extrarf_200_inverse']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='extrarf', weight_function='inverse', n_neighbors=350,
                                  number_of_cooccurrences=10)
    elif dataset in ['yelp', 'yelp_reviews', 'extrarf_200_cosine']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='extrarf', weight_function='cosine', n_neighbors=200,
                                  number_of_cooccurrences=10)
    elif dataset in ['nb_200_cosine']:
        return MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size,
                                  specific_classifier='nb', weight_function='cosine', n_neighbors=200,
                                  number_of_cooccurrences=10)
def get_estimator(specific, weight, cooc, oversampling):

    weight_value = 'inverse' if weight == 1 else 'none'
    cooc_value = 10 if cooc == 1 else 0

    # Create the classifier
    clf = MetaLazyClassifier(specific_classifier=specific,
                             select_features=False,
			                 n_neighbors=200,
                             weight_function=weight_value,
                             number_of_cooccurrences=cooc_value,
                             oversample=oversampling,
                             n_jobs=3)
    return clf
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to thedirectory with  libsvm files')

    args = parser.parse_args()
    path = args.p

    dataset_reader = DatasetReader(path)

    fold = 0
    result = {'25': [], '200': []}

    start = time.time()

    while dataset_reader.has_next():
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        for N in [25, 200]:
            print('{} Neighbours'.format(N))
            clf = MetaLazyClassifier(
                n_neighbors=N,
                select_features=False,
                weight_function='inverse',
                log_time_file=
                '/home/lfmendes/data/mestrado/metalazy/results/tempos2/logtimes{}_{}.json'
                .format(N, fold))
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            # print(classification_report(y_pred=y_pred, y_true=y_test))
            result[str(N)].append(
                f1_score(y_true=y_test, y_pred=y_pred, average='macro'))

            clf.flush_log_time_file()
        fold = fold + 1

    print('\n\n ---------\n EXPERIMENT RESULT \n ---------')
    print(result)
    for N in ['25', '200']:
        print('{}: {}'.format(N, np.mean(np.array(result[N]))))

    end = time.time()
    print(end - start)
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument(
        '-j',
        help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument(
        '-g',
        help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))
        start_fold = time.time()

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(select_features=False,
                                 n_jobs=n_jobs,
                                 grid_size=grid_size)

        clf = ExtraTreesClassifier(n_jobs=-1)

        # tuned_parameters = [{'specific_classifier': ['nb'],
        #                      'weight_function': ['none', 'cosine', 'inverse'],
        #                      'n_neighbors': [200, 50], 'number_of_cooccurrences': [1, 10]}]

        #tuned_parameters = [{'specific_classifier': ['nb', 'logistic', 'extrarf'],
        tuned_parameters = [{
            'specific_classifier': ['nb', 'logistic', 'extrarf'],
            'weight_function': ['inverse'],
            'n_neighbors': [100],
            'number_of_cooccurrences': [10]
        }]

        tuned_parameters = [{
            'criterion': ['gini', 'entropy'],
            'max_features': ['log2', 'sqrt'],
            'class_weight': ['balanced', 'None'],
            'n_estimators': [100, 200]
        }]

        print('GENERAL STARTING')
        start_grid = time.time()
        grid = GridSearchCV(clf,
                            tuned_parameters,
                            cv=3,
                            scoring='f1_macro',
                            n_jobs=1)
        grid.fit(X_train, y_train)
        end = time.time()
        print('GENERAL - Total grid time: {}'.format((end - start_grid)))
        print('GENERAL - Best score was {} with \n {}'.format(
            grid.best_score_, grid.best_estimator_))

        grid.best_score_, grid.best_estimator_

        # Fit the train data
        fit(grid.best_estimator_, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(grid.best_estimator_, X_test, time_dic)

        print(str(grid.best_estimator_))
        print(str(grid.best_estimator_.weaker))
        # Save the result
        result.append({
            'macro':
            f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro':
            f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config':
            str(grid.best_estimator_),
            'best_clf':
            str(grid.best_estimator_.weaker),
        })

        print('Macro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1
        end_fold = time.time()
        print('Total fold time: {}'.format((end_fold - start_fold)))
        print('train size {}'.format(X_train.shape))
        print('test size {}'.format(X_test.shape))
        print()

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)
Пример #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    configurations = {'specific_classifier': [0, 1],
                      'weight': [0, 1],
                      'cooccurrence': [0, 1]}

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(select_features=False,
                                 n_jobs=n_jobs,
                                 grid_size=grid_size)

        # for each fold we vary weight function, number of co occurrences and the choosing of the classifier
        for specific in configurations['specific_classifier']:
            for weight in configurations['weight']:
                for cooccurrence in configurations['cooccurrence']:
                    print(
                        'Running for specific {}, weight {} and cooccurrence {}'.format(specific, weight, cooccurrence))

                    tuned_parameters = choose_tunning_parameters(specific=specific, weight=weight,
                                                                 coccurrence=cooccurrence)

                    print('GENERAL STARTING')
                    start_grid = time.time()
                    grid = GridSearchCV(clf, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1)
                    grid.fit(X_train, y_train)
                    end = time.time()
                    print('GENERAL - Total grid time: {}'.format((end - start_grid)))
                    print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_))

                    grid.best_score_, grid.best_estimator_

                    # Fit the train data
                    fit(grid.best_estimator_, X_train, y_train, time_dic)

                    # Predict
                    y_pred = predict(grid.best_estimator_, X_test, time_dic)

                    print(str(grid.best_estimator_))
                    print(str(grid.best_estimator_.weaker))
                    # Save the result
                    result.append({
                        'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
                        'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
                        'config': str(grid.best_estimator_),
                        'best_clf': str(grid.best_estimator_.weaker),
                    })

                    configuration = {'weight': weight, 'specific': specific, 'cooc': cooccurrence}

                    result[-1].update(configuration)

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

        result_dataframe = pd.DataFrame(data=result)
        print(result_dataframe.head(10))
        result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-c', help='classifier')
    parser.add_argument('-k', help='number of neighbours')
    parser.add_argument('-w', help='weight function')
    parser.add_argument('-f', help='number of cooc features - default 10')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p
    k = int(args.k)
    weight_function = args.w
    classifier_name = args.c

    n_cooc = 10
    if args.f:
        n_cooc = int(args.f)

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(specific_classifier=classifier_name, n_neighbors=k, select_features=False,
                                 weight_function=weight_function, n_jobs=n_jobs,
                                 grid_size=grid_size, number_of_cooccurrences=n_cooc)

        # Fit the train data
        fit(clf, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(clf, X_test, time_dic)

        print(str(clf))
        print(str(clf.weaker))
        # Save the result
        result.append({
            'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config': str(clf),
            'best_clf': str(clf.weaker),
        })

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)
Пример #8
0
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore", message="Numerical issues were encountered ")

iris = datasets.load_iris()
X = iris.data
y = iris.target

# divide into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

clf = MetaLazyClassifier(n_neighbors=1)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

knn = KNeighborsClassifier(n_neighbors=25)

knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred))
Пример #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')
    parser.add_argument('-d',
                        help='Use the dataset default parameters.Dont use this parameter if you want to do grid search')
    parser.add_argument('-t', help='Limit test size, for each fold only use this number of instances')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset = None
    if args.d:
        dataset = args.d

    test_size_limit = None
    if args.t:
        test_size_limit = int(args.t)


    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        if test_size_limit:
            X_test = X_test[0:test_size_limit]
            y_test = y_test[0:test_size_limit]

        if dataset is None:
            # Create the classifier
            estimator = MetaLazyClassifier(select_features=False,
                                           n_jobs=n_jobs,
                                           grid_size=grid_size)

            tuned_parameters = choose_tunning_parameters(specific=1, weight=1, coccurrence=1)

            print(tuned_parameters)

            # first we find the best configuration in general
            print('GRID SEARCH FOR FOLD {}'.format(fold))
            start_grid = time.time()
            grid = GridSearchCV(estimator, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1)
            grid.fit(X_train, y_train)
            end = time.time()
            time_dic['grid'] = (end - start_grid)
            print('GENERAL - Total grid time: {}'.format((end - start_grid)))
            print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_))

            estimator = grid.best_estimator_
            best_param = grid.best_params_
            print('GENERAL - Best param was {}\n'.format(grid.best_params_))
        else:
            print('Using default dataset parameters')
            estimator = get_best_version_for_each_dataset(dataset=dataset, n_jobs=n_jobs, grid_size=grid_size)
        print(estimator)

        estimator.log_time_file = output_path + '/log_times_{}.json'.format(fold)

        # Fit the train data
        fit(estimator, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(estimator, X_test, time_dic)

        print('\nWeaker Classifier used:')
        print(str(estimator.weaker))
        # Save the result
        result.append({
            'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config': str(estimator),
            'best_clf': str(estimator.weaker),
            'fold': str(fold),
        })

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

        result_dataframe = pd.DataFrame(data=result)
        print(result_dataframe.head(10))
        result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

        estimator.flush_log_time_file()

        #FIXME
        #break

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument(
        '-j',
        help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument(
        '-g',
        help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    times = []

    specific_classifier = ['nb', 'logistic', 'extrarf']
    configurations = {'weight': [0, 1], 'cooccurrence': [0, 1]}

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        result_df = pd.DataFrame()

        # for each fold we vary the specific classifier
        for specific in specific_classifier:

            print('Running for specific {}'.format(specific))

            # setting the 0 configurations (turn off cooc or weight)
            estimator = MetaLazyClassifier(specific_classifier=specific,
                                           select_features=False,
                                           n_jobs=n_jobs,
                                           number_of_cooccurrences=0,
                                           weight_function='none',
                                           grid_size=grid_size)

            print(estimator)

            # Fit the train data
            fit(estimator, X_train, y_train, time_dic)

            # Predict
            y_pred = predict(estimator, X_test, time_dic)

            # Save the result
            result_df[specific] = y_pred

        result_df['y_test'] = y_test

        times.append(time_dic)
        fold = fold + 1

        print(result_df.head(10))
        result_df.to_csv(output_path +
                         '/result_oracle_off_fold_{}.csv'.format(fold),
                         index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)