def fit(self, X, y=None):

        path_dict = dict(english='crawl-300d-2M-subword.bin',
                         french='cc.fr.300.bin',
                         hungarian='cc.hu.300.bin')

        if self.language not in path_dict.keys():
            raise AttributeError('language %s has not been downloaded yet' %
                                 self.language)

        self.ft_model = load_model(
            os.path.join(get_data_path(), 'fastText',
                         path_dict[self.language]))
        return self
示例#2
0
def fit_predict_categorical_encoding(
    datasets,
    str_preprocess,
    encoders,
    classifiers,
    reduction_methods,
    n_components,
    test_size,
    n_splits,
    n_jobs,
    results_path,
    model_path=None,
    custom_cv=None,
):
    """
    Learning with dirty categorical variables.
    """
    path = get_data_path()
    results_path = os.path.join(path, results_path)
    model_path = os.path.join(path, model_path)
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    for dataset in datasets:
        n_rows = choose_nrows(dataset_name=dataset)
        for encoder in encoders:
            print("Dataset: %s" % dataset)
            data = Data(dataset).get_df()
            data.preprocess(n_rows=n_rows, str_preprocess=str_preprocess)
            special_col = [
                col for col in data.col_action
                if data.col_action[col] == "Special"
            ][0]
            if type(encoder) is list:
                # special_col = [col for col in data.col_action
                #                if data.col_action[col] == 'Special'][0]
                for i, enc in enumerate(encoder):
                    print(enc)
                    if i == 0:
                        data.col_action[special_col] = "Special"
                    else:
                        new_col = "%s_%d" % (special_col, i)
                        data.df[new_col] = data.df[special_col].copy()
                        data.col_action[new_col] = enc
                        data.xcols.append(new_col)
            for reduction_method in reduction_methods:
                print("Data shape: %d, %d" % data.df.shape)
                cv = select_cross_val(data.clf_type,
                                      n_splits,
                                      test_size,
                                      custom_cv=custom_cv,
                                      col_name=special_col)
                scaler = select_scaler()

                # Define classifiers
                clfs = instanciate_estimators(
                    data.clf_type,
                    classifiers,
                    clf_seed,
                    y=data.df.loc[:, data.ycol].values,
                    model_path=model_path,
                )

                for i, clf in enumerate(clfs):
                    print(
                        "{}: {} \n{}: {} \n{}: {} \n{}: {} \n{}: {},{}".format(
                            "Prediction column",
                            data.ycol,
                            "Task type",
                            str(data.clf_type),
                            "Classifier",
                            clf,
                            "Encoder",
                            encoder,
                            "Dimension reduction",
                            reduction_method,
                            n_components,
                        ))

                    try:
                        clf_name = clf.estimator.__class__.__name__
                        results_dict = {
                            "dataset":
                            data.name,
                            "n_splits":
                            n_splits,
                            "test_size":
                            test_size,
                            "n_rows":
                            n_rows,
                            "encoder":
                            encoder,
                            "str_preprocess":
                            str_preprocess,
                            "clf": [
                                classifiers[i], clf_name,
                                clf.estimator.get_params()
                            ],
                            "ShuffleSplit": [cv.__class__.__name__],
                            "scaler":
                            [scaler.__class__.__name__,
                             scaler.get_params()],
                            "sample_seed":
                            sample_seed,
                            "shuffleseed":
                            shuffle_seed,
                            "col_action":
                            data.col_action,
                            "clf_type":
                            data.clf_type,
                            "dimension_reduction":
                            [reduction_method, n_components],
                        }
                    except AttributeError:
                        clf_name = clf.__class__.__name__
                        results_dict = {
                            "dataset":
                            data.name,
                            "n_splits":
                            n_splits,
                            "test_size":
                            test_size,
                            "n_rows":
                            n_rows,
                            "encoder":
                            encoder,
                            "str_preprocess":
                            str_preprocess,
                            "clf":
                            [classifiers[i], clf_name,
                             clf.get_params()],
                            "ShuffleSplit": [cv.__class__.__name__],
                            "scaler":
                            [scaler.__class__.__name__,
                             scaler.get_params()],
                            "sample_seed":
                            sample_seed,
                            "shuffleseed":
                            shuffle_seed,
                            "col_action":
                            data.col_action,
                            "clf_type":
                            data.clf_type,
                            "dimension_reduction":
                            [reduction_method, n_components],
                        }

                    if verify_if_exists(results_path, results_dict):
                        print("Prediction already exists.\n")
                        continue

                    start = time.time()
                    if type(encoder) is str:
                        column_action = get_column_action(
                            data.col_action,
                            data.xcols,
                            encoder,
                            reduction_method,
                            n_components,
                            data.clf_type,
                        )
                    if type(encoder) is list:
                        column_action = get_column_action(
                            data.col_action,
                            data.xcols,
                            encoder[0],
                            reduction_method,
                            n_components,
                            data.clf_type,
                        )
                    pred = Parallel(n_jobs=n_jobs)(delayed(fit_predict_fold)(
                        data,
                        scaler,
                        column_action,
                        clf,
                        encoder,
                        reduction_method,
                        n_components,
                        fold,
                        cv.n_splits,
                        train_index,
                        test_index,
                    ) for fold, (train_index, test_index) in enumerate(
                        cv.split(data.df, data.df[data.ycol].values)))
                    pred = np.array(pred)
                    results = {
                        "fold": list(pred[:, 0]),
                        "n_train_samples": list(pred[:, 1]),
                        "n_train_features": list(pred[:, 2]),
                        "score": list(pred[:, 3]),
                        "encoding_time": list(pred[:, 4]),
                        "training_time": list(pred[:, 5]),
                    }
                    results_dict["results"] = results

                    # Saving results
                    pc_name = socket.gethostname()
                    now = "".join([
                        c for c in str(datetime.datetime.now()) if c.isdigit()
                    ])
                    filename = "%s_%s_%s_%s_%s.json" % (
                        pc_name,
                        data.name,
                        classifiers[i],
                        encoder,
                        now,
                    )
                    results_file = os.path.join(results_path, filename)
                    results_dict = array2list(results_dict)

                    # patch for nystrom + ridge
                    if clf.__class__.__name__ == "GridSearchCV":
                        if clf.estimator.__class__.__name__ == "Pipeline":
                            results_dict["clf"] = method2str(
                                results_dict["clf"])

                    write_json(results_dict, results_file)
                    print("prediction time: %.1f s." % (time.time() - start))
                    print("Saving results to: %s\n" % results_file)
示例#3
0
文件: main.py 项目: theishantha/dsa2
    "open_payments",
    "traffic_violations",
    "federal_election",
    "public_procurement",
    "building_permits",
    "road_safety",
    "met_objects",
    "drug_directory",
    "wine_reviews",
]
n_jobs = 20
n_splits = 20
test_size = 1.0 / 3
str_preprocess = True
n_components = 100
results_path = os.path.join(get_data_path(), "results", "jmlr2019_2")
# results_path = os.path.join(get_data_folder(), 'results',
#                             'kdd_2019_only_cats')
classifiers = [
    #  'NystroemRidgeCV',
    # 'L2RegularizedLinearModel',
    # 'EigenProGaussian160',
    # 'EigenProPolynomial',
    #  'XGB',
    # 'LGBM',
    # 'KNN',
    "MLPGridSearchCV"
]
###############################################################################

# Probabilistic topic models without dimensionality reduction #################
示例#4
0
def fit_predict_categorical_encoding(datasets,
                                     str_preprocess,
                                     encoders,
                                     classifiers,
                                     reduction_methods,
                                     n_components,
                                     test_size,
                                     n_splits,
                                     n_jobs,
                                     results_path,
                                     model_path=None,
                                     custom_cv=None):
    '''
    Learning with dirty categorical variables.
    '''
    path = get_data_path()
    results_path = os.path.join(path, results_path)
    model_path = os.path.join(path, model_path)
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    for dataset in datasets:
        n_rows = choose_nrows(dataset_name=dataset)
        for encoder in encoders:
            print('Dataset: %s' % dataset)
            data = Data(dataset).get_df()
            data.preprocess(n_rows=n_rows, str_preprocess=str_preprocess)
            special_col = [
                col for col in data.col_action
                if data.col_action[col] == 'Special'
            ][0]
            if type(encoder) is list:
                # special_col = [col for col in data.col_action
                #                if data.col_action[col] == 'Special'][0]
                for i, enc in enumerate(encoder):
                    print(enc)
                    if i == 0:
                        data.col_action[special_col] = 'Special'
                    else:
                        new_col = '%s_%d' % (special_col, i)
                        data.df[new_col] = data.df[special_col].copy()
                        data.col_action[new_col] = enc
                        data.xcols.append(new_col)
            for reduction_method in reduction_methods:
                print('Data shape: %d, %d' % data.df.shape)
                cv = select_cross_val(data.clf_type,
                                      n_splits,
                                      test_size,
                                      custom_cv=custom_cv,
                                      col_name=special_col)
                scaler = select_scaler()

                # Define classifiers
                clfs = instanciate_estimators(data.clf_type,
                                              classifiers,
                                              clf_seed,
                                              y=data.df.loc[:,
                                                            data.ycol].values,
                                              model_path=model_path)

                for i, clf in enumerate(clfs):
                    print(
                        '{}: {} \n{}: {} \n{}: {} \n{}: {} \n{}: {},{}'.format(
                            'Prediction column', data.ycol, 'Task type',
                            str(data.clf_type), 'Classifier', clf, 'Encoder',
                            encoder, 'Dimension reduction', reduction_method,
                            n_components))

                    try:
                        clf_name = clf.estimator.__class__.__name__
                        results_dict = {
                            'dataset':
                            data.name,
                            'n_splits':
                            n_splits,
                            'test_size':
                            test_size,
                            'n_rows':
                            n_rows,
                            'encoder':
                            encoder,
                            'str_preprocess':
                            str_preprocess,
                            'clf': [
                                classifiers[i], clf_name,
                                clf.estimator.get_params()
                            ],
                            'ShuffleSplit': [cv.__class__.__name__],
                            'scaler':
                            [scaler.__class__.__name__,
                             scaler.get_params()],
                            'sample_seed':
                            sample_seed,
                            'shuffleseed':
                            shuffle_seed,
                            'col_action':
                            data.col_action,
                            'clf_type':
                            data.clf_type,
                            'dimension_reduction':
                            [reduction_method, n_components]
                        }
                    except AttributeError:
                        clf_name = clf.__class__.__name__
                        results_dict = {
                            'dataset':
                            data.name,
                            'n_splits':
                            n_splits,
                            'test_size':
                            test_size,
                            'n_rows':
                            n_rows,
                            'encoder':
                            encoder,
                            'str_preprocess':
                            str_preprocess,
                            'clf':
                            [classifiers[i], clf_name,
                             clf.get_params()],
                            'ShuffleSplit': [cv.__class__.__name__],
                            'scaler':
                            [scaler.__class__.__name__,
                             scaler.get_params()],
                            'sample_seed':
                            sample_seed,
                            'shuffleseed':
                            shuffle_seed,
                            'col_action':
                            data.col_action,
                            'clf_type':
                            data.clf_type,
                            'dimension_reduction':
                            [reduction_method, n_components]
                        }

                    if verify_if_exists(results_path, results_dict):
                        print('Prediction already exists.\n')
                        continue

                    start = time.time()
                    if type(encoder) is str:
                        column_action = get_column_action(
                            data.col_action, data.xcols, encoder,
                            reduction_method, n_components, data.clf_type)
                    if type(encoder) is list:
                        column_action = get_column_action(
                            data.col_action, data.xcols, encoder[0],
                            reduction_method, n_components, data.clf_type)
                    pred = Parallel(n_jobs=n_jobs)(
                        delayed(fit_predict_fold)
                        (data, scaler, column_action, clf, encoder,
                         reduction_method, n_components, fold, cv.n_splits,
                         train_index, test_index)
                        for fold, (train_index, test_index) in enumerate(
                            cv.split(data.df, data.df[data.ycol].values)))
                    pred = np.array(pred)
                    results = {
                        'fold': list(pred[:, 0]),
                        'n_train_samples': list(pred[:, 1]),
                        'n_train_features': list(pred[:, 2]),
                        'score': list(pred[:, 3]),
                        'encoding_time': list(pred[:, 4]),
                        'training_time': list(pred[:, 5])
                    }
                    results_dict['results'] = results

                    # Saving results
                    pc_name = socket.gethostname()
                    now = ''.join([
                        c for c in str(datetime.datetime.now()) if c.isdigit()
                    ])
                    filename = (
                        '%s_%s_%s_%s_%s.json' %
                        (pc_name, data.name, classifiers[i], encoder, now))
                    results_file = os.path.join(results_path, filename)
                    results_dict = array2list(results_dict)

                    # patch for nystrom + ridge
                    if clf.__class__.__name__ == 'GridSearchCV':
                        if clf.estimator.__class__.__name__ == 'Pipeline':
                            results_dict['clf'] = method2str(
                                results_dict['clf'])

                    write_json(results_dict, results_file)
                    print('prediction time: %.1f s.' % (time.time() - start))
                    print('Saving results to: %s\n' % results_file)
示例#5
0
    'open_payments',
    'traffic_violations',
    'federal_election',
    'public_procurement',
    'building_permits',
    'road_safety',
    'met_objects',
    'drug_directory',
    'wine_reviews',
    ]
n_jobs = 20
n_splits = 20
test_size = 1./3
str_preprocess = True
n_components = 100
results_path = os.path.join(get_data_path(), 'results', 'jmlr2019_2')
# results_path = os.path.join(get_data_folder(), 'results',
#                             'kdd_2019_only_cats')
classifiers = [
    # 'NystroemRidgeCV',
    # 'L2RegularizedLinearModel',
    # 'EigenProGaussian160',
    # 'EigenProPolynomial',
    # 'XGB',
    # 'LGBM',
    # 'KNN',
    'MLPGridSearchCV',
    ]
###############################################################################

# Probabilistic topic models without dimensionality reduction #################