def fit(self, X, y=None): path_dict = dict(english='crawl-300d-2M-subword.bin', french='cc.fr.300.bin', hungarian='cc.hu.300.bin') if self.language not in path_dict.keys(): raise AttributeError('language %s has not been downloaded yet' % self.language) self.ft_model = load_model( os.path.join(get_data_path(), 'fastText', path_dict[self.language])) return self
def fit_predict_categorical_encoding( datasets, str_preprocess, encoders, classifiers, reduction_methods, n_components, test_size, n_splits, n_jobs, results_path, model_path=None, custom_cv=None, ): """ Learning with dirty categorical variables. """ path = get_data_path() results_path = os.path.join(path, results_path) model_path = os.path.join(path, model_path) if not os.path.exists(results_path): os.makedirs(results_path) for dataset in datasets: n_rows = choose_nrows(dataset_name=dataset) for encoder in encoders: print("Dataset: %s" % dataset) data = Data(dataset).get_df() data.preprocess(n_rows=n_rows, str_preprocess=str_preprocess) special_col = [ col for col in data.col_action if data.col_action[col] == "Special" ][0] if type(encoder) is list: # special_col = [col for col in data.col_action # if data.col_action[col] == 'Special'][0] for i, enc in enumerate(encoder): print(enc) if i == 0: data.col_action[special_col] = "Special" else: new_col = "%s_%d" % (special_col, i) data.df[new_col] = data.df[special_col].copy() data.col_action[new_col] = enc data.xcols.append(new_col) for reduction_method in reduction_methods: print("Data shape: %d, %d" % data.df.shape) cv = select_cross_val(data.clf_type, n_splits, test_size, custom_cv=custom_cv, col_name=special_col) scaler = select_scaler() # Define classifiers clfs = instanciate_estimators( data.clf_type, classifiers, clf_seed, y=data.df.loc[:, data.ycol].values, model_path=model_path, ) for i, clf in enumerate(clfs): print( "{}: {} \n{}: {} \n{}: {} \n{}: {} \n{}: {},{}".format( "Prediction column", data.ycol, "Task type", str(data.clf_type), "Classifier", clf, "Encoder", encoder, "Dimension reduction", reduction_method, n_components, )) try: clf_name = clf.estimator.__class__.__name__ results_dict = { "dataset": data.name, "n_splits": n_splits, "test_size": test_size, "n_rows": n_rows, "encoder": encoder, "str_preprocess": str_preprocess, "clf": [ classifiers[i], clf_name, clf.estimator.get_params() ], "ShuffleSplit": [cv.__class__.__name__], "scaler": [scaler.__class__.__name__, scaler.get_params()], "sample_seed": sample_seed, "shuffleseed": shuffle_seed, "col_action": data.col_action, "clf_type": data.clf_type, "dimension_reduction": [reduction_method, n_components], } except AttributeError: clf_name = clf.__class__.__name__ results_dict = { "dataset": data.name, "n_splits": n_splits, "test_size": test_size, "n_rows": n_rows, "encoder": encoder, "str_preprocess": str_preprocess, "clf": [classifiers[i], clf_name, clf.get_params()], "ShuffleSplit": [cv.__class__.__name__], "scaler": [scaler.__class__.__name__, scaler.get_params()], "sample_seed": sample_seed, "shuffleseed": shuffle_seed, "col_action": data.col_action, "clf_type": data.clf_type, "dimension_reduction": [reduction_method, n_components], } if verify_if_exists(results_path, results_dict): print("Prediction already exists.\n") continue start = time.time() if type(encoder) is str: column_action = get_column_action( data.col_action, data.xcols, encoder, reduction_method, n_components, data.clf_type, ) if type(encoder) is list: column_action = get_column_action( data.col_action, data.xcols, encoder[0], reduction_method, n_components, data.clf_type, ) pred = Parallel(n_jobs=n_jobs)(delayed(fit_predict_fold)( data, scaler, column_action, clf, encoder, reduction_method, n_components, fold, cv.n_splits, train_index, test_index, ) for fold, (train_index, test_index) in enumerate( cv.split(data.df, data.df[data.ycol].values))) pred = np.array(pred) results = { "fold": list(pred[:, 0]), "n_train_samples": list(pred[:, 1]), "n_train_features": list(pred[:, 2]), "score": list(pred[:, 3]), "encoding_time": list(pred[:, 4]), "training_time": list(pred[:, 5]), } results_dict["results"] = results # Saving results pc_name = socket.gethostname() now = "".join([ c for c in str(datetime.datetime.now()) if c.isdigit() ]) filename = "%s_%s_%s_%s_%s.json" % ( pc_name, data.name, classifiers[i], encoder, now, ) results_file = os.path.join(results_path, filename) results_dict = array2list(results_dict) # patch for nystrom + ridge if clf.__class__.__name__ == "GridSearchCV": if clf.estimator.__class__.__name__ == "Pipeline": results_dict["clf"] = method2str( results_dict["clf"]) write_json(results_dict, results_file) print("prediction time: %.1f s." % (time.time() - start)) print("Saving results to: %s\n" % results_file)
"open_payments", "traffic_violations", "federal_election", "public_procurement", "building_permits", "road_safety", "met_objects", "drug_directory", "wine_reviews", ] n_jobs = 20 n_splits = 20 test_size = 1.0 / 3 str_preprocess = True n_components = 100 results_path = os.path.join(get_data_path(), "results", "jmlr2019_2") # results_path = os.path.join(get_data_folder(), 'results', # 'kdd_2019_only_cats') classifiers = [ # 'NystroemRidgeCV', # 'L2RegularizedLinearModel', # 'EigenProGaussian160', # 'EigenProPolynomial', # 'XGB', # 'LGBM', # 'KNN', "MLPGridSearchCV" ] ############################################################################### # Probabilistic topic models without dimensionality reduction #################
def fit_predict_categorical_encoding(datasets, str_preprocess, encoders, classifiers, reduction_methods, n_components, test_size, n_splits, n_jobs, results_path, model_path=None, custom_cv=None): ''' Learning with dirty categorical variables. ''' path = get_data_path() results_path = os.path.join(path, results_path) model_path = os.path.join(path, model_path) if not os.path.exists(results_path): os.makedirs(results_path) for dataset in datasets: n_rows = choose_nrows(dataset_name=dataset) for encoder in encoders: print('Dataset: %s' % dataset) data = Data(dataset).get_df() data.preprocess(n_rows=n_rows, str_preprocess=str_preprocess) special_col = [ col for col in data.col_action if data.col_action[col] == 'Special' ][0] if type(encoder) is list: # special_col = [col for col in data.col_action # if data.col_action[col] == 'Special'][0] for i, enc in enumerate(encoder): print(enc) if i == 0: data.col_action[special_col] = 'Special' else: new_col = '%s_%d' % (special_col, i) data.df[new_col] = data.df[special_col].copy() data.col_action[new_col] = enc data.xcols.append(new_col) for reduction_method in reduction_methods: print('Data shape: %d, %d' % data.df.shape) cv = select_cross_val(data.clf_type, n_splits, test_size, custom_cv=custom_cv, col_name=special_col) scaler = select_scaler() # Define classifiers clfs = instanciate_estimators(data.clf_type, classifiers, clf_seed, y=data.df.loc[:, data.ycol].values, model_path=model_path) for i, clf in enumerate(clfs): print( '{}: {} \n{}: {} \n{}: {} \n{}: {} \n{}: {},{}'.format( 'Prediction column', data.ycol, 'Task type', str(data.clf_type), 'Classifier', clf, 'Encoder', encoder, 'Dimension reduction', reduction_method, n_components)) try: clf_name = clf.estimator.__class__.__name__ results_dict = { 'dataset': data.name, 'n_splits': n_splits, 'test_size': test_size, 'n_rows': n_rows, 'encoder': encoder, 'str_preprocess': str_preprocess, 'clf': [ classifiers[i], clf_name, clf.estimator.get_params() ], 'ShuffleSplit': [cv.__class__.__name__], 'scaler': [scaler.__class__.__name__, scaler.get_params()], 'sample_seed': sample_seed, 'shuffleseed': shuffle_seed, 'col_action': data.col_action, 'clf_type': data.clf_type, 'dimension_reduction': [reduction_method, n_components] } except AttributeError: clf_name = clf.__class__.__name__ results_dict = { 'dataset': data.name, 'n_splits': n_splits, 'test_size': test_size, 'n_rows': n_rows, 'encoder': encoder, 'str_preprocess': str_preprocess, 'clf': [classifiers[i], clf_name, clf.get_params()], 'ShuffleSplit': [cv.__class__.__name__], 'scaler': [scaler.__class__.__name__, scaler.get_params()], 'sample_seed': sample_seed, 'shuffleseed': shuffle_seed, 'col_action': data.col_action, 'clf_type': data.clf_type, 'dimension_reduction': [reduction_method, n_components] } if verify_if_exists(results_path, results_dict): print('Prediction already exists.\n') continue start = time.time() if type(encoder) is str: column_action = get_column_action( data.col_action, data.xcols, encoder, reduction_method, n_components, data.clf_type) if type(encoder) is list: column_action = get_column_action( data.col_action, data.xcols, encoder[0], reduction_method, n_components, data.clf_type) pred = Parallel(n_jobs=n_jobs)( delayed(fit_predict_fold) (data, scaler, column_action, clf, encoder, reduction_method, n_components, fold, cv.n_splits, train_index, test_index) for fold, (train_index, test_index) in enumerate( cv.split(data.df, data.df[data.ycol].values))) pred = np.array(pred) results = { 'fold': list(pred[:, 0]), 'n_train_samples': list(pred[:, 1]), 'n_train_features': list(pred[:, 2]), 'score': list(pred[:, 3]), 'encoding_time': list(pred[:, 4]), 'training_time': list(pred[:, 5]) } results_dict['results'] = results # Saving results pc_name = socket.gethostname() now = ''.join([ c for c in str(datetime.datetime.now()) if c.isdigit() ]) filename = ( '%s_%s_%s_%s_%s.json' % (pc_name, data.name, classifiers[i], encoder, now)) results_file = os.path.join(results_path, filename) results_dict = array2list(results_dict) # patch for nystrom + ridge if clf.__class__.__name__ == 'GridSearchCV': if clf.estimator.__class__.__name__ == 'Pipeline': results_dict['clf'] = method2str( results_dict['clf']) write_json(results_dict, results_file) print('prediction time: %.1f s.' % (time.time() - start)) print('Saving results to: %s\n' % results_file)
'open_payments', 'traffic_violations', 'federal_election', 'public_procurement', 'building_permits', 'road_safety', 'met_objects', 'drug_directory', 'wine_reviews', ] n_jobs = 20 n_splits = 20 test_size = 1./3 str_preprocess = True n_components = 100 results_path = os.path.join(get_data_path(), 'results', 'jmlr2019_2') # results_path = os.path.join(get_data_folder(), 'results', # 'kdd_2019_only_cats') classifiers = [ # 'NystroemRidgeCV', # 'L2RegularizedLinearModel', # 'EigenProGaussian160', # 'EigenProPolynomial', # 'XGB', # 'LGBM', # 'KNN', 'MLPGridSearchCV', ] ############################################################################### # Probabilistic topic models without dimensionality reduction #################