def test_nonexistent_columns_explicit_fail(simple_dataframe): """ If a nonexistent column is selected, KeyError is raised. """ mapper = DataFrameMapper(None) with pytest.raises(KeyError): mapper._get_col_subset(simple_dataframe, ["nonexistent_feature"])
def scale_vars(df, mapper): warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def preprocess_train(train): train_y = train['count'] train_y1 = train['casual'] train_y2 = train['registered'] preprocess_data(train) mapper = DataFrameMapper([ ('hour', None), ('season', preprocessing.LabelBinarizer()), ('holiday', None), ('workingday', None), ('weather', preprocessing.LabelBinarizer()), ('temp', None), ('atemp', None), ('humidity', None), ('windspeed', None), ('weekday', None), ('is_sunday', None), ('bad_weather', None), ('year', None), ]) train_X = mapper.fit_transform(train) return train_X, train_y, train_y1, train_y2, mapper
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10): """ return an iterator with cross validation data :param df: :param clfs: :param preprocess_scaling: :param nFold: :return: """ to_sklearn_features = DataFrameMapper([('features', sklearn.feature_extraction.DictVectorizer())]) data_X = to_sklearn_features.fit_transform(df) data_Y = df.expected_class skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold) classification_results = [] scores = [] for num, (train_index, test_index) in enumerate(skf): X_train, X_test = data_X[train_index], data_X[test_index] Y_train, Y_test = data_Y[train_index], data_Y[test_index] print("Len train{}, Len test{}".format(Y_train.size, Y_test.size)) cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test) cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False) for clf in clfs: score, classification = generate_score(clf, cross_valid_data, fold=num) scores.append(score) classification_results.append(classification) return scores, classification_results
def test_list_transformers_single_arg(simple_dataframe): """ Multiple transformers can be specified in a list even if some of them only accept one X argument instead of two (X, y). """ mapper = DataFrameMapper([("a", [MockXTransformer()])]) # doesn't fail mapper.fit_transform(simple_dataframe)
def test_simple_df(simple_dataframe): """ Get a dataframe from a simple mapped dataframe """ df = simple_dataframe mapper = DataFrameMapper([('a', None)], df_out=True) transformed = mapper.fit_transform(df) assert type(transformed) == pd.DataFrame assert len(transformed["a"]) == len(simple_dataframe["a"])
def test_transformed_names_complex_alias(complex_dataframe): """ If we specify an alias for a multiple output column, it is used for the output """ df = complex_dataframe mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c']
def test_get_col_subset_single_column_array(simple_dataframe): """ Selecting a single column should return a 1-dimensional numpy array. """ mapper = DataFrameMapper(None) array = mapper._get_col_subset(simple_dataframe, "a") assert type(array) == np.ndarray assert array.shape == (len(simple_dataframe["a"]),)
def test_transformed_names_binarizer(complex_dataframe): """ Get transformed names of features in `transformed_names` attribute for a transformation that multiplies the number of columns """ df = complex_dataframe mapper = DataFrameMapper([('target', LabelBinarizer())]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
def test_transformed_names_simple(simple_dataframe): """ Get transformed names of features in `transformed_names` attribute for simple transformation """ df = simple_dataframe mapper = DataFrameMapper([('a', None)]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['a']
def test_mapper(self): domain = CategoricalDomain() df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}]) mapper = DataFrameMapper([ ("X", [domain, LabelBinarizer()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
def test_transformed_names_simple_alias(simple_dataframe): """ If we specify an alias for a single output column, it is used for the output """ df = simple_dataframe mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['new_name']
def test_default_none_names(): """ If default=None, column names are returned unmodified. """ df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) mapper = DataFrameMapper([], default=None) mapper.fit_transform(df) assert mapper.transformed_names_ == ['a', 'b']
class Transformer(object): """ The purpose of this class is to take a dataframe and transform it into a numpy array compatible format. """ def __init__(self, config): self.__config = config self.__mapper = None self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer()) def prepare(self, dataframe): """ Takes the already cleaned dataframe, splits it into train and test and returns the train and test as numpy arrays. If the problem is supervised, the target column will be that last one of the returned arrays. """ mapping = DataFrameMapCreator().get_mapping_from_config(self.__config) self.__mapper = DataFrameMapper(mapping) train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage")) return self.__get_correct_return_parameters(train, test) def __get_correct_return_parameters(self, train, test): model = self.__config.get_data_model() train_transformed = self.__mapper.fit_transform(train) test_transformed = self.__mapper.transform(test) if model.has_target(): return self.__add_target_data(train_transformed, train), \ self.__add_target_data(test_transformed, test) else: return train_transformed, test_transformed def __add_target_data(self, transformed_data, original_data): """ Picks up the target data from the original_data and appends it as a column to the transformed_data. Both arguments are expected to be np.array's. """ model = self.__config.get_data_model() target_feature = model.find_target_feature() name = target_feature.get_name() if target_feature.is_categorical(): target_row = original_data[name] target = self.__label_encoder_adapter.transform(target_row) else: target = original_data[name].values.astype(type_name_to_data_type("float")) target = target[..., None] return np.hstack((transformed_data, target)) def apply(self, dataframe): return self.__mapper.transform(dataframe)
def test_fit_with_optional_y_arg(complex_dataframe): """ Transformers with an optional y argument in the fit method are handled correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())]) # doesn't fail mapper.fit(df[['feat1', 'feat2']], df['target'])
def test_mapper(self): domain = ContinuousDomain() df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 3.0, "X2" : 3.5}]) mapper = DataFrameMapper([ (["X1", "X2"], [domain, StandardScaler()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual(numpy.array([1.0, 0.5]).tolist(), domain.data_min_.tolist()) self.assertEqual(numpy.array([3.0, 3.5]).tolist(), domain.data_max_.tolist())
def test_binarizer2_df(): """ Check level names from LabelBinarizer with just one output column """ df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']}) mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 1 assert cols[0] == 'target'
def test_get_col_subset_single_column_list(simple_dataframe): """ Selecting a list of columns (even if the list contains a single element) should return a 2-dimensional numpy array. """ mapper = DataFrameMapper(None) array = mapper._get_col_subset(simple_dataframe, ["a"]) assert type(array) == np.ndarray assert array.shape == (len(simple_dataframe["a"]), 1)
def test_default_transformer(): """ If default=Transformer, non explicitly selected columns are applied this transformer. """ df = pd.DataFrame({'a': [1, np.nan, 3], }) mapper = DataFrameMapper([], default=Imputer()) transformed = mapper.fit_transform(df) assert (transformed[: 0] == np.array([1., 2., 3.])).all()
def test_sparse_off(simple_dataframe): """ If the resulting features are sparse but the "sparse" argument of the mapper is False, return a non-sparse matrix. """ df = simple_dataframe mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=False) dmatrix = mapper.fit_transform(df) assert type(dmatrix) != sparse.csr.csr_matrix
def test_list_transformers_old_unpickle(simple_dataframe): mapper = DataFrameMapper(None) # simulate the mapper was created with < 1.0.0 code mapper.features = [("a", [MockXTransformer()])] mapper_pickled = pickle.dumps(mapper) loaded_mapper = pickle.loads(mapper_pickled) transformer = loaded_mapper.features[0][1] assert isinstance(transformer, TransformerPipeline) assert isinstance(transformer.steps[0][1], MockXTransformer)
def test_sparse_features(simple_dataframe): """ If any of the extracted features is sparse and "sparse" argument is true, the hstacked result is also sparse. """ df = simple_dataframe mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=True) dmatrix = mapper.fit_transform(df) assert type(dmatrix) == sparse.csr.csr_matrix
def test_multiindex_df(multiindex_dataframe_incomplete): """ Get a dataframe from a multiindex dataframe with missing data """ df = multiindex_dataframe_incomplete mapper = DataFrameMapper([([c], Imputer()) for c in df.columns], df_out=True) transformed = mapper.fit_transform(df) assert len(transformed) == len(multiindex_dataframe_incomplete) for c in df.columns: assert len(transformed[str(c)]) == len(df[c])
def test_transformed_names_transformers_list(complex_dataframe): """ When using a list of transformers, use them in inverse order to get the transformed names """ df = complex_dataframe mapper = DataFrameMapper([ ('target', [LabelBinarizer(), MockXTransformer()]) ]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
def test_onehot_df(): """ Check level ids from one-hot """ df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]}) mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 4 assert cols[0] == 'target_0' assert cols[3] == 'target_3'
def test_fit_transform(simple_dataframe): """ Check that custom fit_transform methods of the transformers are invoked. """ df = simple_dataframe mock_transformer = Mock() # return something of measurable length but does nothing mock_transformer.fit_transform.return_value = np.array([1, 2, 3]) mapper = DataFrameMapper([("a", mock_transformer)]) mapper.fit_transform(df) assert mock_transformer.fit_transform.called
def test_default_false(): """ If default=False, non explicitly selected columns are discarded. """ df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) mapper = DataFrameMapper([ ('b', None) ], default=False) transformed = mapper.fit_transform(df) assert transformed.shape == (3, 1)
def test_unselected_columns(): """ selected_columns returns a list of the columns not appearing in the features of the mapper but present in the given dataframe. """ df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]}) mapper = DataFrameMapper([ ('a', None), (['a', 'b'], None) ]) assert 'c' in mapper._unselected_columns(df)
def scale_X(X, dataset): if dataset == 'noYelp': X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns) else: #use sklearn pandas data mapper to scale only non binary columns mapper = DataFrameMapper([(['yelp_rating'], StandardScaler()), (['yelp_reviews'], StandardScaler()), (['risk'], StandardScaler()), (['insp_badge'], StandardScaler()), (['crime_count'], StandardScaler()), (['311_count'], StandardScaler()), (['construction_count'], StandardScaler()), (['avg_high_temp'], StandardScaler()), (['time_diff'], StandardScaler()), (['prev_crit_viol'], StandardScaler()), ('Burgers', None), ('Convenience Stores', None), ('Sandwiches', None), ('Wine & Spirits', None), ('adultentertainment', None), ('afghani', None), ('african', None), ('apartments', None), ('asianfusion', None), ('bagels', None), ('bakeries', None), ('bangladeshi', None), ('bars', None), ('bbq', None), ('beerbar', None), ('beergardens', None), ('belgian', None), ('brasseries', None), ('breakfast_brunch', None), ('breweries', None), ('british', None), ('buffets', None), ('burgers', None), ('burmese', None), ('cafes', None), ('cafeteria', None), ('cajun', None), ('catering', None), ('cheesesteaks', None), ('chicken_wings', None), ('chinese', None), ('chocolate', None), ('churches', None),('cocktailbars', None), ('coffee', None), ('coffeeroasteries', None), ('comfortfood', None), ('cookingschools', None), ('creperies', None), ('cuban', None), ('cupcakes', None), ('danceclubs', None), ('delis', None), ('desserts', None), ('diners', None), ('discountstore', None), ('divebars', None), ('donuts', None), ('drugstores', None), ('ethiopian', None), ('ethnicmarkets', None), ('falafel', None), ('foodtrucks', None), ('french', None), ('gastropubs', None), ('gelato', None), ('german', None), ('gluten_free', None), ('golf', None), ('gourmet', None), ('greek', None), ('grocery', None), ('gyms', None), ('halal', None), ('healthtrainers', None), ('hookah_bars', None), ('hotdog', None), ('hotdogs', None), ('hotels', None), ('icecream', None), ('indpak', None), ('irish', None), ('irish_pubs', None), ('italian', None), ('japanese', None), ('jazzandblues', None), ('juicebars', None), ('korean', None), ('landmarks', None), ('latin', None), ('lawyers', None), ('lebanese', None), ('libraries', None), ('lounges', None), ('mediterranean', None), ('mexican', None), ('mideastern', None), ('mini_golf', None), ('modern_european', None), ('musicvenues', None), ('newamerican', None), ('nonprofit', None), ('pakistani', None), ('peruvian', None), ('pianobars', None), ('pizza', None), ('publicservicesgovt', None), ('pubs', None), ('puertorican', None), ('restaurants', None), ('salad', None), ('salvadoran', None), ('sandwiches', None), ('seafood', None), ('social_clubs', None), ('soulfood', None), ('soup', None), ('southern', None), ('spanish', None), ('sports_clubs', None), ('sportsbars', None), ('steak', None), ('sushi', None), ('tapas', None), ('tapasmallplates', None), ('tea', None), ('tex-mex', None), ('thai', None), ('tobaccoshops', None), ('tradamerican', None), ('turkish', None), ('vegetarian', None), ('venues', None), ('vietnamese', None), ('wholesale_stores', None), ('wine_bars', None)]) X_scaled = pd.DataFrame(mapper.fit_transform(X.copy()), columns=X.columns) print "\n data scaled\n" return X_scaled
def test_fit_transform_equiv_mock(simple_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper using the mock transformer which does not implement a custom fit_transform. """ df = simple_dataframe mapper = DataFrameMapper([('a', MockXTransformer())]) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.all(transformed_combined == transformed_separate)
def main(): """ Run script""" options = getArgumentParser().parse_args() ### Make output dir dir_path = os.getcwd() out_dir = options.outdir path = os.path.join(dir_path, out_dir) if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) os.chdir(path) all_data = pd.read_csv(options.infile) all_data = all_data[all_data['w'] > 0] #all_data['w'] = all_data['w'].abs() # Variables of interest var = [ 'met_tight_tst_et', 'met_tight_tst_phi', 'mT', 'ph_pt', 'dphi_mety_ll', 'AbsPt', 'Ptll', 'mllg', 'lep1pt', 'lep2pt', 'mll', 'metsig_tst', 'Ptllg', 'dphi_met_ph' ] varw = [ 'met_tight_tst_et', 'met_tight_tst_phi', 'mT', 'ph_pt', 'dphi_mety_ll', 'AbsPt', 'Ptll', 'mllg', 'lep1pt', 'lep2pt', 'mll', 'metsig_tst', 'Ptllg', 'dphi_met_ph', 'w' ] units = [ 'GeV', 'Radians', 'GeV', 'GeV', 'Radians', '', 'GeV', 'GeV', 'GeV', 'GeV', 'GeV', r'$\sqrt{GeV}$', 'GeV', 'Radians' ] df_bkg = all_data[all_data['event'] == 0][var] df_sig = all_data[all_data['event'] == 1][var] #for i in range(0,len(var)): #makePlots(df_bkg,df_sig,var[i],units[i],cuts=[]) # Split into training and testing set and multiply by weights X = all_data y = all_data['event'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_train = X_train[varw] X_test = X_test[varw] wtrain = X_train['w'] wtest = X_test['w'] cols = X_train.columns itrain = X_train.index itest = X_test.index mapper = DataFrameMapper([(cols, StandardScaler())]) scaled_train = mapper.fit_transform(X_train.copy(), len(cols)) scaled_test = mapper.fit_transform(X_test.copy(), len(cols)) X_train = pd.DataFrame(scaled_train, index=itrain, columns=cols) X_test = pd.DataFrame(scaled_test, index=itest, columns=cols) X_train = X_train.drop(['w'], axis=1) X_test = X_test.drop(['w'], axis=1) model = MLPClassifier(max_iter=2000, activation='relu', alpha=0.06, hidden_layer_sizes=(120, 75), learning_rate='adaptive', momentum=0.9, solver='sgd', batch_size=50, learning_rate_init=0.05) ''' mlp = MLPClassifier(max_iter=2000,batch_size=50,momentum=0.9) param_grid = { 'hidden_layer_sizes': [(sp_randint.rvs(100,600,1),sp_randint.rvs(100,600,1),), (sp_randint.rvs(100,600,1),)], 'activation': ['tanh', 'relu', 'logistic'], 'solver': ['sgd', 'adam', 'lbfgs'], 'alpha': uniform(0.0001, 0.9), 'learning_rate': ['constant','adaptive']} #{'alpha': 0.06640793542453478, 'batch_size': 50, 'hidden_layer_sizes': (117, 74), 'learning_rate': 'adaptive', 'learning_rate_init': 0.05421689357774788, 'momentum': 0.9, 'solver': 'sgd'} #{'alpha': 0.015786202068122347, 'hidden_layer_sizes': (197,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.010660992530318792, 'solver': 'sgd'} parameter_space = { 'hidden_layer_sizes': [(sp_randint.rvs(100,600,1),sp_randint.rvs(100,600,1),),(sp_randint.rvs(100,600,1),)], #'momentum': [0.9,0.95,0.99], 'solver': ['sgd', 'adam','lbfgs'], 'alpha': uniform(0.0001,0.1), 'learning_rate_init': uniform(0.0001,0.1), 'learning_rate': ['constant','adaptive','invscaling'] #'batch_size': [10,50,200] } scores = ['roc_auc'] sys.stdout = open('model_cv.txt','wt') for score in scores: clf = RandomizedSearchCV(mlp, parameter_space, cv=3, scoring=score, n_jobs=-1,n_iter=25) clf.fit(X_train, y_train) print(score) print() print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print() ''' model = model.fit(X_train, y_train) predictions = model.predict(X_test) probs = model.predict_proba(X_test) metrics.plot_roc_curve(model, X_test, y_test, sample_weight=wtest) plt.savefig("roc.pdf") metrics.plot_precision_recall_curve(model, X_test, y_test, sample_weight=wtest) plt.savefig("prec_recall.pdf") #compare_train_test(model,X_train, y_train, X_test, y_test) plot_probs(y_test, probs, "nn") # Show output BDT score plot ''' fig,ax = plt.subplots(1,1) twoclass_output = model.decision_function(X_test) train_output = model.decision_function(X_train) class_names = ["Signal", "Background"] plot_colors = ['red', 'blue'] for i, n, c in zip(range(2), class_names, plot_colors): ax.hist(twoclass_output[i], bins=50, range=[-5,5], facecolor=c, label='Test %s' %n, alpha=.5, edgecolor=c) ax.hist(train_output[i], bins=50, range=[-5,5], label='Train %s' %n, fill=False, linestyle='--', edgecolor=c) ax.legend(loc='upper right') ax.set_ylabel('Samples') ax.set_xlabel('Score') ax.set_title('Decision Scores') plt.savefig('bdt_train_test_output_scores.pdf') ''' sys.stdout = open('model_out.txt', 'wt') print('Accuracy:') print(metrics.accuracy_score(y_test, predictions, sample_weight=wtest)) print("ROC:") print(metrics.roc_auc_score(y_test, probs[:, 1], sample_weight=wtest)) print("Confusion Matrix:") print(metrics.confusion_matrix(y_test, predictions, sample_weight=wtest)) print( metrics.classification_report(y_test, predictions, sample_weight=wtest))
parameters = { 'eta': 0.3, 'silent': True, # option for logging 'objective': 'multi:softprob', # error evaluation for multiclass tasks 'num_class': 3, # number of classes to predic 'max_depth': 3 # depth of the trees in the boosting process } num_round = 20 # the number of training iterations model = xgb.XGBClassifier(**parameters) # model.fit(X_train, y_train) # preds = model.predict(X_test) default_mapper = DataFrameMapper([(i, None) for i in feat_names]) pipeline = PMMLPipeline([('mapper', default_mapper), ("classifier", model)]) pipeline.fit(X_train, y_train) preds = pipeline.predict(X_test) y_test_trans = np.array([_[0] for _ in y_test.values]) print(precision_score(y_test, preds, average='macro')) # 各类别分别计算,然后平均 print(precision_score(y_test, preds, average='micro')) # 全局,不区分类别 sklearn2pmml(pipeline, "iris_v2.pmml", with_repr=True) # sklearn2pmml(estimator=model, mapper=default_mapper, pmml='iris_v2.xml')
# In[10]: from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import LabelBinarizer, StandardScaler from sklearn.linear_model import LinearRegression import numpy as np xtrain0 = train[[ 'season', 'hour', 'holiday', 'workingday', 'humidity', 'windspeed', 'weather', 'temp' ]] ytrain = train['count'] mapper = DataFrameMapper([('season', LabelBinarizer()), ('hour', LabelBinarizer()), ('holiday', LabelBinarizer()), ('workingday', LabelBinarizer()), ('humidity', StandardScaler()), ('windspeed', StandardScaler()), ('weather', LabelBinarizer()), ('temp', StandardScaler())]) xtrain1 = mapper.fit_transform(xtrain0) model = LinearRegression() model.fit(xtrain1, ytrain) print(model) print(model.coef_) # In[11]: model.score(xtrain1, ytrain) # In[12]:
from sklearn import preprocessing df = pd.read_csv('data/bio_stats.csv') df['college'].value_counts() target = 'college' X = df.drop(target, axis=1) y = df[target] X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42) mapper = DataFrameMapper([ ('player_height', LabelBinarizer()), ('player_weight', LabelBinarizer()), ('country', LabelBinarizer()), ('draft_year', LabelBinarizer()), ('draft_round', LabelBinarizer()), ('draft_number', LabelBinarizer())],df_out=True) Z_train = mapper.fit_transform(X_train) Z_test = mapper.transform(X_test) model = LogisticRegression(max_iter=500).fit(Z_train, y_train) model.score(Z_train, y_train) model.score(Z_test, y_test) model = RandomForestClassifier().fit(Z_train, y_train) model.score(Z_train, y_train) model.score(Z_test, y_test)
token_2 = [] token_3 = [] token_4 = messages['length'] token_5 = [] token_6 = [] count_vect = CountVectorizer() x_counts = count_vect.fit(messages_data) x_int = count_vect.transform(messages_data) x_int = list(x_int) data = preprocessing_text() labels = ['message', 'f1', 'f2', 'f3', 'f4', 'f5'] df = pd.DataFrame.from_records(data, columns=labels) mapper = DataFrameMapper([(['f1', 'f2', 'f3', 'f4', 'f5'], None), ('message', CountVectorizer(binary=True, ngram_range=(1, 2)))]) X = mapper.fit_transform(df) print("X " + str(X)) print("X " + str(X.shape)) trainset, testset, trainlabel, testlabel = train_test_split( X, messages_labels, test_size=0.33, random_state=42) SVM = svm.SVC() SVM.fit(trainset, trainlabel) predicted_values_svm = SVM.predict(testset) print(predicted_values_svm) acurracy_SVM = accuracy_score(testlabel, predicted_values_svm) print("acurracy_SVM " + str(acurracy_SVM)) confusion_matrix_SVM = confusion_matrix(testlabel, predicted_values_svm,
] ) features_def = features_def + categorical_feature_def if numerical_features and len(numerical_features) > 0: for feature in numerical_features: numerical_feature_def = gen_features( columns=[[feature]], classes=[ {'class': SimpleImputer, 'strategy': 'mean'}, {'class': StandardScaler}, ] ) features_def = features_def + numerical_feature_def preprocess = ('Preprocess', DataFrameMapper(features_def, df_out=True)) estimator = ('Estimator', RandomForestClassifier()) steps = [preprocess, estimator] pipeline = Pipeline(steps=steps) model = pipeline.fit(X, y) y_pred = pipeline.predict(test_df) test_df['Survived'] = y_test test_df['prediction'] = y_pred # 查看预测结果 print(f"预测结果:{test_df[['PassengerId', 'Survived', 'prediction']]}")
y = df[target] X = df.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # DataFrame Mapper mapper = DataFrameMapper( [ # ('region', LabelBinarizer()), (['year'], StandardScaler()), # ('manufacturer',[CategoricalImputer(), LabelBinarizer()]), ('model', [CategoricalImputer()]), ('cylinders', [CategoricalImputer(), LabelBinarizer()]), ('fuel', [CategoricalImputer(), LabelBinarizer()]), (['odometer'], [SimpleImputer(), StandardScaler()]), # ('title_status', [CategoricalImputer(), LabelBinarizer()]), ('transmission', [CategoricalImputer(), LabelBinarizer()]), # (['vin'], StandardScaler()), # ('type', [CategoricalImputer(), LabelBinarizer()]), ('paint_color', [CategoricalImputer(), LabelBinarizer()]), ('condition', [CategoricalImputer(), LabelBinarizer()]), ], df_out=True) Z_train = mapper.fit_transform(X_train) Z_test = mapper.transform(X_test) # # GridSearchCV to find best params for the pipe
Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())])) for f in numerical ] categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical] transformations_pipeline = numeric_transformations + categorical_transformations # Append classifier algorithm to preprocessing pipeline. # Now we have a full prediction pipeline. model_pipeline = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations_pipeline)), ('classifier', LogisticRegression(C=args.C, solver=args.solver, penalty=args.penalty, l1_ratio=args.l1_ratio))]) # Check Scikit-Learn docs to see the hyper-parameters available for the LogisticRegression: # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html # + # Split data into train and test x_train, x_test, y_train, y_test = train_test_split(attritionXData, target, test_size=0.2,
([f], Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())])) for f in numerical ] categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical] transformations = numeric_transformations + categorical_transformations # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations) ), ('classifier', LogisticRegression(solver='lbfgs'))]) # Split data into train and test from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(attritionXData, target, test_size=0.2, random_state=0, stratify=target) # write x_text out as a pickle file for later visualization x_test_pkl = 'x_test.pkl' with open(x_test_pkl, 'wb') as file: joblib.dump(value=x_test, filename=os.path.join('./outputs/', x_test_pkl))
def get_jewellery_data(): data = pd.read_csv(r"../../../sample/jewellery_sample.csv") return data def replace_foreign_characters(s): return re.sub(r'[^\x00-\x7f]', r'', s) if __name__ == '__main__': samples = get_jewellery_data() X = samples.drop(['id'], axis=1) X['name'] = X['name'].apply(lambda x: replace_foreign_characters(x)) X['description'] = X['description'].apply(lambda x: replace_foreign_characters(x)) Y = samples["id"] print("data done!") pipeline = Pipeline([ ('mapper', DataFrameMapper([ ('name', TfidfVectorizer(norm=None, analyzer="word", max_features=200, stop_words="english")), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=600, stop_words="english")) ])), ('model', SVC(max_iter=10000)), # train on TF-IDF vectors w/ Linear SVM classifier ]) print("model set done!") pipeline.fit(X, Y) print("model fit done!") joblib.dump(pipeline, "../../../model/model_for_jewellery_second.joblib") print("model to JobLib done!")
prod_ratings.product_id = prod_ratings.product_id.apply( lambda x: prodid2idx[x]) prod_ratings.user_id = prod_ratings.user_id.apply(lambda x: userid2idx[x]) n_users = prod_ratings.user_id.nunique() n_prods = prod_ratings.product_id.nunique() # print(n_users, n_prods) def round_rating(number): """Round a number to the closest half integer""" return np.round(number * 2) / 2 mapper = DataFrameMapper([(['product_count'], MinMaxScaler())], df_out=True) #apply the mapper to each user and concatenate results dfs = [ np.round( mapper.fit_transform(prod_ratings[prod_ratings.user_id == u].copy()), 1) for u in range(n_users) ] prod_ratings['product_score'] = pd.concat(dfs).reset_index(drop=True) * 4 + 1 prod_ratings['product_score'] = round_rating( prod_ratings['product_score']) #.astype(int) #print(prod_ratings.shape) # print(prod_ratings.head(20)) g = prod_ratings.groupby('user_id')['product_score'].count()
cols_standardize = ["x0", "x3", "x4", "x6"] cols_leave = ["x1", "x7"] cols_categorical = ["x2", "x5"] if len(cols_categorical) > 0: num_embeddings = [ len(df_train[cat].unique()) + 1 for cat in cols_categorical ] embedding_dims = [math.ceil(n_emb / 2) for n_emb in num_embeddings] standardize = [([col], StandardScaler()) for col in cols_standardize] leave = [(col, None) for col in cols_leave] categorical = [(col, OrderedCategoricalLong()) for col in cols_categorical] x_mapper_float = DataFrameMapper(standardize + leave) x_mapper_long = DataFrameMapper(categorical) x_fit_transform = lambda df: tt.tuplefy( x_mapper_float.fit_transform(df).astype(np.float32), x_mapper_long.fit_transform(df)) x_transform = lambda df: tt.tuplefy( x_mapper_float.transform(df).astype(np.float32), x_mapper_long.transform(df)) else: standardize = [([col], StandardScaler()) for col in cols_standardize] leave = [(col, None) for col in cols_leave] x_mapper = DataFrameMapper(standardize + leave) data_file_name = os.path.join('./data/', args.dataset + '.pickle') if os.path.exists(data_file_name): with open(data_file_name, 'rb') as f:
'Sensor2_entropy_250', 'Sensor3_entropy_250', 'Sensor4_entropy_250', 'Sensor5_entropy_250', 'Sensor6_entropy_250', 'Sensor7_entropy_250', 'Sensor8_entropy_250', 'Sensor9_entropy_250', 'Sensor10_entropy_250', 'Sensor11_entropy_250', 'Sensor12_entropy_250', 'Sensor13_entropy_250', 'Sensor14_entropy_250', 'Sensor15_entropy_250', 'Sensor16_entropy_250', 'Sensor17_entropy_250', 'Sensor18_entropy_250', 'Sensor19_entropy_250', 'Sensor20_entropy_250', 'Sensor21_entropy_250'] response_column = 'RUL' # Parsing data training_data = training_frame[training_columns] target_data = training_frame[response_column] testing_data = testing_frame[training_columns] ground_truth_data = testing_frame[response_column] # Setting up mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)]) # Train data - pandas to sklearn data = df_mapper.fit_transform(training_frame) # train x = data[:, 0:108] # response y = data[:, 108] # Test data - pandas to sklearn test = df_mapper.fit_transform(testing_frame) # test tX = test[:, 0:108] # ground truth tY = test[:, 108]
'texture_std_dev', 'perimeter_std_dev', 'area_std_dev', 'smoothness_std_dev', 'compactness_std_dev', 'concavity_std_dev', 'concave_points_std_dev', 'symmetry_std_dev', 'Worst_texture', 'Worst_perimeter', 'Worst_area', 'Worst_smoothness', 'Worst_compactness', 'Worst_concavity', 'Worst_concave_points', 'Worst_symmetry', 'Tumor_Size', 'Lymph_Node_Status' ] ''' These Dropped parameters are highly correlated variables because it could introduce a problem of multicollinearity which further has a negative impact on the accuracy of the model. ''' featureEngineered_dataset = dataset.drop(dropped_params, axis=1) featureEngineered_dataset.head() mapper = DataFrameMapper([(featureEngineered_dataset.columns, StandardScaler()) ]) scaled_features = mapper.fit_transform(featureEngineered_dataset.copy(), 4) scaled_features_df = pd.DataFrame(scaled_features, index=featureEngineered_dataset.index, columns=featureEngineered_dataset.columns) ''' scaled_features_df is the dataset on which feaured engineering has been performed ''' scaled_features_df.describe() i = 4 def running_and_evaluating_model(x, y):
# Input dec csv file from the current folder data = pd.read_csv('final_deceleration_mavg-co.csv', index_col=0) # Removing the parameters which are not used for clustering traindf = data.drop([ 'LA array', 'FileName', 'V2', 'T1', 'T2', 'D2-D1', 'Avg LA', 'yaw array', 'mavg_jerk' ], axis=1) # Conversion formula is a*x+b where x is the parameter after scaling a = traindf.max(axis=0) - traindf.min(axis=0) b = traindf.min(axis=0) # Scaling data using MInMaxScalar formula = X - Min / (Max-Min) mapper1 = DataFrameMapper([(traindf.columns, MinMaxScaler())]) scaled_features = mapper1.fit_transform(traindf.copy(), 4) scnum_train = pd.DataFrame(scaled_features, index=traindf.index, columns=traindf.columns) #scnum_train.describe() # Elbow Method for finding optimal number of clusters. # taking average of WCSS for 15 random seed values for every cluster wcss_avg = [] for i in range(1, 15): wcss_k = [] for j in range(1, 11): km = KMeans(n_clusters=i,
mapper = DataFrameMapper( [ ('SOILCLASS', None), # Soil classification ('LANDCOV', None), # Land coverage class from GlobCover ('ELEVATION', None), # Terrain elevation from a DEM ('SLOPE_PERCENTAGE', None), # Terrain slope from a DEM ('ASPECT', None), # Terrain slope from a DEM ('PROFILE_CURVATURE', None), # Terrain slope from a DEM ('CLEAN_ID', None), # Unique identifier for each measurement point ('TIMESTRR', None), # Date for the measurement ('LONWGS84_x', None), # Longitute coodinates for measurement points ('LATWGS84_x', None), # Latitude coordinates for measurement points ('DEPTH', None), # Depth of the measurement ('UHDICM.f', None), # Upper horizon depth ('LHDICM.f', None), # Lower horizon depth ('DEPTH.f', None), # Depth of the measurement ('UHDICM', None), # Upper horizon depth ('LHDICM', None), # Lower horizon depth ('CRFVOL', None), # Coarse fragments volumetric in % ('SNDPPT', None), # Sand content (50-2000 micro meter) mass fraction in % ('SLTPPT', None), # Silt content (2-50 micro meter) mass fraction in % ('CLYPPT', None), # Clay content (0-2 micro meter) mass fraction in % ('BLD', None), # Bulk density (fine earth) in kg / cubic-meter ('PHIHOX', None), # Soil pH x 10 in H2O ('PHIKCL', None), # pH medido numa solução de Potássio-Cloro (KCl) ('ORCDRC', None ), # Soil organic carbon content (fine earth fraction) in permilles ('CECSUM', None) ], df_out=True) #
# Create a boolean mask for categorical columns categorical_feature_mask = X.dtypes == object # Get list of categorical column names categorical_columns = X.columns[categorical_feature_mask].tolist() # Get list of non-categorical column names non_categorical_columns = X.columns[~categorical_feature_mask].tolist() # Apply numeric imputer numeric_imputation_mapper = DataFrameMapper( [([numeric_feature], SimpleImputer(strategy="median")) for numeric_feature in non_categorical_columns], input_df=True, df_out=True ) # Apply categorical imputer categorical_imputation_mapper = DataFrameMapper( [(category_feature, CategoricalImputer()) for category_feature in categorical_columns], input_df=True, df_out=True ) # Combine the numeric and categorical transformations numeric_categorical_union = FeatureUnion([ ("num_mapper", numeric_imputation_mapper),
# The Pipeline constructor takes a list of name/estimator pairs defining a sequence # of steps. All but the last estimator must be transformers (they must have a fit_transform() # method.) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelBinarizer from sklearn.preprocessing import LabelEncoder from sklearn_pandas import DataFrameMapper num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler())]) housing_num_tr = num_pipeline.fit_transform(housing_num) mapper1 = DataFrameMapper([ ("ocean_proximity", [LabelBinarizer()])], sparse=False) # cat_pipeline = Pipeline([ # ('labeler', StringIndexer()), # ('encoder', OneHotEncoder(handle_unknown='ignore'))]) # # housing_cat_tr = cat_pipeline.fit_transform(housing) # print(housing_cat_tr) #housing_cat = housing["ocean_proximity"] # housing_cat_encoded, housing_categories = housing_cat.factorize() # # encoder = OneHotEncoder() # # transform factorized categorical data from housing set and reshape it since # # fit.transform expects a 2D array. # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1)) # print(housing_cat_1hot)
'description_length','num_of_features', 'day_created', 'manager_quality', 'building_quality', 'hour_created', 'day_of_week_created'] # convert target label into numerical (ordinal) target_conversion = {'low':0,'medium':1,'high':2} y_train = X_train.interest_level.map(target_conversion).values y_test = X_test.interest_level.map(target_conversion).values X_train_cut = X_train[features_to_use] X_test_cut = X_test[features_to_use] # mapping scaler to keep dataset in a dataframe (cannot do inverse using this function) scaler = DataFrameMapper([(X_train_cut.columns, StandardScaler())]) #scaler = StandardScaler() # learn scale parameters from final training set and apply to training, val, and test sets X_train_scaled = scaler.fit_transform(X_train_cut) X_test_scaled = scaler.transform(X_test_cut) # turn numpy arrays back to pandas dataframes (retaining column names) X_train_df = pd.DataFrame(X_train_scaled, index=X_train_cut.index, columns=X_train_cut.columns) X_test_df = pd.DataFrame(X_test_scaled, index=X_test_cut.index, columns=X_test_cut.columns) # In[19]: #============================================================================== # Modeling and evaluation #==============================================================================
#impute missing values for continuous features imputable_cont_features = ['Age','Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform(titanic_train[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) encodable_columns=['Sex', 'Embarked', 'Pclass'] feature_defs = [(col_name, preprocessing.LabelEncoder()) for col_name in encodable_columns] mapper = DataFrameMapper(feature_defs) mapper.fit(titanic_train) titanic_train[encodable_columns] = mapper.transform(titanic_train) titanic_train1 = titanic_train.drop(['PassengerId', 'Name', 'Cabin','Ticket','Survived'], axis=1) one_hot_encoder = preprocessing.OneHotEncoder(categorical_features = np.array([0,1,6])) one_hot_encoder.fit(titanic_train1) print(one_hot_encoder.n_values_) titanic_train2 = one_hot_encoder.transform(titanic_train1).toarray() scaler = preprocessing.StandardScaler() scaler.fit(titanic_train2) X_train = scaler.transform(titanic_train2) y_train = titanic_train[['Survived']]
def impute_categorical_features(df, features): feature_defs = [] for col_name in features: feature_defs.append((col_name, CategoricalImputer())) mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True) df[features] = mapper.fit_transform(df[features])
def pandasToSkLearn(panda_frame, training_features, response_feature): df_mapper = DataFrameMapper([(training_features, None), (response_feature, None)]) parsed_frame = df_mapper.fit_transform(panda_frame) return parsed_frame
def build_dataset(dataframe, num_features, scaler, include_id=False): x = dataframe.iloc[:, :-1] y = dataframe[['redshift']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42) if include_id: x_train = x_train.iloc[:, :-1] x_val = x_val.iloc[:, :-1] x_test_ids = x_test.iloc[:, -1] chunks = 2 if num_features > 10: chunks = num_features / 5 if 5 < num_features < 16: x_train_ugriz, x_train_errs, x_train_experrs, _ = ugriz_errs_split( x_train, chunks) x_val_ugriz, x_val_errs, x_val_experrs, _ = ugriz_errs_split( x_val, chunks) x_test_ugriz, x_test_errs, x_test_experrs, _ = ugriz_errs_split( x_test, chunks) if scaler != None: mapper = DataFrameMapper([(x_train_ugriz.columns, scaler)]) x_train_ugriz_s = mapper.fit_transform(x_train_ugriz) x_train_ugriz = pd.DataFrame(x_train_ugriz_s, index=x_train_ugriz.index, columns=x_train_ugriz.columns) x_val_ugriz_s = mapper.transform(x_val_ugriz) x_val_ugriz = pd.DataFrame(x_val_ugriz_s, index=x_val_ugriz.index, columns=x_val_ugriz.columns) x_test_ugriz_s = mapper.transform(x_test_ugriz) x_test_ugriz = pd.DataFrame(x_test_ugriz_s, index=x_test_ugriz.index, columns=x_test_ugriz.columns) if chunks == 2: x_train = pd.concat([x_train_ugriz, x_train_errs], axis=1) x_val = pd.concat([x_val_ugriz, x_val_errs], axis=1) x_test = pd.concat([x_test_ugriz, x_test_errs], axis=1) else: x_train = pd.concat( [x_train_ugriz, x_train_errs, x_train_experrs], axis=1) x_val = pd.concat([x_val_ugriz, x_val_errs, x_val_experrs], axis=1) x_test = pd.concat([x_test_ugriz, x_test_errs, x_test_experrs], axis=1) elif num_features > 15: x_train_ugriz, x_train_errs, x_train_experrs, x_train_expmags = ugriz_errs_split( x_train, chunks) x_val_ugriz, x_val_errs, x_val_experrs, x_val_expmags = ugriz_errs_split( x_val, chunks) x_test_ugriz, x_test_errs, x_test_experrs, x_test_expmags = ugriz_errs_split( x_test, chunks) if scaler != None: mapper = DataFrameMapper([(x_train_ugriz.columns, scaler)]) x_train_ugriz_s = mapper.fit_transform(x_train_ugriz) x_train_ugriz = pd.DataFrame(x_train_ugriz_s, index=x_train_ugriz.index, columns=x_train_ugriz.columns) x_val_ugriz_s = mapper.transform(x_val_ugriz) x_val_ugriz = pd.DataFrame(x_val_ugriz_s, index=x_val_ugriz.index, columns=x_val_ugriz.columns) x_test_ugriz_s = mapper.transform(x_test_ugriz) x_test_ugriz = pd.DataFrame(x_test_ugriz_s, index=x_test_ugriz.index, columns=x_test_ugriz.columns) x_train = pd.concat([ x_train_ugriz, x_train_errs, x_train_experrs, x_train_expmags ], axis=1) x_val = pd.concat( [x_val_ugriz, x_val_errs, x_val_experrs, x_val_expmags], axis=1) x_test = pd.concat( [x_test_ugriz, x_test_errs, x_test_experrs, x_test_expmags], axis=1) else: if scaler != None: mapper = DataFrameMapper([(x_train.columns, scaler)]) x_train_s = mapper.fit_transform(x_train) x_train = pd.DataFrame(x_train_s, index=x_train.index, columns=x_train.columns) x_val_s = mapper.transform(x_val) x_val = pd.DataFrame(x_val_s, index=x_val.index, columns=x_val.columns) x_test_s = mapper.transform(x_test) x_test = pd.DataFrame(x_test_s, index=x_test.index, columns=x_test.columns) if include_id: x_test = pd.concat([x_test, x_test_ids], axis=1) return x_train, y_train, x_test, y_test, x_val, y_val, scaler
features = DataFrameMapper([ (['VocabularyRichness','Egotest','WordPerLine','WordLenght'], None), ('CleanLyrics',CountVectorizer(analyzer = "word", \ ngram_range=(1, 1), \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 250)), ('PosTag',CountVectorizer(analyzer = "word", \ ngram_range=(2, 2), \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 250)), ('PosWord',CountVectorizer(analyzer = "word", \ ngram_range=(1, 1), \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 250)), ('RidTag',CountVectorizer(analyzer = "word", \ ngram_range=(2, 2), \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 250)), ('RidTagOnly',CountVectorizer(analyzer = "word", \ ngram_range=(4, 4), \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 25))])
titanic_train.Cabin = titanic_train.Cabin.map(lambda x: x[0]) # size of families (including the passenger) titanic_train['FamilySize'] = titanic_train.Parch + titanic_train.SibSp + 1 cat_features = ['Sex', 'Embarked', 'Pclass', 'Cabin', 'Title'] cont_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize'] feature_defs = [] for col_name in cat_features: feature_defs.append((col_name, MyLabelBinarizer())) for col_name in cont_features: feature_defs.append((col_name, None)) mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True) mapper.fit(titanic_train) X_train = mapper.transform(titanic_train) y_train = titanic_train['Survived'] kfold = model_selection.StratifiedKFold(n_splits=10) random_state = 100 rf_classifier = ensemble.RandomForestClassifier(random_state=random_state) rf_grid = { 'max_depth': list(range(7, 14)), 'n_estimators': list(range(10, 100, 10)), 'min_samples_split': list(range(4, 11)), 'min_samples_leaf': list(range(2, 5)) } grid_rf_classifier = model_selection.GridSearchCV(rf_classifier,
#print(audit_df.head(5)) audit_X = audit_df[audit_df.columns.difference(["Adjusted"])] audit_y = audit_df["Adjusted"] scalar_mapper = DataFrameMapper([ ("Education", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Age", [ ContinuousDomain(), CutTransformer(bins=[17, 28, 37, 47, 83], labels=["q1", "q2", "q3", "q4"]), LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier()
major = pd.read_csv(r"C:\Users\钟顺民\Desktop\4.csv", sep=",", encoding='ISO-8859-1') \ .dropna().groupby('id', as_index=False, group_keys=False) \ .apply(typicalsamling, typicalNDict_Major) # 分配数据 X = major.drop(['id'], axis=1) Y = major["id"] pipeline = PMMLPipeline([ ('mapper', DataFrameMapper([('name', TfidfVectorizer(norm=None, analyzer="word", max_features=200, tokenizer=Splitter())), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=600, tokenizer=Splitter()))])), ('model', SVC(max_iter=10000)), # train on TF-IDF vectors w/ Linear SVM classifier ]) pipeline.fit(X, Y) c = pd.read_csv(r"C:\Users\钟顺民\Desktop\4.csv", sep=',', encoding='ISO-8859-1').dropna().sample(n=200) prediction = pipeline.predict(c.drop(['id'], axis=1)) t = c['id']
from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn_pandas import DataFrameMapper from sklearn_pandas import CategoricalImputer train_set = pd.read_csv("train.csv", index_col="PassengerId") test_set = pd.read_csv("test.csv", index_col="PassengerId") mapper = DataFrameMapper( [ ("Age", None), ("Fare", None), #("Embarked", [CategoricalImputer(), LabelBinarizer()]), # at first it didn't well ("Sex", LabelEncoder()), ("Pclass", None), ("SibSp", None), ], df_out=False) pipeline = Pipeline([ ("mapper", mapper), ("imputer", Imputer()), ("scaler", StandardScaler()), #("classifier", SGDClassifier(random_state = 42, n_jobs = 4)), # Stochastic Gradient Descent ("classifier", RandomForestClassifier(random_state=42, n_jobs=4)) ]) train_set_labels = train_set["Survived"] pipeline.fit(train_set, train_set_labels)
from sklearn2pmml.pipeline import PMMLPipeline import pandas df = pandas.read_csv("audit.csv") cat_columns = ["Education", "Employment", "Marital", "Occupation"] cont_columns = ["Age", "Hours", "Income"] df_X = df[cat_columns + cont_columns] df_y = df["Adjusted"] mapper = DataFrameMapper( [(cat_column, [CategoricalDomain(invalid_value_treatment="as_is"), LabelBinarizer()]) for cat_column in cat_columns] + [([cont_column], [ContinuousDomain(invalid_value_treatment="as_is"), StandardScaler()]) for cont_column in cont_columns]) selector = SelectKBest() classifier = LogisticRegression(multi_class="ovr", penalty="elasticnet", solver="saga", max_iter=1000) pipeline = PMMLPipeline([("mapper", mapper), ("selector", selector), ("classifier", classifier)]) param_grid = {
# get features and thermal sensation y = data_new['sensation'] x = data_new[['temperature', 'humidity', 'skin', 'clothing']] # In[6]: import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics from sklearn.model_selection import GridSearchCV, cross_val_score from sklearn import datasets, svm from sklearn.model_selection import KFold, cross_val_score from sklearn.feature_extraction.text import CountVectorizer from sklearn_pandas import DataFrameMapper # In[7]: mapper = DataFrameMapper([(['temperature'], None), (['humidity'], None), (['skin'], None), (['clothing'], None)]) mapper.fit_transform(x.copy()) # count the number of thermal sensation bool = (y == 3) len(y[bool]) # In[21]: #clf = svm.SVC(kernel='linear') C = 1 #clf = svm.SVC(kernel='poly',degree=3,C=C) clf = svm.SVC(kernel='rbf', gamma=0.7, C=C) pipe = sklearn.pipeline.Pipeline([('featurize', mapper), ('svc', clf)]) #np.round(cross_val_score(pipe, X=data_new.copy(), y=data_new.comfort, scoring='r2'), 2) cross_val_score(pipe, X=x.copy(), y=y, scoring='r2', cv=5)