def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34): from pandas import DataFrame, read_csv from numpy import log as ln from sklearn.cross_validation import KFold from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler train = read_csv(path+"train.csv") test = read_csv(path+"test.csv") id = test.id target = train.target encoder = LabelEncoder() target_nnet = encoder.fit_transform(target).astype('int32') feat_names = [x for x in train.columns if x.startswith('feat')] train = train[feat_names].astype(float) test = test[feat_names] if log == 'add': for v in train.columns: train[v+'_log'] = ln(train[v]+1) test[v+'_log'] = ln(test[v]+1) elif log == 'replace': for v in train.columns: train[v] = ln(train[v]+1) test[v] = ln(test[v]+1) if pca_n > 0: from sklearn.decomposition import PCA pca = PCA(pca_n) train = pca.fit_transform(train) test = pca.transform(test) scaler = StandardScaler() scaler.fit(train) train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])]) test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])]) cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED) return train, test, target, target_nnet, id, cv, encoder
def prepare_items_features(user_items_csv, out_dir): array = np.loadtxt(user_items_csv, delimiter='|', dtype=np.dtype(np.uint64)) le = LabelEncoder() col1 = le.fit_transform(array[:, 1].T) col2 = le.fit_transform(array[:, 2].T) col3 = le.fit_transform(array[:, 3].T) col4 = le.fit_transform(array[:, 4].T) columns = np.array([col1, col2, col3, col4]).T enc = OneHotEncoder() print(array[:10]) encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()] print(encoded[:10]) print(encoded.shape) user_id = encoded[0][0] rows = [] current = np.zeros(encoded.shape[1]-1) for i in range(encoded.shape[0]): if encoded[i][0] != user_id: rows.append(np.concatenate([[user_id], current])) user_id = encoded[i][0] current = np.zeros(encoded.shape[1]-1) else: current = np.sum([current, encoded[i, 1:]], axis=0) rows.append(np.concatenate([[user_id], current])) array = np.array(rows) print(array.shape) # let's serialize array np.save(os.path.join(out_dir, "user_items"), array)
def transformTestData(self, train_data, test_data): #Select the right features for both training and testing data X_train, y_train = self.__selectRelevantFeatures(train_data) X_test, y_test = self.__selectRelevantFeatures(test_data) #Transform categorical variables into integer labels martial_le = LabelEncoder() occupation_le = LabelEncoder() relationship_le = LabelEncoder() race_le = LabelEncoder() sex_le = LabelEncoder() transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le] for i in range(len(transformers)): X_train[:,i] = transformers[i].fit_transform(X_train[:,i]) X_test[:,i] = transformers[i].transform(X_test[:,i]) #Dummy code categorical variables dummy_code = OneHotEncoder(categorical_features = range(5)) X_train = dummy_code.fit_transform(X_train).toarray() X_test = dummy_code.transform(X_test).toarray() #Normalize all features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Encode y class_le = LabelEncoder() y_train = class_le.fit_transform(y_train) y_test = class_le.transform(y_test) #print class_le.transform(["<=50K", ">50K"]) return X_train, X_test, y_train, y_test
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def train(cls, X, y, word_sim_metric, classifier=LinearSVC, feature_num=10, feature_type='sim', verbose=True): if isinstance(classifier, type): classifier = classifier() labels = LabelEncoder() y_train = labels.fit_transform(y) @timeit def build(): corpus = zip(X, y) model = Pipeline([ ('preprocessor', TextPreprocessor(corpus, word_sim_metric, feature_num, feature_type)), ('vectorizer', DictVectorizer()), ('classifier', classifier), ]) model.fit(X, y_train) return model if verbose: print("Building the model") model, secs = build() if verbose: print("Complete model building in {:0.3f} seconds".format(secs)) return cls(labels, model)
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None): """ Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() le = LabelEncoder() X = s.fit_transform(df[predictorColumns].astype('float64')) y = le.fit_transform(df[targetcolumn].values) rfc = RandomForestClassifier(max_depth = md) acc = mean(cross_val_score(rfc, X, y)) X_train, X_test, y_train, y_test = train_test_split(X,y) rfc.fit(X_train,y_train) y_predict = rfc.predict(X_test) cm = confusion_matrix(y_test, y_predict) cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_) rfc.fit(X, y) return rfc, cm, round(acc,2), le
def load_train_data(path): print("Loading Train Data") df = pd.read_csv(path) # Remove line below to run locally - Be careful you need more than 8GB RAM rows = np.random.choice(df.index.values, 40000) df = df.ix[rows] # df = df.sample(n=40000) # df = df.loc[df.index] labels = df.target df = df.drop('target',1) df = df.drop('ID',1) # Junk cols - Some feature engineering needed here df = df.fillna(-1) X = df.values.copy() np.random.shuffle(X) X = X.astype(np.float32) encoder = LabelEncoder() y = encoder.fit_transform(labels).astype(np.int32) scaler = StandardScaler() X = scaler.fit_transform(X) return X, y, encoder, scaler
def load_otto_group(): """ Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition. Link: https://www.kaggle.com/c/otto-group-product-classification-challenge Returns ---------- data : array-like Pandas data frame containing the entire data set. X : array-like Training input samples. y : array-like Target values. """ file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip') z = ZipFile(file_location) data = pd.read_csv(z.open('train.csv')) data = data.set_index('id') # move the label to the first position cols = data.columns.tolist() cols = cols[-1:] + cols[0:-1] data = data[cols] X = data.iloc[:, 1:].values y = data.iloc[:, 0].values # transform the labels from strings to integers encoder = LabelEncoder() y = encoder.fit_transform(y) return data, X, y
def auto_alpha2num(self, col): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() for i in col: self.df[i] = le.fit_transform(self.df[i]) return
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
class Classifier(BaseEstimator): def __init__(self): self.label_encoder = LabelEncoder() self.scaler = StandardScaler() self.clf = None def fit(self, X, y): X = self.scaler.fit_transform(X.astype(np.float32)) y = self.label_encoder.fit_transform(y).astype(np.int32) dtrain = xgb.DMatrix( X, label=y.astype(np.float32)) param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'} param['nthread'] = 4 param['num_class'] = 9 param['colsample_bytree'] = 0.55 param['subsample'] = 0.85 param['gamma'] = 0.95 param['min_child_weight'] = 3.0 param['eta'] = 0.05 param['max_depth'] = 12 num_round = 400 # to be faster ?? #num_round = 820 self.clf = xgb.train(param, dtrain, num_round) def predict(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) label_index_array = np.argmax(self.clf.predict(dtest), axis=1) return self.label_encoder.inverse_transform(label_index_array) def predict_proba(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) return self.clf.predict(dtest)
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def load_kernel_matrix(data_path='data', study='wl_kernel', verbose=True): """Loading already computed kernel matrix. Parameters: --------- data_path: string Path to the data folder. study: string Name of the folder containing the study, e.g. 'wl_kernel', which contains the WL kernel matrix. verbose: bool """ path_k_matrix = os.path.join(data_path, 'precomputed_kernels', study, 'k_matrix.csv') path_cls = os.path.join(data_path, 'precomputed_kernels', study, 'class_labels.csv') K = np.loadtxt(path_k_matrix) y = np.loadtxt(path_cls) le = LabelEncoder() y = le.fit_transform(y) if verbose: print 'n_samples: %s, n_samples_by_class: (%s - %s)' % (len(y), len(y[y == 0]), len(y[y == 1])) return K, y
def main(): train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') enc = LabelEncoder() joined = pd.concat((train['Product_Info_2'], test['Product_Info_2']), axis=0) enc.fit(joined) train['Product_Info_2'] = enc.transform(train['Product_Info_2']) test['Product_Info_2'] = enc.transform(test['Product_Info_2']) X_train = train.drop('Response', axis=1).values y_train = train['Response'].values X_test = test.values mdl = xgb.XGBRegressor(learning_rate=0.05, n_estimators=200, subsample=0.5, max_depth=6, silent=False) mdl.fit(X_train, y_train) preds = mdl.predict(X_test) preds = [min(max(1, int(round(pred))), 8) for pred in preds] sub = pd.DataFrame({'Id': test['Id'], 'Response': preds}) sub.to_csv('submissions/xgb.csv', index=False)
def main(X_fname, Y_fname, result_fname=None): le = LabelEncoder() moves = pandas.read_csv(Y_fname, index_col=0) Y = moves.values.ravel() Y = le.fit_transform(Y) X = io.mmread(X_fname) print X.shape, Y.shape, len(le.classes_) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) xg_train = xgboost.DMatrix( X_train, label=y_train) xg_test = xgboost.DMatrix(X_test, label=y_test) param = {} # use softmax multi-class classification param['objective'] = 'multi:softprob' param['eta'] = 0.002 param['max_depth'] = 7 param['nthread'] = 7 param['num_class'] = len(le.classes_) param['eval_metric'] = 'merror' evals = [ (xg_train, 'train'), (xg_test, 'eval') ] # Train xgboost print "Training" t1 = time.time() bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=3) t2 = time.time() print t2-t1 if result_fname is None: result_fname = str(datetime.now()) bst.save_model("%s.bst"%result_fname)
def plot_model_decision_surface(clf, train_features, train_labels, plot_step=0.02, cmap=plt.cm.RdYlBu, markers=None, alphas=None, colors=None): if train_features.shape[1] != 2: raise ValueError("X_train should have exactly 2 columnns!") x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) clf_est = clone(clf) clf_est.fit(train_features,train_labels) if hasattr(clf_est, 'predict_proba'): Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1] else: Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=cmap) le = LabelEncoder() y_enc = le.fit_transform(train_labels) n_classes = len(le.classes_) plot_colors = ''.join(colors) if colors else [None] * n_classes label_names = le.classes_ markers = markers if markers else [None] * n_classes alphas = alphas if alphas else [None] * n_classes for i, color in zip(range(n_classes), plot_colors): idx = np.where(y_enc == i) plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color, label=label_names[i], cmap=cmap, edgecolors='black', marker=markers[i], alpha=alphas[i]) plt.legend() plt.show()
def load_input_files(self, **kwargs): """ Loads both files containing training data and data for prediction. Encodes the target labels to integers. In case the it is training data, it will return in the output args the LabelEncoder used to encode the target labels to integers. We return it instead of directly storing it, because it will be saved in case the training ends without errors. Inputs: - files_paths (string): path the input files. - training_data (bool): specifies whether the files containing training data or data for making predictions. Outputs: - LabelEncoder (LabelEncoder) (optional): Encodes the labels of the target variables to integers. """ input_data = kwargs['input_data'] input_files_dir = kwargs['input_files_dir'] input_file_path = input_files_dir + input_data['database'] df = pd.read_csv(input_file_path) training_data = kwargs.pop('training_data', False) # if we are loading training data, we have to assign an integer to each possible # target label in the dataset. We do it by fitting a LabelEncoder, if training_data: le = LabelEncoder() col_name = df.columns[4] df[col_name] = le.fit_transform(df[col_name]) data = {} data['features'] = df[df.columns[0:4]].values data['targets'] = df[df.columns[4]].values self.feature_names = list(df.columns[0:4]) self.target_name = df.columns[4] out_args = {} out_args['LabelEncoder'] = le return data, out_args # if the data is for making predictions else: data = {} # ensure that the columns are in the correct order data['features'] = df[self.feature_names].values out_args = {} return data, out_args
def ml_target(dataset): """ Takes a dataset and retuns the target in a numpy.array ready for machine learning. Mainly transforms non-numerical variables(columns) to numbers. Parameters ---------- copper.Dataset Returns ------- (label_encoder, np.array) Notes ----- If dataset has more than one variable with role=TARGET then the first one is selected. """ cols = dataset.filter_cols(role=dataset.TARGET) assert len(cols) > 0, 'No target variables on Dataset' if len(cols) > 1: import warnings warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0]) if dataset[cols[0]].dtype in (np.int, np.float): return None, dataset[cols[0]].values else: le = LabelEncoder() encoded = le.fit_transform(dataset[cols[0]].values) return le, encoded
def __call__(self, X_train, X_test, y_train, y_test): X = np.vstack([X_train, X_test]) y = np.hstack([y_train, y_test]) le = LabelEncoder() y = le.fit_transform(y) kmeans = KMeans( n_clusters=len(np.unique(y)), n_init=self.kmeans__n_init, random_state=self.random_state, ) kmeans.fit(X) r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_) h = np.exp(-r / (self.sig**2)) N = confusion_matrix(y, kmeans.labels_) wN = np.zeros(h.shape) for l in range(wN.shape[0]): # label for c in range(wN.shape[0]): # cluster for j in range(wN.shape[0]): wN[l, c] += h[l, c] * N[l, j] return wN.max(axis=0).sum() / wN.sum()
def multicol_fit_transform(dframe, columns): if isinstance(columns, list): columns = np.array(columns) else: columns = columns encoder_dict = {} # columns are provided, iterate through and get `classes_` # ndarray to hold LabelEncoder().classes_ for each # column; should match the shape of specified `columns` all_classes_ = np.ndarray(shape=columns.shape, dtype=object) all_encoders_ = np.ndarray(shape=columns.shape, dtype=object) all_labels_ = np.ndarray(shape=columns.shape, dtype=object) for idx, column in enumerate(columns): # instantiate LabelEncoder le = LabelEncoder() # fit and transform labels in the column dframe.loc[:, column] = le.fit_transform(dframe.loc[:, column].values) encoder_dict[column] = le # append the `classes_` to our ndarray container all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) all_encoders_[idx] = le all_labels_[idx] = le multicol_dict = {"encoder_dict":encoder_dict, "all_classes_":all_classes_,"all_encoders_":all_encoders_,"columns": columns} return dframe, multicol_dict
def train(args): print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']} ] svm = GridSearchCV( SVC(probability=True), param_grid, verbose=4, cv=5, n_jobs=16 ).fit(embeddings, labelsNum) print("Best estimator: {}".format(svm.best_estimator_)) print("Best score on left out data: {:.2f}".format(svm.best_score_)) with open("{}/classifier.pkl".format(args.workDir), 'w') as f: pickle.dump((le, svm), f)
def load_data(filename="Feat_normalized.csv") : ''' Load training data from csv file. Load labels from it. Return matrix, training labels, encoder for labels. http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html http://stackoverflow.com/questions/21589177/using-multiple-features-with-scikit-learn?rq=1 Labels could just be the names? : http://stackoverflow.com/questions/13300160/non-integer-class-labels-scikit-learn?rq=1 ''' df = pd.read_csv(filename, index_col=0) lb = LabelEncoder() labels = lb.fit_transform((df.index.values)) print ("labels: %s %s" %(type(labels),labels)) features = df.values # labels = LabelEncoder.transform(np.asarray(df['labels'].values)) 'This could be done more elegantly. Check index num for later filtering!!' 'TODO: Is pop needed? List of col.values?? ' feature_names=df.columns.values #No pop. (nd array, no labels index here) print("%s features: " % (len(feature_names))) # classes = label_encoder.transform(np.asarray(df['labels'])) print('encoded labels: %s' % (set(labels))) # print("feature_names: %s" %(feature_names)) return (features, labels, lb,feature_names)
def prep_data(df_train,df_test,test_size=0.2): print(" ---- Start data prep") df_train = df_train.dropna(subset=['X1']) df_train['X1'] = (df_train['X1'].replace( '[\%,)]','',regex=True).replace( '[(]','-', regex=True ).astype(float)) labels = df_train['X1'].values id_test = df_test['X2'] piv_train = df_train.shape[0] df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) del df_all['X1'], df_all['X2'], df_all['X3'], df_all['X10'], df_all['X16'], df_all['X18'] df_all['X23'] = df_all['X23'].map(lambda x: str(x)[:-3]) df_all['X15'] = df_all['X15'].map(lambda x: str(x)[:-3]) df_all['X4'] = (df_all['X4'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float)) df_all['X5'] = (df_all['X5'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float)) df_all['X6'] = (df_all['X6'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float)) df_all['X30'] = (df_all['X30'].replace( '[\%,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float)) df_f = feature_engineering(df_all) vals = df_f.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) y = labels X_test = vals[piv_train:] X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.2) print(" ---- end data prep") return X_train, X_valid, y_train, y_valid, X_test, id_test
def to_numeric(self, columns=[]): le = LabelEncoder() for i, c in enumerate(columns): le.fit(self.M[:, c]) self.M[:, c] = le.transform(self.M[:, c]) self.M = self.M.astype(np.float) return self
def test_vote_soft(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) #train_attr = reduce(lambda a,b:a+b,train_probs) test_attr = reduce(lambda a,b:a+b,test_probs) pred = test_attr.idxmax(1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred))
def train(self): input_dir = get_config().get('Classification', 'TrainingInputPath') self.logger.info("Loading features") file_name = os.path.join(input_dir, 'labels.csv') labels = pd.read_csv(file_name, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) label_encoder = LabelEncoder().fit(labels) labels_encoded = label_encoder.transform(labels) num_classes = len(label_encoder.classes_) file_name = os.path.join(input_dir, 'reps.csv') features = pd.read_csv(file_name, header=None).as_matrix() self.logger.info("Training for {} classes.".format(num_classes)) clf = SVC(C=1, kernel='linear', probability=True) # TODO: Try a previous LDA try: lda = int(get_config().get("Classification", "LDADim")) except ValueError: lda = None if lda: clf_final = clf clf = Pipeline([('lda', LDA(n_components=lda)), ('clf', clf_final)]) clf.fit(features, labels_encoded) file_name = os.path.join(input_dir, 'classifier.pkl') self.logger.info("Saving classifier to '{}'".format(file_name)) with open(file_name, 'w') as f: pickle.dump((label_encoder, clf), f)
def labele(tbl,cols='all'): from sklearn.preprocessing import LabelEncoder as LE if cols=='all':cols=tbl.columns le=LE() for ac in tbl.columns: tbl.loc[:,ac]=le.fit(tbl[ac]).transform(tbl[ac]) #might have to return le return tbl
def prepare_labels(y): # From here: https://www.kaggle.com/pestipeti/keras-cnn-starter values = np.array(y) label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(values) return integer_encoded, label_encoder
def customEncode(df): global labelencoder le = LabelEncoder() le.fit(df['OutcomeType']) df['OutcomeType'] = le.transform(df['OutcomeType']) labelencoder = le return df
def test_hard_vote(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) #train_probs = probs[0] test_probs = probs[1] print(len(test_probs)) preds = [x.idxmax(1) for x in test_probs] pred = np.zeros(len(preds[0]),dtype=np.int8) print(len(pred)) for i in range(len(preds[0])): votes = [p[i] for p in preds] print(votes) pred[i]= max(set(votes),key=votes.count) print(pred[i]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred)) """
from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score if __name__ == "__main__": pd.set_option('display.width', 300) data = pd.read_csv('../dataset/tel.csv', skipinitialspace=True, thousands=',') # thousands : str, default None 千分位分割符,如“,”或者“." print u'原始数据:\n', data.head(10) # print 'data.columns() = \n', data.columns # 将每列数据按照类别做Label,比如Married和Unmarried这两个值分别用0和1取代 le = LabelEncoder() # 编码标签值介于0和n 比如有5类则标签为0/1/2/3/4 for col in data.columns: data[col] = le.fit_transform(data[col]) # 符合标签编码的返回编码标签 print u'处理后数据1:\n', data.head(10) # 年龄分组 # 将age这列的数据按照给定的bins半开区间做标记,比如年龄在[-1,6)标记为0;[6,12)标记为1;[12,18)标记为2 ;这里标记可以自己指定,但要和bins的取值个数一样 bins = [-1, 6, 12, 18, 24, 35, 50, 70] data['age'] = pd.cut(data['age'], bins=bins, labels=np.arange(len(bins)-1)) # cut函数:返回指数每个x的值所属半开的范围,并且用labels的值标记 # print u'处理后2:\n', data['age'] # 取对数 columns_log = ['income', 'tollten', 'longmon', 'tollmon', 'equipmon', 'cardmon', 'wiremon', 'longten', 'tollten', 'equipten', 'cardten', 'wireten', ] mms = MinMaxScaler() # 这个估计量尺度和单独翻译每个特性,使其在训练集在给定的范围内,即在0和1之间。
from sklearn.preprocessing import MinMaxScaler from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsRegressor # df. dfProcess = df[['MAKE', 'year', 'mileage', 'engine cc', 'selling_price']] dfProcess['selling_price'] = pd.to_numeric(dfProcess['selling_price']) data = dfProcess.values # 1 X = data[:, 0:-1] Y = data[:, -1] Y.ravel() # 2 encoder = LabelEncoder() for i in range(X.shape[1]): X[:, i] = encoder.fit_transform(X[:, i]) # 3 minmax = MinMaxScaler() X = minmax.fit_transform(X) # 4 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9, random_state=42) # 5 model = KNeighborsRegressor(n_neighbors=5)
ds.loc[ds['SibSp'] == 1, ['Parch']] = 1 dssub.loc[dssub['SibSp'] == 1, ['Parch']] = 1 pid = sub.PassengerId.values #handling missing values #for Age ds.Age = ds.Age.fillna(ds.Age.median()) dssub.Age = dssub.Age.fillna(dssub.Age.median()) #for Embarked ds.Embarked = ds.Embarked.fillna('S') dssub.Embarked = dssub.Embarked.fillna('S') X_all = np.concatenate((X, X_sub), axis=0) y = dataset.loc[:, 'Survived'].values #Handling categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X_all[:, 1] = labelencoder_X.fit_transform(X_all[:, 1]) X_all[:, 5] = labelencoder_X.fit_transform(X_all[:, 5]) onehotencoder = OneHotEncoder(categorical_features=[0, 5]) X_all = onehotencoder.fit_transform(X_all).toarray() X = X_all[:891, :] X_sub = X_all[891:, :] from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=1) from keras.models import Sequential from keras.layers import Dense model = Sequential()
import pandas as pd titanic=pd.read_csv('titanic.csv',encoding="shift-jis") titanic=titanic.drop(['name','row.names'],axis=1) mean=round(titanic['age'].mean(),2) titanic['age'].fillna(mean,inplace=True) titanic.fillna("",inplace=True) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() for i in titanic.columns.values.tolist(): if (i=='age'): pass else: titanic[i] = le.fit_transform(titanic[i]) from sklearn.model_selection import train_test_split titanic_target = titanic['survived'] titanic_data=titanic.drop(['survived'],axis=1) yX=titanic_target yX=pd.concat([yX,titanic_data],axis=1) yX.to_csv('temp.csv',encoding='utf-8') X_train,X_test,y_train,y_test=train_test_split(titanic_data,titanic_target,test_size=0.2,random_state=54,shuffle=True) from sklearn.ensemble import ExtraTreesClassifier clf=ExtraTreesClassifier(n_estimators=382, max_depth=None,min_samples_split=2,random_state=8) clf.fit(X_train,y_train) print(clf.score(X_test,y_test)) dic=dict(zip(titanic_data.columns,clf.feature_importances_)) for item in sorted(dic.items(), key=lambda x: x[1], reverse=True): print(item[0],round(item[1],4))
from matplotlib import pyplot as plt %matplotlib inline cancer = load_breast_cancer() df = pd.DataFrame(cancer.data, columns=cancer.feature_names) df.columns = [c.replace(' ', '_') for c in df.columns] df['target'] = cancer.target df['target'] = df.target.replace({0: 'malignant', 1: 'benign'}) # separation target = 'target' X = df.drop(target, axis=1) y = df[target] le = LabelEncoder() y = le.fit_transform(y) # experiment feature = 'worst_concave_points' p = 50 threshold = np.percentile(X[feature], 50) feature_cuts = np.where(X[feature] > threshold, 'left', 'right') decision = pd.DataFrame(zip(X[feature], feature_cuts, y), columns=['feature', 'cut', 'y']) majority = decision.groupby('cut')['y'].mean() # BUG: maybe could be the same, or maybe doesn't get rounded?
import numpy as np from seqlearn.evaluation import bio_f_score from seqlearn.hmm import MultinomialHMM from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.model_selection import cross_validate from data import * from epam_nlp import CustomHMM, get_bio_f1 DATA_PATH = Path('../data') RAW_DATA_PATH = DATA_PATH / 'processed.tsv' df = load_data(RAW_DATA_PATH, nrows=1000) X, y, lengths = get_X_y_lengths(df, cols_to_keep={'token'}) le = LabelEncoder() ohe = OneHotEncoder(handle_unknown='ignore') clf = CustomHMM(y=y) pipeline = Pipeline([('one_hot', ohe), ('hmm', clf)]) cv = get_cv(lengths=lengths) res = cross_validate(pipeline, X.reshape(-1, 1), y, cv=cv, n_jobs=1, scoring=get_bio_f1) print(res) # cv = get_cv(X, y, lengths) # i = 1 # scores = []
#Importing header files from sklearn.preprocessing import MinMaxScaler, LabelEncoder #Code starts here #Removing `,` from the column data['Installs']=data['Installs'].str.replace(',','') #Removing `+` from the column data['Installs']=data['Installs'].str.replace('+','') #Converting the column to `int` datatype data['Installs'] = data['Installs'].astype(int) #Creating a label encoder object le=LabelEncoder() #Label encoding the column to reduce the effect of a large range of values data['Installs']=le.fit_transform(data['Installs']) #Setting figure size plt.figure(figsize = (10,10)) #Plotting Regression plot between Rating and Installs sns.regplot(x="Installs", y="Rating", color = 'teal',data=data) #Setting the title of the plot plt.title('Rating vs Installs[RegPlot]',size = 20) #Code ends here
X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values # Replace Missing values # missing data replace NaN with average from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer=imputer.fit(X[:,1:3]) X[:,1:3]=imputer.transform(X[:,1:3]) # categorical data encoding to integer variables from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X=LabelEncoder() X[:,0]=labelencoder_X.fit_transform(X[:,0]) onehotEncoder= OneHotEncoder(categorical_features=[0]) X=onehotEncoder.fit_transform(X).toarray() # Encoding the Dependent Variable if needed from Yes/No to 1 and 0 labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler()
from sklearn.pipeline import Pipeline # fix random seed for reproducibility seed = 7 np.random.seed(seed) # load dataset dataframe = read_csv('dataset/sonar.csv', header=None, delimiter=',') dataframe = dataframe.values # split into input (X) and output (Y) variables X = dataframe[:, :-1].astype(float) Y = dataframe[:, -1] # encode class values as integers encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # baseline model def create_baseline(): ''' Larger 60 inputs -> [60] -> 1 output Larger 60 inputs -> [60 -> 30] -> 1 output Smaller 60 inputs -> [30] -> 1 output
import pandas as pd dataset = pd.read_csv('enter your dataset') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.transform(y_test) from xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import accuracy_score score = accuracy_score(y_test, y_pred) print(score) from sklearn.model_selection import cross_val_score score = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
data['DailyRate'].hist(bins=20) sns.countplot(x= data['Attrition'], data = data, hue = data['Gender']) sns.countplot(x = data['MaritalStatus'],hue=data['Attrition'] , data = data) sns.barplot(y=data['JobRole'], x=data['JobSatisfaction'],estimator = np.mean, data = data) """Label Encoding""" from sklearn.preprocessing import LabelEncoder cat_object= ['Attrition', 'BusinessTravel', 'Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'] le = LabelEncoder() for obj in cat_object: data[obj] = le.fit_transform(data[obj]) data.dtypes corr = data.corr() f,ax = plt.subplots(figsize=(16,9)) sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values, square=True) data.drop(['Over18'], axis=1, inplace= True) k=10 cols=corr.nlargest(k,'Attrition')['Attrition'].index cm= np.corrcoef(data[cols].values.T) sns.set(font_scale=1.25) hm = sns.heatmap(cm, cbar = True ,annot = True,fmt ='.2f',annot_kws ={'size':10}, yticklabels=cols.values, xticklabels=cols.values )
import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4:5].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() # Spliting into a training and a test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0) from sklearn.linear_model import LinearRegression as LR regressor = LR() regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) regressor.score(X_test, y_test) plt.plot(X_test, y_test, color='g')
def extract_features_from_model(base_model, model_name, feature_shape): label_encoder = None # loop over the data splits for split in (config.TRAIN, config.TEST, config.VAL): # grab all image paths in the current split p = os.path.sep.join([config.MODEL_DATASET_PATH, split]) imagePaths = list(paths.list_images(p)) # randomly shuffle the image paths and then extract the class labels # from the file paths. It is more efficient to shuffle the classes # now, instead of during the training; random.shuffle(imagePaths) # get the labels in the same order as the random image paths # path/dataset/training/nonfood/0_123.jpb # index of -2 references 'nonfood' labels = [imagePath.split(os.path.sep)[-2] for imagePath in imagePaths] # if the label encoder is None, create it if label_encoder is None: label_encoder = LabelEncoder() label_encoder.fit(labels) # open the output CSV file for writing Path(config.BASE_CSV_PATH.format(model_name)).mkdir(parents=True, exist_ok=True) csvPath = os.path.sep.join( [config.BASE_CSV_PATH.format(model_name), f"{split}.csv"]) csv = open(csvPath, "w") # loop over the images in batches that match the batchsize # for the model # will feed the image through the model in batches # to get the resulting vector for (b, i) in enumerate(range(0, len(imagePaths), config.BATCH_SIZE)): # extract the batch of images and labels, then initialize the # list of actual images that will be passed through the network # for feature extraction logger.info( f"Processing batch {b+1}/{int(np.ceil(len(imagePaths)/float(config.BATCH_SIZE)))}" ) batchPaths = imagePaths[i:i + config.BATCH_SIZE] batchLabels = label_encoder.transform(labels[i:i + config.BATCH_SIZE]) batchImages = [] for imagePath in batchPaths: # load the input image using the keras helpt utility # while ensuring the image is resized to 224x224 pixels image = load_img(imagePath, target_size=(224, 224)) image = img_to_array(image) # preprocess the image by: # 1 - expanding the dimensions because the model expects and array of array of image values # and what image currently is, is a single array image = np.expand_dims(image, axis=0) # 2 - subracting the mean RGB pixel intensity from the ImageNet dataset image = imagenet_utils.preprocess_input(image) # add the image to the batch collection batchImages.append(image) # at this point we are ready to pass the image through the model network to extract the # features. which in this case is an array/vector of size: 7*7*512 # pass the images through the network nad use the outputs as # our actual features, then reshape the features into a flattened volume batchImages = np.vstack(batchImages) # recall our base_model has the front FCN layer REMOVED so we are getting the output # of the convolutional network. features = base_model.predict(batchImages, batch_size=config.BATCH_SIZE) # reshape features into an array of array features = features.reshape((features.shape[0], feature_shape)) # loop over the class labels and extracted features for (label, vec) in zip(batchLabels, features): # construct a row that exists of the class label and extracted features vec = ",".join([str(v) for v in vec]) csv.write(f"{label},{vec}\n") # close file csv.close() Path(config.LE_PATH.format(model_name)).mkdir(parents=True, exist_ok=True) f = open(config.LE_FILE.format(model_name), "wb") f.write(pickle.dumps(label_encoder)) f.close()
train_df[col + '_target_mean'] = train_df[col].map(temp_dict) test_df[col + '_target_mean'] = test_df[col].map(temp_dict) # %% [code] ########################### Encode Str columns for col in list(train_df): if train_df[col].dtype == 'O': print(col) train_df[col] = train_df[col].fillna('unseen_before_label') test_df[col] = test_df[col].fillna('unseen_before_label') train_df[col] = train_df[col].astype(str) test_df[col] = test_df[col].astype(str) le = LabelEncoder() le.fit(list(train_df[col]) + list(test_df[col])) train_df[col] = le.transform(train_df[col]) test_df[col] = le.transform(test_df[col]) train_df[col] = train_df[col].astype('category') test_df[col] = test_df[col].astype('category') # %% [code] ########################### TransactionAmt # Let's add some kind of client uID based on cardID and addr columns # The value will be very specific for each client so we need to remove it # from final feature. But we can use it for aggregations. train_df['uid'] = train_df['card1'].astype(str) + '_' + train_df[ 'card2'].astype(str) + '_' + train_df['card3'].astype(
kind='bar' ) #The line is unreliable cause some countries end up not showing plt.show() def yearCrisis(): pd.crosstab(x.country, y).plot( kind='bar' ) #The line is unreliable cause some countries end up not showing plt.show() '''IN ORDER To handle our data we shall replace the crisis no_crisis values in banking crisis to 0 and 1's ''' le = LabelEncoder() x['country'] = le.fit_transform( x['country']) #OUR COUNTRIES WOULD BE LABELLLED FROM 0 TO 13 y = le.fit_transform(y) #SAME GOES TO CRISIS NO_CRISIS values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1000, random_state=0) #print(y_test) #LOGISISTIC REGRESSION lr = LogisticRegression() lr.fit(x_train, y_train)
def create_handler(): train_x = [] train_y = [] train_idx = [] test_x = [] test_y = [] test_idx = [] print(csv_original) if (problem_type.active == 1): notifier.text = """ Making DB - Regression """ else: notifier.text = """ Making DB - Classification """ if (len(param_key.value) == 0): slide = round(slider_window.value) xs = csv_original[param_x.value].values ys = csv_original[param_y.value].values train_ratio = round(slider_train_ratio.value) if (train_ratio != 0): train_x_set = xs[:xs.shape[0] * train_ratio / 100] train_y_set = ys[:ys.shape[0] * train_ratio / 100] for start, stop in zip(range(0, train_x_set.shape[0] - slide), range(slide, train_x_set.shape[0])): train_x.append(train_x_set[start:start + slide]) train_y.append(train_y_set[start:start + slide][-1]) train_idx.append('0') train_x = np.asarray(train_x) train_x = np.swapaxes(train_x, 1, 2) train_x = np.expand_dims(train_x, -1) train_idx = np.asarray(train_idx) if (train_ratio != 100): test_x_set = xs[xs.shape[0] * train_ratio / 100:] test_y_set = ys[ys.shape[0] * train_ratio / 100:] for start, stop in zip(range(0, test_x_set.shape[0] - slide), range(slide, test_x_set.shape[0])): test_x.append(test_x_set[start:start + slide]) test_y.append(test_y_set[start:start + slide][-1]) test_idx.append('1') test_x = np.asarray(test_x) test_x = np.swapaxes(test_x, 1, 2) test_x = np.expand_dims(test_x, -1) test_idx = np.asarray(test_idx) train_x = np.asarray(train_x) test_x = np.asarray(test_x) all_y = train_y + test_y data_train = {} data_test = {} if (problem_type.active == 1): train_y = np.asarray(train_y) train_y = np.expand_dims(train_y, -1) test_y = np.asarray(test_y) test_y = np.expand_dims(test_y, -1) elif (problem_type.active == 0): encoder = LabelEncoder() encoder.fit(all_y) encoded_y = encoder.transform(all_y) category_y = np_utils.to_categorical(encoded_y) labels = [] for y in all_y: if (y not in labels): labels.append(y) data_train['labels'] = np.asarray(labels) data_test['labels'] = np.asarray(labels) train_y = category_y[:train_x.shape[0]] test_y = category_y[train_x.shape[0]:] data_train['x'] = train_x data_train['y'] = train_y data_train['key1'] = train_idx data_train['slideing_window'] = slider_window.value data_test['x'] = test_x data_test['y'] = test_y data_test['key1'] = test_idx data_test['slideing_window'] = slider_window.value if (problem_type.active == 1): print("Regression") target_dir = 'Regression/' elif (problem_type.active == 0): print("Classification") target_dir = 'Classification/' time_window = '[' + str(round(slider_window.value)) + ']' if (train_ratio != 0): np.save( "./np/" + target_dir + time_window + text_title.value + "_train.npy", data_train) if (train_ratio != 100): np.save( "./np/" + target_dir + time_window + text_title.value + "_test.npy", data_test) elif (len(param_key.value) == 1): key1_list = list(csv_original[param_key.value[0]].unique()) train_ratio = round(slider_train_ratio.value) if (train_ratio == 0): train_key = [] test_key = key1_list[int(len(key1_list) * train_ratio / 100):] elif (train_ratio == 100): train_key = key1_list[:int(len(key1_list) * train_ratio / 100)] test_key = [] else: train_key = key1_list[:int(len(key1_list) * train_ratio / 100)] test_key = key1_list[int(len(key1_list) * train_ratio / 100):] for key in train_key: num_elements = csv_original[csv_original[param_key.value[0]] == key].shape[0] slide = round(slider_window.value) if (num_elements < slide): continue xs = csv_original[csv_original[param_key.value[0]] == key][ param_x.value].values ys = csv_original[csv_original[param_key.value[0]] == key][ param_y.value].values for start, stop in zip(range(0, num_elements - slide), range(slide, num_elements)): train_x.append(xs[start:start + slide]) train_y.append(ys[start:start + slide][-1]) train_idx.append(key) for key in test_key: num_elements = csv_original[csv_original[param_key.value[0]] == key].shape[0] slide = round(slider_window.value) if (num_elements < slide): continue xs = csv_original[csv_original[param_key.value[0]] == key][ param_x.value].values ys = csv_original[csv_original[param_key.value[0]] == key][ param_y.value].values for start, stop in zip(range(0, num_elements - slide), range(slide, num_elements)): test_x.append(xs[start:start + slide]) test_y.append(ys[start:start + slide][-1]) test_idx.append(key) all_y = train_y + test_y train_x = np.asarray(train_x) if (train_ratio != 0): train_x = np.swapaxes(train_x, 1, 2) train_x = np.expand_dims(train_x, -1) train_idx = np.asarray(train_idx) test_x = np.asarray(test_x) if (train_ratio != 100): test_x = np.swapaxes(test_x, 1, 2) test_x = np.expand_dims(test_x, -1) test_idx = np.asarray(test_idx) data_train = {} data_test = {} if (problem_type.active == 1): train_y = np.asarray(train_y) train_y = np.expand_dims(train_y, -1) test_y = np.asarray(test_y) test_y = np.expand_dims(test_y, -1) elif (problem_type.active == 0): encoder = LabelEncoder() encoder.fit(all_y) encoded_y = encoder.transform(all_y) category_y = np_utils.to_categorical(encoded_y) labels = [] for y in all_y: if (y not in labels): labels.append(y) data_train['labels'] = np.asarray(labels) data_test['labels'] = np.asarray(labels) train_y = category_y[:train_x.shape[0]] test_y = category_y[train_x.shape[0]:] data_train['x'] = train_x data_train['y'] = train_y data_train['key1'] = train_idx data_train['slideing_window'] = slider_window.value data_test['x'] = test_x data_test['y'] = test_y data_test['key1'] = test_idx data_test['slideing_window'] = slider_window.value print(train_x.shape) print(train_y.shape) print(test_x.shape) print(test_y.shape) if (problem_type.active == 1): print("Regression") target_dir = 'Regression/' elif (problem_type.active == 0): print("Classification") target_dir = 'Classification/' print(train_x.shape) time_window = '[' + str(round(slider_window.value)) + ']' if (train_ratio == 0): np.save( "./np/" + target_dir + time_window + text_title.value + "_test.npy", data_test) elif (train_ratio == 100): np.save( "./np/" + target_dir + time_window + text_title.value + "_train.npy", data_train) else: np.save( "./np/" + target_dir + time_window + text_title.value + "_train.npy", data_train) np.save( "./np/" + target_dir + time_window + text_title.value + "_test.npy", data_test) elif (len(param_key.value) == 2): keys_list = csv_original[param_key.value].drop_duplicates() train_ratio = round(slider_train_ratio.value) if (train_ratio == 0): train_key = [] test_key = keys_list.iloc[ int(len(keys_list) * slider_train_ratio.value / 100):] elif (train_ratio == 100): train_key = keys_list.iloc[:int( len(keys_list) * slider_train_ratio.value / 100)] test_key = [] else: train_key = keys_list.iloc[:int( len(keys_list) * slider_train_ratio.value / 100)] test_key = keys_list.iloc[ int(len(keys_list) * slider_train_ratio.value / 100):] if (train_ratio != 0): for index, row in train_key.iterrows(): key1 = row[param_key.value[0]] key2 = row[param_key.value[1]] cond1 = csv_original[param_key.value[0]] == key1 cond2 = csv_original[param_key.value[0]] == key2 csv_target = csv_original[cond1 & cond2] num_elements = csv_target.shape[0] if (num_elements < slider_window.value): continue xs = csv_target[param_x.value].values ys = csv_target[param_y.value].values for start, stop in zip( range(0, num_elements - slider_window.value), range(slider_window.value, num_elements)): train_x.append(xs[start:start + slider_window.value]) train_y.append(ys[start:start + slider_window.value][-1]) train_idx.append(str(key1) + "_" + str(key2)) if (train_ratio != 100): for index, row in test_key.iterrows(): key1 = row[param_key.value[0]] key2 = row[param_key.value[1]] cond1 = csv_original[param_key.value[0]] == key1 cond2 = csv_original[param_key.value[1]] == key2 csv_target = csv_original[cond1 & cond2] num_elements = csv_target.shape[0] if (num_elements < slider_window.value): continue xs = csv_target[param_x.value].values ys = csv_target[param_y.value].values for start, stop in zip( range(0, num_elements - slider_window.value), range(slider_window.value, num_elements)): test_x.append(xs[start:start + slider_window.value]) test_y.append(ys[start:start + slider_window.value][-1]) test_idx.append(str(key1) + "_" + str(key2)) all_y = train_y + test_y train_x = np.asarray(train_x) if (train_ratio != 0): train_x = np.swapaxes(train_x, 1, 2) train_x = np.expand_dims(train_x, -1) train_idx = np.asarray(train_idx) test_x = np.asarray(test_x) if (train_ratio != 100): test_x = np.swapaxes(test_x, 1, 2) test_x = np.expand_dims(test_x, -1) test_idx = np.asarray(test_idx) data_train = {} data_test = {} if (problem_type.active == 1): train_y = np.asarray(train_y) train_y = np.expand_dims(train_y, -1) test_y = np.asarray(test_y) test_y = np.expand_dims(test_y, -1) elif (problem_type.active == 0): encoder = LabelEncoder() encoder.fit(all_y) encoded_y = encoder.transform(all_y) category_y = np_utils.to_categorical(encoded_y) labels = [] for y in all_y: if (y not in labels): labels.append(y) data_train['labels'] = np.asarray(labels) data_test['labels'] = np.asarray(labels) train_y = category_y[:train_x.shape[0]] test_y = category_y[train_x.shape[0]:] data_train['x'] = train_x data_train['y'] = train_y data_train['key1'] = train_idx data_train['slideing_window'] = slider_window.value data_test['x'] = test_x data_test['y'] = test_y data_test['key1'] = test_idx data_test['slideing_window'] = slider_window.value print(train_x.shape) print(train_y.shape) print(test_x.shape) print(test_y.shape) if (problem_type.active == 1): print("Regression") target_dir = 'Regression/' elif (problem_type.active == 0): print("Classification") target_dir = 'Classification/' print(train_x.shape) time_window = '[' + str(round(slider_window.value)) + ']' if (train_ratio == 0): np.save( "./np/" + target_dir + time_window + text_title.value + "_test.npy", data_test) elif (train_ratio == 100): np.save( "./np/" + target_dir + time_window + text_title.value + "_train.npy", data_train) else: np.save( "./np/" + target_dir + time_window + text_title.value + "_train.npy", data_train) np.save( "./np/" + target_dir + time_window + text_title.value + "_test.npy", data_test) notifier.text = """ DB creation complete """
def train_log_reg(filename, sentiment_words_file, seed=42): train_df = load_finphrase(filename) # Samples pd.set_option("display.max_colwidth", -1) logging.debug(train_df.sample(n=20, random_state=seed)) # Encode the label le = LabelEncoder() le.fit(train_df["label"]) train_df["label"] = le.transform(train_df["label"]) logging.debug(list(le.classes_)) logging.debug(train_df["label"]) corpus = create_corpus(train_df) # visualize_frequent_words(corpus, stop_words) # generate_word_cloud(corpus, stop_words) # Load sentiment data sentiment_df = pd.read_csv(sentiment_words_file) # Make all words lower case sentiment_df["word"] = sentiment_df["word"].str.lower() sentiments = sentiment_df["sentiment"].unique() sentiment_df.groupby(by=["sentiment"]).count() sentiment_dict = { sentiment: sentiment_df.loc[sentiment_df["sentiment"] == sentiment][ "word" ].values.tolist() for sentiment in sentiments } columns = [ "tone_score", "word_count", "n_pos_words", "n_neg_words", "pos_words", "neg_words", ] # Analyze tone for original text dataframe print(train_df.shape) tone_lmdict = [ tone_count_with_negation_check(sentiment_dict, x.lower()) for x in tqdm(train_df["sentence"], total=train_df.shape[0]) ] tone_lmdict_df = pd.DataFrame(tone_lmdict, columns=columns) train_tone_df = pd.concat([train_df, tone_lmdict_df.reindex(train_df.index)], axis=1) train_tone_df.head() # Show corelations to next_decision plt.figure(figsize=(10, 6)) corr_columns = ["label", "n_pos_words", "n_neg_words"] sns.heatmap( train_tone_df[corr_columns].astype(float).corr(), cmap="coolwarm", annot=True, fmt=".2f", vmin=-1, vmax=1, ) # plt.show() # X and Y data used Y_data = train_tone_df["label"] X_data = train_tone_df[["tone_score", "n_pos_words", "n_neg_words"]] # Train test split (Shuffle=False will make the test data for the most recent ones) X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X_data.values, Y_data.values, test_size=0.2, shuffle=True ) # Tokenize tokenized, tokenized_text, bow, vocab, id2vocab, token_ids = tokenize_df( train_tone_df, col="sentence", lemma=True, stopwords=True, tokenizer="NLTK" ) sns.distplot([len(x) for x in tokenized_text]) # X and Y data used Y_data = train_tone_df["label"] X_data = tokenized_text # Train test split (Shuffle=False will make the test data for the most recent ones) X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X_data, Y_data.values, test_size=0.2, shuffle=True ) pipeline = Pipeline( [("vec", TfidfVectorizer(analyzer="word")), ("clf", LogisticRegression())] ) pipeline.fit(X_train, Y_train) pred_train = pipeline.predict(X_train) pred_test = pipeline.predict(X_test) # Define metrics # Here, use F1 Macro to evaluate the model. def metric(y_true, y_pred): acc = accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred, average="macro") return acc, f1 acc, f1 = metric(Y_train, pred_train) logging.info("Training - acc: %.8f, f1: %.8f" % (acc, f1)) acc, f1 = metric(Y_test, pred_test) logging.info("Test - acc: %.8f, f1: %.8f" % (acc, f1)) return pipeline
from numpy import mean from numpy import std from pandas import read_csv from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import MinMaxScaler # load dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv' dataframe = read_csv(url, header=None) data = dataframe.values # separate into input and output elements X, y = data[:, :-1], data[:, -1] # minimally prepare dataset X = X.astype('float') y = LabelEncoder().fit_transform(y.astype('str')) # define the modeling pipelinw model = LogisticRegression(solver='liblinear') #Here, we are normalizing! scaler = MinMaxScaler() #Pipeline is done just so that we scale after splitting, in this case, we are doing cross validation pipeline = Pipeline([('s',scaler),('m',model)]) # define the evaluation procedure cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # evaluate the model m_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1) # summarize the result print('Accuracy: %.3f (%.3f)' % (mean(m_scores), std(m_scores)))
class DistanceClassifier(BaseEstimator): """Multifactor Dimensionality Reduction (DistanceClassifier) for feature construction in machine learning""" def __init__(self, d='mahalanobis'): """Sets up the DistanceClassifier algorithm Parameters ---------- d: ('mahalanobis' or 'euclidean') Type of distance calculation to use Returns ------- None """ # Save params to be recalled later by get_params() self.params = locals() # Must be placed before any local variable definitions self.params.pop('self') self.d = d self.mu = None self.Z = None self.le = LabelEncoder() def fit(self, features, classes): """Constructs the DistanceClassifier from the provided training data Parameters ---------- features: array-like {n_samples, n_features} Feature matrix classes: array-like {n_samples} List of class labels for prediction Returns ------- None """ # class labels classes = self.le.fit_transform(classes) # group the data by class label X = [] self.mu = [] self.Z = [] for i in np.unique(classes): X.append(features[classes == i]) self.mu.append(np.mean(X[i],axis=0)) if self.d == 'mahalanobis': self.Z.append(np.cov(X[i].transpose())) return self def predict(self, features): """Predict class outputs for an unlabelled feature set""" # get distance of features to class clusters distances = [self._distance(x) for x in features] # assign class label belonging to smallest distance class_predict = [np.argmin(d) for d in distances] return self.le.inverse_transform(class_predict) def _distance(self,x): """returns distance measures for features""" distance = np.empty([len(self.mu)]) for i in np.arange(len(self.mu)): if self.d == 'mahalanobis' and self.is_invertible(self.Z[i]): distance[i] = (x - self.mu[i]).dot(np.linalg.inv(self.Z[i])).dot((x - self.mu[i]).transpose()) else: distance[i] = (x - self.mu[i]).dot((x - self.mu[i]).transpose()) return distance def fit_predict(self, features, classes): """Convenience function that fits the provided data then predicts the class labels Parameters ---------- features: array-like {n_samples, n_features} Feature matrix classes: array-like {n_samples} List of true class labels Returns ---------- array-like: {n_samples} Constructed features from the provided feature matrix """ self.fit(features, classes) return self.predict(features) def score(self, features, classes, scoring_function=accuracy_score, **scoring_function_kwargs): """Estimates the accuracy of the predictions from the constructed feature Parameters ---------- features: array-like {n_samples, n_features} Feature matrix to predict from classes: array-like {n_samples} List of true class labels Returns ------- accuracy_score: float The estimated accuracy based on the constructed feature """ if not self.mu: raise ValueError('The DistanceClassifier model must be fit before score() can be called') return scoring_function(classes, self.predict(features), **scoring_function_kwargs) def get_params(self, deep=None): """Get parameters for this estimator This function is necessary for DistanceClassifier to work as a drop-in feature constructor in, e.g., sklearn.cross_validation.cross_val_score Parameters ---------- deep: unused Only implemented to maintain interface for sklearn Returns ------- params: mapping of string to any Parameter names mapped to their values """ return self.params def is_invertible(self,X): """checks if Z is invertible""" if len(X.shape) == 2: return X.shape[0] == X.shape[1] and np.linalg.matrix_rank(X) == X.shape[0] else: return False
cols = list(cat_df) fig, axes = plt.subplots(nrows=2, ncols=2) for i in range(0, 2): for j in range(0, 2): sns.countplot(x=X_train[cols[i * 2 + j]], hue=y_train, ax=axes[i, j]) # -------------- #Importing header files from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import LabelEncoder # Code starts here for i in list(cat_df): X_train[i].fillna('NA') le = LabelEncoder() X_train[i] = le.fit_transform(X_train[i]) X_test[i].fillna('NA') le = LabelEncoder() X_test[i] = le.fit_transform(X_test[i]) #y_test = y_test.str.replace('No',0) y_train.replace({'No': 0, 'Yes': 1}, inplace=True) y_test.replace({'No': 0, 'Yes': 1}, inplace=True) # Code ends here from sklearn.metrics import accuracy_score model = DecisionTreeClassifier(random_state=0) model.fit(X_train, y_train) y_preds = model.predict(X_test)
target = pickle.load(open("../generated/group.p", "r")) device_id = pickle.load(open("../generated/device_id.p", "r")) trainDevices = pd.read_csv("../../../data/gender_age_train.csv", usecols=["device_id"]) indexes = pd.read_csv("../generated/raddarIndices.csv") indexes = pd.merge(trainDevices, indexes, how="left", on="device_id", left_index=True).reset_index().drop(["index"], axis=1) ################## # Pre Processing ################## targetEncoder = LabelEncoder() target = targetEncoder.fit_transform(target) skfTarget = target.copy() target = np_utils.to_categorical(target) ################## # Build Model ################## def modelBuilder(): model = Sequential() model.add( Dense(200, input_dim=train.shape[1], init='normal', activation="tanh")) model.add(Dropout(0.4)) model.add(Dense(70, input_dim=150, init='normal')) model.add(PReLU())
df['Sentence'] = df['Sentence'].map(lambda x: clean_text(x)) # In[12]: X, y = df['Sentence'], df['Emotions'] # ### Ex5: Transform y to one-hot-encoding # In[15]: onehot_y = None # YOUR CODE HERE from keras.utils import np_utils encoder = LabelEncoder() encoder.fit(y) encoded_y = encoder.transform(y) onehot_y = np_utils.to_categorical(encoded_y) # In[16]: vocab = set() a = [vocab.add(el) for s in X.values for el in s.split(' ')] print("Total Unique words:", len(a)) # In[17]: l = [len(s) for s in X.values] counts = Counter(l) plt.bar(counts.keys(), counts.values())
from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from sklearn import svm import numpy as np np.set_printoptions(precision=2) # Carrega a base de dados sonar. sonar = pd.read_excel('../Datasets/sonar.xlsx', sheetname=0) X = sonar.iloc[:, 0:(sonar.shape[1] - 1)] le = LabelEncoder() y = le.fit_transform(sonar.iloc[:, (sonar.shape[1] - 1)]) class_names = le.classes_ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) # Grid Search # Seleciona os parâmetros da SVM que deseja testar params = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]
# Load the dataset dataset = pd.read_csv("../../Datasets/RAVDESS/speechActorDataset.csv") # Split features and labels datasetCopy = dataset labels = datasetCopy['label'] features = datasetCopy.drop(columns='label') # Train Test Split trainIndex = int(len(features) * 0.7) train_features = features[:trainIndex] train_labels = labels[:trainIndex] test_features = features[trainIndex + 1:-1] test_labels = labels[trainIndex + 1:-1] lb = LabelEncoder() y_train = np_utils.to_categorical(lb.fit_transform(train_labels)) y_test = np_utils.to_categorical(lb.fit_transform(test_labels)) # Changing Dimension for CNN model x_traincnn = np.expand_dims(train_features, axis=2) x_testcnn = np.expand_dims(test_features, axis=2) model = Sequential() model.add(Conv1D(256, 5, padding='same', input_shape=(216, 1))) model.add(Activation('relu')) model.add(Conv1D(128, 5, padding='same')) model.add(Activation('relu')) model.add(Dropout(0.1)) model.add(MaxPooling1D(pool_size=(8)))
class WeightAverageEnsembleClassifier(BaseEstimator, ClassifierMixin): """ 多数決 or 重み付け平均化でのアンサンブルモデルの識別器 classifier の自作クラス. scikit-learn ライブラリの推定器 estimator の基本クラス BaseEstimator, ClassifierMixin を継承している. """ def __init__(self, classifiers, weights=[], train_modes=[], vote_method="probability_vote", clone=False): """ Args : classifiers : list <classifier オブジェクト> 分類器のクラスのオブジェクトのリスト weights : list <float> 各分類器の対する重みの値のリスト : __init()__ の引数と同名のオブジェクトの属性 vote_method : str ( "majority_vote" or "probability_vote" ) アンサンブルによる最終的な判断判断手法 : __init()__ の引数と同名のオブジェクトの属性 "majority_vote" : 弱識別器の多数決で決定する.多数決方式 (=クラスラベルの argmax() 結果) "probability_vote" : 弱識別器の確率値での重み付け結果で決定する.(=クラスの所属確率の argmax() 結果) """ self.classifiers = classifiers self.fitted_classifiers = classifiers self.weights = weights self.n_classes = 0 self.n_classifier = len(classifiers) self.train_modes = train_modes self.vote_method = vote_method self.clone = clone self.encoder = LabelEncoder() # classifiers で指定した各オブジェクトの名前 if classifiers != None: self.named_classifiers = { key: value for key, value in _name_estimators(classifiers) } else: self.named_classifiers = {} for i, named_classifier in enumerate(self.named_classifiers): print("name {} : {}".format( i, self.named_classifiers[named_classifier])) if (self.train_modes == []): for c in range(len(self.classifiers)): train_modes.append("train") return def fit(self, X_train, y_train, X_valid=None, y_valid=None): # LabelEncoder クラスを使用して, クラスラベルが 0 から始まるようにする.これは, self.predict() 関数内の np.argmax() 関数呼び出し時に重要となるためである. self.encoder.fit(y_train) y_train = self.encoder.transform(y_train) self.n_classes = self.encoder.classes_ # self.classifiers に設定されている分類器のクローン clone(clf) で fitting self.fitted_classifiers = [] for c, clf in enumerate(self.classifiers): if (self.train_modes[c] == "train"): if (self.clone): # clone() : 同じパラメータの 推定器を生成 fitted_clf = clone(clf).fit(X_train, y_train, X_valid, y_valid) else: fitted_clf = clf.fit(X_train, y_train, X_valid, y_valid) else: fitted_clf = clf self.fitted_classifiers.append(fitted_clf) return self def predict(self, X_test): #------------------------------------------------------------------------------------------------------ # アンサンブルの最終決定方式 vote_method が, 各弱識別器の重み付け方式 "probability_vote" のケース #------------------------------------------------------------------------------------------------------ if self.vote_method == "probability_vote": # np.argmax() : 指定した配列の最大要素のインデックスを返す # axis : 最大値を読み取る軸の方向 ( axis = 1 : shape が2次元配列 行方向) vote_results = np.argmax(self.predict_proba(X_test), axis=1) #------------------------------------------------------------------------------------------------------ # アンサンブルの最終決定方式 vote_method が, 多数決方式 "majority_vote" のケース #------------------------------------------------------------------------------------------------------ else: # 各弱識別器 clf の predict() 結果を predictions (list) に格納 predictions = [ clf.predict(X_test) for clf in self.fitted_classifiers ] """ for i in range(len(predictions)): print( "predictions[{}].shape : {}".format(i, predictions[i].shape) ) print( "predictions[{}][0:5] : {}".format(i, predictions[i][0:5]) ) """ # predictions を 転置し, 行と列 (shape) を反転 predictions = np.asarray(predictions).T # 各サンプルの所属クラス確率に重み付けで足し合わせた結果が最大となるようにし、列番号を返すようにする. vote_results = np.apply_along_axis( lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions) # vote_results を LabelEncoder で逆行列化して, shape を反転 vote_results = self.encoder.inverse_transform(vote_results) return vote_results def predict_proba(self, X_test): # 各弱識別器 clf の predict_prpba() 結果を predictions (list) に格納 predict_probas = [] for clf in self.fitted_classifiers: predict_proba = clf.predict_proba(X_test) predict_probas.append(predict_proba) #print( "predict_proba.shape : ", predict_proba.shape ) # shape = [n_classifer, n_features] predict_probas = np.asarray(predict_probas) # 平均化 ave_probas = np.average(predict_probas, axis=0, weights=self.weights) #print( "EnsembleLearningClassifier.predict_proba() { ave_probas } : \n", ave_probas ) return ave_probas
Created on Sun Feb 17 13:33:36 2019 @author: sidha """ ## Importing The Libraries import pandas as pd import matplotlib.pyplot as plt ## Importing a File dataset = pd.read_csv("Iris.csv") X = dataset.iloc[:, 1:5].values Y = dataset.iloc[:, -1].values ## Encoding the Categorical Variable from sklearn.preprocessing import LabelEncoder labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) ## Visulaising ## Parallel Coordinates from pandas.plotting import parallel_coordinates plt.figure(figsize=(15, 10)) parallel_coordinates(dataset.drop("Id", axis=1), "Species") plt.title("Parallel Coordinates Plot", fontsize=20, fontweight="bold") plt.xlabel("Features", fontsize=15) plt.ylabel("Features Values", fontsize=15) plt.legend(loc=1, prop={"size": 15}, frameon=True, shadow=True, facecolor="White",
X[0] # In[6]: Y[0] # ## Preprocess the Data # In[7]: from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler le_Y = LabelEncoder() Y = le_Y.fit_transform(Y) Y = Y.reshape(len(Y), 1) ohe = OneHotEncoder(categorical_features=[0]) Y = ohe.fit_transform(Y).toarray() Y[0] # In[8]: sc_X = StandardScaler() X = sc_X.fit_transform(X) # In[9]:
def __encode_data(self, dataframe, label_to_encode): y = dataframe[label_to_encode] encoder = LabelEncoder() y = encoder.fit_transform(y) return y
""" import numpy as np import matplotlib.pyplot as plt import pandas as pd """## Preprocess""" dataset = pd.read_csv('Churn_Modelling.csv') X = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values X # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Country labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) # Sex : male, female labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) # encoded numeric values -> categorical onehotencoder = OneHotEncoder(categorical_features=[1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] """### dataset""" from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
def main(): train_0 = pd.read_csv('train.csv') test_0 = pd.read_csv('test.csv') #print(train_0.head(10)) header = [ 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime' ] # 删除无用特征 user_id = test_0['user_id'] train_0 = train_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1) test_0 = test_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1) #特征编码 for index in header: LE = LabelEncoder() train_0[index] = LE.fit_transform(train_0[index]) test_0[index] = LE.transform(test_0[index]) LE = LabelEncoder() label_0 = LE.fit_transform(train_0['Attrition']) train_0 = train_0.drop(['Attrition'], axis=1) train_x, train_y, label_x, label_y = train_test_split(train_0, label_0, test_size=0.3, random_state=1) # 标准化 # LGBM 调参 parameters = { 'max_depth': [15, 20, 25], 'learning_rate': [0.01, 0.05], 'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95], 'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95], 'bagging_freq': [2, 4, 5, 6, 8], 'lambda_l1': [0.6, 0.7, 0.8], 'lambda_l2': [0, 15, 35], 'cat_smooth': [1, 10, 15] } LGB = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metric='auc', verbose=0, learning_rate=0.01, num_leaves=35, feature_fraction=0.8, bagging_fraction=0.7, bagging_freq=2, lambda_l1=0.8, lambda_l2=0, max_depth=15, #silent = False cat_smooth=1) # gsearch = GridSearchCV(LGB, param_grid=parameters, scoring='roc_auc', cv = 3) # gsearch.fit(train_0, label_0) # # print("Best score: %0.3f" % gsearch.best_score_) # print("Best parameters set:") # best_parameters = gsearch.best_estimator_.get_params() # for param_name in sorted(parameters.keys()): # print("\t%s: %r" % (param_name, best_parameters[param_name])) # LGB.fit(train_0, label_0) # predict = LGB.predict_proba(test_0)[:,1] # # test_0['Attrition'] = predict # test_0['user_id'] = user_id # test_0[['user_id','Attrition']].to_csv('submit_lgb.csv', index = False) LGB.fit(train_x, label_x) predict = LGB.predict_proba(train_y)[:, 1] print("LGB auc:%0.6lf" % metrics.roc_auc_score(label_y, predict)) SVM = SVC(kernel='rbf', probability=True, C=0.2) SVM.fit(train_x, label_x) predict_svm = SVM.predict_proba(train_y)[:, 1] print("SVM auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm)) CAT = cat.CatBoostClassifier() CAT.fit(train_x, label_x) predict_svm = CAT.predict_proba(train_y)[:, 1] print("cat auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm)) NG = ng.NGBClassifier() NG.fit(train_x, label_x) predict_ng = NG.pred_dist(train_y) predict_ng = predict_ng.probs[1, :] print("NG auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_ng))