def get_test(dim=128,maxlen=500,name='test.csv',events=None): X_train = pd.read_csv(path+name, dtype={'device_id': np.str}) X_train["app_lab"] = X_train["device_id"].map(events) X_train.fillna('0 ',inplace=True) x_train = X_train["app_lab"].values phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv', dtype={'device_id': np.str}) phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True) phone_brand_le = LabelEncoder() phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand']) device_model_le = LabelEncoder() phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model']) X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True) X_train.fillna(0,inplace=True) phone_brand = X_train['phone_brand'].values device_model = X_train['device_model'].values x_train = [ x.split(' ') for x in x_train] for i in range(len(x_train)): x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')] x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_train = [x_train,phone_brand,device_model] return x_train
def load_data(): train_list = [] for line in open('../data/train_clean.json', 'r'): train_list.append(json.loads(line)) train = pd.DataFrame(train_list) #train_work = train[names[-1]] test_list = [] for line in open('../data/test_clean.json', 'r'): test_list.append(json.loads(line)) test = pd.DataFrame(test_list) print('--- NLP on major, simply cut the first word') le = LabelEncoder() print len(set(train['major'])) train['major'] = train['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') test['major'] = test['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') print len(set(train['major'])) le.fit(list(train['major']) + list(test['major'])) train['major'] = le.transform(train['major']) test['major'] = le.transform(test['major']) le = LabelEncoder() train['gender'] = le.fit_transform(train['gender']) names = train.columns le = LabelEncoder() test['gender'] = le.fit_transform(test['gender']) del train['_id'] del test['_id'] train = train.fillna(0) test = test.fillna(0) #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii'))) return train, test
def label_encode_train_test_sets (train, test) : " Label encode 'supplier' and 'bracket_pricing' features for both train and test set " test_suppliers = np.sort(pd.unique(test.supplier.ravel())) print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers) train_suppliers = np.sort(pd.unique(train.supplier.ravel())) print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers) ## Merge 'supplier' for both datasets first because we want encoding to be consistent across both # http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html supplier_ids = [] supplier_ids.extend(train_suppliers) supplier_ids.extend(test_suppliers) supplier_ids = np.sort(np.unique(supplier_ids)) print ("Merged supplier_ids.shape: ", supplier_ids.shape) # print ("supplier_ids.elements: ", supplier_ids) ## Perform label encoding fit on the merged array and then individually transform for train and test sets print ("Performing label encoding on supplier column...") label_e = LabelEncoder() label_e.fit(supplier_ids) train['supplier'] = label_e.transform(train['supplier']) test['supplier'] = label_e.transform(test['supplier']) ## Perform label encoding on 'bracket_pricing' print ("Performing label encoding on bracket_pricing column...") train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing']) test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing']) return train, test
def prepare_items_features(user_items_csv, out_dir): array = np.loadtxt(user_items_csv, delimiter='|', dtype=np.dtype(np.uint64)) le = LabelEncoder() col1 = le.fit_transform(array[:, 1].T) col2 = le.fit_transform(array[:, 2].T) col3 = le.fit_transform(array[:, 3].T) col4 = le.fit_transform(array[:, 4].T) columns = np.array([col1, col2, col3, col4]).T enc = OneHotEncoder() print(array[:10]) encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()] print(encoded[:10]) print(encoded.shape) user_id = encoded[0][0] rows = [] current = np.zeros(encoded.shape[1]-1) for i in range(encoded.shape[0]): if encoded[i][0] != user_id: rows.append(np.concatenate([[user_id], current])) user_id = encoded[i][0] current = np.zeros(encoded.shape[1]-1) else: current = np.sum([current, encoded[i, 1:]], axis=0) rows.append(np.concatenate([[user_id], current])) array = np.array(rows) print(array.shape) # let's serialize array np.save(os.path.join(out_dir, "user_items"), array)
def process_raw_label(): df = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'], ['blue', 'XL', 15.3, 'class1'] ]) df.columns = ['color', 'size', 'price', 'classlabel'] print(df) size_mapping = { 'XL': 3, 'L': 2, 'M': 1 } df['size'] = df['size'].map(size_mapping) print(df) class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))} print(class_mapping) df['classlabel'] = df['classlabel'].map(class_mapping) print(df) # inv inv_class_mapping = {v: k for k, v in class_mapping.items()} df['classlabel'] = df['classlabel'].map(inv_class_mapping) print(df) class_le = LabelEncoder() y = class_le.fit_transform(df['classlabel'].values) print(y) x = df[['color', 'size', 'price']].values print(x) color_le = LabelEncoder() x[:, 0] = color_le.fit_transform(x[:, 0]) print('label encoder\n', x) ohe = OneHotEncoder(categorical_features=[0], sparse=False) x = ohe.fit_transform(x) print(x) print(pd.get_dummies(df[['price', 'color', 'size']]))
def train_test(self, X, y, X_test): """ """ le = LabelEncoder() id_123 = np.logical_or(np.logical_or(y==1, y==2), y==3) y0 = np.zeros(len(y), dtype=np.int32) y0[id_123] = 1 X0 = np.copy(X) y0 = le.fit_transform(y0).astype(np.int32) X1 = X[id_123] y1 = y[id_123] y1 = le.fit_transform(y1).astype(np.int32) X2 = X[np.logical_not(id_123)] y2 = y[np.logical_not(id_123)] y2 = le.fit_transform(y2).astype(np.int32) print 'working on nn0...' self.nn0.max_epochs = self.early_stopping0.best_valid_epoch self.nn0.verbose=0 self.nn0.fit(X0, y0) y0_pred = self.nn0.predict_proba(X_test) print 'working on nn1...' self.nn1.max_epochs = self.early_stopping1.best_valid_epoch self.nn1.verbose=0 self.nn1.fit(X1, y1) y1_pred = self.nn1.predict_proba(X_test) print 'working on nn2...' self.nn2.max_epochs = self.early_stopping2.best_valid_epoch self.nn2.verbose=0 self.nn2.fit(X2, y2) y2_pred = self.nn2.predict_proba(X_test) y_pred = np.zeros((y0_pred.shape[0], 9)) y_pred[:,0] = y0_pred[:,0]*y2_pred[:,0] y_pred[:,1] = y0_pred[:,1]*y1_pred[:,0] y_pred[:,2] = y0_pred[:,1]*y1_pred[:,1] y_pred[:,3] = y0_pred[:,1]*y1_pred[:,2] y_pred[:,4] = y0_pred[:,0]*y2_pred[:,1] y_pred[:,5] = y0_pred[:,0]*y2_pred[:,2] y_pred[:,6] = y0_pred[:,0]*y2_pred[:,3] y_pred[:,7] = y0_pred[:,0]*y2_pred[:,4] y_pred[:,8] = y0_pred[:,0]*y2_pred[:,5] yp0 = y_pred self.cal_clf.fit(X, y) yp1 = self.cal_clf.predict_proba(X_test) y_pred = (yp0 + yp1)/2. return y_pred
def label_encoding(self, x: pd.DataFrame, y: pd.DataFrame, services: list) -> (pd.DataFrame, pd.DataFrame): le = LabelEncoder() le = le.fit(services) x['service'] = le.transform(x['service']) for feature in ["protocol_type", "flag"]: x[feature] = le.fit_transform(x[feature]) y = le.fit_transform(y) print(le.classes_) return x, y
def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() ret = le.fit_transform([1, 1, 4, 5, -1, 0]) assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) le = LabelEncoder() ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(ret, [1, 1, 2, 0])
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None): X_train = pd.read_csv(path+name) group_le = LabelEncoder() group_lb = LabelBinarizer() labels = group_le.fit_transform(X_train['group'].values) labels = group_lb.fit_transform(labels) del labels ################## # Phone Brand ################## # print("# Read Phone Brand") phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv', dtype={'device_id': np.str}) phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True) phone_brand_le = LabelEncoder() phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand']) device_model_le = LabelEncoder() phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model']) while 1: data = pd.read_csv(path+name,iterator=True,chunksize=batch_size, dtype={'device_id': np.str}) for X_train in data: X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True) phone_brand = X_train['phone_brand'].values device_model = X_train['device_model'].values X_train["app_lab"] = X_train["device_id"].map(events) y_train = X_train['group'].values X_train['gender'][X_train['gender']=='M']=1 X_train['gender'][X_train['gender']=='F']=0 y_train_gender = X_train['gender'].values y_train_age = X_train['age'].values # take log transformation y_train_age = np.log(y_train_age) X_train.fillna('0 ',inplace=True) y_train = group_le.transform(y_train) y_train = group_lb.transform(y_train) x_train = X_train["app_lab"].values x_train = [ x.split(' ') for x in x_train] for i in range(len(x_train)): x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')] x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_train = [x_train,phone_brand,device_model] y_train = [y_train,y_train_gender,y_train_age] yield (x_train,y_train)
def process_data(trainDF, testDF): # 去除train,test中的无用列,并做数据合并 trainDF.drop(['Descript', 'Resolution'], axis=1, inplace=True) testDF.drop(['Id'], axis=1, inplace=True) labels = trainDF['Category'].copy() y = trainDF['Category'].copy() combi = pd.concat([trainDF.drop(['Category'], axis=1), testDF]) combi['Month'], combi['Day'], combi['Hour'] = zip(*combi['Dates'].apply(extract_time)) combi.drop(['Dates'], axis=1, inplace=True) combi['intesect'] = combi['Address'].apply(lambda x: 1 if '/' in x else 0) combi['Wake'] = combi['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0) addresses = sorted(combi['Address'].unique()) categories = sorted(trainDF['Category'].unique()) addr_counts = combi.groupby('Address').size() cat_counts = trainDF.groupby('Category').size() addr_cat_counts = trainDF.groupby(['Address', 'Category']).size() # 使用counts learning方法对地址信息和分类结果进行特征提取, 可参考https://msdn.microsoft.com/en-us/library/azure/dn913056.aspx logoddsPA = {} logodds = {} PA = cat_counts/float(len(trainDF)) default_logodds = np.log(PA/(1-PA)) for addr in addresses: PA = addr_counts[addr]/float(len(combi)) logoddsPA[addr] = np.log(PA/(1.0-PA)) logodds[addr] = deepcopy(default_logodds) if addr in addr_cat_counts.keys(): for cat in addr_cat_counts[addr].keys(): if addr_cat_counts[addr][cat] >= 2 and addr_cat_counts[addr][cat] < addr_counts[addr]: PA = addr_cat_counts[addr][cat] / float(addr_counts[addr]) logodds[addr][categories.index(cat)] = np.log(PA/(1.0-PA)) logodds[addr] = pd.Series(logodds[addr]) logodds[addr].index = range(len(categories)) combi['LogoddsPA'] = combi['Address'].apply(lambda x: logoddsPA[x]) logodds_features = combi['Address'].apply(lambda x: logodds[x]) logodds_features.colums = ["logodds"+str(x) for x in range(len(categories))] combi_full = pd.concat([combi, logodds_features], axis=1) xy_scaler = StandardScaler() combi_full[['X', 'Y']] = xy_scaler.fit_transform(combi_full[['X', 'Y']]) # 进行label encoding lbe = LabelEncoder() combi_full['DayOfWeek'] = lbe.fit_transform(combi_full['DayOfWeek']) combi_full['PdDistrict'] = lbe.fit_transform(combi_full['PdDistrict']) combi_full['Wake'] = combi_full['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0) combi_full["IsDup"]=pd.Series(combi_full.duplicated()|combi_full.duplicated(take_last=True)).apply(int) combi_full.drop(['Address'], axis=1, inplace=True) y = lbe.fit_transform(y) # 由于采用xgboost,可不对特征进行dummy处理 #ohe = OneHotEncoder(categorical_features=[0, 1,4,5,6]) #data = ohe.fit_transform(combi_full.values) train = combi_full.values[:878049, :] test = combi_full.values[878049:, :] return train, test, y, lbe.classes_
def execute(self,data): print 'started label encoding step' le = LabelEncoder() output_array = le.fit_transform(data[self.column_list[0]]) for i in range(1,len(self.column_list)): output_array=np.column_stack([output_array,le.fit_transform(data[self.column_list[i]])]) otherCols = set(data.columns).difference(set(self.column_list)) df1 = data[list(otherCols)] df2 = pd.DataFrame(output_array,columns=self.column_list) df1 = df1.join(df2,how='left') print 'finished label encoding step' return df1
def data_preprocess(df): dropLst = ['Unnamed: 0', 'STATION_NAME', 'STATISTICAL_CODE_DESCRIPTION', 'CrimeCat'] df['STREET'] = df['STREET'].apply(get_rid_num) df['ZIP'] = df['ZIP'].apply(int) le = LabelEncoder() df['STREET'] = le.fit_transform(df['STREET']) df['CITY'] = le.fit_transform(df['CITY']) feature_names = df.drop(dropLst, axis=1).columns X = df.drop(dropLst, axis=1).values y = df['CrimeCat'].values lb = LabelBinarizer() y = lb.fit_transform(y) return X, y, feature_names
def fit_transform(self, dframe): """ Fit label encoder and return encoded labels. Access individual column classes via indexing `self.all_classes_` Access individual column encoders via indexing `self.all_encoders_` Access individual column encoded labels via indexing `self.all_labels_` """ df = dframe.copy() # if columns are provided, iterate through and get `classes_` if self.columns is not None: # ndarray to hold LabelEncoder().classes_ for each # column; should match the shape of specified `columns` self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object) self.all_encoders_ = np.ndarray(shape=self.columns.shape, dtype=object) self.all_labels_ = np.ndarray(shape=self.columns.shape, dtype=object) for idx, column in enumerate(self.columns): # instantiate LabelEncoder le = LabelEncoder() # fit and transform labels in the column df.loc[:, column] =\ le.fit_transform(df.loc[:, column].values) # append the `classes_` to our ndarray container self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) self.all_encoders_[idx] = le self.all_labels_[idx] = le else: # no columns specified; assume all are to be encoded self.columns = df.iloc[:, :].columns self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object) for idx, column in enumerate(self.columns): le = LabelEncoder() df.loc[:, column] = le.fit_transform( df.loc[:, column].values) self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) self.all_encoders_[idx] = le return df
def transformTestData(self, train_data, test_data): #Select the right features for both training and testing data X_train, y_train = self.__selectRelevantFeatures(train_data) X_test, y_test = self.__selectRelevantFeatures(test_data) #Transform categorical variables into integer labels martial_le = LabelEncoder() occupation_le = LabelEncoder() relationship_le = LabelEncoder() race_le = LabelEncoder() sex_le = LabelEncoder() transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le] for i in range(len(transformers)): X_train[:,i] = transformers[i].fit_transform(X_train[:,i]) X_test[:,i] = transformers[i].transform(X_test[:,i]) #Dummy code categorical variables dummy_code = OneHotEncoder(categorical_features = range(5)) X_train = dummy_code.fit_transform(X_train).toarray() X_test = dummy_code.transform(X_test).toarray() #Normalize all features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Encode y class_le = LabelEncoder() y_train = class_le.fit_transform(y_train) y_test = class_le.transform(y_test) #print class_le.transform(["<=50K", ">50K"]) return X_train, X_test, y_train, y_test
def load_otto_group(): """ Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition. Link: https://www.kaggle.com/c/otto-group-product-classification-challenge Returns ---------- data : array-like Pandas data frame containing the entire data set. X : array-like Training input samples. y : array-like Target values. """ file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip') z = ZipFile(file_location) data = pd.read_csv(z.open('train.csv')) data = data.set_index('id') # move the label to the first position cols = data.columns.tolist() cols = cols[-1:] + cols[0:-1] data = data[cols] X = data.iloc[:, 1:].values y = data.iloc[:, 0].values # transform the labels from strings to integers encoder = LabelEncoder() y = encoder.fit_transform(y) return data, X, y
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None): """ Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() le = LabelEncoder() X = s.fit_transform(df[predictorColumns].astype('float64')) y = le.fit_transform(df[targetcolumn].values) rfc = RandomForestClassifier(max_depth = md) acc = mean(cross_val_score(rfc, X, y)) X_train, X_test, y_train, y_test = train_test_split(X,y) rfc.fit(X_train,y_train) y_predict = rfc.predict(X_test) cm = confusion_matrix(y_test, y_predict) cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_) rfc.fit(X, y) return rfc, cm, round(acc,2), le
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def load_kernel_matrix(data_path='data', study='wl_kernel', verbose=True): """Loading already computed kernel matrix. Parameters: --------- data_path: string Path to the data folder. study: string Name of the folder containing the study, e.g. 'wl_kernel', which contains the WL kernel matrix. verbose: bool """ path_k_matrix = os.path.join(data_path, 'precomputed_kernels', study, 'k_matrix.csv') path_cls = os.path.join(data_path, 'precomputed_kernels', study, 'class_labels.csv') K = np.loadtxt(path_k_matrix) y = np.loadtxt(path_cls) le = LabelEncoder() y = le.fit_transform(y) if verbose: print 'n_samples: %s, n_samples_by_class: (%s - %s)' % (len(y), len(y[y == 0]), len(y[y == 1])) return K, y
def load_train_data(path): print("Loading Train Data") df = pd.read_csv(path) # Remove line below to run locally - Be careful you need more than 8GB RAM rows = np.random.choice(df.index.values, 40000) df = df.ix[rows] # df = df.sample(n=40000) # df = df.loc[df.index] labels = df.target df = df.drop('target',1) df = df.drop('ID',1) # Junk cols - Some feature engineering needed here df = df.fillna(-1) X = df.values.copy() np.random.shuffle(X) X = X.astype(np.float32) encoder = LabelEncoder() y = encoder.fit_transform(labels).astype(np.int32) scaler = StandardScaler() X = scaler.fit_transform(X) return X, y, encoder, scaler
class Classifier(BaseEstimator): def __init__(self): self.label_encoder = LabelEncoder() self.scaler = StandardScaler() self.clf = None def fit(self, X, y): X = self.scaler.fit_transform(X.astype(np.float32)) y = self.label_encoder.fit_transform(y).astype(np.int32) dtrain = xgb.DMatrix( X, label=y.astype(np.float32)) param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'} param['nthread'] = 4 param['num_class'] = 9 param['colsample_bytree'] = 0.55 param['subsample'] = 0.85 param['gamma'] = 0.95 param['min_child_weight'] = 3.0 param['eta'] = 0.05 param['max_depth'] = 12 num_round = 400 # to be faster ?? #num_round = 820 self.clf = xgb.train(param, dtrain, num_round) def predict(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) label_index_array = np.argmax(self.clf.predict(dtest), axis=1) return self.label_encoder.inverse_transform(label_index_array) def predict_proba(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) return self.clf.predict(dtest)
def main(X_fname, Y_fname, result_fname=None): le = LabelEncoder() moves = pandas.read_csv(Y_fname, index_col=0) Y = moves.values.ravel() Y = le.fit_transform(Y) X = io.mmread(X_fname) print X.shape, Y.shape, len(le.classes_) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) xg_train = xgboost.DMatrix( X_train, label=y_train) xg_test = xgboost.DMatrix(X_test, label=y_test) param = {} # use softmax multi-class classification param['objective'] = 'multi:softprob' param['eta'] = 0.002 param['max_depth'] = 7 param['nthread'] = 7 param['num_class'] = len(le.classes_) param['eval_metric'] = 'merror' evals = [ (xg_train, 'train'), (xg_test, 'eval') ] # Train xgboost print "Training" t1 = time.time() bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=3) t2 = time.time() print t2-t1 if result_fname is None: result_fname = str(datetime.now()) bst.save_model("%s.bst"%result_fname)
def plot_model_decision_surface(clf, train_features, train_labels, plot_step=0.02, cmap=plt.cm.RdYlBu, markers=None, alphas=None, colors=None): if train_features.shape[1] != 2: raise ValueError("X_train should have exactly 2 columnns!") x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) clf_est = clone(clf) clf_est.fit(train_features,train_labels) if hasattr(clf_est, 'predict_proba'): Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1] else: Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=cmap) le = LabelEncoder() y_enc = le.fit_transform(train_labels) n_classes = len(le.classes_) plot_colors = ''.join(colors) if colors else [None] * n_classes label_names = le.classes_ markers = markers if markers else [None] * n_classes alphas = alphas if alphas else [None] * n_classes for i, color in zip(range(n_classes), plot_colors): idx = np.where(y_enc == i) plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color, label=label_names[i], cmap=cmap, edgecolors='black', marker=markers[i], alpha=alphas[i]) plt.legend() plt.show()
def load_input_files(self, **kwargs): """ Loads both files containing training data and data for prediction. Encodes the target labels to integers. In case the it is training data, it will return in the output args the LabelEncoder used to encode the target labels to integers. We return it instead of directly storing it, because it will be saved in case the training ends without errors. Inputs: - files_paths (string): path the input files. - training_data (bool): specifies whether the files containing training data or data for making predictions. Outputs: - LabelEncoder (LabelEncoder) (optional): Encodes the labels of the target variables to integers. """ input_data = kwargs['input_data'] input_files_dir = kwargs['input_files_dir'] input_file_path = input_files_dir + input_data['database'] df = pd.read_csv(input_file_path) training_data = kwargs.pop('training_data', False) # if we are loading training data, we have to assign an integer to each possible # target label in the dataset. We do it by fitting a LabelEncoder, if training_data: le = LabelEncoder() col_name = df.columns[4] df[col_name] = le.fit_transform(df[col_name]) data = {} data['features'] = df[df.columns[0:4]].values data['targets'] = df[df.columns[4]].values self.feature_names = list(df.columns[0:4]) self.target_name = df.columns[4] out_args = {} out_args['LabelEncoder'] = le return data, out_args # if the data is for making predictions else: data = {} # ensure that the columns are in the correct order data['features'] = df[self.feature_names].values out_args = {} return data, out_args
def ml_target(dataset): """ Takes a dataset and retuns the target in a numpy.array ready for machine learning. Mainly transforms non-numerical variables(columns) to numbers. Parameters ---------- copper.Dataset Returns ------- (label_encoder, np.array) Notes ----- If dataset has more than one variable with role=TARGET then the first one is selected. """ cols = dataset.filter_cols(role=dataset.TARGET) assert len(cols) > 0, 'No target variables on Dataset' if len(cols) > 1: import warnings warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0]) if dataset[cols[0]].dtype in (np.int, np.float): return None, dataset[cols[0]].values else: le = LabelEncoder() encoded = le.fit_transform(dataset[cols[0]].values) return le, encoded
def __call__(self, X_train, X_test, y_train, y_test): X = np.vstack([X_train, X_test]) y = np.hstack([y_train, y_test]) le = LabelEncoder() y = le.fit_transform(y) kmeans = KMeans( n_clusters=len(np.unique(y)), n_init=self.kmeans__n_init, random_state=self.random_state, ) kmeans.fit(X) r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_) h = np.exp(-r / (self.sig**2)) N = confusion_matrix(y, kmeans.labels_) wN = np.zeros(h.shape) for l in range(wN.shape[0]): # label for c in range(wN.shape[0]): # cluster for j in range(wN.shape[0]): wN[l, c] += h[l, c] * N[l, j] return wN.max(axis=0).sum() / wN.sum()
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def auto_alpha2num(self, col): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() for i in col: self.df[i] = le.fit_transform(self.df[i]) return
def load_data(filename="Feat_normalized.csv") : ''' Load training data from csv file. Load labels from it. Return matrix, training labels, encoder for labels. http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html http://stackoverflow.com/questions/21589177/using-multiple-features-with-scikit-learn?rq=1 Labels could just be the names? : http://stackoverflow.com/questions/13300160/non-integer-class-labels-scikit-learn?rq=1 ''' df = pd.read_csv(filename, index_col=0) lb = LabelEncoder() labels = lb.fit_transform((df.index.values)) print ("labels: %s %s" %(type(labels),labels)) features = df.values # labels = LabelEncoder.transform(np.asarray(df['labels'].values)) 'This could be done more elegantly. Check index num for later filtering!!' 'TODO: Is pop needed? List of col.values?? ' feature_names=df.columns.values #No pop. (nd array, no labels index here) print("%s features: " % (len(feature_names))) # classes = label_encoder.transform(np.asarray(df['labels'])) print('encoded labels: %s' % (set(labels))) # print("feature_names: %s" %(feature_names)) return (features, labels, lb,feature_names)
def multicol_fit_transform(dframe, columns): if isinstance(columns, list): columns = np.array(columns) else: columns = columns encoder_dict = {} # columns are provided, iterate through and get `classes_` # ndarray to hold LabelEncoder().classes_ for each # column; should match the shape of specified `columns` all_classes_ = np.ndarray(shape=columns.shape, dtype=object) all_encoders_ = np.ndarray(shape=columns.shape, dtype=object) all_labels_ = np.ndarray(shape=columns.shape, dtype=object) for idx, column in enumerate(columns): # instantiate LabelEncoder le = LabelEncoder() # fit and transform labels in the column dframe.loc[:, column] = le.fit_transform(dframe.loc[:, column].values) encoder_dict[column] = le # append the `classes_` to our ndarray container all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) all_encoders_[idx] = le all_labels_[idx] = le multicol_dict = {"encoder_dict":encoder_dict, "all_classes_":all_classes_,"all_encoders_":all_encoders_,"columns": columns} return dframe, multicol_dict
def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") \ * RBF(1.0, length_scale_bounds="fixed") else: self.kernel_ = clone(self.kernel) self.rng = check_random_state(self.random_state) self.X_train_ = np.copy(X) if self.copy_X_train else X # Encode class labels and check that it is a binary classification # problem label_encoder = LabelEncoder() self.y_train_ = label_encoder.fit_transform(y) self.classes_ = label_encoder.classes_ if self.classes_.size > 2: raise ValueError("%s supports only binary classification. " "y contains classes %s" % (self.__class__.__name__, self.classes_)) elif self.classes_.size == 1: raise ValueError( "{0:s} requires 2 classes; got {1:d} class".format( self.__class__.__name__, self.classes_.size)) if self.optimizer is not None and self.kernel_.n_dims > 0: # Choose hyperparameters based on maximizing the log-marginal # likelihood (potentially starting from several initial values) def obj_func(theta, eval_gradient=True): if eval_gradient: lml, grad = self.log_marginal_likelihood( theta, eval_gradient=True) return -lml, -grad else: return -self.log_marginal_likelihood(theta) # First optimize starting from theta specified in kernel optima = [ self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds) ] # Additional runs are performed from log-uniform chosen initial # theta if self.n_restarts_optimizer > 0: if not np.isfinite(self.kernel_.bounds).all(): raise ValueError( "Multiple optimizer restarts (n_restarts_optimizer>0) " "requires that all bounds are finite.") bounds = self.kernel_.bounds for iteration in range(self.n_restarts_optimizer): theta_initial = np.exp( self.rng.uniform(bounds[:, 0], bounds[:, 1])) optima.append( self._constrained_optimization(obj_func, theta_initial, bounds)) # Select result from run with minimal (negative) log-marginal # likelihood lml_values = list(map(itemgetter(1), optima)) self.kernel_.theta = optima[np.argmin(lml_values)][0] self.log_marginal_likelihood_value_ = -np.min(lml_values) else: self.log_marginal_likelihood_value_ = \ self.log_marginal_likelihood(self.kernel_.theta) # Precompute quantities required for predictions which are independent # of actual query points K = self.kernel_(self.X_train_) _, (self.pi_, self.W_sr_, self.L_, _, _) = \ self._posterior_mode(K, return_temporaries=True) return self
X = dataset.iloc[:,:-1].values Y = dataset.iloc[:,3].values #take care to missing data from sklearn.preprocessing import Imputer # create imputer imputer = Imputer(missing_values = 'NaN',strategy = 'median', axis = 0) #fit imputer to matrix imputer.fit(X[:,1:3]) X[:,1:3] = imputer.transform(X[:,1:3]) #dummy variable for country name. from sklearn.preprocessing import LabelEncoder, OneHotEncoder #labelencoder_X = LabelEncoder() #X[:,0] = labelencoder_X.fit_transform(X[:,0]) onehotencoder = OneHotEncoder() X = onehotencoder.fit_transform(X).toarray() #encoding catoagorical variables for yes or no labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) # split train and test set, 20% for test from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)
# -*- coding: utf-8 -*- import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder data = pd.read_excel(r'../initial datasets/LabeledDataset.xlsx', sheet_name='Scenario 2') data = data.sample(frac=1, random_state=42) data['commentText'] = data['commentText'].astype(str) data['replies.commentText'] = data['replies.commentText'].astype(str) # Replace empty cells in commentText with np.nan data['commentText'].replace(['nan'], np.nan, inplace=True) # replace NaN values in commentText with values in replies.commentText data['commentText'] = data['commentText'].fillna(data['replies.commentText']) # Encode labels (strings -> numbers) encoder = LabelEncoder() data['Label'] = encoder.fit_transform(data['Label']) # Rename columns and select text and class info data.rename(columns={'commentText': 'text', 'Label': 'class'}, inplace=True) df = data[['text', 'class']] df.to_excel('../feature datasets/ar/labels.xlsx', index_label="index")
# mapping ordinal features size_mapping = {'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) print(df) class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classLabel']))} print(class_mapping) df['classLabel'] = df['classLabel'].map(class_mapping) print(df) inv_class_mapping = {v: k for k, v in class_mapping.items()} df['classLabel'] = df['classLabel'].map(inv_class_mapping) print(df) class_encoder = LabelEncoder() y = class_encoder.fit_transform(df['classLabel'].values) print(y) print(class_encoder.inverse_transform(y)) x = df [['color', 'size', 'price']].values class_encoder = LabelEncoder() x[:, 0] = class_encoder.fit_transform(x[:, 0]) print(x) # one-hot encoding one_encoder = OneHotEncoder(categorical_features=[0]) print(one_encoder.fit_transform(x).toarray()) # this one-hot is more readable print(pd.get_dummies(df[['price', 'color', 'size']]))
label_list.append(item[1]) print('Features in Training Set: {}'.format(len(training_set))) print('Invalid Features in Training set: {}'.format( len(training_set) - len(feature_list))) X = np.array(feature_list) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X X_train = X_scaler.transform(X) y_train = np.array(label_list) # Convert label strings to numerical encoding encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) # Create classifier clf = svm.SVC(kernel='linear') # Set up 5-fold cross-validation kf = model_selection.KFold( #len(X_train), n_splits=5, shuffle=True, random_state=1) # Perform cross-validation scores = model_selection.cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy') print('Scores: ' + str(scores))
corpus = [] y = [] for i in content: analysis = TextBlob(i) if analysis.polarity == 0: y.append(0) elif analysis.polarity > 0: y.append(1) elif analysis.polarity < 0: y.append(-1) y = pd.Series(y) u = pd.Series(y) from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(content).toarray() #y=dataset.iloc[:,1] # Binarize the output # Add noisy features to make the problem harder random_state = np.random.RandomState(0) #n_samples, n_features = X.shape #X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # Feature Scaling # Fitting Naive Bayes to the Training set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,
dataset = pd.read_csv('datasets/Data.csv') X = dataset.iloc[:,:-1].values Y = dataset.iloc[:,3].values # Handling the missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) # compute mean imputer = imputer.fit(X[:, 1:3]) # use mean to replace X[:,1:3] = imputer.transform(X[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0]) # which column is the feature needed one-hot onehotencoder = OneHotEncoder(categorical_features = [0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0) # the scaling is needed on the training set and test set from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test)
'creative_is_voicead','creative_is_js'] for i in bool_feature: data[i] = data[i].astype(int) # label encoding advert_feature = ['advert_id','adid','orderid','advert_industry_inner','advert_name', 'campaign_id','creative_id','creative_type','creative_tp_dnf','advert_industry_inner_0'] media_feature = ['app_cate_id','f_channel','app_id','inner_slot_id','app_paid','inner_slot_id_1'] content_feature = ['city', 'province', 'nnt', 'devtype','osv','os'] # 待处理 user_tags , make , model label_feature = advert_feature + media_feature + content_feature num_feature = ['creative_width','creative_height','hour','day'] label_enc = LabelEncoder() for label in label_feature: data[label] = label_enc.fit_transform(data[label].astype(str)) print("labelencoding is finish") onehot_feature = label_feature predict = data[data['click'] == -1].drop('click',axis=1) predice_click = predict[['instance_id']] predice_click['predicted_score'] = 0 train_all = data[data['click'] != -1] train_y = train_all.click.values train_x = train_all.drop('click',axis = 1) if os.path.exists(path + '/feature/base_train_csr1.npz') and True: base_train_csr = sparse.load_npz(path + '/feature/base_train_csr1.npz').tocsr().astype(bool) base_predict_csr = sparse.load_npz(path+ '/feature/base_predict_csr1.npz').tocsr().astype(bool) else: base_train_csr = sparse.csr_matrix((len(train_x), 0))
import matplotlib.pyplot as plt import pandas as pd # Importing dataset dataset = pd.read_csv('Churn_Modelling.csv') dataset # In[2]: X = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features=[1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Avoiding dummy variable trap! # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler
# import data train = pd.read_csv('./train.csv') test = pd.read_csv('./test.csv') sample = pd.read_csv('./sampleSubmission.csv') features = train.loc[:, 'feat_1':'feat_93'].values labels = train['target'].values categories = [ 'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9' ] #transformer la label on one hot label label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(labels) # Train SVM by using different kernel 'rbf','poly','linear' clf = SVC(C=1.0, kernel='sigmoid', probability=True) clf.fit(features, integer_encoded) #prediction test = pd.read_csv('./test.csv') test_features = test.loc[:, 'feat_1':'feat_93'].values predictions = clf.predict_proba(test_features) #print (predictions) #enc = OneHotEncoder() #onehot=enc.fit_transform(predictions.reshape(-1, 1)).toarray() # create submission file
# Importing the dataset dataset = pd.read_csv('Salary_Classification.csv') temp = dataset.values print(temp) features = dataset.iloc[:, :-1].values print(features) labels = dataset.iloc[:, -1].values print(labels) # Encoding categorical data from sklearn.preprocessing import LabelEncoder labelencoder = LabelEncoder() features[:, 0] = labelencoder.fit_transform(features[:, 0]) # One Hot Encoding from sklearn.preprocessing import OneHotEncoder onehotencoder = OneHotEncoder(categorical_features=[0]) features = onehotencoder.fit_transform(features).toarray() print(features) # Avoiding the Dummy Variable Trap # dropping first column features = features[:, 1:] print(features) # We are not performing Prediction today # But we have to identify which is the most important columns # For which we will use a new library of statsmodels and not sklearn
# Fare d_train["Fare"] = d_train["Fare"].fillna(d_train.Fare.mean()) d_test["Fare"] = d_test["Fare"].fillna(d_test.Fare.mean()) # Age d_train["Age"] = d_train["Age"].fillna(d_train.Age.mean()) d_test["Age"] = d_test["Age"].fillna(d_test.Age.mean()) d_train = d_train.drop(['PassengerId','Name','Ticket','Cabin', 'Parch','SibSp'], axis=1) d_test = d_test.drop(['Name','Ticket','Cabin', 'Parch','SibSp'], axis=1) from sklearn.preprocessing import LabelEncoder LE=LabelEncoder() labels = ['title'] for label in labels: d_train[label]=LE.fit_transform(d_train[label]) d_test[label]=LE.fit_transform(d_test[label]) y_train = d_train["Survived"] X_train = d_train.drop("Survived",axis=1) X_test = d_test.drop(d_test.columns[[0,1]],axis=1).copy() from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import KFold from sklearn import metrics from sklearn.metrics import accuracy_score K = 5 kf = KFold(n_splits=K, shuffle=True, random_state=1 )
def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict', safe=True): """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for non-safe cloning of the models for each fold. Parameters ---------- estimator : estimator object implementing 'fit' and 'predict' The object to use to fit the data. X : array-like of shape (n_samples, n_features) The data to fit. Can be, for example a list, or an array at least 2d. y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ default=None The target variable to try to predict in the case of supervised learning. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - int, to specify the number of folds in a `(Stratified)KFold`, - CV splitter, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. n_jobs : int, default=None The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. verbose : int, default=0 The verbosity level. fit_params : dict, defualt=None Parameters to pass to the fit method of the estimator. pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' method : str, default='predict' Invokes the passed method name of the passed estimator. For method='predict_proba', the columns correspond to the classes in sorted order. safe : bool, default=True Whether to clone with safe option. Returns ------- predictions : ndarray This is the result of calling ``method`` """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) test_indices = np.concatenate([test for _, test in splits]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') # If classification methods produce multiple columns of output, # we need to manually encode classes to ensure consistent column ordering. encode = method in [ 'decision_function', 'predict_proba', 'predict_log_proba' ] and y is not None if encode: y = np.asarray(y) if y.ndim == 1: le = LabelEncoder() y = le.fit_transform(y) elif y.ndim == 2: y_enc = np.zeros_like(y, dtype=int) for i_label in range(y.shape[1]): y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label]) y = y_enc # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # TODO. The API of the private scikit-learn `_fit_and_predict` has changed # between 0.23.2 and 0.24. For this to work with <0.24, we need to add a # case analysis based on sklearn version. predictions = parallel( delayed(_fit_and_predict)(clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method) for train, test in splits) inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) elif encode and isinstance(predictions[0], list): # `predictions` is a list of method outputs from each fold. # If each of those is also a list, then treat this as a # multioutput-multiclass task. We need to separately concatenate # the method outputs for each label into an `n_labels` long list. n_labels = y.shape[1] concat_pred = [] for i_label in range(n_labels): label_preds = np.concatenate([p[i_label] for p in predictions]) concat_pred.append(label_preds) predictions = concat_pred else: predictions = np.concatenate(predictions) if isinstance(predictions, list): return [p[inv_test_indices] for p in predictions] else: return predictions[inv_test_indices]
#print(A.x) print(B.x) class C(A, B): pass print(C.x) 数据处理 空数据 dropna fillna 属性转成数字 手动转 labelencoder from sklearn.preprocessing import LabelEncoder df['column'] = LabelEncoder.fit_transform(df.column) 哑变量 pd.get_dummies 返回的df 和原df concat 筛选一下,去掉异常值 数据规范化 from sklearn.preprocessing import MinMaxScaler, StandardScaler df2 = StandardScaler().fit_transform(df) 选择学习集 X 特征 y 标签 交叉验证 from model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y) 选择模型 from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier
print('total empty value item are', data.isnull().sum()) # sum of all colum null values # drop purchased column from data sets # features=features.drop(labels=features.columns[[7]],axis=1)#method one # features.head() features = data.iloc[:, 0:2].values # method 2 labels = data.iloc[:, -1].values dt = pd.DataFrame( features) # juct for overhead view convert feature into data frame label_encode = LabelEncoder() first_encode = label_encode.fit_transform(features[:, 1]) features[:, 1] = first_encode print(len(pd.unique(first_encode))) # inique featues in fist code # data[:,1].unique() # print(pd.value_counts(pd.unique(data[:,1]))) one_hot_encode = OneHotEncoder() first_encode = np.reshape(first_encode, newshape=(len(first_encode), 1)) hot_e = one_hot_encode.fit_transform(first_encode) print(hot_e) # you have to convert all string column into categorical before applying one hot encoding sec_hot_code = OneHotEncoder(categorical_features=[1]) features = sec_hot_code.fit_transform(features).toarray()
def labels(tags): le = LabelEncoder() tags = le.fit_transform(tags) categories = len(np.unique(tags)) y = np_utils.to_categorical(tags, categories) return y
type=int, default=-1, help="# of jobs for k-NN") args = vars(ap.parse_args()) print("[INFO] loading images...") imagePaths = list(paths.list_images(args["dataset"])) sp = SimplePreprocessor(32, 32) sdl = SimpleDatasetLoader(preprocessors=[sp]) (data, labels) = sdl.load(imagePaths, verbose=500) data = data.reshape((data.shape[0], 3072)) print("[INFO] features matrix: {:.1f}MB".format(data.nbytes / (1024 * 1000.0))) le = LabelEncoder() labels = le.fit_transform(labels) (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) print("[INFO] evaluating k-NN classifier...") model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"]) model.fit(trainX, trainY) print( classification_report(testY, model.predict(testX), target_names=le.classes_))
#Importing Libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd ####### DATA PREPROCESSING ######## # Importing Dataset dataset = pd.read_csv("Churn_Modelling.csv") x = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_x_1 = LabelEncoder() x[:, 1] = labelencoder_x_1.fit_transform(x[:, 1]) labelencoder_x_2 = LabelEncoder() x[:, 2] = labelencoder_x_2.fit_transform(x[:, 2]) onehotencoder = OneHotEncoder(categorical_features=[1]) x = onehotencoder.fit_transform(x).toarray() x = x[:, 1:] # Splitting the dataset into training set and test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler
@author: BrysDom """ #Importing libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd #Importing dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:,:-1].values y = dataset.iloc[:,4].values #Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:,3]=labelencoder_X.fit_transform(X[:,3]) onehotencoder=OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() #Avoinding Dummy Variable Trap X=X[:,1:] #Splitting database into Training and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0) ##Feature scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)"""
dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Create the training set and the test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature scaling
image = cv2.resize(image, (IMAGE_DIMS[1], IMAGE_DIMS[0])) image = img_to_array(image) data.append(image) #extract the class label from image path and update the labels list label = imagePath.split(os.path.sep)[-2] labels.append(label) #scale the raw pixel intensities to the range [0,1] data = np.array(data, dtype="float")/255.0 labels = np.array(labels) print("[INFO] data matrix: {:.2f}MB".format(data.nbytes / (1024 * 1000.0))) # binarize the labels lb = LabelEncoder() labels = lb.fit_transform(labels) labels = to_categorical(labels,2) #partiton the data into training and testing splits using 80% for training and the remaining 20% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.2, random_state=42) #construct the image generator for data augmentation aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode="nearest") #initialize the model print("[info] compiling the model...") model = SmallerVGGNet.build(width=IMAGE_DIMS[1], height=IMAGE_DIMS[0], depth=IMAGE_DIMS[2], classes=len(lb.classes_)) opt = Adam(lr=INIT_LR, decay=INIT_LR/EPOCHS) model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"]) #train model
'Geography': Geography, 'Gender': Gender, 'Age': Age, 'Tenure': Tenure, 'Balance': Balance, 'NumberofProducts': NumberofProducts, 'HasCrcard': HasCrcard, 'IsActiveMember': IsActiveMember, 'EstimatedSalary': EstimatedSalary } features = pd.DataFrame(data, index=[0]) return features input_df = user_input_features() objList = input_df.select_dtypes(include="object").columns for feat in objList: input_df[feat] = le.fit_transform(input_df[feat].astype(str)) df = input_df[:1] if st.button('Predict'): load_clf = pickle.load(open('predictionrfc.pkl', 'rb')) st.subheader('User Input features') st.write(df) prediction = load_clf.predict(df) skip = np.array(['Defaulter', 'Defaulter']) st.write(skip[prediction])
dataDF = pd.DataFrame() dataDF['text'] = texts dataDF['label'] = labels print(dataDF.head()) print("total examples %s" % len(labels)) # split the dataset into training and test datasets X_train, X_test, y_train, y_test = train_test_split(dataDF['text'], dataDF['label'], random_state=24, test_size=0.2) # encode the target variable encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.fit_transform(y_test) onehotencoder = OneHotEncoder(sparse=False) y_train = onehotencoder.fit_transform(y_train.reshape(-1, 1)) y_test = onehotencoder.fit_transform(y_test.reshape(-1, 1)) tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(dataDF['text']) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) vocab_size = len(tokenizer.word_index) + 1 maxlen = 100 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
test[9].isna().sum() test[10].isna().sum() del test[0] del test[7] X = test.values from sklearn.preprocessing import LabelEncoder lab = LabelEncoder() X[:, 0] = lab.fit_transform(X[:, 0]) X[:, 1] = lab.fit_transform(X[:, 1]) X[:, 4] = lab.fit_transform(X[:, 4]) X[:, -1] = lab.fit_transform(X[:, -1]) test2 = pd.DataFrame(X) test2.isnull().sum() #Handling 5th column which has multiple categorical values in one cell df = test2
(trainLabels, testLabels) = (labels[:split], labels[split:]) # create the training and testing bunches training = Bunch(name="training", data=trainData, target=trainLabels) testing = Bunch(name="testing", data=testData, target=testLabels) # return a tuple of the training, testing bunches, and original labels return (training, testing, labels) (training, testing, names) = load_sunplusit_faces(facePath, min_faces=faces_min, test_size=test_size) le = LabelEncoder() le.fit_transform(training.target) #recognizer = cv2.face.createLBPHFaceRecognizer(radius=2, neighbors=16, grid_x=8, grid_y=8) recognizer = cv2.face.LBPHFaceRecognizer_create(radius=2, neighbors=16, grid_x=8, grid_y=8) print("[INFO] training face recognizer...") recognizer.train(training.data, le.transform(training.target)) predictions = [] confidence = [] # loop over the test data for i in range(0, len(testing.data)): print("{} of {}".format(str(i), str(len(testing.data))))
import numpy as np # 数据加载 #data = pd.read_csv('Mall_Customers.csv', encoding='gbk') data = pd.read_csv('CarPrice_Assignment.csv') train_x = data[[ 'car_ID', 'symboling', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price' ]] # LabelEncoder from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_x['car_ID'] = le.fit_transform(train_x['car_ID']) # 规范化到 [0,1] 空间 min_max_scaler = preprocessing.MinMaxScaler() train_x = min_max_scaler.fit_transform(train_x) pd.DataFrame(train_x).to_csv('temp.csv', index=False) #print(train_x) ### 使用KMeans聚类 kmeans = KMeans(n_clusters=2) kmeans.fit(train_x) predict_y = kmeans.predict(train_x) # 合并聚类结果,插入到原数据中 result = pd.concat((data, pd.DataFrame(predict_y)), axis=1) result.rename({0: u'聚类结果'}, axis=1, inplace=True) print(result)
# reviews_per_month: replace null with 0 df["reviews_per_month"] = df["reviews_per_month"].fillna(0) # last_review data: convert it to numeric value df["last_review"] = pd.to_datetime(df["last_review"], infer_datetime_format=True) earliest_last_review = min(df["last_review"]) df["last_review"] = df["last_review"].fillna(earliest_last_review) df["last_review"] = df["last_review"].apply( lambda review_date: review_date.toordinal( ) - earliest_last_review.toordinal()) # neighbourhood: label encoding neighbourhood_encoder = LabelEncoder() neighbourhood_labels = neighbourhood_encoder.fit_transform(df["neighbourhood"]) df["neighbourhood"] = neighbourhood_labels # retain the mapping of neighbourhood and encoded values # neighbourhood_dict = dict(zip(neighbourhood_encoder.classes_, range(len(neighbourhood_encoder.classes_)))) # room_type: label encoding room_encoder = LabelEncoder() room_labels = room_encoder.fit_transform(df["room_type"]) df["room_type"] = room_labels # retain the mapping of room_type and encoded values # room_dict = dict(zip(room_encoder.classes_, range(len(room_encoder.classes_)))) # convert feature to log(1 + feature) df["price"] = np.log1p(df["price"]) #######################
a = pd.get_dummies(test_set[['flag','protocol_type']]) test_set = pd.concat([test_set,a],axis=1) test_set=test_set.drop(['Unnamed: 0','flag','protocol_type'],axis=1) #train_set['flag']=le.fit_transform(train_set['flag']) #~/~ #exclude the 'label' attribute from set test_set.drop(['label'],inplace=True,axis=1) #feature scaling test_set=test_set.astype(float) from sklearn.preprocessing import MinMaxScaler for each_column in test_set: test_set[each_column] = MinMaxScaler().fit_transform(test_set[each_column].values.reshape(len(test_set),-1)) #save to file full_test_set = pd.concat([test_set,test_labels],axis=1) full_test_set.to_csv('~/dataset/preprocessed_test_dos_kdd99.csv') #transform label to numeric representation test_labels = le.fit_transform(test_labels) #train model #from sklearn.neighbors import KNeighborsClassifier #clf=KNeighborsClassifier(n_neighbors=5,algorithm='ball_tree',leaf_size=500) #from time import time #t0=time() #clf.fit(features,labels) #tt=time()-t0 #print('classifier trained in {} seconds'.format(round(tt,3)))
#Exploring the Target Variable. Our target variable is the Member Type column data['Member Type'].value_counts() #Checking for missing values in any column/features data.isnull().sum() #Convert the Categorical Values to Numerical to allow us perform plotting #import the library LabelEncoder from sklearn.preprocessing import LabelEncoder #Create a list with categorical predictors cat_var = ['Start station', 'End station', 'Bike Number', 'Member Type'] #Initiate LabelEncoder le = LabelEncoder() #A for loop to transform the categorical values to numerical values for n in cat_var: data[n] = le.fit_transform(data[n]) #Checking for the type of the predictors afterwards data.dtypes #Explore the relationship between duration and member type data.plot(x='Duration', y='Member Type', style='*') plt.title('Duration of Bike Use') plt.xlabel('Duration') plt.ylabel('Member Type') plt.show() #Explore the relationship between Start Station and member type data.plot(x='Start station', y='Member Type', style='*') plt.title('Start station by Member Type') plt.xlabel('Start station')
#Class distrbution sns.countplot(data['class']) ##Feature distribution for i in data.columns[:-1]: plt.figure(figsize=(12,6)) plt.title("For feature '%s'"%i) sns.countplot(data[i],hue=data['class']) #Modelling starts here with label encoding le=LabelEncoder() for i in data.columns: data[i]=le.fit_transform(data[i]) ##X and Y variables X=data[data.columns[:-1]] y=data['class'] ##Train test split and building the model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22) logreg=LogisticRegression(solver='newton-cg',multi_class='multinomial') logreg.fit(X_train,y_train) pred=logreg.predict(X_test)