def feature_engineering_solution(): train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) X_test=feature_engineering(test[feature_cols]) feature_cols= [col for col in X_train.columns]#std 0.607958003167 mean 0.615741311533 X_train=X_train[feature_cols] X_test=X_test[feature_cols] y=train['target'] test_ids=test['id'] print 'feature_engineering_solution' cross_v(get_rf(),X_train.values,y.values)#0.600017926514
def main(): df = feature_engineering() df_train = df.iloc[0:357, :] df_test = df.iloc[357:447, :] X = df_train.iloc[:, 1:58] y = df_train.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 4, random_state=0) # following is RandomForestRegressor, or choose other model in model.py RF = RandomForestRegressor(max_depth=2, random_state=0, max_features='sqrt', n_estimators=100) RF.fit(X_train, y_train) y_pred = RF.predict(X_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) RF.fit(X, y) test = df_test.iloc[:, 1:58] y_pred = RF.predict(test) df = pd.DataFrame(y_pred, columns=['playtime_forever']) df['id'] = df.index df = df[['id', 'playtime_forever']] df.to_csv(r'./result/submission.csv', index=False)
def get_data(datafile): # Load in data df = pd.read_json(datafile) # Perform data & cleaning and feature engineering df = feature_engineering(df) # Perform over-sampling of majority class df = oversampling(df) # Make train test split X_train, X_test, y_train, y_test = make_split(df) return X_train, X_test, y_train, y_test
def on_button_click1(self,df): dataset_location=self.entry1.get() loc = dataset_location.split("/") filename= loc[-1] dest_loc = "./"+filename shutil.copyfile(dataset_location,dest_loc) preprocessing.missing_value_filling(filename) df = feature_engineering.feature_engineering(df) todo = 'classfication' #nlp module will be sending this models.model(df, todo)
def main(paths): df_train = pd.read_csv(paths[0]) df_test = pd.read_csv(paths[1]) print('Read {}'.format(paths)) df_train = df_train.set_index('PassengerId') df_test = df_test.set_index('PassengerId') df_train = fea_eng.feature_engineering(df_train) df_test = fea_eng.feature_engineering(df_test) df_train = fea_ext.feature_extraction(df_train) df_test = fea_ext.feature_extraction(df_test) df_test.Fare = df_test.Fare.fillna(df_test.Fare.median()) df_train, df_test = fea_ext.process_age(df_train, df_test) drop_cols = [ 'Name', 'Ticket', 'Cabin', 'Age', 'Sex', 'Embarked', 'Title', 'Surname' ] df_train = df_train.drop(drop_cols, axis=1) df_test = df_test.drop(drop_cols, axis=1) # Save file, added "-processed" as suffix df_train.to_csv('./data/train-processed.csv') df_test.to_csv('./data/test-processed.csv')
def get_data(datafile): ''' Loads raw data from a json file into a pandas data frame, performs feature selection, feature engineering, oversampling for the minority class and splits the data into training and test sets. ''' # Load raw data df = pd.read_json(datafile) # Perform data cleaning and feature engineering df = feature_engineering(df) # Perform over-sampling of majority class df = oversampling(df) # Make train test split X_train, X_test, y_train, y_test = make_split(df) return X_train, X_test, y_train, y_test
def feature_selection_solution(): train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) X_test=feature_engineering(test[feature_cols]) feature_cols=[col for col in X_train.columns if col not in ['mean','std','nonzero','feat_6','feat_82','feat_84']] X_train=X_train[feature_cols] X_test=X_test[feature_cols] print X_train.columns y=train['target'] test_ids=test['id'] print 'feature_selection_solution' cross_v(get_rf(),X_train.values,y.values)# mean 0.595288515439 std 0.593551044059 nonzero 0.597406303207 #no fg 6 82 84 0.603600594376 #0.600058535601 clf=get_rf() clf.fit(X_train,y) preds = clf.predict_proba(X_test) write_submission(test_ids,preds,'submissions/feature_selection_rf100_84_82_6_nofg.csv')
from feature_engineering import feature_engineering from feature_selection import feature_selection from Models import linear_model,xgb_model import argparser import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) if __name__ == '__main__': parser=argparse.ArgumentParser() parser.add_argument('--train_dataset',help='address of train dataset') parser.add_argument('--test_dataset',help='address of test dataset') parser.add_argument('--model',help='model') train_dataset = parser.train_dataset test_dataset = parser.test_dataset model=parser.model feature_engineering(train_dataset,test_dataset) feature_selection() if model=='linear': linear_model() if model=='xbg': xgb_model() elif: linear_model() xgb_model()
from tools import load_data from sklearn.metrics import log_loss from sklearn.calibration import CalibratedClassifierCV from feature_engineering import feature_engineering from sklearn import cross_validation from tools import cross_v import matplotlib.pyplot as plt plt.style.use('ggplot') def get_rf(): forest=ensemble.RandomForestClassifier(n_estimators=100) return forest train=load_data('train.csv') feature_cols = [col for col in train.columns if col not in ['id','target']] X_train=feature_engineering(train[feature_cols]).values y=train['target'].values X_train, X_test, y_train, y_test=cross_validation.train_test_split(X_train,y,test_size=0.33,random_state=1) skf = cross_validation.StratifiedKFold(y_train, n_folds=10, random_state=42) calibration_method = 'isotonic' clf=get_rf() ccv = CalibratedClassifierCV(base_estimator=clf, method=calibration_method, cv=skf) #ccv.fit(X_train,y_train) #pred=ccv.predict_proba(X_test) clf.fit(X_train,y_train) pred=clf.predict_proba(X_test) score=log_loss(y_test,pred) #0.487707826761
# fill_null_values : fill no-value to the column model fill_null_values(updated_data, 'model', 'no-value') # eds : barchart for model bar_chart(updated_data['model'], 'model') # describe-data: display null counts display_null_counts(updated_data) # ----------------------------------------------------------- # eds : barchart for fuelType bar_chart(updated_data['fuelType'], 'fuelType') # fill_null_values : fill no-value to the column fuelType fill_null_values(updated_data, 'fuelType', 'benzin') # eds : barchart for fuelType bar_chart(updated_data['fuelType'], 'fuelType') # describe-data: display null counts display_null_counts(updated_data) #------------------------------FEATURE Engineering------------------------------# updated_data = feature_engineering(updated_data) #-----------------------------Prepare Data for Training------------------------# x_train, x_test, y_train, y_test = prepare_data(updated_data, 'price') # -----------------------------Modelling-Random Forests-------------------------# model_random_forests(x_train, y_train, x_test, y_test) # -----------------------------Modelling-Linear Regression-------------------------# model_linear_regression(x_train, y_train, x_test, y_test) print( '\n\n#----------------------THE END OF THE PROJECT----------------------#' )
import argparse import numpy as np import pandas as pd import xgboost as xgb import pickle from feature_engineering import feature_engineering # Parsing script arguments parser = argparse.ArgumentParser(description='Process input') parser.add_argument('tsv_path', type=str, help='tsv file path') args = parser.parse_args() # Reading input TSV data = pd.read_csv(args.tsv_path, sep="\t") data_X, data_Y = feature_engineering(data, test=True) # Load model: model = xgb.XGBRegressor() with open(f"models/{model.__class__.__name__}.pkl", 'rb') as f: model = pickle.load(f) # Prediction: pred = np.expm1(model.predict(data_X)) prediction_df = pd.DataFrame(columns=['id', 'revenue']) prediction_df['id'] = data['id'] prediction_df['revenue'] = pred # Export prediction results prediction_df.to_csv("prediction.csv", index=False, header=False)
import pandas as pd from feature_engineering import feature_engineering from Modeling1 import Modeling1 DATA_PATH = r'C:\Users\Davis\Desktop\Dat and aud Hotel' dft = pd.read_csv(DATA_PATH+r'\train.csv', header=0) #functions recall X_train,y_train =feature_engineering(dft) feature_imp=Modeling1(X_train,y_train)
import pandas as pd import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import MinMaxScaler from feature_engineering import feature_engineering,\ test_train_split, create_X_and_y def read_data(filename): df = read_json(filename) if __name__ if __name__ == '__main__': #need to figure out how to generalize load data df = read_data(filename) #run data through pipeline for features and transformations df_features = feature_engineering(df) #Create X and y for splits, models X, y = create_X_and_y(df_features) #Create test and training sets X_train, X_val, y_train, y_val = test_train_split(X, y) #Instantiate scaler, scale data for Naive Bayes model min_max = MinMaxScaler() min_max.fit(X_train) X_scaled = min_max.transform(X_train) #Instantiate, fit Naive Bayes model nb = MultinomialNB() nb.fit(scaled_data, y_train)
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPRegressor from data_loading import read_co_data from feature_engineering import feature_engineering from model_evaluation import model_evaluation from model_visualization import model_visualization if __name__ == '__main__': # 读取原始数据 raw_data = read_co_data() # 特征工程 fed_data = feature_engineering(raw_data) # feature vector X = fed_data.take(list(range(fed_data.shape[1] - 1)), axis=1) # target y = np.ravel(fed_data.take([fed_data.shape[1] - 1], axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) # 定义一个BP神经网络 reg = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, ), random_state=1) # 训练
def main(): train=load_data('train.csv') feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) y=train['target'] grid_search(X_train,y,get_clfs())
def train(fold): df = pd.read_csv(config.training_file) ## fill all NaNs df[df == ' ?'] = np.nan # the dataset has labelled all NaNs with ? already ## label encoding ### define numerical columns num_feas = [ 'age', 'wage per hour', 'capital gains', 'capital losses', 'dividends from stocks', 'num persons worked for employer', 'own business or self employed', 'weeks worked in year', 'kfold', 'y' ] ## only for this competition, mapping them to 0 and 1 as suggested by the organizer df.y = df.y.map({' - 50000.': 0, ' 50000+.': 1}) ## define all categorical features cat_feas = [i for i in df.columns if i not in num_feas] ## call the feature engineering function for categorical features df = feature_engineering(df, cat_feas) ## all features features = [i for i in df.columns if i not in ('kfold', 'y')] ## fill all NaNs with NONE for col in features: if col not in num_feas: df.loc[:, col] = df[col].astype(str).fillna( 'NONE') # fill all NaNs wiht None ## label encoding each column ## add each encoder to the dictionary encoder = {} for col in features: if col not in num_feas: lbl = preprocessing.LabelEncoder() lbl.fit(df[col]) df.loc[:, col] = lbl.transform(df[col]) encoder[col] = lbl ## create data for training and validation df_train = df[df.kfold != fold].reset_index(drop=True) df_valid = df[df.kfold == fold].reset_index(drop=True) ## prepare data for training x_train = df_train.drop(['kfold', 'y'], axis=1).values y_train = df_train[config.target].values ## similarly, we prepare data for testing x_valid = df_valid.drop(['kfold', 'y'], axis=1).values y_valid = df_valid[config.target].values ## initialize a model model = xgb.XGBClassifier(n_jobs=-1) print('Training starting!!!') ## fit model.fit(x_train, y_train) ## predict on validation dataset valid_preds = model.predict_proba(x_valid)[:, 1] ## get roc auc score auc = metrics.roc_auc_score(y_valid, valid_preds) ## print auc print(f"Fold = {fold}, AUC = {auc}") ## save the model joblib.dump(model, os.path.join(config.model_output, f'xgb_fe_fold{fold}.bin')) ## save the columns used to fit the model joblib.dump( df_train.drop(['kfold', 'y'], axis=1).columns, os.path.join(config.model_output, f'xgb_fe_cols_fold{fold}.pkl')) ## save the label encoder joblib.dump( encoder, os.path.join(config.model_output, f'xgb_fe_encoder_fold{fold}.pkl'))
from training import train, run_kaggle_submission import constants as CN # Parse arguments parser = argparse.ArgumentParser() # If the argument is present, throw the log in the "trash" folder : for testing phases parser.add_argument("-t", action="store_false") # Specify the number of epochs parser.add_argument("-n", default=100, type=int) # Say whether or not the results should be submitted to Kaggle parser.add_argument("--kaggle", action="store_true", default=False) args = parser.parse_args() # Load an preprocess data train_df = pd.read_csv(CN.TRAIN_FILE) train_df = feature_engineering(train_df) train_loader, val_loader = get_dataloaders(train_df=train_df) test_df = pd.read_csv(CN.TEST_FILE) test_df = feature_engineering(test_df) test_df, test_tensors = get_test_data(test_df) lr = 0.001 n_epochs = args.n dropout_rate = 0.0 regul = 0.00 submit_to_kaggle = args.kaggle if submit_to_kaggle: print("Kaggle submission is enabled") else: print("Kaggle submission is disabled")
def semi_sup_cf(): # df_review = pd.read_csv("./Datasets/labeled_reviews.csv") # df_tweet = preprocess_file("./Datasets/test.csv") # df_tweet = df_tweet.rename(columns={"tweet": "comment"}) # df_review = df_review[['comment', 'class']] # df = pd.concat([df_tweet, df_review]) # freq_inverted = create_inverted_frequency_dict(df, "comment") # json_dict = json.dumps(freq_inverted) # f = open("dict_inverted.json", "w") # f.write(json_dict) # f.close() with open('./dict_inverted.json') as f: freq_inverted = json.load(f) f.close() for classifier, name in zip(classifiers, classifiers_names): model = make_pipeline(classifier) df_labeled_train, df_labeled_test = train_test_balanced_reviews_tweets( 1000, 1000) df_labeled_train = feature_engineering(df_labeled_train, "comment", freq_inverted) df_labeled_test = feature_engineering(df_labeled_test, "comment", freq_inverted) df_unlabeled = pd.read_csv("unlabeled_reviews.csv") df_unlabeled = feature_engineering(df_unlabeled, "comment", freq_inverted) high_prob = [1] i = 0 while True: model.fit( df_labeled_train[[ 'spoken', "rarity", "meanings", "lexical", "emoticon" ]].to_numpy(), df_labeled_train["class"].to_numpy()) predicted_categories = model.predict(df_unlabeled[[ 'spoken', "rarity", "meanings", "lexical", "emoticon" ]].to_numpy()) predicted_categories_prob = model.predict_proba(df_unlabeled[[ 'spoken', "rarity", "meanings", "lexical", "emoticon" ]].to_numpy()) prob_false = predicted_categories_prob[:, 0] prob_true = predicted_categories_prob[:, 1] df_prob = pd.DataFrame([]) df_prob['predicted'] = predicted_categories df_prob['prob_false'] = prob_false df_prob['prob_true'] = prob_true df_prob.index = df_unlabeled.index high_prob = pd.concat([ df_prob.loc[df_prob['prob_false'] > 0.99], df_prob.loc[df_prob['prob_true'] > 0.99] ], axis=0) pseudos = df_unlabeled.loc[high_prob.index] pseudos["class"] = high_prob['predicted'] df_labeled_train = pd.concat([ df_labeled_train, pseudos[[ 'spoken', "rarity", "meanings", "lexical", "emoticon", 'class' ]] ], axis=0) df_unlabeled = df_unlabeled.drop(index=high_prob.index) if len(df_unlabeled) == 0 or len(high_prob) == 0: test = model.predict(df_labeled_test[[ 'spoken', "rarity", "meanings", "lexical", "emoticon" ]].to_numpy()) report = classification_report( df_labeled_test["class"].to_numpy(), test, output_dict=True) plot(report, f"{name}_cf.png", f"{name} with custom features") joblib.dump(model, f"{name}_cf.sav") break i += 1