# tile = np.moveaxis(tile, -1, 0) # tile = np.expand_dims(tile, axis=0) # # # Scale to [0, 1] # tile = tile / 255 # # # Embed tile # tile = torch.from_numpy(tile).float() # tile = Variable(tile) # # if cuda: tile = tile.cuda() # z = tilenet.encode(tile) # if cuda: z = z.cpu() # z = z.data.numpy() # # X[idx,:] = z # t1 = time() # print('Embedded {} tiles: {:0.3f}s'.format(config.n_tiles, t1-t0)) from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split # Splitting data and training RF classifer X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2) rf = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features='sqrt', random_state=1) rf.fit(X_trn, y_trn) print("AND EVAL", rf.eval(X_val, y_val))
def main(): print("Loading train data from {}".format(args.raw_data)) df = pd.read_csv(args.raw_data) df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args) if args.algo == 'decisiontree': # min_samples_leaf: 0.05, min_samples_split: 10, class_weight: None, splitter: best, max_features: 10 # criterion: entropy, max_depth: 7 for all feature # min_samples_leaf: 0.05, min_samples_split: 3, class_weight: None, splitter: best, max_features: 8 # criterion: entropy, max_depth: 6 for discrete feature model = tree.DecisionTreeClassifier(min_samples_leaf=0.05, min_samples_split=3, class_weight=None, splitter="best", max_features=8, criterion="entropy", max_depth=6) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'randomforest': # random_state: 42, n_estimators: 1000, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5, # min_samples_leaf: 7, min_samples_split: 7 for all feature # random_state: 42, n_estimators: 100, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5, # min_samples_leaf: 7, min_samples_split: 7 for discrete feature model = RandomForestClassifier(random_state=42, # pafam for using all feature n_estimators=1000, criterion="gini", max_depth=7, bootstrap=True, max_features=5, min_samples_leaf=7, min_samples_split=7) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'logisticregression': # penalty: l1, random_state: 42, C: 0.05, tol: 0.01, intercept_scaling: 3, fit_intercept: True, # max_iter: 10 for all feature # penalty: l2, random_state: 42, C: 0.05, tol: 0.1, intercept_scaling: 1, fit_intercept: True, # max_iter: 10 for discrete feature model = LogisticRegression(penalty="l1", random_state=42, C=.05, tol=0.01, intercept_scaling=3, fit_intercept=True, max_iter=10) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'ADA': model = AdaBoostClassifier() model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'XGB': model = XGBClassifier() model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'FFN': model = lib.FFN(df_train_input_sc.shape[1], args.output_dim, args.num_classes) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) dataloader = lib.DataLoader(df_train_input_sc, df_train_target, args.batchsize) # training model.train() for epoch in range(args.num_epochs): sum_loss = 0 cnt = 0 for it, (input_data, target_data) in enumerate(dataloader): cnt += 1 input_data = torch.Tensor(input_data) target_data = torch.LongTensor(target_data) optimizer.zero_grad() logit = model(input_data) loss = F.nll_loss(logit, target_data) pred = logit.data.max(1)[1] sum_loss += loss.item() loss.backward() optimizer.step() print("Epoch: {} - loss: {}".format(epoch, float(sum_loss) / cnt)) # testing model.eval() with torch.no_grad(): input_data_test = torch.Tensor(df_test_input_sc) target_data_test = torch.LongTensor(df_test_target) logit = model(input_data_test) loss = F.nll_loss(logit, target_data_test) y_pred = logit.data.max(1)[1] print(classification_report(df_test_target, y_pred))