def main(data): df = pd.read_csv(data).fillna("UNK") df["age*"] = df["age"].map(NEWAGE) provider1 = df[df["provider"] == "provider1"] provider2 = df[df["provider"] == "provider2"] X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age*"]) rs = RandomUnderSampler(random_state=0) X_tr_resampled, y_tr_resampled = rs.fit_resample(X_tr, y_tr) model = build_model() model.fit(X_tr_resampled, y_tr_resampled) # model.fit(X_tr, y_tr) print("Done fitting") print("Train set") print(classification_report(y_tr, model.predict(X_tr))) print("Valid set") print(classification_report(y_te, model.predict(X_te))) print("Hold-out set") print(classification_report(provider2["age*"], model.predict(provider2)))
def test_baseline(data): # Just check if the pipeline works x, lags = lag_columns(data) x = lag_average(x, cols=lags) x = lag_ewm(x, cols=lags) model = build_model().fit(x, x["sold_qty_units_1_weeks_ago"]) model.predict(x)
def main(data, use_grid_cv): df = pd.read_csv(data) # convert to date format df.date_of_day = df.date_of_day.apply(pd.to_datetime) df['month'] = df.date_of_day.apply(lambda x: x.month) df['day'] = df.date_of_day.apply(lambda x: x.day) xx, lag_col_names = lag_columns(df, n_intervals=1) # Apply the moving averages to lags, not to the targets itself!!! x = lag_average(xx, cols=lag_col_names) x = lag_ewm(x, cols=lag_col_names) print(x.columns) print("Build model") x = x.sort_values(by='date_of_day') X_tr, X_te, y_tr, y_te = train_test_split_rounded(x, x["sold_qty_units"], test_size=6) print('train period: from {} to {}'.format(X_tr.date_of_day.min(), X_tr.date_of_day.max())) print('evaluation period: from {} to {}'.format(X_te.date_of_day.min(), X_te.date_of_day.max())) # Remove first examples with NaNs -> needed for linear models train_idx = ~np.isnan( X_tr.store_count_1_weeks_ago__2_weeks_window_size).values.reshape(-1) X_tr = X_tr[train_idx] y_tr = y_tr[train_idx] model = build_model() model.fit(X_tr, np.log(y_tr)) # force positive output print("Done fitting") print("Train set:") y_tr_predicted = np.exp(model.predict(X_tr)) y_te_predicted = np.exp(model.predict(X_te)) print("\tRMS:", sqrt(mean_squared_error(y_tr, y_tr_predicted))) print("Valid set:") print("\tRMS:", sqrt(mean_squared_error(y_te, y_te_predicted)))
def main(data): df = pd.read_csv(data).fillna("UNK") df["sample_weight"] = df["age"].map(SAMPLE_WEIGHTS) provider1 = df[df["provider"] == "provider1"] provider2 = df[df["provider"] == "provider2"] X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age"]) model = build_model() model.fit(X_tr, y_tr, logisticregression__sample_weight=X_tr["sample_weight"]) print("Done fitting") print("Train set:") print(classification_report(y_tr, model.predict(X_tr))) print("Valid set:") print(classification_report(y_te, model.predict(X_te))) print("Hold-out (provider2) set:") print(classification_report(provider2["age"], model.predict(provider2)))
def main(data): df = pd.read_csv(data).fillna("UNK") provider1 = df[df["provider"] == "provider1"] X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age"]) model = build_model() train_sizes, train_scores, test_scores = learning_curve( model, X_tr, y_tr, cv=2, scoring="f1_micro", n_jobs=-1, ) curve(train_sizes, train_scores, label="train", color="b") curve(train_sizes, test_scores, label="test", color="orange") plt.ylabel("f1 score") plt.xlabel("Number of training samples") plt.ylim(0.2, 0.6) plt.legend() plt.show()
def main_worker(rank, nprocs, args ,cfg): best_acc1 = .0 torch.cuda.set_device(rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:12345', world_size=args.nprocs, rank=rank) if is_master_proc(args) and args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training.' 'This will turn on the CUDNN deterministic setting,' 'which can slow down your training considerably!' 'You may see unexpected behavior when restarting' 'from checkpoints.') # create model model = build_model(cfg) model.cuda(rank) # define loss function (criterion) and optimizer criterion = nn.BCELoss().cuda(rank) optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) # model, optimizer = amp.initialize(model, optimizer) model = DDP(model, device_ids=[rank]) cudnn.benchmark = True # Data loading code train_dataloader = construct_loader(cfg, 'train') val_dataloader = construct_loader(cfg, 'val') # get logget logger = get_logger(args) # tesorboard writer writer = SummaryWriter() for epoch in range(cfg.SOLVER.MAX_EPOCH): # train for one epoch loss = train(train_dataloader, model, criterion, optimizer, epoch, rank, args, logger, cfg) # evaluate on validation set acc1 = validate(val_dataloader, model, criterion, rank, args, logger, cfg) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if rank == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'best_acc1': best_acc1, }, is_best) writer.add_scalar('Loss', loss, epoch) writer.add_scalar('Accuracy', acc1, epoch) writer.flush() writer.close() return print("Train finished! to see train infomation in log/, to see train result in runs/ by using tensorboard command")
def test_handles_model(data): # Check that the pipeline works model = build_model().fit(data, data["age"]) model.predict(data)
def train(rank, world_size, args, cfg): dist.init_process_group(backend='nccl', init_method=args.init_method, world_size=world_size, rank=rank) # dist.init_process_group(backend='nccl', rank=rank, ) torch.cuda.set_device(args.local_rank) seed = int(time.time() * 256) torch.manual_seed(seed) logger = logging.getLogger(__name__) logging.basicConfig(level=20, format='%(asctime)s - %(message)s') # ================================================ # 2) get data and load data # ================================================ train_dataloader = construct_loader(cfg, 'train') val_dataloader = construct_loader(cfg, 'val') # ================================================ # 3) init model/loss/optimizer # ================================================ model = build_model(cfg) model.cuda() optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY) model, optimizer = amp.initialize(model, optimizer) model = torch.nn.parallel.DistributedDataParallel(model) cudnn.benchmark = True loss_function = F.cross_entropy().cuda() # ================================================ # 4) train loop # ================================================ print("|------------------------|") print("| train on train dataset |") print("|------------------------|") early_stopping = EarlyStopping(20, verbose=True, path='checkpoints/model.pth', trace_func=logging.info) writer = SummaryWriter() start_time = time.time() for epoch in range(args.n_epochs): train_loss_lst = [] val_loss_lst = [] train_acc_lst = [] val_acc_lst = [] model.train() for i, train_dataset in enumerate(train_dataloader): train_data, train_label = train_dataset if cfg.NUM_GPU: train_data.cuda(non_blocking=True) train_label.cuda(non_blocking=True) torch.distributed.barrier() optimizer.zero_grad() # # forward + backward + optimize train_outputs = model(train_data) train_loss = loss_function(train_outputs, train_label.long()) adjust_lr(optimizer, epoch, cfg.SOLVER.BASE_LR) with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() # train_loss.backward() optimizer.step() train_acc = accuracy(train_outputs, train_label.long()) train_acc_lst.append(train_acc) train_loss_lst.append(train_loss) train_avg_loss = sum(train_loss_lst) / i train_avg_acc = sum(train_acc_lst) / i # ================================================ # 5) evaluate on validation dataset # ================================================ model.eval() for v, val_dataset in enumerate(val_dataloader): val_data, val_label = val_dataset val_outputs = model(val_data) val_loss = F.cross_entropy(val_outputs, val_label.long()) val_acc = accuracy(val_outputs, val_label) val_acc_lst.append(val_acc) val_loss_lst.append(val_loss) val_avg_acc = sum(val_acc_lst) / v val_avg_loss = sum(val_loss_lst) / v logging.info( "Train Phase, Epoch:{}, Train_avg_loss:{}, Val_avg_loss:{},Train_avg_acc:{}, Val_avg_acc:{}" .format(epoch, train_avg_loss, val_avg_loss, train_avg_acc, val_avg_acc)) early_stopping(val_avg_loss, model) if early_stopping.early_stop: print('|------- Early Stop ------|') end_time = time.time() logging.info("Total spend time:{}s".format(end_time - start_time)) break writer.add_scalar('Loss', train_avg_loss, epoch) writer.add_scalar('Accuracy', train_avg_acc, epoch) logging.FileHandler('logs/{}_log.txt'.format( time.strftime(r"%Y-%m-%d-%H_%M_%S", time.localtime())))