示例#1
0
def main(data):
    df = pd.read_csv(data).fillna("UNK")
    df["age*"] = df["age"].map(NEWAGE)

    provider1 = df[df["provider"] == "provider1"]
    provider2 = df[df["provider"] == "provider2"]

    X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age*"])

    rs = RandomUnderSampler(random_state=0)
    X_tr_resampled, y_tr_resampled = rs.fit_resample(X_tr, y_tr)

    model = build_model()
    model.fit(X_tr_resampled, y_tr_resampled)
    # model.fit(X_tr, y_tr)

    print("Done fitting")

    print("Train set")
    print(classification_report(y_tr, model.predict(X_tr)))

    print("Valid set")
    print(classification_report(y_te, model.predict(X_te)))

    print("Hold-out set")
    print(classification_report(provider2["age*"], model.predict(provider2)))
示例#2
0
def test_baseline(data):
    # Just check if the pipeline works
    x, lags = lag_columns(data)
    x = lag_average(x, cols=lags)
    x = lag_ewm(x, cols=lags)

    model = build_model().fit(x, x["sold_qty_units_1_weeks_ago"])
    model.predict(x)
示例#3
0
def main(data, use_grid_cv):
    df = pd.read_csv(data)

    # convert to date format
    df.date_of_day = df.date_of_day.apply(pd.to_datetime)
    df['month'] = df.date_of_day.apply(lambda x: x.month)
    df['day'] = df.date_of_day.apply(lambda x: x.day)

    xx, lag_col_names = lag_columns(df, n_intervals=1)

    # Apply the moving averages to lags, not to the targets itself!!!
    x = lag_average(xx, cols=lag_col_names)
    x = lag_ewm(x, cols=lag_col_names)
    print(x.columns)

    print("Build model")
    x = x.sort_values(by='date_of_day')
    X_tr, X_te, y_tr, y_te = train_test_split_rounded(x,
                                                      x["sold_qty_units"],
                                                      test_size=6)

    print('train period: from {} to {}'.format(X_tr.date_of_day.min(),
                                               X_tr.date_of_day.max()))
    print('evaluation period: from {} to {}'.format(X_te.date_of_day.min(),
                                                    X_te.date_of_day.max()))

    # Remove first examples with NaNs -> needed for linear models
    train_idx = ~np.isnan(
        X_tr.store_count_1_weeks_ago__2_weeks_window_size).values.reshape(-1)
    X_tr = X_tr[train_idx]
    y_tr = y_tr[train_idx]

    model = build_model()

    model.fit(X_tr, np.log(y_tr))  # force positive output
    print("Done fitting")

    print("Train set:")
    y_tr_predicted = np.exp(model.predict(X_tr))
    y_te_predicted = np.exp(model.predict(X_te))

    print("\tRMS:", sqrt(mean_squared_error(y_tr, y_tr_predicted)))

    print("Valid set:")
    print("\tRMS:", sqrt(mean_squared_error(y_te, y_te_predicted)))
示例#4
0
def main(data):
    df = pd.read_csv(data).fillna("UNK")
    df["sample_weight"] = df["age"].map(SAMPLE_WEIGHTS)

    provider1 = df[df["provider"] == "provider1"]
    provider2 = df[df["provider"] == "provider2"]

    X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age"])
    model = build_model()
    model.fit(X_tr,
              y_tr,
              logisticregression__sample_weight=X_tr["sample_weight"])

    print("Done fitting")

    print("Train set:")
    print(classification_report(y_tr, model.predict(X_tr)))

    print("Valid set:")
    print(classification_report(y_te, model.predict(X_te)))

    print("Hold-out (provider2) set:")
    print(classification_report(provider2["age"], model.predict(provider2)))
示例#5
0
def main(data):
    df = pd.read_csv(data).fillna("UNK")
    provider1 = df[df["provider"] == "provider1"]

    X_tr, X_te, y_tr, y_te = train_test_split(provider1, provider1["age"])
    model = build_model()

    train_sizes, train_scores, test_scores = learning_curve(
        model,
        X_tr,
        y_tr,
        cv=2,
        scoring="f1_micro",
        n_jobs=-1,
    )

    curve(train_sizes, train_scores, label="train", color="b")
    curve(train_sizes, test_scores, label="test", color="orange")

    plt.ylabel("f1 score")
    plt.xlabel("Number of training samples")
    plt.ylim(0.2, 0.6)
    plt.legend()
    plt.show()
def main_worker(rank, nprocs, args ,cfg):
    best_acc1 = .0
    torch.cuda.set_device(rank)
    dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:12345', world_size=args.nprocs, rank=rank)


    if is_master_proc(args) and args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training.' 
                      'This will turn on the CUDNN deterministic setting,' 
                      'which can slow down your training considerably!' 
                      'You may see unexpected behavior when restarting' 
                      'from checkpoints.')

    # create model
    model = build_model(cfg)
    model.cuda(rank)


    # define loss function (criterion) and optimizer
    criterion = nn.BCELoss().cuda(rank)

    optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY)

    # model, optimizer = amp.initialize(model, optimizer)
    model = DDP(model, device_ids=[rank])

    cudnn.benchmark = True

    # Data loading code
    train_dataloader = construct_loader(cfg, 'train')
    val_dataloader = construct_loader(cfg, 'val')

    # get logget 
    logger = get_logger(args)
    # tesorboard writer
    writer = SummaryWriter()

    for epoch in range(cfg.SOLVER.MAX_EPOCH):
        # train for one epoch
        loss = train(train_dataloader, model, criterion, optimizer, epoch, rank, args, logger, cfg)

        # evaluate on validation set
        acc1 = validate(val_dataloader, model, criterion, rank, args, logger, cfg)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if rank == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.module.state_dict(),
                    'best_acc1': best_acc1,
                }, is_best)

        writer.add_scalar('Loss', loss, epoch)
        writer.add_scalar('Accuracy', acc1, epoch)
        writer.flush()
        writer.close()
    return print("Train finished! to see train infomation in log/, to see train result in runs/ by using tensorboard command")
示例#7
0
def test_handles_model(data):
    # Check that the pipeline works
    model = build_model().fit(data, data["age"])
    model.predict(data)
def train(rank, world_size, args, cfg):
    dist.init_process_group(backend='nccl',
                            init_method=args.init_method,
                            world_size=world_size,
                            rank=rank)
    # dist.init_process_group(backend='nccl', rank=rank, )
    torch.cuda.set_device(args.local_rank)

    seed = int(time.time() * 256)
    torch.manual_seed(seed)

    logger = logging.getLogger(__name__)
    logging.basicConfig(level=20, format='%(asctime)s - %(message)s')

    # ================================================
    # 2) get data and load data
    # ================================================
    train_dataloader = construct_loader(cfg, 'train')
    val_dataloader = construct_loader(cfg, 'val')

    # ================================================
    # 3) init model/loss/optimizer
    # ================================================

    model = build_model(cfg)
    model.cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=cfg.SOLVER.BASE_LR,
                           weight_decay=cfg.SOLVER.WEIGHT_DECAY)

    model, optimizer = amp.initialize(model, optimizer)

    model = torch.nn.parallel.DistributedDataParallel(model)

    cudnn.benchmark = True

    loss_function = F.cross_entropy().cuda()

    # ================================================
    # 4) train loop
    # ================================================
    print("|------------------------|")
    print("| train on train dataset |")
    print("|------------------------|")

    early_stopping = EarlyStopping(20,
                                   verbose=True,
                                   path='checkpoints/model.pth',
                                   trace_func=logging.info)
    writer = SummaryWriter()
    start_time = time.time()
    for epoch in range(args.n_epochs):
        train_loss_lst = []
        val_loss_lst = []
        train_acc_lst = []
        val_acc_lst = []
        model.train()
        for i, train_dataset in enumerate(train_dataloader):
            train_data, train_label = train_dataset

            if cfg.NUM_GPU:
                train_data.cuda(non_blocking=True)
                train_label.cuda(non_blocking=True)
                torch.distributed.barrier()

            optimizer.zero_grad()  #

            # forward + backward + optimize
            train_outputs = model(train_data)
            train_loss = loss_function(train_outputs, train_label.long())

            adjust_lr(optimizer, epoch, cfg.SOLVER.BASE_LR)

            with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            # train_loss.backward()
            optimizer.step()

            train_acc = accuracy(train_outputs, train_label.long())

            train_acc_lst.append(train_acc)
            train_loss_lst.append(train_loss)

        train_avg_loss = sum(train_loss_lst) / i
        train_avg_acc = sum(train_acc_lst) / i
        # ================================================
        # 5) evaluate on validation dataset
        # ================================================

        model.eval()
        for v, val_dataset in enumerate(val_dataloader):
            val_data, val_label = val_dataset

            val_outputs = model(val_data)
            val_loss = F.cross_entropy(val_outputs, val_label.long())
            val_acc = accuracy(val_outputs, val_label)

            val_acc_lst.append(val_acc)
            val_loss_lst.append(val_loss)

        val_avg_acc = sum(val_acc_lst) / v
        val_avg_loss = sum(val_loss_lst) / v
        logging.info(
            "Train Phase, Epoch:{}, Train_avg_loss:{}, Val_avg_loss:{},Train_avg_acc:{}, Val_avg_acc:{}"
            .format(epoch, train_avg_loss, val_avg_loss, train_avg_acc,
                    val_avg_acc))
        early_stopping(val_avg_loss, model)
        if early_stopping.early_stop:
            print('|------- Early Stop ------|')
            end_time = time.time()
            logging.info("Total spend time:{}s".format(end_time - start_time))
            break

        writer.add_scalar('Loss', train_avg_loss, epoch)
        writer.add_scalar('Accuracy', train_avg_acc, epoch)
        logging.FileHandler('logs/{}_log.txt'.format(
            time.strftime(r"%Y-%m-%d-%H_%M_%S", time.localtime())))