Exemplo n.º 1
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}', filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='train')
    valid = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='valid')

    train = train.sample(15000000, random_state=SEED)

    # qs = pd.read_csv('./data/output/question_lsi50.csv')
    # qs = qs[['question_id','tags_lsi']]
    # qs = qs.rename(columns={'question_id':'content_id'})
    # train = pd.merge(train,qs,on='content_id',how='left')
    # valid = pd.merge(valid,qs,on='content_id',how='left')

    # train = data_util.reduce_mem_usage(train)
    # valid = data_util.reduce_mem_usage(valid)

    LOG.info(
        f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}'
    )

    model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG)
    data_util.seve_model(model, fi, file_name)

    valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
Exemplo n.º 2
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}', filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='train')
    valid = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='valid')

    train = train.sample(15000000, random_state=SEED)
    # train = data_util.reduce_mem_usage(train)
    # valid = data_util.reduce_mem_usage(valid)

    LOG.info(
        f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}'
    )

    model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG)
    data_util.seve_model(model, fi, file_name)

    valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
Exemplo n.º 3
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}', filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train_df = data_util.load_features(feature_list)
    train_df = train_df.sample(frac=0.10, random_state=127)
    valid_df = train_df.sample(frac=0.02, random_state=127)
    valid_id = valid_df['row_id']
    train_df = train_df[~train_df['row_id'].isin(valid_id)]

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    LOG.info(f'train shape : {train_df.shape}')
    LOG.info(f'valid shape : {valid_df.shape}')

    train_x = train_df[USE_COLS]
    train_y = train_df[TARGET]
    valid_x = valid_df[USE_COLS]
    valid_y = valid_df[TARGET]

    lgb_model, fi, valid_df['pred'] = run_lgb(train_x=train_x,
                                              train_y=train_y,
                                              valid_x=valid_x,
                                              valid_y=valid_y,
                                              LOG=LOG)

    data_util.seve_model(lgb_model, fi, file_name)

    valid_df[['user_id', 'pred']]
Exemplo n.º 4
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}', filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='train')
    valid = data_util.load_features(FEATURES_LIST,
                                    path=f'features/{path}',
                                    train_valid='valid')

    qs = pd.read_csv('./data/input/question_cmnts.csv')
    qs.columns = ['content_id', 'community']

    train = pd.merge(train, qs, on='content_id', how='left')
    valid = pd.merge(valid, qs, on='content_id', how='left')

    # train = data_util.reduce_mem_usage(train)
    # valid = data_util.reduce_mem_usage(valid)

    LOG.info(
        f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}'
    )

    model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG)
    data_util.seve_model(model, fi, file_name)

    valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
Exemplo n.º 5
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}',filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train = data_util.load_features(FEATURES_LIST,path='features/mini_data',train_valid='train')
    valid = data_util.load_features(FEATURES_LIST,path='features/mini_data',train_valid='valid')

    LOG.info(f'train_size:{train.shape} valid_size:{valid.shape}')

    model,fi,valid_pred = run_lgb(train=train,valid=valid,LOG=LOG)
    data_util.seve_model(model,fi,file_name)
Exemplo n.º 6
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}',filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train = data_util.load_features(FEATURES_LIST,path=f'features/{path}',train_valid='train')
    valid = data_util.load_features(FEATURES_LIST,path=f'features/{path}',train_valid='valid')

    train = train.sample(15000000, random_state = SEED)

    questions = pd.read_csv('./data/input/questions.csv')
    lst = []
    for tags in questions["tags"]:
        ohe = np.zeros(188)
        if str(tags) != "nan":
            for tag in tags.split():
                ohe += np.eye(188)[int(tag)]
        lst.append(ohe)
    tags_df = pd.DataFrame(lst, columns=[f"tag_{i}" for i in range(188)]).astype(int)

    questions = pd.concat([questions,tags_df],axis=1)
    questions = questions.rename(columns={'question_id':'content_id'})

    questions = questions[QS+['content_id']]

    train = pd.merge(train,questions,on='content_id',how='left')
    valid = pd.merge(valid,questions,on='content_id',how='left')

    # qs = pd.read_csv('./data/output/question_lsi50.csv')
    # qs = qs[['question_id','tags_lsi']]
    # qs = qs.rename(columns={'question_id':'content_id'})
    # train = pd.merge(train,qs,on='content_id',how='left')
    # valid = pd.merge(valid,qs,on='content_id',how='left')

    # train = data_util.reduce_mem_usage(train)
    # valid = data_util.reduce_mem_usage(valid)

    LOG.info(f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}')

    model,fi,valid['pred'] = run_lgb(train=train,valid=valid,LOG=LOG)
    data_util.seve_model(model,fi,file_name)

    valid[['row_id','pred']].to_feather(f'./data/oof/{file_name}.feather')
Exemplo n.º 7
0
def main():

    file_name = os.path.basename(__file__)[:-3]
    LOG = logger.Logger(name=f'{file_name}', filename=file_name)
    LOG.info('base line')
    LOG.info(f'{USE_COLS}')

    train_df = data_util.load_features(feature_list)
    train_index = pd.read_feather(f'./data/train_valid/cv1_train.feather')
    valid_index = pd.read_feather(f'./data/train_valid/cv1_valid.feather')
    train = train_df[train_df['row_id'].isin(train_index['row_id'])]
    valid = train_df[train_df['row_id'].isin(valid_index['row_id'])]

    del train_df
    gc.collect()

    LOG.info(f'train_size:{train.shape} valid_size:{valid.shape}')

    model, fi, valid_pred = run_lgb(train=train, valid=valid, LOG=LOG)
    data_util.seve_model(model, fi, file_name)