예제 #1
0
            dataset, tasktype))
    parser.add_argument('--model-dir',
                        default='check_{}_{}_model/'.format(dataset, tasktype))
    parser.add_argument(
        '--mode',
        choices=['classification', 'regression'],
        default='regression' if tasktype == 'r' else 'classification')
    args = parser.parse_args()

    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    start_time = time.time()

    print('Dataset:', args.train_csv)
    df_X, df_y, model_config, _ = load_data(args.train_csv)

    model_config['mode'] = args.mode

    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    # eval dataset during the training
    size = min(df_X.shape[0] // 10, 1000)
    X_train, X_eval, y_train, y_eval = train_test_split(df_X,
                                                        df_y,
                                                        test_size=size)

    train_dir = 'catboost_info/'
    if os.path.exists(train_dir):
예제 #2
0
    parser.add_argument('--test-csv', required=True)
    parser.add_argument('--prediction-csv',
                        type=argparse.FileType('w'),
                        required=True)
    parser.add_argument('--model-dir', required=True)
    args = parser.parse_args()

    start_time = time.time()

    # load model
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)

    X_scaled, _, _, df = load_data(args.test_csv,
                                   datatype='test',
                                   cfg=model_config)

    if model_config['time_leakage']['is_leakage']:
        df[model_config['time_leakage']
           ['id_col']] = model_config['time_leakage']['id_series']
        df[model_config['time_leakage']
           ['dt_col']] = model_config['time_leakage']['dt_series']
        df[model_config['time_leakage']
           ['num_col']] = model_config['time_leakage']['num_series']
        df = use_time_leakage(df, model_config['time_leakage'])
        df['prediction'] = df['prediction'].fillna(0)
    else:
        model = model_config['model']
        #df = pd.read_csv(args.test_csv, usecols=['line_id',])
        #print(args.test_csv)