Exemplo n.º 1
0
def run_class(depth, num_est, train_transformed, labels, test_transformed, evaluate=True, file_name=None, jobs=1):
    # train_transformed, labels = transform_set('train.csv')

    categories = sorted(list(set(labels)))
    mapping = {value: index for index, value in enumerate(categories)}
    label_transformed = [mapping[cat] for cat in labels]

    # test_transformed, _ = transform_set("test.csv", train=False)

    clf = RandomForestClassifier(max_depth=depth, verbose=2, n_jobs=jobs, n_estimators=num_est)

    # mkdirs(data_path('serialized/model1/'))
    # joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

    print("=======================================")

    if not evaluate:
        clf.fit(train_transformed, label_transformed)
        del train_transformed
        del label_transformed
        test_prediction = clf.predict_proba(test_transformed)
        del test_transformed
        create_submission(test_prediction, file_name)

    if evaluate:
        cross = cross_validation(clf, train_transformed, label_transformed)
        print(cross, cross.mean())
        return cross, cross.mean(), ', '.join(sorted({feature.split('_')[0] for feature in train_transformed.columns}))
Exemplo n.º 2
0
def run_class(depth,
              num_est,
              train_transformed,
              labels,
              test_transformed,
              evaluate=True,
              file_name=None,
              jobs=1):
    # train_transformed, labels = transform_set('train.csv')

    categories = sorted(list(set(labels)))
    mapping = {value: index for index, value in enumerate(categories)}
    label_transformed = [mapping[cat] for cat in labels]

    # test_transformed, _ = transform_set("test.csv", train=False)

    clf = RandomForestClassifier(max_depth=depth,
                                 verbose=2,
                                 n_jobs=jobs,
                                 n_estimators=num_est)

    # mkdirs(data_path('serialized/model1/'))
    # joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

    print("=======================================")

    if not evaluate:
        clf.fit(train_transformed, label_transformed)
        del train_transformed
        del label_transformed
        test_prediction = clf.predict_proba(test_transformed)
        del test_transformed
        create_submission(test_prediction, file_name)

    if evaluate:
        cross = cross_validation(clf, train_transformed, label_transformed)
        print(cross, cross.mean())
        return cross, cross.mean(), ', '.join(
            sorted({
                feature.split('_')[0]
                for feature in train_transformed.columns
            }))
Exemplo n.º 3
0
    result = transformer.transform_frame(train_frame)

    not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y"
    train_transformed = result.filter(regex=not_regex)
    label_transformed = None
    if train: label_transformed = result.filter(regex="^Category")

    return train_transformed, label_transformed


train_transformed, label_transformed = transform_set('train.csv')

print(train_transformed.columns)
print(label_transformed.columns)

clf = OneVsRestClassifier(LogisticRegression(random_state=0))
train_prediction = clf.fit(train_transformed,
                           label_transformed).predict(train_transformed)

# mkdirs(data_path('serialized/model1/'))
# joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

# print(cross_validation(clf, train_transformed, label_transformed))

test_transformed, _ = transform_set("test.csv", train=False)

print("-=======================================")

test_prediction = clf.predict_proba(test_transformed)
create_submission(test_prediction, "submission1.csv")
Exemplo n.º 4
0
clf = RandomForestClassifier(max_depth=10)
clf.fit(train_transformed, label_transformed)
# train_prediction = clf.predict(train_transformed)

# mkdirs(data_path('serialized/model1/'))
# joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

# print(cross_validation(clf, train_transformed, label_transformed))

test_transformed, _ = transform_set("test.csv", train=False)

print("=======================================")


test_prediction = clf.predict_proba(test_transformed)
print(test_prediction[0])


print(sum([t[0][0] for t in test_prediction]))
print(sum([t[0][1] for t in test_prediction]))

reshaped = [[t[i][1] for t in test_prediction] for i in range(len(test_prediction[0]))]

print(reshaped)


print(len(test_prediction), test_prediction[0].shape)
create_submission(reshaped, "submission12.csv")


# score on kaggle = 2.60553
Exemplo n.º 5
0
# clf = OneVsRestClassifier(LogisticRegression(random_state=0))
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10)
clf.fit(train_transformed, label_transformed)
# train_prediction = clf.predict(train_transformed)

# mkdirs(data_path('serialized/model1/'))
# joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

# print(cross_validation(clf, train_transformed, label_transformed))

test_transformed, _ = transform_set("test.csv", train=False)

print("=======================================")

test_prediction = clf.predict_proba(test_transformed)
print(test_prediction[0])

print(sum([t[0][0] for t in test_prediction]))
print(sum([t[0][1] for t in test_prediction]))

reshaped = [[t[i][1] for t in test_prediction]
            for i in range(len(test_prediction[0]))]

print(reshaped)

print(len(test_prediction), test_prediction[0].shape)
create_submission(reshaped, "submission12.csv")

# score on kaggle = 2.60553
Exemplo n.º 6
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({'args': {'config': args.config, 'debug': args.debug}})

    if config["model"]["name"] == "lightgbm":
        config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (pathlib.Path(config['dataset']['output_directory']) /
                        model_no)
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({'model_output_dir': str(model_output_dir)})

    # =========================================
    # === Loading data
    # =========================================
    logger.info('Loading data')

    # Get train and test
    input_dir = pathlib.Path(config['dataset']['input_directory'])
    train = pd.read_csv(input_dir / 'train.csv')
    test = pd.read_csv(input_dir / 'test.csv')

    # Get target values
    target_column = config['data_type']['target']
    y_train = train[target_column].values

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # =========================================
    # === Adversarial Validation
    # =========================================
    logger.info("adversarial validation")
    train_adv = x_train
    test_adv = x_test
    train_adv['target'] = 0
    test_adv['target'] = 1
    train_test_adv = pd.concat([train_adv, test_adv], axis=0,
                               sort=False).reset_index(drop=True)
    target = train_test_adv['target'].values

    train_set, val_set = train_test_split(train_test_adv,
                                          test_size=0.33,
                                          random_state=71,
                                          shuffle=True)
    x_train_adv = train_set[feature_name]
    y_train_adv = train_set['target']
    x_val_adv = val_set[feature_name]
    y_val_adv = val_set['target']
    logger.debug(f'the number of train set: {len(x_train_adv)}')
    logger.debug(f'the number of valid set: {len(x_val_adv)}')

    train_lgb = lgb.Dataset(x_train_adv, label=y_train_adv)
    val_lgb = lgb.Dataset(x_val_adv, label=y_val_adv)
    lgb_model_params = config["adversarial_validation"]["lgb_model_params"]
    lgb_train_params = config["adversarial_validation"]["lgb_train_params"]
    clf = lgb.train(lgb_model_params,
                    train_lgb,
                    valid_sets=[train_lgb, val_lgb],
                    valid_names=['train', 'valid'],
                    **lgb_train_params)

    feature_imp = pd.DataFrame(sorted(
        zip(clf.feature_importance(importance_type='gain'), feature_name)),
                               columns=['value', 'feature'])
    plt.figure(figsize=(20, 10))
    sns.barplot(x='value',
                y='feature',
                data=feature_imp.sort_values(by='value',
                                             ascending=False).head(20))
    plt.title('LightGBM Features')
    plt.tight_layout()
    plt.savefig(model_output_dir / "feature_importance_adv.png")

    config.update({
        'adversarial_validation_result': {
            'score':
            clf.best_score,
            'feature_importances':
            feature_imp.set_index("feature").sort_values(
                by="value", ascending=False).head(20).to_dict()["value"]
        }
    })

    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # Get folds
    folds_ids = Fold(
        n_splits=config['cv']['n_splits'],
        shuffle=config['cv']['shuffle'],
        random_state=config['cv']['random_state']).get_stratifiedkfold(
            x_train, y_train)

    # Train and predict
    model_name = config['model']['name']
    model_cls = model_map[model_name]
    params = config['model']
    runner = Runner(model_cls, params, model_output_dir,
                    f'Train_{model_cls.__name__}')

    oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids)
    config.update(evals_result)
    test_preds = runner.predict_cv(x_test)

    # =========================================
    # === Make submission file
    # =========================================
    sub = create_submission(test, test_preds, target_column)
    sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True)

    # =========================================
    # === Save files
    # =========================================
    save_path = model_output_dir / 'output.json'
    json_dump(config, save_path)

    pd.DataFrame(oof_preds,
                 columns=["target"]).to_csv(model_output_dir / 'oof.csv',
                                            index=False,
                                            header=True)
Exemplo n.º 7
0
classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
           'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING',
           'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
           'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY',
           'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY',
           'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']

category = train_frame['Category']
mapping = {clazz: num for (num, clazz) in enumerate(classes)}

most_freq_class = Counter(category).most_common()[0][0]

predicted = category.apply(lambda cat: mapping[most_freq_class])
expected = category.apply(lambda cat: mapping[cat])

mlb = MultiLabelBinarizer()

expected_b = mlb.fit_transform(to_singleton(expected))
predicted_b = mlb.transform(to_singleton(predicted))

for (clazz, count) in Counter(category).most_common():
    print("{}\t{}".format(clazz, count))

# todo: use validation.py
print("Accuracy on training: {}".format(accuracy_score(expected_b, predicted_b)))
print("Log los on training: {}".format(log_loss(expected_b, predicted_b)))

test_prediction = np.full((submission_size, len(predicted_b[0])), predicted_b[0])
create_submission(test_prediction, 'baseline_sub.csv')

Exemplo n.º 8
0
    result = transformer.transform_frame(train_frame)

    not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y"
    train_transformed = result.filter(regex=not_regex)
    label_transformed = None
    if train: label_transformed = result.filter(regex="^Category")

    return train_transformed, label_transformed

train_transformed, label_transformed = transform_set('train.csv')

print(train_transformed.columns)
print(label_transformed.columns)

clf = OneVsRestClassifier(LogisticRegression(random_state=0))
train_prediction = clf.fit(train_transformed, label_transformed).predict(train_transformed)

# mkdirs(data_path('serialized/model1/'))
# joblib.dump(clf, data_path('serialized/model1/model1.pkl'))

# print(cross_validation(clf, train_transformed, label_transformed))

test_transformed, _ = transform_set("test.csv", train=False)

print("-=======================================")

test_prediction = clf.predict_proba(test_transformed)
create_submission(test_prediction, "submission1.csv")


Exemplo n.º 9
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_1dcnn_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({'args': {'config': args.config, 'debug': args.debug}})

    if config["model"]["name"] == "lightgbm":
        config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (pathlib.Path(config['dataset']['output_directory']) /
                        model_no)
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({'model_output_dir': str(model_output_dir)})

    # =========================================
    # === Loading data
    # =========================================
    logger.info('Loading data')

    # Get train and test
    input_dir = pathlib.Path(config['dataset']['input_directory'])
    train = pd.read_csv(input_dir / 'train.csv')
    test = pd.read_csv(input_dir / 'test.csv')

    spectrum = pd.read_csv(input_dir / 'spectrum_stack.csv')
    spectrum_fitting = pd.read_csv(input_dir / 'spectrum_fitting_stack.csv')
    wv_cols = [f"wavelength_{i}" for i in range(512)]
    wv_fit_cols = [f"fitting_wavelength_{i}" for i in range(512)]

    train_spectrum = pd.merge(train,
                              spectrum,
                              on="spectrum_filename",
                              how="left")
    test_spectrum = pd.merge(test,
                             spectrum,
                             on="spectrum_filename",
                             how="left")
    train_spectrum = pd.merge(train_spectrum,
                              spectrum_fitting,
                              on="spectrum_filename",
                              how="left")
    test_spectrum = pd.merge(test_spectrum,
                             spectrum_fitting,
                             on="spectrum_filename",
                             how="left")

    train_std = np.std(train_spectrum[wv_cols].values, axis=1, keepdims=True)
    test_std = np.std(test_spectrum[wv_cols].values, axis=1, keepdims=True)
    train_spectrum[wv_cols] = train_spectrum[wv_cols].values / train_std
    test_spectrum[wv_cols] = test_spectrum[wv_cols].values / test_std

    spectrum_cols = wv_cols + wv_fit_cols
    train_spectrum = train_spectrum[spectrum_cols]
    test_spectrum = test_spectrum[spectrum_cols]

    # Get target values
    target_column = config['data_type']['target']
    y_train = train[target_column].values

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # =========================================
    # === features preprocess
    # =========================================
    x_total = x_train.append(x_test).reset_index(drop=True)
    remove_features = [c for c in x_total.columns if c.find("layout_x") != -1]
    remove_features += [c for c in x_total.columns if c.find("layout_y") != -1]
    x_total.drop(columns=remove_features, inplace=True)

    x_total = pd.get_dummies(
        x_total, columns=["LabelEncoding_exc_wl", "LabelEncoding_layout_a"])
    x_total.fillna(0, inplace=True)

    from sklearn.preprocessing import StandardScaler
    numeric_features = [
        c for c in x_total.columns if c.find("LabelEncoding_") == -1
    ]
    sc = StandardScaler()
    x_total[numeric_features] = sc.fit_transform(x_total[numeric_features])

    x_train = x_total.iloc[:len(train)]
    x_test = x_total.iloc[len(train):].reset_index(drop=True)

    x_train = pd.concat([x_train, train_spectrum], axis=1)
    x_test = pd.concat([x_test, test_spectrum], axis=1)
    logger.debug(f'number of features with spec in train: {x_train.shape}')
    logger.debug(f'number of features with spec in test: {x_test.shape}')

    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Get folds
    folds_ids = Fold(
        n_splits=config['cv']['n_splits'],
        shuffle=config['cv']['shuffle'],
        random_state=config['cv']['random_state']).get_stratifiedkfold(
            x_train, y_train)

    # Train and predict
    model_name = config['model']['name']
    model_cls = model_map[model_name]
    params = config['model']
    runner = Runner(model_cls, params, model_output_dir,
                    f'Train_{model_cls.__name__}')

    oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids)
    config.update(evals_result)
    test_preds = runner.predict_cv(x_test)

    # =========================================
    # === Make submission file
    # =========================================
    sub = create_submission(test, test_preds, target_column)
    sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True)

    # =========================================
    # === Save files
    # =========================================
    save_path = model_output_dir / 'output.json'
    json_dump(config, save_path)

    pd.DataFrame(oof_preds,
                 columns=["target"]).to_csv(model_output_dir / 'oof.csv',
                                            index=False,
                                            header=True)