示例#1
0
def get_interactions_features_pipeline():
    categorical_preprocessors = []
    for feature in settings.CATEGORICAL:
        categorical_preprocessors.append((
            feature,
            'C',
            Pipeline([
                ('string-to-int', StringToInt()),
                # ('higher', HighOrderFeatures())
                # ('one-hot', OneHotEncoder())
            ])))

    # features = []
    # for feature in settings.FEATURES:
    #     if feature in settings.CATEGORICAL:
    #         features.append(feature + '_C')
    #     else:
    #         features.append(feature)
    # print(features)

    # all_preprocessors = []
    # for feature in features:
    #     all_preprocessors.append(
    #         (feature, '', OneHotEncoder())
    #     )
    # print(categorical_preprocessors[0])
    pipeline = Pipeline([
        ('original', FeatureColumnsExtractor(settings.FEATURES)),
        ('string_to_int->one_hot',
         DataFrameMapper(categorical_preprocessors,
                         return_df=True,
                         rest_unchanged=False)),
        ('higher-order', HighOrderFeatures()),
        ('one-hot', OneHotEncoder()),
        # ('high-correlations', HighCorrelationFilter(threshold=0.95)), #0.3507
        # ('one-hot', DataFrameMapper(
        #     all_preprocessors
        # , return_df=True, rest_unchanged=True))
    ])
    return pipeline
示例#2
0
if __name__ == '__main__':
    orig_dataset = pd.read_csv(settings.TRAIN_FILE)
    # sample_mask = np.zeros((orig_dataset.shape[0],), dtype=np.bool_)
    # sample_idx = sample_without_replacement(orig_dataset.shape[0], orig_dataset.shape[0] * 1.0, random_state=42)
    # sample_mask[sample_idx] = True

    before = time.time()
    fcols = [col for col in orig_dataset.columns if col in settings.FEATURES]
    catconversion = FeatureUnion([feature_sets.CATEGORICAL_CONVERSION],
                                 n_jobs=1)

    dataset = pd.DataFrame(data=catconversion.fit_transform(orig_dataset),
                           columns=fcols,
                           index=orig_dataset.index)
    target = FeatureColumnsExtractor(
        settings.TARGET).fit_transform(orig_dataset).apply(nonlinearity)

    print('original dataset shape:', dataset.shape)

    # union = get_feature_union()
    # dataset = union.fit_transform(dataset, target)

    print('preprocessed dataset shape:', dataset.shape)
    print('preprocessing time: ', time.time() - before)

    # cv = KFold(len(target), n_folds=10, random_state=2, shuffle=False)
    cv = KFold(len(target), n_folds=4, random_state=2, shuffle=False)
    # from src.cross_validation import RepeatedKFold
    # cv = RepeatedKFold(len(target), n_folds=4, n_repeats=2, random_state=3)

    estimators_pipeline = get_estimation_pipeline()
                                    random_state=22,
                                    loss='squared_epsilon_insensitive'))])
    return pipeline


def overall_pipeline():
    return Pipeline([
        ('features', get_feature_union()),
        # ('filters', get_filters()),
        ('estimators', get_estimation_pipeline())
    ])


if __name__ == '__main__':
    original_dataset = pd.read_csv(settings.TRAIN_FILE)
    target = FeatureColumnsExtractor(settings.TARGET).fit_transform(
        original_dataset).apply(lambda x: np.sqrt(x))

    pipeline = overall_pipeline()
    pipeline.fit(original_dataset, target)

    original_test_set = pd.read_csv(settings.TEST_FILE)
    predictions = pipeline.predict(original_test_set)

    output = pd.DataFrame({
        'Id': original_test_set['Id'],
        'Hazard': predictions
    })
    output.to_csv(settings.SUBMIT_SVM_REDUCED,
                  index=False,
                  header=True,
                  columns=['Id', 'Hazard'])
示例#4
0
from __future__ import division, print_function
# noinspection PyUnresolvedReferences
from py3compatibility import *

from kaggle_tools.feature_extraction import FeatureColumnsExtractor
import pandas as pd
import numpy as np
import settings
submittion_files = [settings.SUBMIT_RIDGE_SQRT_REDUCED, settings.SUBMIT_RIDGE_LOG]#, settings.SUBMIT_RIDGE_DIRECT]
weights = [0.5, 0.5]#, 0.3]

dfs = []
for f in submittion_files:
    df = pd.read_csv(f)
    dfs.append(FeatureColumnsExtractor(settings.TARGET).fit_transform(df).values)

submittions = np.array(dfs).T
print(submittions)


stacked_predictions = np.average(submittions, axis=1)
print(stacked_predictions)
output = pd.DataFrame({'Id': df['Id'],
                       'Hazard': stacked_predictions})
output.to_csv(settings.SUBMIT_FILE_STACKED, index=False, header=True, columns=['Id', 'Hazard'])
示例#5
0
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kaggle_tools.feature_extraction import FeatureColumnsExtractor

import settings
from src.main import get_preprocessing_pipeline, get_estimation_pipeline, get_interactions_features_pipeline

dataset = pd.read_csv(settings.TRAIN_FILE)
target = FeatureColumnsExtractor(settings.TARGET).fit_transform(dataset)

# dataset = get_preprocessing_pipeline().fit_transform(dataset)
dataset = get_interactions_features_pipeline().fit_transform(dataset)

if hasattr(dataset, 'toarray'):
    dataset = dataset.toarray()

print(dataset.shape)
devs = np.std(dataset, axis=0)
plt.bar(np.arange(devs.shape[0]), np.sort(devs))
plt.show()
示例#6
0
from sklearn.base import BaseEstimator, TransformerMixin


class Identity(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X


CATEGORICAL_CONVERSION = ('Categorical-Conversion',
                          Pipeline([
                              ('original',
                               FeatureColumnsExtractor(settings.FEATURES)),
                              ('StringToInt',
                               TransformFeatureSet(settings.CATEGORICAL,
                                                   transformer=StringToInt())),
                          ]))

DIRECT = (
    'Direct',
    Pipeline([
        ('identity', TransformFeatureSet(settings.FEATURES, Identity()))
        # ('original', FeatureColumnsExtractor(settings.FEATURES)),
        # ('StringToInt', TransformFeatureSet(settings.CATEGORICAL, transformer=StringToInt())),
    ]))
# DIRECT = ('Direct', Pipeline([
#     ('original', FeatureColumnsExtractor(settings.FEATURES)),
#     ('StringToInt', TransformFeatureSet(settings.CATEGORICAL, transformer=StringToInt())),
示例#7
0
from __future__ import division, print_function

import pandas as pd
import numpy as np
from kaggle_tools.feature_extraction import FeatureColumnsExtractor
from src.main import get_whole_dataset, get_feature_union
import settings

orig_dataset = pd.read_csv(settings.TRAIN_FILE)
target = FeatureColumnsExtractor(settings.TARGET).fit_transform(orig_dataset)

dataset = get_feature_union().fit_transform(get_whole_dataset(), np.empty(()))

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import settings

from matplotlib.backends.backend_pdf import PdfPages
pp = PdfPages('summary.pdf')

for column in sorted(dataset.columns):
    # if column == settings.TARGET:
    #     continue

    # column = 'Age'
    fig = plt.figure()
    title_string = "'{0}' summary".format(column)
    if column in settings.CATEGORICAL:
        title_string += ' CATEGORICAL'
    fig.suptitle(title_string, fontsize=14)