Exemplo n.º 1
0
    # 'parch',
    # 'sibsp',
    # 'embarked',
    # 'fare',
    # 'age',
    # 'cabin',
    # 'ticket',
    # 'name',
]

# Preprocess Data
preprocessing_titanic(analysis_set, ignore_fields, target_field)
preprocessing_titanic(cheat_set, ignore_fields, target_field)

# Get Targets
balanced_set, targets = get_targets(analysis_set, target_field, balance=False)
_,      cheat_targets = get_targets(cheat_set, target_field, balance=False)

print "\nTarget mean = %s" % np.average(targets)
print "Cheat Target mean = %s\n" % np.average(cheat_targets)

DV = feature_extraction.DictVectorizer(sparse=False)

# CF = CollaborativeFilter()
# collaborated = CF.fit_transform(balanced_set, targets)

PL = pipeline.Pipeline(steps=[
    ("collab",          CollaborativeFilter(L=1)),
    ("pctcats",         PercentileCategorizer({'fare': 10, 'age': 10, 'ticket_number': 10})),
    # ("lowcount",        LowCountTrimmer(threshold=0, criteria='field')),
Exemplo n.º 2
0
target_field = 'category'
ignore_fields = ['descript', 'resolution', 'id',
    # 'dayofweek',
    # 'pddistrict',
    # 'dates',
    'x',
    'y',
    'address',
]

remove_ignored(training_set, ignore_fields)
# remove_ignored(test_set, ignore_fields)

preprocess_sf_crime(training_set)

train_wo_tgts, train_targets_wo_multiclass = get_targets(training_set, target_field)
train_targets = train_targets_wo_multiclass
# train_targets = split_multiclass(train_targets_wo_multiclass)

# Going away from pipeline - not enough need to chain things together.
print "DictVectorizer"
DV = feature_extraction.DictVectorizer(sparse=False)
train_inputs = DV.fit_transform(train_wo_tgts)
print train_inputs.shape

SSS = Splitter(train_targets_wo_multiclass, n_iter=1, test_size=0.5)
for tr, cv in SSS:
    print "Split"
    train_idx, cv_idx = tr, cv
    print train_idx.shape
    print cv_idx.shape