# 'parch', # 'sibsp', # 'embarked', # 'fare', # 'age', # 'cabin', # 'ticket', # 'name', ] # Preprocess Data preprocessing_titanic(analysis_set, ignore_fields, target_field) preprocessing_titanic(cheat_set, ignore_fields, target_field) # Get Targets balanced_set, targets = get_targets(analysis_set, target_field, balance=False) _, cheat_targets = get_targets(cheat_set, target_field, balance=False) print "\nTarget mean = %s" % np.average(targets) print "Cheat Target mean = %s\n" % np.average(cheat_targets) DV = feature_extraction.DictVectorizer(sparse=False) # CF = CollaborativeFilter() # collaborated = CF.fit_transform(balanced_set, targets) PL = pipeline.Pipeline(steps=[ ("collab", CollaborativeFilter(L=1)), ("pctcats", PercentileCategorizer({'fare': 10, 'age': 10, 'ticket_number': 10})), # ("lowcount", LowCountTrimmer(threshold=0, criteria='field')),
target_field = 'category' ignore_fields = ['descript', 'resolution', 'id', # 'dayofweek', # 'pddistrict', # 'dates', 'x', 'y', 'address', ] remove_ignored(training_set, ignore_fields) # remove_ignored(test_set, ignore_fields) preprocess_sf_crime(training_set) train_wo_tgts, train_targets_wo_multiclass = get_targets(training_set, target_field) train_targets = train_targets_wo_multiclass # train_targets = split_multiclass(train_targets_wo_multiclass) # Going away from pipeline - not enough need to chain things together. print "DictVectorizer" DV = feature_extraction.DictVectorizer(sparse=False) train_inputs = DV.fit_transform(train_wo_tgts) print train_inputs.shape SSS = Splitter(train_targets_wo_multiclass, n_iter=1, test_size=0.5) for tr, cv in SSS: print "Split" train_idx, cv_idx = tr, cv print train_idx.shape print cv_idx.shape