if 'name' not in ignored: if 'name' in entry: if entry['name']: name_tokens = entry['name'].translate(table, string.punctuation).lower().split() for t in name_tokens: entry['name=' + t] = 1.0 del entry['name'] target_field = 'survived' training_file = 'data/train.csv' test_file = 'data/test.csv' other_file = 'data/titanic3.csv' training, headers = csv_to_row_dicts(training_file, ['name', 'ticket']) test, _ = csv_to_row_dicts(test_file, ['name', 'ticket']) other, _ = csv_to_row_dicts(other_file, ['name', 'ticket']) compare_data(other, training) cheat_test = compare_data(other, test) analysis_set = (training.values()) cheat_set = (cheat_test.values()) full_set = analysis_set + cheat_set # Delete Uninteresting Variables ignore_fields = ['boat', 'home.dest', 'body', 'passengerid', 'survived', # 'sex', # 'pclass', # 'parch',
from ml_utils.data.sklearn_compatible import PercentileCategorizer from ml_utils.learners.utils import multiclass_prediction from ml_utils.metrics.metrics import logloss from ml_utils.analysis.analysis import val_frequency_hist, analyze_date_format, write_val_hist from py_utils.utils import is_num import string, csv, re from calendar import monthrange import numpy as np from sklearn import pipeline, feature_extraction, ensemble, linear_model, tree from sklearn.cross_validation import LabelShuffleSplit as Splitter training_file = 'data/train.csv' test_file = 'data/test.csv' training, headers = csv_to_row_dicts(training_file, display=True, row_limit=0) # test, _ = csv_to_row_dicts(test_file, display=True) training_set = training.values() # test_set = test.values() if 0: full_set = training_set + test_set Addresses = {}; Mismatches = []; N_mismatches = 0 for entry in training_set: add_data = {'address': '', 'x': '', 'y': ''} for key in entry: if key in ['address', 'x', 'y']: if is_num(entry[key]): add_data[key] = "%.2f" % float(entry[key])