def preprocess_sf_crime(data): table = string.maketrans("", "") for entry in data: if 'dates' in entry: date, time = entry['dates'].split() year, month, day = date.split('-') if len(year) != 4 or int(month) > 12 or \ int(day) > monthrange(int(year), int(month)): errmsg = "Invalid date format: %s-%s-%s" % (year, month, day) raise Exception(errmsg) hour, minute, second = time.split(':') if int(hour) > 12 or int(minute) > 60 or int(second) > 60: errmsg = "Invalid time format: %s:%s:%s" % (hour, minute, second) entry['year'] = year entry['month'] = month entry['hour'] = hour del entry['dates'] if 'x' in entry and is_num(entry['x']): entry['x'] = float(entry['x']) if 'y' in entry and is_num(entry['y']): entry['y'] = float(entry['y'])
def get_targets(data, target_field, balance=False, split_multiclass=False): if balance: # Second Pass, get target statistics target_groups = {} for entry in data: if target_field and target_field in entry: # Balancing Classes target_val = entry[target_field] if target_val not in target_groups: target_groups[target_val] = [] target_groups[target_val].append(entry) # Resampling to balance out the classes while not checkEqual(map(lambda x: len(target_groups[x]), target_groups)): class_summary = map(lambda x: [x, len(target_groups[x])], target_groups) min_target = sorted(class_summary, key=lambda x: x[1])[0][0] target_groups[min_target].append( copy.deepcopy( np.random.choice( target_groups[min_target] ) ) ) # Add the groups back to the original data new_data = [] for t in target_groups: new_data += copy.deepcopy(target_groups[t]) data = new_data # generate the array of targets targets = []; target_dict = {'Isurehopethiswillneverbearealclass': -1.0} for entry in data: if target_field and target_field in entry: value = entry[target_field] # 2-class numeric targets if is_num(value): targets.append(float(value)) # Multi-class and text-field targets else: if value not in target_dict: target_dict[value] = max(target_dict.values()) + 1.0 targets.append(target_dict[value]) del entry[target_field] return data, np.array(targets)
def fit(self, data, targets=[]): numeric_fields = {} for row in data: for field in self.features: if field in row and is_num(row[field]): if field not in numeric_fields: numeric_fields[field] = [] numeric_fields[field].append(float(row[field])) for field in self.features: if field in self.percentiles: self.percentiles[field] = stats.mstats.mquantiles( numeric_fields[field], prob=[float(x)/float(self.features[field]) for x in range(1, self.features[field] + 1)] )
def transform(self, data, targets=[]): new_data = [] # Iterate through rows... for row in data: new_row = copy.deepcopy(row) # Stated Numeric Features... for field in self.percentiles: # If the current field is in the row and it is non empty... if field in new_row and new_row[field] and is_num(new_row[field]): # Locate the correct percentile. current_val = float(new_row[field]) new_row[field] = current_val for p in self.percentiles[field]: if current_val <= p: new_row[field + '_percentile'] = "%s" % p break else: new_row[field + '_percentile'] = "%s" % p else: new_row[field + '_percentile'] = "" new_data.append(new_row) return new_data
def preprocessing_titanic(data, ignored=[], target_field=''): table = string.maketrans("", "") # First pass, preprocess input data for entry in data: # Ignoring Features for field in ignored: if field in entry and field != target_field: del entry[field] # Deriving New Features if 'age' not in ignored: entry['age_est'] = '' if 'age' in entry and entry['age']: entry['age_est'] = float(entry['age']) if entry['age'].endswith('.5'): entry['age'] = '' else: entry['age'] = float(entry['age']) else: entry['age'] = '' if 'sibsp' not in ignored and 'parch' not in ignored: entry['family'] = '' if 'sibsp' in entry and 'parch' in entry and entry['sibsp'] and entry['parch']: entry['family'] = "%s" % (int(entry['sibsp']) + int(entry['parch'])) if 'cabin' not in ignored: entry['cabin_deck'] = '' entry['cabin_prefix'] = '' entry['cabin_port_starboard'] = '' if 'cabin' in entry: if entry['cabin']: cabins = entry['cabin'].split() entry['cabin_deck'] = Counter(map(lambda x: x[0], cabins)).most_common(1)[0][0] single_letter = filter(lambda x: len(x) == 1, cabins) entry['cabin_prefix'] = single_letter[0] if single_letter else '' rooms = filter(lambda x: len(x) > 1 and is_num(x[1:]), cabins) port_starboard = Counter( map(lambda x: "%s" % (float(x[1:]) % 2), rooms) ).most_common(1)[0][0] if rooms else '' entry['cabin_port_starboard'] = port_starboard del entry['cabin'] if 'ticket' not in ignored: entry['ticket_number'] = '' entry['ticket_text'] = '' if 'ticket' in entry: if entry['ticket']: split = entry['ticket'].split() if len(split) > 1: number = split[-1] text = ''.join(split[:-1]).translate(table, string.punctuation).lower() elif len(split) == 1: number = split[0] text = '' else: number = '' text = '' entry['ticket_number'] = number entry['ticket_text'] = text del entry['ticket'] if 'name' not in ignored: if 'name' in entry: if entry['name']: name_tokens = entry['name'].translate(table, string.punctuation).lower().split() for t in name_tokens: entry['name=' + t] = 1.0 del entry['name']
test_file = 'data/test.csv' training, headers = csv_to_row_dicts(training_file, display=True, row_limit=0) # test, _ = csv_to_row_dicts(test_file, display=True) training_set = training.values() # test_set = test.values() if 0: full_set = training_set + test_set Addresses = {}; Mismatches = []; N_mismatches = 0 for entry in training_set: add_data = {'address': '', 'x': '', 'y': ''} for key in entry: if key in ['address', 'x', 'y']: if is_num(entry[key]): add_data[key] = "%.2f" % float(entry[key]) else: add_data[key] = "%s" % entry[key] if add_data['address'] not in Addresses: Addresses[add_data['address']] = [add_data] elif add_data not in Addresses[add_data['address']]: Addresses[add_data['address']].append(add_data) else: N_mismatches += 1 if add_data['address'] not in Mismatches: Mismatches = [add_data['address']] else: Mismatches.append(add_data['address'])