示例#1
0
def preprocess_sf_crime(data):
    table = string.maketrans("", "")
    for entry in data:
        if 'dates' in entry:
            date, time = entry['dates'].split()
            year, month, day = date.split('-')

            if len(year) != 4 or int(month) > 12 or \
              int(day) > monthrange(int(year), int(month)):
                errmsg = "Invalid date format: %s-%s-%s" % (year, month, day)
                raise Exception(errmsg)

            hour, minute, second = time.split(':')

            if int(hour) > 12 or int(minute) > 60 or int(second) > 60:
                errmsg = "Invalid time format: %s:%s:%s" % (hour, minute, second)

            entry['year'] = year
            entry['month'] = month

            entry['hour'] = hour
            del entry['dates']

        if 'x' in entry and is_num(entry['x']):
            entry['x'] = float(entry['x'])

        if 'y' in entry and is_num(entry['y']):
            entry['y'] = float(entry['y'])
示例#2
0
def get_targets(data, target_field, balance=False, split_multiclass=False):
    if balance:
        # Second Pass, get target statistics
        target_groups = {}
        for entry in data:
            if target_field and target_field in entry:            
                # Balancing Classes
                target_val = entry[target_field]
                if target_val not in target_groups:
                    target_groups[target_val] = []

                target_groups[target_val].append(entry)

        # Resampling to balance out the classes
        while not checkEqual(map(lambda x: len(target_groups[x]), target_groups)):
            class_summary = map(lambda x: [x, len(target_groups[x])], target_groups)
            min_target = sorted(class_summary, key=lambda x: x[1])[0][0]

            target_groups[min_target].append(
                copy.deepcopy(
                    np.random.choice(
                        target_groups[min_target]
                    )
                )
            )

        # Add the groups back to the original data
        new_data = []
        for t in target_groups:
            new_data += copy.deepcopy(target_groups[t])

        data = new_data

    # generate the array of targets
    targets = []; target_dict = {'Isurehopethiswillneverbearealclass': -1.0}
    for entry in data:
        if target_field and target_field in entry:
            value = entry[target_field]
            # 2-class numeric targets
            if is_num(value):
                targets.append(float(value))
            # Multi-class and text-field targets
            else:
                if value not in target_dict:
                    target_dict[value] = max(target_dict.values()) + 1.0

                targets.append(target_dict[value])

            del entry[target_field]

    return data, np.array(targets)
示例#3
0
    def fit(self, data, targets=[]):
        numeric_fields = {}
        for row in data:
            for field in self.features:
                if field in row and is_num(row[field]):
                    if field not in numeric_fields:
                        numeric_fields[field] = []

                    numeric_fields[field].append(float(row[field]))
        
        for field in self.features:
            if field in self.percentiles:
                self.percentiles[field] = stats.mstats.mquantiles(
                    numeric_fields[field],
                    prob=[float(x)/float(self.features[field]) for x in range(1, self.features[field] + 1)]
                )
示例#4
0
    def transform(self, data, targets=[]):
        new_data = []
        # Iterate through rows...
        for row in data:
            new_row = copy.deepcopy(row)
            # Stated Numeric Features...
            for field in self.percentiles:
                # If the current field is in the row and it is non empty...
                if field in new_row and new_row[field] and is_num(new_row[field]):
                    # Locate the correct percentile.
                    current_val = float(new_row[field])
                    new_row[field] = current_val
                    for p in self.percentiles[field]:
                        if current_val <= p:
                            new_row[field + '_percentile'] = "%s" % p
                            break
                    else:
                        new_row[field + '_percentile'] = "%s" % p
                else:
                    new_row[field + '_percentile'] = ""

            new_data.append(new_row)

        return new_data
示例#5
0
def preprocessing_titanic(data, ignored=[], target_field=''):
    table = string.maketrans("", "")
    # First pass, preprocess input data
    for entry in data:
        # Ignoring Features
        for field in ignored:
            if field in entry and field != target_field:
                del entry[field]

        # Deriving New Features
        if 'age' not in ignored:
            entry['age_est'] = ''
            if 'age' in entry and entry['age']:
                entry['age_est'] = float(entry['age'])
                if entry['age'].endswith('.5'):
                    entry['age'] = ''
                else:
                    entry['age'] = float(entry['age'])
            else:
                entry['age'] = ''

        if 'sibsp' not in ignored and 'parch' not in ignored:
            entry['family'] = ''
            if 'sibsp' in entry and 'parch' in entry and entry['sibsp'] and entry['parch']:
                entry['family'] = "%s" % (int(entry['sibsp']) + int(entry['parch']))

        if 'cabin' not in ignored:
            entry['cabin_deck'] = ''
            entry['cabin_prefix'] = ''
            entry['cabin_port_starboard'] = ''
            if 'cabin' in entry:
                if entry['cabin']:
                    cabins = entry['cabin'].split()

                    entry['cabin_deck'] = Counter(map(lambda x: x[0], cabins)).most_common(1)[0][0]

                    single_letter = filter(lambda x: len(x) == 1, cabins)
                    entry['cabin_prefix'] = single_letter[0] if single_letter else ''

                    rooms = filter(lambda x: len(x) > 1 and is_num(x[1:]), cabins)
                    port_starboard = Counter(
                        map(lambda x: "%s" % (float(x[1:]) % 2), rooms)
                    ).most_common(1)[0][0] if rooms else ''

                    entry['cabin_port_starboard'] = port_starboard

                del entry['cabin']
                

        if 'ticket' not in ignored:
            entry['ticket_number'] = ''
            entry['ticket_text'] = ''
            if 'ticket' in entry:
                if entry['ticket']:
                    split = entry['ticket'].split()

                    if len(split) > 1:
                        number = split[-1]
                        text = ''.join(split[:-1]).translate(table, string.punctuation).lower()
                    elif len(split) == 1:
                        number = split[0]
                        text = ''
                    else:
                        number = ''
                        text = ''

                    entry['ticket_number'] = number
                    entry['ticket_text'] = text

                del entry['ticket']

        if 'name' not in ignored:
            if 'name' in entry:
                if entry['name']:
                    name_tokens = entry['name'].translate(table, string.punctuation).lower().split()
                    for t in name_tokens:
                        entry['name=' + t] = 1.0

                del entry['name']
示例#6
0
test_file = 'data/test.csv'
training,   headers = csv_to_row_dicts(training_file, display=True, row_limit=0)
# test,       _       = csv_to_row_dicts(test_file,     display=True)

training_set = training.values()
# test_set = test.values()

if 0:
    full_set = training_set + test_set

    Addresses = {};    Mismatches = [];    N_mismatches = 0
    for entry in training_set:
        add_data = {'address': '', 'x': '', 'y': ''}
        for key in entry:
            if key in ['address', 'x', 'y']:
                if is_num(entry[key]):
                    add_data[key] = "%.2f" % float(entry[key])
                else:
                    add_data[key] = "%s"   % entry[key]

        if add_data['address'] not in Addresses:
            Addresses[add_data['address']] = [add_data]
        elif add_data not in Addresses[add_data['address']]:
            Addresses[add_data['address']].append(add_data)
        else:
            N_mismatches += 1
            if add_data['address'] not in Mismatches:
                Mismatches = [add_data['address']]
            else:
                Mismatches.append(add_data['address'])