示例#1
0
def main(load=True):
    indices = ['X', 'Y', 'PdDistrictInt']
    train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices]
    all = sfc.get_data('data/all.csv', drop_data=True)
    knn = Juristictions()
    if load:
        knn.load()
    else:
        knn.train(train)
        knn.save()
    data = knn.outside_juristiction(all)
    sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction')
    return
示例#2
0
def main(load=True):
    indices = ['X', 'Y', 'PdDistrictInt']
    train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices]
    all = sfc.get_data('data/all.csv', drop_data=True)
    knn = Juristictions()
    if load:
        knn.load()
    else:
        knn.train(train)
        knn.save()
    data = knn.outside_juristiction(all)
    sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction')
    return
示例#3
0
def make_trimmed():
    train = sfc.get_data("data/train.csv")
    trimmed = trim_categories(train)
    # Formatting
    formatter = sfc.DataFormat()
    formatter.add_columns_enumerate(trimmed)
    # formatter.add_columns_resolution(trimmed) # Almost certainly not used
    formatter.add_columns_time(trimmed)
    # Make the actual datasets
    make_dataset(
        trimmed,
        "data/trim_1e4.csv",
        size=10000,
        comment="Random set of training data. " "Selected Categories, 1e4 records",
    )
    make_dataset(
        trimmed,
        "data/trim_1e5.csv",
        size=100000,
        comment="Random set of training data. " "Selected Categories, 1e5 records",
    )
    make_dataset(
        trimmed,
        "data/trim_1e6.csv",
        size=1000000,
        comment="Random set of training data. " "Selected Categories, 1e6 records",
    )
    make_dataset(trimmed, "data/trim.csv", comment="Random set of training data. " "Selected Categories, all records")
    return
示例#4
0
def make_full():
    train = sfc.get_data("data/train.csv")
    # Formatting
    formatter = sfc.DataFormat()
    formatter.add_columns_enumerate(train)
    formatter.add_columns_time(train)
    make_dataset(train, "data/all.csv", comment="All training data")
    return
示例#5
0
def make_full():
    train = sfc.get_data('data/train.csv')
    # Formatting
    formatter = sfc.DataFormat()
    formatter.add_columns_enumerate(train)
    formatter.add_columns_time(train)
    make_dataset(train, 'data/all.csv', comment='All training data')
    return
示例#6
0
def make_full():
    sfc.msg('Make full dataset')
    train = sfc.get_data('data/train.csv')
    # Formatting
    formatter = sfc.DataFormat()
    train = formatter.add_columns_enumerate(train)
    train = formatter.add_columns_time(train)
    train = formatter.add_weather(train)
    make_dataset(train, 'data/all.csv',
                 comment='All training data')
    return
示例#7
0
def make_test():
    sfc.msg('Make test dataset')
    test = sfc.get_data('data/test.csv')
    # Formatting
    formatter = sfc.DataFormat()
    test = formatter.add_columns_enumerate(test)
    test = formatter.add_columns_time(test)
    test = formatter.add_weather(test)
    make_dataset(test, 'data/test_format.csv',
                 comment='Formatted test data')
    test = test.sort('Id', ascending=True).reset_index(drop=True)
    return test
示例#8
0
 def get(self):
     '''Pretty much the initialization and setup'''
     # If filename exists, then load, else make it
     if not os.path.exists(self.filename):
         adds = sfc.get_data('data/train.csv').Address.unique()
         adds = pandas.Series([self.add_code(x) for x in adds]).unique()
         self.code = dict(zip(adds, range(len(adds))))
         with open(self.filename, 'w') as f:
             pickle.dump(self.code, f)
     else:
         with open(self.filename, 'r') as f:
             self.code = pickle.load(f)
     return
示例#9
0
def plot1():
    fig = pl.figure()
    data = sfc.get_data('data/all.csv')
    data.Category = data.Category.map(lambda x: x.capitalize())
    data_day = data[data.Darkness == 0]
    data_dark = data[data.Darkness == 2]
    data_day = data_day.groupby('Category').size()
    data_dark = data_dark.groupby('Category').size()
    data_day = data_day.map(lambda x: float(x) / data_day.sum())
    data_dark = data_dark.map(lambda x: float(x) / data_dark.sum())
    data_day.plot(kind='bar', label='Light', color='b', alpha=0.5)
    data_dark.plot(kind='bar', label='Dark', color='g', alpha=0.5)
    fig.subplots_adjust(bottom=0.40)
    pl.legend()
    return
示例#10
0
def plot1():
    fig = pl.figure()
    data = sfc.get_data("data/all.csv")
    data.Category = data.Category.map(lambda x: x.capitalize())
    data_day = data[data.Darkness == 0]
    data_dark = data[data.Darkness == 2]
    data_day = data_day.groupby("Category").size()
    data_dark = data_dark.groupby("Category").size()
    data_day = data_day.map(lambda x: float(x) / data_day.sum())
    data_dark = data_dark.map(lambda x: float(x) / data_dark.sum())
    data_day.plot(kind="bar", label="Light", color="b", alpha=0.5)
    data_dark.plot(kind="bar", label="Dark", color="g", alpha=0.5)
    fig.subplots_adjust(bottom=0.40)
    pl.legend()
    return
示例#11
0
 def plot(self, *args):
     indices = ['Category']
     fig = pl.figure()
     cols = ['b', 'g', 'r', 'm']
     for col, (filename, label) in zip(cols, args):
         data = sfc.get_data(filename)[indices]
         data.Category = data.Category.map(lambda x: x.capitalize())
         hist = data.groupby('Category').size()
         hist = hist.map(lambda x: float(x)/hist.sum())
         hist.plot(kind='bar', color=col, alpha=0.5, label=label)
     fig.subplots_adjust(bottom=0.40)
     locs, labels = pl.xticks()
     #pl.setp(labels, rotation=20)
     pl.legend()
     pl.savefig('plots/categories_outside_pd.pdf')
     pl.close(fig)
     return
示例#12
0
 def plot(self, *args):
     indices = ['Category']
     fig = pl.figure()
     cols = ['b', 'g', 'r', 'm']
     for col, (filename, label) in zip(cols, args):
         data = sfc.get_data(filename)[indices]
         data.Category = data.Category.map(lambda x: x.capitalize())
         hist = data.groupby('Category').size()
         hist = hist.map(lambda x: float(x) / hist.sum())
         hist.plot(kind='bar', color=col, alpha=0.5, label=label)
     fig.subplots_adjust(bottom=0.40)
     locs, labels = pl.xticks()
     #pl.setp(labels, rotation=20)
     pl.legend()
     pl.savefig('plots/categories_outside_pd.pdf')
     pl.close(fig)
     return
示例#13
0
def make_trimmed():
    sfc.msg('Make trimmed datasets')
    train = sfc.get_data('data/train.csv')
    trimmed = trim_categories(train)
    # Formatting
    formatter = sfc.DataFormat()
    trimmed = formatter.add_columns_enumerate(trimmed)
    trimmed = formatter.add_columns_time(trimmed)
    trimmed = formatter.add_weather(trimmed)
    # Make the actual datasets
    make_dataset(trimmed, 'data/trim_1e4.csv', size=10000,
                 comment='Random set of training data. ' \
                 'Selected Categories, 1e4 records')
    #make_dataset(trimmed, 'data/trim_1e5.csv', size=100000,
                 #comment='Random set of training data. ' \
                 #'Selected Categories, 1e5 records')
    #make_dataset(trimmed, 'data/trim.csv',
                 #comment='Random set of training data. ' \
                 #'Selected Categories, all records')
    return
示例#14
0
def make_dataset(input, output, comment='', verbose=False, size=None):
    data = None
    # Get the input data
    if isinstance(input, str):
        if not os.path.exists(input):
            input2 = os.path.join('data', input)
            if not os.path.exists(input2):
                raise IOError('Neither {} nor {} exist'.format(input, input2))
            input = input2
        data = sfc.get_data(input)
    elif isinstance(input, pandas.DataFrame):
        data = copy.deepcopy(input)
    else:
        raise IOError('Cannot deal with a {}'.format(type(input)))
    # Shrink to random records
    if size is not None and size < len(data):
        random.seed(sfc._SEED)
        data = data.ix[sorted(random.sample(xrange(len(data)), size))]
        data.reset_index(drop=True)
    sfc.write_data(data, output, comment=comment)
    return
示例#15
0
 def add(self, filename):
     finger = 0
     data = sfc.get_data(filename)
     weather = pandas.read_csv(self.weather,
                               infer_datetime_format=True,
                               parse_dates=['DateTime'])
     # make both ascending in date
     weather = weather.sort('DateTime', ascending=True)
     weather = weather.reset_index(drop=True)
     data = data.reindex(index=data.index[::-1])
     data = data.reset_index(drop=True)
     new_cols = {k: [None] * len(data) for k in weather.columns}
     for i, date in enumerate(data.Dates):
         while abs(weather.DateTime[finger] -
                   date) > abs(weather.DateTime[finger + 1] - date):
             finger += 1
         for k in weather.columns:
             new_cols[k][i] = weather[k][finger]
     new_data = data.join(pandas.DataFrame(new_cols))
     new_data.reindex(index=new_data.index[::-1])
     return new_data.reset_index(drop=True)
示例#16
0
def make_dataset(input, output, comment="", verbose=False, size=None):
    data = None
    # Get the input data
    if isinstance(input, str):
        if not os.path.exists(input):
            input2 = os.path.join("data", input)
            if not os.path.exists(input2):
                raise IOError("Neither {} nor {} exist".format(input, input2))
            input = input2
        data = sfc.get_data(input)
    elif isinstance(input, pandas.DataFrame):
        data = copy.deepcopy(input)
    else:
        raise IOError("Cannot deal with a {}".format(type(input)))
    # Shrink to random records
    if size is not None and size < len(data):
        random.seed(sfc._SEED)
        data = data.ix[sorted(random.sample(xrange(len(data)), size))]
        data.reset_index(drop=True)
    sfc.write_data(data, output, comment=comment)
    return
示例#17
0
def make_trimmed():
    train = sfc.get_data('data/train.csv')
    trimmed = trim_categories(train)
    # Formatting
    formatter = sfc.DataFormat()
    formatter.add_columns_enumerate(trimmed)
    #formatter.add_columns_resolution(trimmed) # Almost certainly not used
    formatter.add_columns_time(trimmed)
    # Make the actual datasets
    make_dataset(trimmed, 'data/trim_1e4.csv', size=10000,
                 comment='Random set of training data. ' \
                 'Selected Categories, 1e4 records')
    make_dataset(trimmed, 'data/trim_1e5.csv', size=100000,
                 comment='Random set of training data. ' \
                 'Selected Categories, 1e5 records')
    make_dataset(trimmed, 'data/trim_1e6.csv', size=1000000,
                 comment='Random set of training data. ' \
                 'Selected Categories, 1e6 records')
    make_dataset(trimmed, 'data/trim.csv',
                 comment='Random set of training data. ' \
                 'Selected Categories, all records')
    return
示例#18
0
 def add(self, filename):
     finger = 0
     data = sfc.get_data(filename)
     weather = pandas.read_csv(
         self.weather,
         infer_datetime_format=True,
         parse_dates=['DateTime']
     )
     # make both ascending in date
     weather = weather.sort('DateTime', ascending=True)
     weather = weather.reset_index(drop=True)
     data = data.reindex(index=data.index[::-1])
     data = data.reset_index(drop=True)
     new_cols = {k: [None] * len(data) for k in weather.columns}
     for i, date in enumerate(data.Dates):
         while abs(weather.DateTime[finger] - date) > abs(weather.DateTime[finger + 1] - date):
             finger += 1
         for k in weather.columns:
             new_cols[k][i] = weather[k][finger]
     new_data = data.join(pandas.DataFrame(new_cols))
     new_data.reindex(index=new_data.index[::-1])
     return new_data.reset_index(drop=True)
示例#19
0
        time_range.append(datetime.time(m / 60, m % 60))
    df_out = pandas.concat(all_bins, axis=1).T
    df_out.index = time_range
    return df_out


def sort_categories_by_frequency(cats):
    order = sorted(cats, key=lambda x: len(cats[x]))
    out = OrderedDict()
    for i in order:
        out[i] = cats[i]
    return out


###############################################################################

if __name__ == "__main__":
    df = sfc.get_data('data/trim_1e5.csv', drop_data=True)
    cats = sfc.data2dict(df, 'Category')
    #pds = sfc.data2dict(df, 'PdDistrict')
    #plotter = Plot2D()
    #plotter.plot(cats)
    plotter.plot_scatter(pds, 'PDs')
    pds_theft = {
        k: v[v.Category == 'VEHICLE THEFT']
        for k, v in pds.iteritems()
    }
    plotter.plot_scatter(pds_theft, 'PDs_VehicleTheft')

###############################################################################
示例#20
0

def main(load=True):
    indices = ['X', 'Y', 'PdDistrictInt']
    train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices]
    all = sfc.get_data('data/all.csv', drop_data=True)
    knn = Juristictions()
    if load:
        knn.load()
    else:
        knn.train(train)
        knn.save()
    data = knn.outside_juristiction(all)
    sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction')
    return


###############################################################################


if __name__ == "__main__":
    #main(False)
    knn = Juristictions()
    knn.load()
    #knn.plot(('data/outside_pd.csv', 'Outside PD'), ('data/all.csv', 'all'))
    data = sfc.get_data('data/all.csv')
    all = knn.add_outside_juristiction(data)
    sfc.write_data(all, 'data/all.csv')

###############################################################################
示例#21
0
    bdt_real_2 = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=8),
        n_estimators=10,
        learning_rate=1
    )

    #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1,
                                      #random_state=6065)

    bdt_real = BaggingClassifier(base_estimator=bdt_real_2,
                                random_state=6065,
                                n_estimators=100)

    #bdt_real = RandomForestClassifier(random_state=6065,
                                      #n_estimators=200)

    #bdt_real = ExtraTreesClassifier(random_state=6065,
                                    #min_samples_split=5,
                                    #n_estimators=200)

    bdt_real.fit(X_train, y_train)
    y_predict = pandas.Series(bdt_real.predict(X_test))
    print len(y_predict[y_predict == y_test])
    print len(y_predict)
    return bdt_real

bdt = train_classifiers(sfc.get_data('data/all.csv'))
test = sfc.get_data('data/test.csv')

示例#22
0
        t1, t2 = bin_times.index[0], bin_times.index[-1]
        m = (t1.hour*60 + t1.minute + t2.hour*60 + t2.minute) / 2
        all_bins.append(bin_times.sum())
        time_range.append(datetime.time(m/60, m%60))
    df_out = pandas.concat(all_bins, axis=1).T
    df_out.index = time_range
    return df_out


def sort_categories_by_frequency(cats):
    order = sorted(cats, key=lambda x: len(cats[x]))
    out = OrderedDict()
    for i in order:
        out[i] = cats[i]
    return out


###############################################################################

if __name__ == "__main__":
    df = sfc.get_data('data/trim_1e5.csv', drop_data=True)
    cats = sfc.data2dict(df, 'Category')
    #pds = sfc.data2dict(df, 'PdDistrict')
    #plotter = Plot2D()
    #plotter.plot(cats)
    plotter.plot_scatter(pds, 'PDs')
    pds_theft = {k:v[v.Category=='VEHICLE THEFT'] for k, v in pds.iteritems()}
    plotter.plot_scatter(pds_theft, 'PDs_VehicleTheft')

###############################################################################
示例#23
0
###############################################################################


def main(load=True):
    indices = ['X', 'Y', 'PdDistrictInt']
    train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices]
    all = sfc.get_data('data/all.csv', drop_data=True)
    knn = Juristictions()
    if load:
        knn.load()
    else:
        knn.train(train)
        knn.save()
    data = knn.outside_juristiction(all)
    sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction')
    return


###############################################################################

if __name__ == "__main__":
    #main(False)
    knn = Juristictions()
    knn.load()
    #knn.plot(('data/outside_pd.csv', 'Outside PD'), ('data/all.csv', 'all'))
    data = sfc.get_data('data/all.csv')
    all = knn.add_outside_juristiction(data)
    sfc.write_data(all, 'data/all.csv')

###############################################################################