Exemplo n.º 1
0
def splitDataByTimespans(datalist, timespan, dateinfoname="eval_date"):
    print("Performing splitDataByTimespans")
    date_list = sorted(set([d[dateinfoname] for d in datalist]))
    earliest_date = date_list[0]
    latest_date = date_list[-1]
    daterange_list = generateDateRange(start=earliest_date,
                                       end=latest_date + _day,
                                       step=timespan)
    data_by_daterange = defaultdict(list)
    for d in datalist:
        d_time = d[dateinfoname]
        for t in daterange_list:
            if d_time >= t and d_time < generateLaterDate(t, timespan):
                data_by_daterange[t].append(d)
                break
    print("Ending splitDataByTimespans")
    return data_by_daterange
Exemplo n.º 2
0
# Time parameters
# The best single date to define an experiment by is the start of the test
#  data. The training data will be from a given time range fully up to but
#  not including that date, while the test data will be from a given time
#  range starting on that date. If we wish to compare different sizes of
#  training or test data, then the best comparison would be against the
#  other experiments with this same date as the cutoff between the training
#  and testing data, regardless of the sizes of those data sets.

# Of all planned experiments, earliest start of a test data set
earliest_test_date = "2013-01-01"
earliest_test_date_str = "".join(earliest_test_date.split("-"))[2:]
# Time between earliest experiment and latest experiment
test_date_range = "5Y"
# Latest start of a test data set, calculated from above 2 variables
latest_test_date = generateLaterDate(earliest_test_date, test_date_range)

# Length of training data
train_len = "8W"
#train_len_sweep = ["4W"] #multi-option not fully implemented
# Length of testing data
test_len = "1D"
#test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented

# Time step between different experiments
#test_date_step = "1D"
# We have currently decided to step forward the experiment so that test sets
#  do not overlap, the reasoning being roughly: why would we bother evaluating
#  a model on 7 days of data if we're about to retrain the model 1 day later?
#  In the future it is possible this may change if we find a compelling reason
#  otherwise, or may add an option to override this choice.
Exemplo n.º 3
0
num_cells_region = len(cellcoordlist_region)

print("...loaded region and data subset.\nTime taken: {}".format(time.time() - clock_time))



all_exp_clock_time = time.time()
exp_clock_time_list = []

print("Starting experiments...")

for exp_index, start_time in enumerate(start_times):
    
    exp_clock_time = time.time()
    
    end_time = generateLaterDate(start_time, time_len)
    
    print("Exp {}/{}, start {} end {}".format(exp_index+1, 
                                              num_exp, start_time, end_time))
    
    
    
    
    ### SELECT TRAINING DATA
    
    
    # Get subset of data for training
    points_crime_region_train = getTimedPointsInTimeRange(points_crime_region, 
                                                      start_time, 
                                                      end_time)
    
Exemplo n.º 4
0
exp_times = []

start_test_list = generateDateRange("2018-01-01", "2018-02-01", "1D")

total_num_exp = len(start_test_list)
for exp_index, start_test in enumerate(start_test_list):

    exp_start_time = time.time()

    if exp_index % 10 == 0:
        print("Running experiment {}/{}...".format(exp_index, total_num_exp))

    # Declare time ranges of training and testing data
    end_train = start_test
    start_train = generateEarlierDate(end_train, train_len)
    end_test = generateLaterDate(start_test, test_len)

    test_data_dates.append(start_test)

    #print(start_train, end_train, start_test, end_test)

    ### SELECT TRAINING DATA

    # Get subset of data for training
    points_crime_region_train = getTimedPointsInTimeRange(
        points_crime_region, start_train, end_train)

    #print(len(points_crime_region_train.timestamps))

    training_data = points_crime_region_train
    print(type(training_data))
Exemplo n.º 5
0
def make_knox_info_file(
    datadir,
    in_csv_file_name,
    out_knox_file_name,
    geojson_file_name,
    crime_types,
    num_knox_iterations,
    knox_sbin_size,
    knox_sbin_num,
    knox_tbin_size,
    knox_tbin_num,
    earliest_exp_time,
    num_exp,
    time_step,
    time_len,
    csv_date_format="%m/%d/%Y %I:%M:%S %p",
    csv_longlat=False,
    csv_epsg=None,
    csv_infeet=True,
    csv_has_header=True,
):

    # Normalised and derived parameters

    # Normalised data directory
    datadir = os.path.expanduser(os.path.normpath(datadir))

    # Full paths to files
    in_csv_full_path = os.path.join(datadir, in_csv_file_name)
    geojson_full_path = os.path.join(datadir, geojson_file_name)

    # Set of relevant crime types in the data
    crime_type_set = set(splitCommaArgs(crime_types))

    # Spatial and temporal bandwidth bins
    knox_sbins = makeBins(knox_sbin_size, knox_sbin_num)
    knox_tbins = makeBins(knox_tbin_size, knox_tbin_num)

    earliest_start_time = generateEarlierDate(earliest_exp_time, time_len)
    print(f"First time window is from \
{earliest_start_time} to {earliest_exp_time}")
    start_times = generateDateRange(start=earliest_start_time,
                                    step=time_step,
                                    num=num_exp)

    out_file_path = os.path.join(datadir, out_knox_file_name)

    print(f"outfile: {out_file_path}")

    # Obtain crime data points, and region polygon

    # Obtain all crimes (of relevant types) from input data
    points_crime = loadGenericData(in_csv_full_path,
                                   crime_type_set=crime_type_set,
                                   date_format_csv=csv_date_format,
                                   longlat=csv_longlat,
                                   epsg=csv_epsg,
                                   infeet=csv_infeet,
                                   has_header=csv_has_header)

    # Obtain polygon from geojson file (which should have been pre-processed)
    region_polygon = gpd.read_file(geojson_full_path).unary_union

    # Get subset of input crime that occurred within region
    points_crime_region = open_cp.geometry.intersect_timed_points(
        points_crime, region_polygon)

    total_num_events = len(points_crime_region.timestamps)

    print(f"Successfully obtained data, with {total_num_events} events.")

    # Do Knox runs and store info in file

    print(f"Opening file {out_file_path} for writing.")
    with open(out_file_path, "w") as fout:

        chkpt_0 = time.time()
        for exp_index, start_time in enumerate(start_times):

            chkpt_1 = time.time()

            end_time = generateLaterDate(start_time, time_len)

            print(f"Time span: {start_time} to {end_time}")

            ### SELECT TRAINING DATA

            chkpt_2 = time.time()
            print(f"Getting data subset...")
            # Get subset of data for training
            points_crime_region_train = getTimedPointsInTimeRange(
                points_crime_region, start_time, end_time)
            print(f"...Got data subset. ({time.time()-chkpt_2:.4f})")

            num_events = len(points_crime_region_train.timestamps)

            print(f"Number of events in timespan: {num_events}")

            chkpt_3 = time.time()
            print("Calculating Knox...")
            knox_result = getKnoxResult(points_crime_region_train,
                                        num_knox_iterations, knox_sbins,
                                        knox_tbins)
            print(f"...Calculated Knox. ({time.time()-chkpt_3:.4f})")

            chkpt_4 = time.time()
            print(f"Writing to file {out_file_path} ...")
            fout.write(str(start_time))
            fout.write("\n")
            fout.write(str(end_time))
            fout.write("\n")
            fout.write(str(time_len))
            fout.write("\n")
            fout.write(str(num_events))
            fout.write("\n")
            fout.write("Spatial bins (columns):")
            fout.write("\n")
            fout.write(str(knox_sbins))
            fout.write("\n")
            fout.write("Temporal bins (rows):")
            fout.write("\n")
            fout.write(str(knox_tbins))
            fout.write("\n")
            fout.write("Knox Statistics\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(knox_result.statistic(j, i))
                    for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("Monte Carlo Medians\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(statistics.median(knox_result.distribution(j, i)))
                    for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("P Values\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(knox_result.pvalue(j, i)) for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("\n")
            print(f"...Wrote to file. ({time.time()-chkpt_4:.4f})")
            print(f"Time for this run: {time.time()-chkpt_1:.4f}")

    print(f"Number of runs: {len(start_times)}")
    print(f"Number of bins per run: {len(knox_sbins) * len(knox_tbins)}")
    print(f"Overall time: {time.time()-chkpt_0:.4f}")