def splitDataByTimespans(datalist, timespan, dateinfoname="eval_date"): print("Performing splitDataByTimespans") date_list = sorted(set([d[dateinfoname] for d in datalist])) earliest_date = date_list[0] latest_date = date_list[-1] daterange_list = generateDateRange(start=earliest_date, end=latest_date + _day, step=timespan) data_by_daterange = defaultdict(list) for d in datalist: d_time = d[dateinfoname] for t in daterange_list: if d_time >= t and d_time < generateLaterDate(t, timespan): data_by_daterange[t].append(d) break print("Ending splitDataByTimespans") return data_by_daterange
# Time parameters # The best single date to define an experiment by is the start of the test # data. The training data will be from a given time range fully up to but # not including that date, while the test data will be from a given time # range starting on that date. If we wish to compare different sizes of # training or test data, then the best comparison would be against the # other experiments with this same date as the cutoff between the training # and testing data, regardless of the sizes of those data sets. # Of all planned experiments, earliest start of a test data set earliest_test_date = "2013-01-01" earliest_test_date_str = "".join(earliest_test_date.split("-"))[2:] # Time between earliest experiment and latest experiment test_date_range = "5Y" # Latest start of a test data set, calculated from above 2 variables latest_test_date = generateLaterDate(earliest_test_date, test_date_range) # Length of training data train_len = "8W" #train_len_sweep = ["4W"] #multi-option not fully implemented # Length of testing data test_len = "1D" #test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented # Time step between different experiments #test_date_step = "1D" # We have currently decided to step forward the experiment so that test sets # do not overlap, the reasoning being roughly: why would we bother evaluating # a model on 7 days of data if we're about to retrain the model 1 day later? # In the future it is possible this may change if we find a compelling reason # otherwise, or may add an option to override this choice.
num_cells_region = len(cellcoordlist_region) print("...loaded region and data subset.\nTime taken: {}".format(time.time() - clock_time)) all_exp_clock_time = time.time() exp_clock_time_list = [] print("Starting experiments...") for exp_index, start_time in enumerate(start_times): exp_clock_time = time.time() end_time = generateLaterDate(start_time, time_len) print("Exp {}/{}, start {} end {}".format(exp_index+1, num_exp, start_time, end_time)) ### SELECT TRAINING DATA # Get subset of data for training points_crime_region_train = getTimedPointsInTimeRange(points_crime_region, start_time, end_time)
exp_times = [] start_test_list = generateDateRange("2018-01-01", "2018-02-01", "1D") total_num_exp = len(start_test_list) for exp_index, start_test in enumerate(start_test_list): exp_start_time = time.time() if exp_index % 10 == 0: print("Running experiment {}/{}...".format(exp_index, total_num_exp)) # Declare time ranges of training and testing data end_train = start_test start_train = generateEarlierDate(end_train, train_len) end_test = generateLaterDate(start_test, test_len) test_data_dates.append(start_test) #print(start_train, end_train, start_test, end_test) ### SELECT TRAINING DATA # Get subset of data for training points_crime_region_train = getTimedPointsInTimeRange( points_crime_region, start_train, end_train) #print(len(points_crime_region_train.timestamps)) training_data = points_crime_region_train print(type(training_data))
def make_knox_info_file( datadir, in_csv_file_name, out_knox_file_name, geojson_file_name, crime_types, num_knox_iterations, knox_sbin_size, knox_sbin_num, knox_tbin_size, knox_tbin_num, earliest_exp_time, num_exp, time_step, time_len, csv_date_format="%m/%d/%Y %I:%M:%S %p", csv_longlat=False, csv_epsg=None, csv_infeet=True, csv_has_header=True, ): # Normalised and derived parameters # Normalised data directory datadir = os.path.expanduser(os.path.normpath(datadir)) # Full paths to files in_csv_full_path = os.path.join(datadir, in_csv_file_name) geojson_full_path = os.path.join(datadir, geojson_file_name) # Set of relevant crime types in the data crime_type_set = set(splitCommaArgs(crime_types)) # Spatial and temporal bandwidth bins knox_sbins = makeBins(knox_sbin_size, knox_sbin_num) knox_tbins = makeBins(knox_tbin_size, knox_tbin_num) earliest_start_time = generateEarlierDate(earliest_exp_time, time_len) print(f"First time window is from \ {earliest_start_time} to {earliest_exp_time}") start_times = generateDateRange(start=earliest_start_time, step=time_step, num=num_exp) out_file_path = os.path.join(datadir, out_knox_file_name) print(f"outfile: {out_file_path}") # Obtain crime data points, and region polygon # Obtain all crimes (of relevant types) from input data points_crime = loadGenericData(in_csv_full_path, crime_type_set=crime_type_set, date_format_csv=csv_date_format, longlat=csv_longlat, epsg=csv_epsg, infeet=csv_infeet, has_header=csv_has_header) # Obtain polygon from geojson file (which should have been pre-processed) region_polygon = gpd.read_file(geojson_full_path).unary_union # Get subset of input crime that occurred within region points_crime_region = open_cp.geometry.intersect_timed_points( points_crime, region_polygon) total_num_events = len(points_crime_region.timestamps) print(f"Successfully obtained data, with {total_num_events} events.") # Do Knox runs and store info in file print(f"Opening file {out_file_path} for writing.") with open(out_file_path, "w") as fout: chkpt_0 = time.time() for exp_index, start_time in enumerate(start_times): chkpt_1 = time.time() end_time = generateLaterDate(start_time, time_len) print(f"Time span: {start_time} to {end_time}") ### SELECT TRAINING DATA chkpt_2 = time.time() print(f"Getting data subset...") # Get subset of data for training points_crime_region_train = getTimedPointsInTimeRange( points_crime_region, start_time, end_time) print(f"...Got data subset. ({time.time()-chkpt_2:.4f})") num_events = len(points_crime_region_train.timestamps) print(f"Number of events in timespan: {num_events}") chkpt_3 = time.time() print("Calculating Knox...") knox_result = getKnoxResult(points_crime_region_train, num_knox_iterations, knox_sbins, knox_tbins) print(f"...Calculated Knox. ({time.time()-chkpt_3:.4f})") chkpt_4 = time.time() print(f"Writing to file {out_file_path} ...") fout.write(str(start_time)) fout.write("\n") fout.write(str(end_time)) fout.write("\n") fout.write(str(time_len)) fout.write("\n") fout.write(str(num_events)) fout.write("\n") fout.write("Spatial bins (columns):") fout.write("\n") fout.write(str(knox_sbins)) fout.write("\n") fout.write("Temporal bins (rows):") fout.write("\n") fout.write(str(knox_tbins)) fout.write("\n") fout.write("Knox Statistics\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(knox_result.statistic(j, i)) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("Monte Carlo Medians\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(statistics.median(knox_result.distribution(j, i))) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("P Values\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(knox_result.pvalue(j, i)) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("\n") print(f"...Wrote to file. ({time.time()-chkpt_4:.4f})") print(f"Time for this run: {time.time()-chkpt_1:.4f}") print(f"Number of runs: {len(start_times)}") print(f"Number of bins per run: {len(knox_sbins) * len(knox_tbins)}") print(f"Overall time: {time.time()-chkpt_0:.4f}")