def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: raise Exception driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1] set2 = [util.get_list_string(heading.get_ride_heading(ride, \ moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2] set1 = list(itertools.chain(*set1)) set1 = [util.get_list_string(r) for r in set1] vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test if version == 1: set1 = [util.fft(ride) for ride in set1] set2 = [util.fft(ride) for ride in set2] else: set1 = [util.fft_strip(ride) for ride in set1] set2 = [util.fft_strip(ride) for ride in set2] return np.array(set1), np.array(set2)
def get_data_acc4acc(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [ util.get_acc4acc_words(ride, step=3, version=version) for ride in set1 ] set2 = [ util.get_acc4acc_words(ride, step=3, version=version) for ride in set2 ] max_ngram = 15 if version == 1 else 20 vectorizer = CountVectorizer(min_df=1, ngram_range=(1, max_ngram)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1] set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 20)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features_acc(ride, version=version) for ride in set1] set2 = [util.build_features_acc(ride, version=version) for ride in set2] return np.array(set1), np.array(set2)
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: raise Exception driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1] set2 = [util.get_list_string(heading.get_ride_heading(ride, \ moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2] set1 = list(itertools.chain(*set1)) set1 = [util.get_list_string(r) for r in set1] vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features4(r, step=step, version=version) for r in set1] set2 = [util.build_features4(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_segment_angles(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1, 1), 2)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=True, version=segment_version) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version)) set1 = driver_train + other_train set2 = driver_test + other_test # create features for each (segment, angle, segment) tuple set1 = [[ '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i]) for i in xrange(1, len(d[0])) ] for d in set1] set2 = [[ '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i]) for i in xrange(1, len(d[0])) ] for d in set2] set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1, 8), 1)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=True, version=segment_version) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version)) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths set1 = [d[0] for d in set1] set2 = [d[0] for d in set2] # convert to text set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,8),1)): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list(da.get_random_rides( settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed )) else: driver_train, driver_test = da.get_rides_split( driver_id, settings.BIG_CHUNK, segments=True, version=segment_version ) other_train = list(da.get_random_rides( settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed )) other_test = list(da.get_random_rides( settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version )) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths set1 = [d[0] for d in set1] set2 = [d[0] for d in set2] # convert to text set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_segment_angles_v2(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,3),1)): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list(da.get_random_rides( settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed )) else: driver_train, driver_test = da.get_rides_split( driver_id, settings.BIG_CHUNK, segments=True, version=segment_version ) other_train = list(da.get_random_rides( settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed )) other_test = list(da.get_random_rides( settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version )) set1 = driver_train + other_train set2 = driver_test + other_test # create features for each (segment, angle, segment) tuple set1 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set1] set2 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set2] set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_movements_v1(model_id, driver_id, repeat, test=False, step=5, tf=False, version=1, extra=((1, 5), 2)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths and convert to text set1 = [util.build_features3(r, step=step, version=version) for r in set1] set2 = [util.build_features3(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features_acc(ride, version=version) for ride in set1] set2 = [util.build_features_acc(ride, version=version) for ride in set2] return np.array(set1), np.array(set2)
def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1, 10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1] set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1] set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test if version == 1: set1 = [util.fft(ride) for ride in set1] set2 = [util.fft(ride) for ride in set2] else: set1 = [util.fft_strip(ride) for ride in set1] set2 = [util.fft_strip(ride) for ride in set2] return np.array(set1), np.array(set2)