def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0):
  def process(ride, digitize):
    g_forces = util.get_g_forces(ride)
    if digitize:
      g_forces = np.digitize(g_forces, range(0, 800, digitize))
    return util.get_list_string(g_forces)

  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [process(ride, digitize) for ride in set1]
  set2 = [process(ride, digitize) for ride in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
Exemplo n.º 2
0
def segment_driver(driver_id):
  ''' this generated the segments in settings.SEGMENTS_FOLDER[1] '''
  da = DataAccess()
  for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
    ride_id = ride_id_minus_1 + 1
    if da.skip_segment(driver_id, ride_id):
      continue

    # apply the Ramer-Douglas-Peucker algorithm
    ride = [p + [i]  for i, p in enumerate(smoothen(ride))] # enrich with timestamp
    ride = rdp(ride, epsilon=10)

    lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))]
    times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))]
    angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))]

    # bucket the values
    lengths = util.bucket(np.log(lengths), 25, [2.2,8]) # [int(l) for l in lengths]
    times = util.bucket(np.log(times), 20, [1,5.5]) # [int(t) for t in times]
    angles = util.bucket(angles, 30, [0,180]) # [int(a) for a in angles]

    # write results
    da.write_ride_segments(driver_id, ride_id, lengths, times, angles)

  logging.info('finished segmenting driver %s' % driver_id)
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features4(r, step=step, version=version) for r in set1]
  set2 = [util.build_features4(r, step=step, version=version) for r in set2]

  if tf:
    vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
  else:
    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
Exemplo n.º 4
0
def get_data_movements_v1(model_id,
                          driver_id,
                          repeat,
                          test=False,
                          step=5,
                          tf=False,
                          version=1,
                          extra=((1, 5), 2)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=False)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=False))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # keep only lengths and convert to text
    set1 = [util.build_features3(r, step=step, version=version) for r in set1]
    set2 = [util.build_features3(r, step=step, version=version) for r in set2]

    if tf:
        vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    else:
        vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()

  set1 = list(da.get_rides(driver_id)) # first half of the train set
  set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set

  moving_average_window = 6 if get_data == get_data_heading_v2 else 3
  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set1]
  set2 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set2]

  set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
  set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  vectorizer.fit([r[0] for r in set1])
  rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1]
  other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2]
  other_rides = list(itertools.chain(*other_rides))

  rides = np.array(rides)

  trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat
  kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id)
  predictions = ['bug'] * 200
  for train_fold, test_fold in kf:
    trainX = rides[train_fold]
    trainX = scipy.sparse.vstack(
        list(itertools.chain(*trainX)) * repeat + \
        other_rides
    )
    testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

    assert(trainX.shape[0] == len(trainY))
    assert(testX.shape[0] == settings.SMALL_CHUNK_TEST)

    model = Model(trainX, trainY, driver_id)
    fold_predictions = model.predict(testX)
    for i, v in enumerate(test_fold):
      predictions[v] = fold_predictions[i]

  predictions = np.array(predictions)
  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, True, predictions, repeat)
  return driver_id, predictions
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features_acc(ride, version=version) for ride in set1]
  set2 = [util.build_features_acc(ride, version=version) for ride in set2]
  return np.array(set1), np.array(set2)
Exemplo n.º 7
0
def get_data_g_forces_v1(model_id,
                         driver_id,
                         repeat,
                         test=False,
                         min_df=1,
                         ngram_range=(1, 10),
                         digitize=0):
    def process(ride, digitize):
        g_forces = util.get_g_forces(ride)
        if digitize:
            g_forces = np.digitize(g_forces, range(0, 800, digitize))
        return util.get_list_string(g_forces)

    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))  # first half of the train set
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))  # second half of the train set
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train  # used for training
        set2 = driver_test + other_test  # used for testing

    set1 = [process(ride, digitize) for ride in set1]
    set2 = [process(ride, digitize) for ride in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1]
  set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1]
  set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
Exemplo n.º 10
0
def get_data_fft(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  if version == 1:
    set1 = [util.fft(ride) for ride in set1]
    set2 = [util.fft(ride) for ride in set2]
  else:
    set1 = [util.fft_strip(ride) for ride in set1]
    set2 = [util.fft_strip(ride) for ride in set2]

  return np.array(set1), np.array(set2)
Exemplo n.º 11
0
def segment_driver_v2(driver_id):
    ''' this generated the segments in settings.SEGMENTS_FOLDER[2] '''
    da = DataAccess()
    for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
        ride_id = ride_id_minus_1 + 1
        if da.skip_segment(driver_id, ride_id, version=2):
            continue

        # apply the Ramer-Douglas-Peucker algorithm
        ride = [p + [i] for i, p in enumerate(ride)]  # enrich with timestamp
        ride = rdp(ride, epsilon=4)

        lengths = [
            util.euclidian_distance(ride[i - 1], ride[i])
            for i in range(1, len(ride))
        ]
        times = [ride[i][2] - ride[i - 1][2] for i in range(1, len(ride))]
        angles = [
            util.get_angle(ride[i - 2], ride[i - 1], ride[i])
            for i in range(2, len(ride))
        ]

        lengths = np.histogram(lengths,
                               bins=list(range(0, 700, 20)) + [1000000000])[0]
        times = np.histogram(times,
                             bins=list(range(0, 60, 4)) + [1000000000])[0]
        angles = np.histogram(angles, bins=list(range(0, 181, 20)))[0]

        # write results
        da.write_ride_segments(driver_id,
                               ride_id,
                               lengths,
                               times,
                               angles,
                               version=2)

    logging.info('finished segmenting driver %s' % driver_id)
Exemplo n.º 12
0
def get_data_basic(model_id,
                   driver_id,
                   repeat,
                   test=False,
                   normalized=False,
                   version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))

    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    set1 = [
        util.build_features(ride, normalized=normalized, version=version)
        for ride in set1
    ]
    set2 = [
        util.build_features(ride, normalized=normalized, version=version)
        for ride in set2
    ]
    return np.array(set1), np.array(set2)
Exemplo n.º 13
0
def segment_driver_v2(driver_id):
  ''' this generated the segments in settings.SEGMENTS_FOLDER[2] '''
  da = DataAccess()
  for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
    ride_id = ride_id_minus_1 + 1
    if da.skip_segment(driver_id, ride_id, version=2):
      continue

    # apply the Ramer-Douglas-Peucker algorithm
    ride = [p + [i]  for i, p in enumerate(ride)] # enrich with timestamp
    ride = rdp(ride, epsilon=4)

    lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))]
    times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))]
    angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))]

    lengths = np.histogram(lengths, bins=range(0, 700, 20) + [1000000000])[0]
    times = np.histogram(times, bins=range(0, 60, 4) + [1000000000])[0]
    angles = np.histogram(angles, bins=range(0, 181, 20))[0]

    # write results
    da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2)

  logging.info('finished segmenting driver %s' % driver_id)