def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0):
  def process(ride, digitize):
    g_forces = util.get_g_forces(ride)
    if digitize:
      g_forces = np.digitize(g_forces, range(0, 800, digitize))
    return util.get_list_string(g_forces)

  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [process(ride, digitize) for ride in set1]
  set2 = [process(ride, digitize) for ride in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
예제 #2
0
def get_data_heading(model_id,
                     driver_id,
                     repeat,
                     test=False,
                     moving_average_window=3,
                     stops=False,
                     version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        raise Exception

    driver_train, driver_test = da.get_rides_split(driver_id,
                                                   settings.BIG_CHUNK)
    other_train = list(
        da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train  # used for training
    set2 = driver_test + other_test  # used for testing

    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
    set2 = [util.get_list_string(heading.get_ride_heading(ride, \
        moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

    set1 = list(itertools.chain(*set1))

    set1 = [util.get_list_string(r) for r in set1]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #3
0
def get_data_fft(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))

    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    if version == 1:
        set1 = [util.fft(ride) for ride in set1]
        set2 = [util.fft(ride) for ride in set2]
    else:
        set1 = [util.fft_strip(ride) for ride in set1]
        set2 = [util.fft_strip(ride) for ride in set2]

    return np.array(set1), np.array(set2)
예제 #4
0
def get_data_acc4acc(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    set1 = [
        util.get_acc4acc_words(ride, step=3, version=version) for ride in set1
    ]
    set2 = [
        util.get_acc4acc_words(ride, step=3, version=version) for ride in set2
    ]

    max_ngram = 15 if version == 1 else 20
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, max_ngram))
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #5
0
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))  # first half of the train set
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))  # second half of the train set
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train  # used for training
        set2 = driver_test + other_test  # used for testing

    set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1]
    set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2]

    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 20))
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #6
0
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))

    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    set1 = [util.build_features_acc(ride, version=version) for ride in set1]
    set2 = [util.build_features_acc(ride, version=version) for ride in set2]
    return np.array(set1), np.array(set2)
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    raise Exception

  driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
  other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
  other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

  set1 = driver_train + other_train # used for training
  set2 = driver_test + other_test # used for testing

  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
  set2 = [util.get_list_string(heading.get_ride_heading(ride, \
      moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

  set1 = list(itertools.chain(*set1))

  set1 = [util.get_list_string(r) for r in set1]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features4(r, step=step, version=version) for r in set1]
  set2 = [util.build_features4(r, step=step, version=version) for r in set2]

  if tf:
    vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
  else:
    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
예제 #9
0
def get_data_segment_angles(model_id,
                            driver_id,
                            repeat,
                            test=False,
                            segment_version=1,
                            extra=((1, 1), 2)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides_segments(driver_id, version=segment_version))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=True,
                                                       version=segment_version)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=True,
                                version=segment_version))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # create features for each (segment, angle, segment) tuple
    set1 = [[
        '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i])
        for i in xrange(1, len(d[0]))
    ] for d in set1]
    set2 = [[
        '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i])
        for i in xrange(1, len(d[0]))
    ] for d in set2]

    set1 = [util.get_list_string(d) for d in set1]
    set2 = [util.get_list_string(d) for d in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)
    return set1, set2
예제 #10
0
def get_data_segment_lengths(model_id,
                             driver_id,
                             repeat,
                             test=False,
                             segment_version=1,
                             extra=((1, 8), 1)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides_segments(driver_id, version=segment_version))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=True,
                                                       version=segment_version)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=True,
                                version=segment_version))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # keep only lengths
    set1 = [d[0] for d in set1]
    set2 = [d[0] for d in set2]

    # convert to text
    set1 = [util.get_list_string(d) for d in set1]
    set2 = [util.get_list_string(d) for d in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)
    return set1, set2
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,8),1)):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides_segments(driver_id, version=segment_version))
    set2 = list(da.get_random_rides(
        settings.BIG_CHUNK_TEST * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
  else:
    driver_train, driver_test = da.get_rides_split(
        driver_id,
        settings.BIG_CHUNK,
        segments=True,
        version=segment_version
    )
    other_train = list(da.get_random_rides(
        settings.BIG_CHUNK * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
    other_test = list(da.get_random_rides(
        settings.SMALL_CHUNK,
        driver_id,
        segments=True,
        version=segment_version
    ))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  # keep only lengths
  set1 = [d[0] for d in set1]
  set2 = [d[0] for d in set2]

  # convert to text
  set1 = [util.get_list_string(d) for d in set1]
  set2 = [util.get_list_string(d) for d in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)
  return set1, set2
def get_data_segment_angles_v2(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,3),1)):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides_segments(driver_id, version=segment_version))
    set2 = list(da.get_random_rides(
        settings.BIG_CHUNK_TEST * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
  else:
    driver_train, driver_test = da.get_rides_split(
        driver_id,
        settings.BIG_CHUNK,
        segments=True,
        version=segment_version
    )
    other_train = list(da.get_random_rides(
        settings.BIG_CHUNK * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
    other_test = list(da.get_random_rides(
        settings.SMALL_CHUNK,
        driver_id,
        segments=True,
        version=segment_version
    ))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  # create features for each (segment, angle, segment) tuple
  set1 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set1]
  set2 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set2]

  set1 = [util.get_list_string(d) for d in set1]
  set2 = [util.get_list_string(d) for d in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)
  return set1, set2
예제 #13
0
def get_data_movements_v1(model_id,
                          driver_id,
                          repeat,
                          test=False,
                          step=5,
                          tf=False,
                          version=1,
                          extra=((1, 5), 2)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=False)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=False))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # keep only lengths and convert to text
    set1 = [util.build_features3(r, step=step, version=version) for r in set1]
    set2 = [util.build_features3(r, step=step, version=version) for r in set2]

    if tf:
        vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    else:
        vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features_acc(ride, version=version) for ride in set1]
  set2 = [util.build_features_acc(ride, version=version) for ride in set2]
  return np.array(set1), np.array(set2)
예제 #15
0
def get_data_g_forces_v1(model_id,
                         driver_id,
                         repeat,
                         test=False,
                         min_df=1,
                         ngram_range=(1, 10),
                         digitize=0):
    def process(ride, digitize):
        g_forces = util.get_g_forces(ride)
        if digitize:
            g_forces = np.digitize(g_forces, range(0, 800, digitize))
        return util.get_list_string(g_forces)

    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))  # first half of the train set
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))  # second half of the train set
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train  # used for training
        set2 = driver_test + other_test  # used for testing

    set1 = [process(ride, digitize) for ride in set1]
    set2 = [process(ride, digitize) for ride in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1]
  set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1]
  set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  if version == 1:
    set1 = [util.fft(ride) for ride in set1]
    set2 = [util.fft(ride) for ride in set2]
  else:
    set1 = [util.fft_strip(ride) for ride in set1]
    set2 = [util.fft_strip(ride) for ride in set2]

  return np.array(set1), np.array(set2)