예제 #1
0
def segment_driver(driver_id):
  ''' this generated the segments in settings.SEGMENTS_FOLDER[1] '''
  da = DataAccess()
  for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
    ride_id = ride_id_minus_1 + 1
    if da.skip_segment(driver_id, ride_id):
      continue

    # apply the Ramer-Douglas-Peucker algorithm
    ride = [p + [i]  for i, p in enumerate(smoothen(ride))] # enrich with timestamp
    ride = rdp(ride, epsilon=10)

    lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))]
    times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))]
    angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))]

    # bucket the values
    lengths = util.bucket(np.log(lengths), 25, [2.2,8]) # [int(l) for l in lengths]
    times = util.bucket(np.log(times), 20, [1,5.5]) # [int(t) for t in times]
    angles = util.bucket(angles, 30, [0,180]) # [int(a) for a in angles]

    # write results
    da.write_ride_segments(driver_id, ride_id, lengths, times, angles)

  logging.info('finished segmenting driver %s' % driver_id)
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    raise Exception

  driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
  other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
  other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

  set1 = driver_train + other_train # used for training
  set2 = driver_test + other_test # used for testing

  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
  set2 = [util.get_list_string(heading.get_ride_heading(ride, \
      moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

  set1 = list(itertools.chain(*set1))

  set1 = [util.get_list_string(r) for r in set1]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
예제 #3
0
def get_data_heading(model_id,
                     driver_id,
                     repeat,
                     test=False,
                     moving_average_window=3,
                     stops=False,
                     version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        raise Exception

    driver_train, driver_test = da.get_rides_split(driver_id,
                                                   settings.BIG_CHUNK)
    other_train = list(
        da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train  # used for training
    set2 = driver_test + other_test  # used for testing

    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
    set2 = [util.get_list_string(heading.get_ride_heading(ride, \
        moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

    set1 = list(itertools.chain(*set1))

    set1 = [util.get_list_string(r) for r in set1]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #4
0
def save_threads(youtube: YouTube,
                 da: DataAccess,
                 from_vid: str,
                 dry_run: bool = True):
    for video in da.gen_all_videos_in_order(from_vid):
        vid = video["id"]
        vtitle = video["snippet"]["title"]

        print()
        print(f"Processing {vtitle}...")
        if da.have_comments_for_video(vid):
            print(f'We\'ve already got comments for "{vtitle}".')
            print("Skipping...")
            continue

        if not dry_run:
            threads = youtube.get_comment_threads_for_video(vid)

            with open(os.path.join(ROOT_DIR, "db", "commentThreads",
                                   f"{vid}.json"),
                      mode="w") as f:
                f.write(json.dumps(threads))
        else:
            print("\t(Dry run)")

        print(f'Threads for "{vtitle}" saved.')
        print()
        print("------------------------------------------------------------")

        # Give a little delay between batches.
        # - DOS paranoia.
        sleep(1)
예제 #5
0
    def export_video_ids_json(self):
        da = DataAccess()
        videos = da.get_all_videos(sort=True)

        vids = [video["id"] for video in videos]

        with open(os.path.join(self.export_dir, "video_ids.txt"), mode="w") as f:
            f.write(json.dumps(vids, indent=2))
예제 #6
0
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()

    set1 = list(da.get_rides(driver_id))  # first half of the train set
    set2 = list(
        da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                            driver_id,
                            seed=seed))  # second half of the train set

    moving_average_window = 6 if get_data == get_data_heading_v2 else 3
    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set1]
    set2 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set2]

    set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
    set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    vectorizer.fit([r[0] for r in set1])
    rides = [[vectorizer.transform([r])[0] for r in four_pack]
             for four_pack in set1]
    other_rides = [[vectorizer.transform([r])[0] for r in four_pack]
                   for four_pack in set2]
    other_rides = list(itertools.chain(*other_rides))

    rides = np.array(rides)

    trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [
        0
    ] * settings.BIG_CHUNK_TEST * 4 * repeat
    kf = KFold(200,
               n_folds=settings.FOLDS,
               shuffle=True,
               random_state=driver_id)
    predictions = ['bug'] * 200
    for train_fold, test_fold in kf:
        trainX = rides[train_fold]
        trainX = scipy.sparse.vstack(
            list(itertools.chain(*trainX)) * repeat + \
            other_rides
        )
        testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

        assert (trainX.shape[0] == len(trainY))
        assert (testX.shape[0] == settings.SMALL_CHUNK_TEST)

        model = Model(trainX, trainY, driver_id)
        fold_predictions = model.predict(testX)
        for i, v in enumerate(test_fold):
            predictions[v] = fold_predictions[i]

    predictions = np.array(predictions)
    if settings.ENABLE_CACHE:
        util.cache_results(Model, get_data, driver_id, True, predictions,
                           repeat)
    return driver_id, predictions
예제 #7
0
    def export_video_ids_tsv(self):
        da = DataAccess()
        videos = da.get_all_videos(sort=True)
        with open(
            os.path.join(self.export_dir, "vids.tsv"), mode="w", encoding="utf-8"
        ) as f:
            f.write(f"video_id\tvideo_title\n")

            for video in videos:
                vtitle = video["snippet"]["title"]
                f.write(f"{video['id']}\t{vtitle}\n")
예제 #8
0
def evaluate_iex_stocks():
    print('Started evaluating IEX stocks ...', flush=True)
    filename = 'sandp_top_250'
    os.environ["IEX_API_KEY"] = config.data_iex_api_key
    data_access = DataAccess(filename)
    global iex_stocks
    iex_stocks = evaluate_stocks('iex', f'{filename}.csv', data_access)
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()

  set1 = list(da.get_rides(driver_id)) # first half of the train set
  set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set

  moving_average_window = 6 if get_data == get_data_heading_v2 else 3
  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set1]
  set2 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set2]

  set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
  set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  vectorizer.fit([r[0] for r in set1])
  rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1]
  other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2]
  other_rides = list(itertools.chain(*other_rides))

  rides = np.array(rides)

  trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat
  kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id)
  predictions = ['bug'] * 200
  for train_fold, test_fold in kf:
    trainX = rides[train_fold]
    trainX = scipy.sparse.vstack(
        list(itertools.chain(*trainX)) * repeat + \
        other_rides
    )
    testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

    assert(trainX.shape[0] == len(trainY))
    assert(testX.shape[0] == settings.SMALL_CHUNK_TEST)

    model = Model(trainX, trainY, driver_id)
    fold_predictions = model.predict(testX)
    for i, v in enumerate(test_fold):
      predictions[v] = fold_predictions[i]

  predictions = np.array(predictions)
  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, True, predictions, repeat)
  return driver_id, predictions
예제 #10
0
 def test_sum_expeneses_two_rows(self):
     da = DataAccess(database=TEST_DB)
     da.insert(**TEST_ROW_1)
     da.insert(**TEST_ROW_2)
     sum_result = da.sum_expenses()
     da.close()
     self.assertAlmostEqual(sum_result, 26.99)
예제 #11
0
def get_data_fft(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))

    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    if version == 1:
        set1 = [util.fft(ride) for ride in set1]
        set2 = [util.fft(ride) for ride in set2]
    else:
        set1 = [util.fft_strip(ride) for ride in set1]
        set2 = [util.fft_strip(ride) for ride in set2]

    return np.array(set1), np.array(set2)
예제 #12
0
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))  # first half of the train set
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))  # second half of the train set
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train  # used for training
        set2 = driver_test + other_test  # used for testing

    set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1]
    set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2]

    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 20))
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #13
0
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))

    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    set1 = [util.build_features_acc(ride, version=version) for ride in set1]
    set2 = [util.build_features_acc(ride, version=version) for ride in set2]
    return np.array(set1), np.array(set2)
예제 #14
0
def get_data_acc4acc(model_id, driver_id, repeat, test=False, version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    set1 = [
        util.get_acc4acc_words(ride, step=3, version=version) for ride in set1
    ]
    set2 = [
        util.get_acc4acc_words(ride, step=3, version=version) for ride in set2
    ]

    max_ngram = 15 if version == 1 else 20
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, max_ngram))
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #15
0
def del_user():
    if request.method == 'GET':
        uuid = session['user']['user_id']
        if DA.delete_user_from_uuid(uuid):
            user = session['user']['user_id']
            session.pop('user', None)
            return redirect(url_for('auth.login'))
        else:
            flash("Errore nella cancellazione del profilo!")
예제 #16
0
    def get_videos_for_pitems(self, pitems: List) -> List:
        print("Requesting videos for playlist items.")

        vids: List[str] = [
            pitem["contentDetails"]["videoId"] for pitem in pitems
        ]
        data = []

        # Filter out videos we already have.
        da = DataAccess()
        vids = [vid for vid in vids if not da.have_video(vid)]

        for items in gen_resources_for_ids(
                self.youtube.videos,
                vids,
                part="snippet,statistics",
        ):
            data += items

        return data
예제 #17
0
파일: bow.py 프로젝트: Keesiu/meta-kaggle
def segment_driver(driver_id):
    ''' this generated the segments in settings.SEGMENTS_FOLDER[1] '''
    da = DataAccess()
    for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
        ride_id = ride_id_minus_1 + 1
        if da.skip_segment(driver_id, ride_id):
            continue

        # apply the Ramer-Douglas-Peucker algorithm
        ride = [p + [i]
                for i, p in enumerate(smoothen(ride))]  # enrich with timestamp
        ride = rdp(ride, epsilon=10)

        lengths = [
            util.euclidian_distance(ride[i - 1], ride[i])
            for i in xrange(1, len(ride))
        ]
        times = [ride[i][2] - ride[i - 1][2] for i in xrange(1, len(ride))]
        angles = [
            util.get_angle(ride[i - 2], ride[i - 1], ride[i])
            for i in xrange(2, len(ride))
        ]

        # bucket the values
        lengths = util.bucket(np.log(lengths), 25,
                              [2.2, 8])  # [int(l) for l in lengths]
        times = util.bucket(np.log(times), 20,
                            [1, 5.5])  # [int(t) for t in times]
        angles = util.bucket(angles, 30, [0, 180])  # [int(a) for a in angles]

        # write results
        da.write_ride_segments(driver_id, ride_id, lengths, times, angles)

    logging.info('finished segmenting driver %s' % driver_id)
예제 #18
0
def get_data_segment_angles(model_id,
                            driver_id,
                            repeat,
                            test=False,
                            segment_version=1,
                            extra=((1, 1), 2)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides_segments(driver_id, version=segment_version))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=True,
                                                       version=segment_version)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=True,
                                version=segment_version))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # create features for each (segment, angle, segment) tuple
    set1 = [[
        '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i])
        for i in xrange(1, len(d[0]))
    ] for d in set1]
    set2 = [[
        '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i])
        for i in xrange(1, len(d[0]))
    ] for d in set2]

    set1 = [util.get_list_string(d) for d in set1]
    set2 = [util.get_list_string(d) for d in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)
    return set1, set2
def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0):
  def process(ride, digitize):
    g_forces = util.get_g_forces(ride)
    if digitize:
      g_forces = np.digitize(g_forces, range(0, 800, digitize))
    return util.get_list_string(g_forces)

  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [process(ride, digitize) for ride in set1]
  set2 = [process(ride, digitize) for ride in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features4(r, step=step, version=version) for r in set1]
  set2 = [util.build_features4(r, step=step, version=version) for r in set2]

  if tf:
    vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
  else:
    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
예제 #21
0
def main():
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    # Get the API key as a CLI arg.
    api_key = sys.argv[1]
    if not api_key:
        raise Exception("No API key provided.")

    # Get credentials and create an API client
    youtube = YouTube(api_key)

    # Do stuff.
    da = DataAccess()

    pitems_dict = da.get_pitems_dict(OTHER_PLAYLIST_IDS)

    current_vid = "lM28rfsHge0"
    # save_threads(youtube, da, from_vid=current_vid, dry_run=False)
    # save_all_playlist_items(youtube, OTHER_PLAYLIST_IDS, dry_run=False)
    save_all_videos(youtube, pitems_dict, dry_run=False)
예제 #22
0
def segment_driver_v2(driver_id):
  ''' this generated the segments in settings.SEGMENTS_FOLDER[2] '''
  da = DataAccess()
  for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
    ride_id = ride_id_minus_1 + 1
    if da.skip_segment(driver_id, ride_id, version=2):
      continue

    # apply the Ramer-Douglas-Peucker algorithm
    ride = [p + [i]  for i, p in enumerate(ride)] # enrich with timestamp
    ride = rdp(ride, epsilon=4)

    lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))]
    times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))]
    angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))]

    lengths = np.histogram(lengths, bins=range(0, 700, 20) + [1000000000])[0]
    times = np.histogram(times, bins=range(0, 60, 4) + [1000000000])[0]
    angles = np.histogram(angles, bins=range(0, 181, 20))[0]

    # write results
    da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2)

  logging.info('finished segmenting driver %s' % driver_id)
예제 #23
0
def get_data_segment_lengths(model_id,
                             driver_id,
                             repeat,
                             test=False,
                             segment_version=1,
                             extra=((1, 8), 1)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides_segments(driver_id, version=segment_version))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=True,
                                                       version=segment_version)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=True,
                                version=segment_version,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=True,
                                version=segment_version))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # keep only lengths
    set1 = [d[0] for d in set1]
    set2 = [d[0] for d in set2]

    # convert to text
    set1 = [util.get_list_string(d) for d in set1]
    set2 = [util.get_list_string(d) for d in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)
    return set1, set2
예제 #24
0
def get_data_movements_v1(model_id,
                          driver_id,
                          repeat,
                          test=False,
                          step=5,
                          tf=False,
                          version=1,
                          extra=((1, 5), 2)):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    ngram_range, min_df = extra

    if test:
        set1 = list(da.get_rides(driver_id))
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK,
                                                       segments=False)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                segments=False,
                                seed=seed))
        other_test = list(
            da.get_random_rides(settings.SMALL_CHUNK,
                                driver_id,
                                segments=False))

        set1 = driver_train + other_train
        set2 = driver_test + other_test

    # keep only lengths and convert to text
    set1 = [util.build_features3(r, step=step, version=version) for r in set1]
    set2 = [util.build_features3(r, step=step, version=version) for r in set2]

    if tf:
        vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    else:
        vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)

    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
예제 #25
0
 def insert_expense(self):
     insert_data_access = DataAccess()
     insert_dict = {
         'description': self.description.get(),
         'amount': self.amount.get(),
         'file_path': self.receipt.get(),
         'date': self.date.get(),
     }
     insert_data_access.insert(**insert_dict)
     insert_data_access.close()
     self.sum += float(self.amount.get())
     self.sum_label.set(format_sum_string(self.sum))
     self.add_window.destroy()
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,8),1)):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides_segments(driver_id, version=segment_version))
    set2 = list(da.get_random_rides(
        settings.BIG_CHUNK_TEST * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
  else:
    driver_train, driver_test = da.get_rides_split(
        driver_id,
        settings.BIG_CHUNK,
        segments=True,
        version=segment_version
    )
    other_train = list(da.get_random_rides(
        settings.BIG_CHUNK * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
    other_test = list(da.get_random_rides(
        settings.SMALL_CHUNK,
        driver_id,
        segments=True,
        version=segment_version
    ))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  # keep only lengths
  set1 = [d[0] for d in set1]
  set2 = [d[0] for d in set2]

  # convert to text
  set1 = [util.get_list_string(d) for d in set1]
  set2 = [util.get_list_string(d) for d in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)
  return set1, set2
def get_data_segment_angles_v2(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,3),1)):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  ngram_range, min_df = extra

  if test:
    set1 = list(da.get_rides_segments(driver_id, version=segment_version))
    set2 = list(da.get_random_rides(
        settings.BIG_CHUNK_TEST * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
  else:
    driver_train, driver_test = da.get_rides_split(
        driver_id,
        settings.BIG_CHUNK,
        segments=True,
        version=segment_version
    )
    other_train = list(da.get_random_rides(
        settings.BIG_CHUNK * repeat,
        driver_id,
        segments=True,
        version=segment_version,
        seed=seed
    ))
    other_test = list(da.get_random_rides(
        settings.SMALL_CHUNK,
        driver_id,
        segments=True,
        version=segment_version
    ))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  # create features for each (segment, angle, segment) tuple
  set1 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set1]
  set2 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set2]

  set1 = [util.get_list_string(d) for d in set1]
  set2 = [util.get_list_string(d) for d in set2]

  vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)
  return set1, set2
예제 #28
0
def get_data_g_forces_v1(model_id,
                         driver_id,
                         repeat,
                         test=False,
                         min_df=1,
                         ngram_range=(1, 10),
                         digitize=0):
    def process(ride, digitize):
        g_forces = util.get_g_forces(ride)
        if digitize:
            g_forces = np.digitize(g_forces, range(0, 800, digitize))
        return util.get_list_string(g_forces)

    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        set1 = list(da.get_rides(driver_id))  # first half of the train set
        set2 = list(
            da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                                driver_id,
                                seed=seed))  # second half of the train set
    else:
        driver_train, driver_test = da.get_rides_split(driver_id,
                                                       settings.BIG_CHUNK)
        other_train = list(
            da.get_random_rides(settings.BIG_CHUNK * repeat,
                                driver_id,
                                seed=seed))
        other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

        set1 = driver_train + other_train  # used for training
        set2 = driver_test + other_test  # used for testing

    set1 = [process(ride, digitize) for ride in set1]
    set2 = [process(ride, digitize) for ride in set2]

    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.build_features_acc(ride, version=version) for ride in set1]
  set2 = [util.build_features_acc(ride, version=version) for ride in set2]
  return np.array(set1), np.array(set2)
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id)) # first half of the train set
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train # used for training
    set2 = driver_test + other_test # used for testing

  set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1]
  set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))
  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1]
  set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2]

  vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15))
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    set1 = list(da.get_rides(driver_id))
    set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed))

  else:
    driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
    other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train
    set2 = driver_test + other_test

  if version == 1:
    set1 = [util.fft(ride) for ride in set1]
    set2 = [util.fft(ride) for ride in set2]
  else:
    set1 = [util.fft_strip(ride) for ride in set1]
    set2 = [util.fft_strip(ride) for ride in set2]

  return np.array(set1), np.array(set2)
예제 #33
0
파일: bow.py 프로젝트: Keesiu/meta-kaggle
def segment_driver_v2(driver_id):
    ''' this generated the segments in settings.SEGMENTS_FOLDER[2] '''
    da = DataAccess()
    for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)):
        ride_id = ride_id_minus_1 + 1
        if da.skip_segment(driver_id, ride_id, version=2):
            continue

        # apply the Ramer-Douglas-Peucker algorithm
        ride = [p + [i] for i, p in enumerate(ride)]  # enrich with timestamp
        ride = rdp(ride, epsilon=4)

        lengths = [
            util.euclidian_distance(ride[i - 1], ride[i])
            for i in range(1, len(ride))
        ]
        times = [ride[i][2] - ride[i - 1][2] for i in range(1, len(ride))]
        angles = [
            util.get_angle(ride[i - 2], ride[i - 1], ride[i])
            for i in range(2, len(ride))
        ]

        lengths = np.histogram(lengths,
                               bins=list(range(0, 700, 20)) + [1000000000])[0]
        times = np.histogram(times,
                             bins=list(range(0, 60, 4)) + [1000000000])[0]
        angles = np.histogram(angles, bins=list(range(0, 181, 20)))[0]

        # write results
        da.write_ride_segments(driver_id,
                               ride_id,
                               lengths,
                               times,
                               angles,
                               version=2)

    logging.info('finished segmenting driver %s' % driver_id)
예제 #34
0
def mod_targa():
    targa = request.form['targa']
    uuid = session['user']['user_id']
    if not DA.change_targa(uuid, targa):
        flash("Errore in modifica targa!")
    return redirect(url_for('main.profilo'))
예제 #35
0
 def test_insert_extra_fields(self):
     da = DataAccess(database=TEST_DB)
     da.insert(**TEST_ROW_3)
     self.assertEqual(_get_row_count(), 1)
예제 #36
0
 def test_insert_no_kwargs_raises_InvalidDataError(self):
     da = DataAccess(database=TEST_DB)
     self.assertRaises(InvalidDataError, da.insert)
예제 #37
0
 def test_sum_expenses_one_row(self):
     da = DataAccess(database=TEST_DB)
     da.insert(**TEST_ROW_1)
     sum_result = da.sum_expenses()
     da.close()
     self.assertEqual(sum_result, 12.75)
예제 #38
0
 def __init__(self):
     connection_string = SvcConfig.db_connection_str
     data_access = DataAccess(connection_string)
     self.data_access = data_access
예제 #39
0
    def __init__(self, db, model_table_name, model_table, hh_table_name, hh_table, pp_table_name, pp_table, 
                 land_table_name, land_table, business_sector_table_name, business_sector_table, 
                 policy_table_name, policy_table, stat_table_name, stat_table, start_year):
        '''
        Initialize the society class;
        '''
        
        # Set the start year and current time stamps
        self.start_year = start_year
        self.current_year = start_year
        
        # Create a dictionary to store model parameters, indexed by Variable_Name, and contents are Variable_Value      
        self.model_parameters_dict = dict()        
        # Fill in the model parameters dictionary from the model table (fetched from DB)
        for record in model_table:
            self.model_parameters_dict[record.Variable_Name] = record.Variable_Value
          
        
        # Get the variable lists for household, person, land, business sector, policy, and statistics classes 
        self.hh_var_list = DataAccess.get_var_list(db, hh_table_name)
        self.pp_var_list = DataAccess.get_var_list(db, pp_table_name)
        self.land_var_list = DataAccess.get_var_list(db, land_table_name)
        self.business_sector_var_list = DataAccess.get_var_list(db, business_sector_table_name)
        self.policy_var_list = DataAccess.get_var_list(db, policy_table_name)
        self.stat_var_list = DataAccess.get_var_list(db, stat_table_name)


        # Initialize the land instances, and create a land dictionary to store all the land parcels
        # Note that the land parcels here do not necessarily belong to any household. i.e. land_parcel.HID could be "None".
        self.land_dict = dict()
        for land in land_table:
            land_parcel = Land(land, self.land_var_list, self.current_year)
            self.land_dict[land_parcel.ParcelID] = land_parcel # Indexed by ParcelID


        # Initialize the household instances (household capital property and land class instances are initialized at the initialization of household class);
        # And create a dictionary to store them, indexed by HID.
        self.hh_dict = dict()        
        # Add household instances to hh_dict
        for hh in hh_table:
            hh_temp = Household(hh, self.hh_var_list, self.current_year, db, pp_table_name, pp_table, 
                                self.land_dict, self.model_parameters_dict)
            self.hh_dict[hh_temp.HID] = hh_temp # Indexed by HID

        
        # Initialize the business sector instances;
        # And create a dictionary to store them, indexed by sector name.
        self.business_sector_dict = dict()
        for sector in business_sector_table:
            sector_temp = BusinessSector(sector, self.business_sector_var_list)
            self.business_sector_dict[sector_temp.SectorName] = sector_temp # Indexed by SectorName
            
        
        # Initialize the policy program instances;
        # And create a dictionary to store them, indexed by policy program type.
        self.policy_dict = dict()
        for program in policy_table:
            program_temp = Policy(program, self.policy_var_list)
            self.policy_dict[program_temp.PolicyType] = program_temp # Indexed by PolicyType
        
                        
        # Create a statistics dictionary; indexed by Variable Names.To be filled later in Statistics Class.
        self.stat_dict = dict()
        
        # Create some variables to record the confiscated (due to ownerlessness) money and land
        self.ownerless_land = list()
        self.ownerless_money = 0
예제 #40
0
def evaluate_sandp500_stocks():
    print('Started evaluating STOOQ.com stocks ...', flush=True)
    data_access = DataAccess('sandp500')
    global sandp500_stocks
    sandp500_stocks = evaluate_stocks('stooq', 'sandp500.csv', data_access)
예제 #41
0
def evaluate_fse_stocks():
    print('Started evaluating QUANDL stocks ...', flush=True)
    data_access = DataAccess('quandl_fse_stocks')
    global fse_stocks
    fse_stocks = evaluate_stocks('quandl', 'quandl_fse_stocks.csv',
                                 data_access)
예제 #42
0
def get_sum():
    sum_data_access = DataAccess()
    sum_result = sum_data_access.sum_expenses()
    sum_data_access.close()
    return sum_result
예제 #43
0
    def __init__(self, record, VarList, current_year, db, pp_table_name, pp_table, land_dict, model_parameters):
        '''
        Construct the household class from the household table in the DB, and then add some other user-defined attributes.

        record - a record in the household table in the DB.       
        VarList - the variable (or field) list of the household table in the DB   
        VarList = {paramName1: paramOrder1, paramName2: paramOrder2, ...}
        
        Also initialize the household's own person instances here from the person table in the DB.
        Also initialize the household's capital properties instance here.     
        '''       
        
        # Set the attributes (var) and their values (record) from the household table in the DB.
        for var in VarList:
            setattr(self, var[0], record[var[1]])

        # Set the household and person variables (attributes) lists
        self.hh_var_list = VarList
        self.pp_var_list = DataAccess.get_var_list(db, pp_table_name)


        # Set the current time stamp
        self.StatDate = current_year
        
        
        # Define own persons (members) dict of the household, indexed by PID
        self.own_pp_dict = dict()

        
        # Add respective persons into the persons dict of the household
        for pp in pp_table:
            if pp.HID == self.HID:
                pp_temp = Person(pp, self.pp_var_list, current_year)
                self.own_pp_dict[pp_temp.PID] = pp_temp # Indexed by PID
        

        
        # Initialize the household's capital properties class instance
        self.own_capital_properties = CapitalProperty(self, land_dict, model_parameters)
        
        # Define an empty list of available business sectors,
        # and another empty list of the business sectors that the household is in in the current year
        self.own_av_business_sectors = list()
        self.own_current_sectors = list()
        
        # Define an empty list of participant policy programs
        self.own_policy_programs = list()


        # Initialize the household's energy class instance
        self.energy = Energy()
        
        
        # Define a variable indicating the household's preference type
        '''
        1 - Max Labor, Min Risk;
        2 - Min Labor, Min Risk;
        3 - Max Labor, Max Risk;
        4 - Min Labor, Max Risk;
        '''
        self.hh_preference_type = int()
        
        # Also define a variable indicating the preference toward risks
        '''
        True - risk aversion; False - risk appetite;
        '''
        self.hh_risk_type = True
        
        
        # Define a variable indicating the household's business type
        '''
        0 - agriculture only
        1 - agriculture and another business sectors; or one business sector which is not agriculture
        2 - agriculture and more than one other business sectors
        '''
        self.business_type = int()
        
        # Define a switch variable indicating whether the household is dissolved in the current year
        self.is_dissolved_this_year = False
def init_db():
    try:
        global db_client
        db_client = DataAccess.get_instance().get_db_client()
    except Exception as e:
        logging.critical("Problem occured while connecting db. {}".format(e))