def segment_driver(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[1] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(smoothen(ride))] # enrich with timestamp ride = rdp(ride, epsilon=10) lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))] times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))] angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))] # bucket the values lengths = util.bucket(np.log(lengths), 25, [2.2,8]) # [int(l) for l in lengths] times = util.bucket(np.log(times), 20, [1,5.5]) # [int(t) for t in times] angles = util.bucket(angles, 30, [0,180]) # [int(a) for a in angles] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles) logging.info('finished segmenting driver %s' % driver_id)
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: raise Exception driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1] set2 = [util.get_list_string(heading.get_ride_heading(ride, \ moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2] set1 = list(itertools.chain(*set1)) set1 = [util.get_list_string(r) for r in set1] vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: raise Exception driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1] set2 = [util.get_list_string(heading.get_ride_heading(ride, \ moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2] set1 = list(itertools.chain(*set1)) set1 = [util.get_list_string(r) for r in set1] vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def save_threads(youtube: YouTube, da: DataAccess, from_vid: str, dry_run: bool = True): for video in da.gen_all_videos_in_order(from_vid): vid = video["id"] vtitle = video["snippet"]["title"] print() print(f"Processing {vtitle}...") if da.have_comments_for_video(vid): print(f'We\'ve already got comments for "{vtitle}".') print("Skipping...") continue if not dry_run: threads = youtube.get_comment_threads_for_video(vid) with open(os.path.join(ROOT_DIR, "db", "commentThreads", f"{vid}.json"), mode="w") as f: f.write(json.dumps(threads)) else: print("\t(Dry run)") print(f'Threads for "{vtitle}" saved.') print() print("------------------------------------------------------------") # Give a little delay between batches. # - DOS paranoia. sleep(1)
def export_video_ids_json(self): da = DataAccess() videos = da.get_all_videos(sort=True) vids = [video["id"] for video in videos] with open(os.path.join(self.export_dir, "video_ids.txt"), mode="w") as f: f.write(json.dumps(vids, indent=2))
def test_model_heading(model_id, driver_id, Model, get_data, repeat): seed = random.Random(x=driver_id + model_id) da = DataAccess() set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set moving_average_window = 6 if get_data == get_data_heading_v2 else 3 set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set1] set2 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set2] set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1] set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2] vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000) vectorizer.fit([r[0] for r in set1]) rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1] other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2] other_rides = list(itertools.chain(*other_rides)) rides = np.array(rides) trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [ 0 ] * settings.BIG_CHUNK_TEST * 4 * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] trainX = scipy.sparse.vstack( list(itertools.chain(*trainX)) * repeat + \ other_rides ) testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]]) assert (trainX.shape[0] == len(trainY)) assert (testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def export_video_ids_tsv(self): da = DataAccess() videos = da.get_all_videos(sort=True) with open( os.path.join(self.export_dir, "vids.tsv"), mode="w", encoding="utf-8" ) as f: f.write(f"video_id\tvideo_title\n") for video in videos: vtitle = video["snippet"]["title"] f.write(f"{video['id']}\t{vtitle}\n")
def evaluate_iex_stocks(): print('Started evaluating IEX stocks ...', flush=True) filename = 'sandp_top_250' os.environ["IEX_API_KEY"] = config.data_iex_api_key data_access = DataAccess(filename) global iex_stocks iex_stocks = evaluate_stocks('iex', f'{filename}.csv', data_access)
def test_model_heading(model_id, driver_id, Model, get_data, repeat): seed = random.Random(x=driver_id+model_id) da = DataAccess() set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set moving_average_window = 6 if get_data == get_data_heading_v2 else 3 set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set1] set2 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set2] set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1] set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2] vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000) vectorizer.fit([r[0] for r in set1]) rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1] other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2] other_rides = list(itertools.chain(*other_rides)) rides = np.array(rides) trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] trainX = scipy.sparse.vstack( list(itertools.chain(*trainX)) * repeat + \ other_rides ) testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]]) assert(trainX.shape[0] == len(trainY)) assert(testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def test_sum_expeneses_two_rows(self): da = DataAccess(database=TEST_DB) da.insert(**TEST_ROW_1) da.insert(**TEST_ROW_2) sum_result = da.sum_expenses() da.close() self.assertAlmostEqual(sum_result, 26.99)
def get_data_fft(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test if version == 1: set1 = [util.fft(ride) for ride in set1] set2 = [util.fft(ride) for ride in set2] else: set1 = [util.fft_strip(ride) for ride in set1] set2 = [util.fft_strip(ride) for ride in set2] return np.array(set1), np.array(set2)
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1] set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 20)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features_acc(ride, version=version) for ride in set1] set2 = [util.build_features_acc(ride, version=version) for ride in set2] return np.array(set1), np.array(set2)
def get_data_acc4acc(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [ util.get_acc4acc_words(ride, step=3, version=version) for ride in set1 ] set2 = [ util.get_acc4acc_words(ride, step=3, version=version) for ride in set2 ] max_ngram = 15 if version == 1 else 20 vectorizer = CountVectorizer(min_df=1, ngram_range=(1, max_ngram)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def del_user(): if request.method == 'GET': uuid = session['user']['user_id'] if DA.delete_user_from_uuid(uuid): user = session['user']['user_id'] session.pop('user', None) return redirect(url_for('auth.login')) else: flash("Errore nella cancellazione del profilo!")
def get_videos_for_pitems(self, pitems: List) -> List: print("Requesting videos for playlist items.") vids: List[str] = [ pitem["contentDetails"]["videoId"] for pitem in pitems ] data = [] # Filter out videos we already have. da = DataAccess() vids = [vid for vid in vids if not da.have_video(vid)] for items in gen_resources_for_ids( self.youtube.videos, vids, part="snippet,statistics", ): data += items return data
def segment_driver(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[1] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(smoothen(ride))] # enrich with timestamp ride = rdp(ride, epsilon=10) lengths = [ util.euclidian_distance(ride[i - 1], ride[i]) for i in xrange(1, len(ride)) ] times = [ride[i][2] - ride[i - 1][2] for i in xrange(1, len(ride))] angles = [ util.get_angle(ride[i - 2], ride[i - 1], ride[i]) for i in xrange(2, len(ride)) ] # bucket the values lengths = util.bucket(np.log(lengths), 25, [2.2, 8]) # [int(l) for l in lengths] times = util.bucket(np.log(times), 20, [1, 5.5]) # [int(t) for t in times] angles = util.bucket(angles, 30, [0, 180]) # [int(a) for a in angles] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles) logging.info('finished segmenting driver %s' % driver_id)
def get_data_segment_angles(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1, 1), 2)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=True, version=segment_version) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version)) set1 = driver_train + other_train set2 = driver_test + other_test # create features for each (segment, angle, segment) tuple set1 = [[ '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i]) for i in xrange(1, len(d[0])) ] for d in set1] set2 = [[ '%s_%s_%s' % (d[0][i - 1], d[1][i - 1], d[0][i]) for i in xrange(1, len(d[0])) ] for d in set2] set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features4(r, step=step, version=version) for r in set1] set2 = [util.build_features4(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def main(): # Disable OAuthlib's HTTPS verification when running locally. # *DO NOT* leave this option enabled in production. os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" # Get the API key as a CLI arg. api_key = sys.argv[1] if not api_key: raise Exception("No API key provided.") # Get credentials and create an API client youtube = YouTube(api_key) # Do stuff. da = DataAccess() pitems_dict = da.get_pitems_dict(OTHER_PLAYLIST_IDS) current_vid = "lM28rfsHge0" # save_threads(youtube, da, from_vid=current_vid, dry_run=False) # save_all_playlist_items(youtube, OTHER_PLAYLIST_IDS, dry_run=False) save_all_videos(youtube, pitems_dict, dry_run=False)
def segment_driver_v2(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[2] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id, version=2): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(ride)] # enrich with timestamp ride = rdp(ride, epsilon=4) lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))] times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))] angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))] lengths = np.histogram(lengths, bins=range(0, 700, 20) + [1000000000])[0] times = np.histogram(times, bins=range(0, 60, 4) + [1000000000])[0] angles = np.histogram(angles, bins=range(0, 181, 20))[0] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2) logging.info('finished segmenting driver %s' % driver_id)
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1, 8), 1)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=True, version=segment_version) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version)) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths set1 = [d[0] for d in set1] set2 = [d[0] for d in set2] # convert to text set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_movements_v1(model_id, driver_id, repeat, test=False, step=5, tf=False, version=1, extra=((1, 5), 2)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths and convert to text set1 = [util.build_features3(r, step=step, version=version) for r in set1] set2 = [util.build_features3(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def insert_expense(self): insert_data_access = DataAccess() insert_dict = { 'description': self.description.get(), 'amount': self.amount.get(), 'file_path': self.receipt.get(), 'date': self.date.get(), } insert_data_access.insert(**insert_dict) insert_data_access.close() self.sum += float(self.amount.get()) self.sum_label.set(format_sum_string(self.sum)) self.add_window.destroy()
def get_data_segment_lengths(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,8),1)): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list(da.get_random_rides( settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed )) else: driver_train, driver_test = da.get_rides_split( driver_id, settings.BIG_CHUNK, segments=True, version=segment_version ) other_train = list(da.get_random_rides( settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed )) other_test = list(da.get_random_rides( settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version )) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths set1 = [d[0] for d in set1] set2 = [d[0] for d in set2] # convert to text set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_segment_angles_v2(model_id, driver_id, repeat, test=False, segment_version=1, extra=((1,3),1)): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides_segments(driver_id, version=segment_version)) set2 = list(da.get_random_rides( settings.BIG_CHUNK_TEST * repeat, driver_id, segments=True, version=segment_version, seed=seed )) else: driver_train, driver_test = da.get_rides_split( driver_id, settings.BIG_CHUNK, segments=True, version=segment_version ) other_train = list(da.get_random_rides( settings.BIG_CHUNK * repeat, driver_id, segments=True, version=segment_version, seed=seed )) other_test = list(da.get_random_rides( settings.SMALL_CHUNK, driver_id, segments=True, version=segment_version )) set1 = driver_train + other_train set2 = driver_test + other_test # create features for each (segment, angle, segment) tuple set1 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set1] set2 = [['%s_%s' % (d[0][i-1], d[1][i-1]) for i in xrange(1, len(d[0]))] for d in set2] set1 = [util.get_list_string(d) for d in set1] set2 = [util.get_list_string(d) for d in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1, 10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features_acc(ride, version=version) for ride in set1] set2 = [util.build_features_acc(ride, version=version) for ride in set2] return np.array(set1), np.array(set2)
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1] set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1] set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test if version == 1: set1 = [util.fft(ride) for ride in set1] set2 = [util.fft(ride) for ride in set2] else: set1 = [util.fft_strip(ride) for ride in set1] set2 = [util.fft_strip(ride) for ride in set2] return np.array(set1), np.array(set2)
def segment_driver_v2(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[2] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id, version=2): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(ride)] # enrich with timestamp ride = rdp(ride, epsilon=4) lengths = [ util.euclidian_distance(ride[i - 1], ride[i]) for i in range(1, len(ride)) ] times = [ride[i][2] - ride[i - 1][2] for i in range(1, len(ride))] angles = [ util.get_angle(ride[i - 2], ride[i - 1], ride[i]) for i in range(2, len(ride)) ] lengths = np.histogram(lengths, bins=list(range(0, 700, 20)) + [1000000000])[0] times = np.histogram(times, bins=list(range(0, 60, 4)) + [1000000000])[0] angles = np.histogram(angles, bins=list(range(0, 181, 20)))[0] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2) logging.info('finished segmenting driver %s' % driver_id)
def mod_targa(): targa = request.form['targa'] uuid = session['user']['user_id'] if not DA.change_targa(uuid, targa): flash("Errore in modifica targa!") return redirect(url_for('main.profilo'))
def test_insert_extra_fields(self): da = DataAccess(database=TEST_DB) da.insert(**TEST_ROW_3) self.assertEqual(_get_row_count(), 1)
def test_insert_no_kwargs_raises_InvalidDataError(self): da = DataAccess(database=TEST_DB) self.assertRaises(InvalidDataError, da.insert)
def test_sum_expenses_one_row(self): da = DataAccess(database=TEST_DB) da.insert(**TEST_ROW_1) sum_result = da.sum_expenses() da.close() self.assertEqual(sum_result, 12.75)
def __init__(self): connection_string = SvcConfig.db_connection_str data_access = DataAccess(connection_string) self.data_access = data_access
def __init__(self, db, model_table_name, model_table, hh_table_name, hh_table, pp_table_name, pp_table, land_table_name, land_table, business_sector_table_name, business_sector_table, policy_table_name, policy_table, stat_table_name, stat_table, start_year): ''' Initialize the society class; ''' # Set the start year and current time stamps self.start_year = start_year self.current_year = start_year # Create a dictionary to store model parameters, indexed by Variable_Name, and contents are Variable_Value self.model_parameters_dict = dict() # Fill in the model parameters dictionary from the model table (fetched from DB) for record in model_table: self.model_parameters_dict[record.Variable_Name] = record.Variable_Value # Get the variable lists for household, person, land, business sector, policy, and statistics classes self.hh_var_list = DataAccess.get_var_list(db, hh_table_name) self.pp_var_list = DataAccess.get_var_list(db, pp_table_name) self.land_var_list = DataAccess.get_var_list(db, land_table_name) self.business_sector_var_list = DataAccess.get_var_list(db, business_sector_table_name) self.policy_var_list = DataAccess.get_var_list(db, policy_table_name) self.stat_var_list = DataAccess.get_var_list(db, stat_table_name) # Initialize the land instances, and create a land dictionary to store all the land parcels # Note that the land parcels here do not necessarily belong to any household. i.e. land_parcel.HID could be "None". self.land_dict = dict() for land in land_table: land_parcel = Land(land, self.land_var_list, self.current_year) self.land_dict[land_parcel.ParcelID] = land_parcel # Indexed by ParcelID # Initialize the household instances (household capital property and land class instances are initialized at the initialization of household class); # And create a dictionary to store them, indexed by HID. self.hh_dict = dict() # Add household instances to hh_dict for hh in hh_table: hh_temp = Household(hh, self.hh_var_list, self.current_year, db, pp_table_name, pp_table, self.land_dict, self.model_parameters_dict) self.hh_dict[hh_temp.HID] = hh_temp # Indexed by HID # Initialize the business sector instances; # And create a dictionary to store them, indexed by sector name. self.business_sector_dict = dict() for sector in business_sector_table: sector_temp = BusinessSector(sector, self.business_sector_var_list) self.business_sector_dict[sector_temp.SectorName] = sector_temp # Indexed by SectorName # Initialize the policy program instances; # And create a dictionary to store them, indexed by policy program type. self.policy_dict = dict() for program in policy_table: program_temp = Policy(program, self.policy_var_list) self.policy_dict[program_temp.PolicyType] = program_temp # Indexed by PolicyType # Create a statistics dictionary; indexed by Variable Names.To be filled later in Statistics Class. self.stat_dict = dict() # Create some variables to record the confiscated (due to ownerlessness) money and land self.ownerless_land = list() self.ownerless_money = 0
def evaluate_sandp500_stocks(): print('Started evaluating STOOQ.com stocks ...', flush=True) data_access = DataAccess('sandp500') global sandp500_stocks sandp500_stocks = evaluate_stocks('stooq', 'sandp500.csv', data_access)
def evaluate_fse_stocks(): print('Started evaluating QUANDL stocks ...', flush=True) data_access = DataAccess('quandl_fse_stocks') global fse_stocks fse_stocks = evaluate_stocks('quandl', 'quandl_fse_stocks.csv', data_access)
def get_sum(): sum_data_access = DataAccess() sum_result = sum_data_access.sum_expenses() sum_data_access.close() return sum_result
def __init__(self, record, VarList, current_year, db, pp_table_name, pp_table, land_dict, model_parameters): ''' Construct the household class from the household table in the DB, and then add some other user-defined attributes. record - a record in the household table in the DB. VarList - the variable (or field) list of the household table in the DB VarList = {paramName1: paramOrder1, paramName2: paramOrder2, ...} Also initialize the household's own person instances here from the person table in the DB. Also initialize the household's capital properties instance here. ''' # Set the attributes (var) and their values (record) from the household table in the DB. for var in VarList: setattr(self, var[0], record[var[1]]) # Set the household and person variables (attributes) lists self.hh_var_list = VarList self.pp_var_list = DataAccess.get_var_list(db, pp_table_name) # Set the current time stamp self.StatDate = current_year # Define own persons (members) dict of the household, indexed by PID self.own_pp_dict = dict() # Add respective persons into the persons dict of the household for pp in pp_table: if pp.HID == self.HID: pp_temp = Person(pp, self.pp_var_list, current_year) self.own_pp_dict[pp_temp.PID] = pp_temp # Indexed by PID # Initialize the household's capital properties class instance self.own_capital_properties = CapitalProperty(self, land_dict, model_parameters) # Define an empty list of available business sectors, # and another empty list of the business sectors that the household is in in the current year self.own_av_business_sectors = list() self.own_current_sectors = list() # Define an empty list of participant policy programs self.own_policy_programs = list() # Initialize the household's energy class instance self.energy = Energy() # Define a variable indicating the household's preference type ''' 1 - Max Labor, Min Risk; 2 - Min Labor, Min Risk; 3 - Max Labor, Max Risk; 4 - Min Labor, Max Risk; ''' self.hh_preference_type = int() # Also define a variable indicating the preference toward risks ''' True - risk aversion; False - risk appetite; ''' self.hh_risk_type = True # Define a variable indicating the household's business type ''' 0 - agriculture only 1 - agriculture and another business sectors; or one business sector which is not agriculture 2 - agriculture and more than one other business sectors ''' self.business_type = int() # Define a switch variable indicating whether the household is dissolved in the current year self.is_dissolved_this_year = False
def init_db(): try: global db_client db_client = DataAccess.get_instance().get_db_client() except Exception as e: logging.critical("Problem occured while connecting db. {}".format(e))