def basic_stats(fake_input_content): if fake_input_content: pass conn = sqlite_connection(get_db_path()) unique_vids = execute_query( conn, "SELECT count(*) FROM videos " "WHERE NOT videos.id = 'unknown';")[0][0] total_opened = execute_query(conn, "SELECT count(*) FROM " "videos_timestamps;")[0][0] total_channels = execute_query(conn, "SELECT count(*) FROM channels;")[0][0] total_tags = execute_query(conn, "SELECT count(*) FROM videos_tags;")[0][0] unique_tags = execute_query(conn, "SELECT count(*) FROM tags;")[0][0] conn.close() mark_down = dcc.Markdown( dedent(f''' **{unique_vids}** unique videos (and some with no identifying info available), opened/watched **{total_opened}** times **{total_tags}** tags across all videos (**{unique_tags}** unique), an average of **{round(total_tags / unique_vids, 1)}** per video **{total_channels}** channels ''')) return mark_down
def setup_tables(conn: sqlite3.Connection, api_auth): for schema in TABLE_SCHEMAS: create_schema_ = 'CREATE TABLE IF NOT EXISTS ' + TABLE_SCHEMAS[schema] execute_query(conn, create_schema_) insert_or_refresh_categories(conn, api_auth, True) insert_topics(conn) conn.commit()
def insert_topics(conn: sqlite3.Connection): query_string = generate_insert_query('topics', columns=TOPICS_COLUMNS, on_conflict_ignore=True) for k, v in topics.items(): insert_tuple = (k, v) execute_query(conn, query_string, insert_tuple) conn.commit()
def _show_front_end_data(fe_data: dict, conn): """ Composes a basic summary shown at the end of adding Takeout or updating records """ fe_data['records_in_db'] = execute_query( conn, 'SELECT count(*) from videos')[0][0] fe_data['timestamps'] = execute_query( conn, 'SELECT count(*) from videos_timestamps')[0][0] at_start = fe_data.get('at_start', None) if at_start is not None: fe_data['inserted'] = fe_data['records_in_db'] - at_start if DBProcessState.stage: add_sse_event(event='stop') add_sse_event(json.dumps(fe_data), 'stats')
def v_scatter_summary(hover_data: dict): if hover_data: point_of_interest = hover_data['points'][0].get('customdata', None) else: point_of_interest = None if point_of_interest: conn = sqlite_connection(get_db_path()) '''Views, Ratio, Likes, Dislikes, Title, Channel, Upload date''' query = '''SELECT v.title, c.title, v.published_at, v.view_count, v.like_count, v.dislike_count, (v.like_count * 1.0 / dislike_count), v.status, v.last_updated FROM videos v JOIN channels c ON v.channel_id = c.id WHERE v.id = ?''' r = execute_query(conn, query, (point_of_interest, ))[0] conn.close() views = add_commas_to_num(r[3]) likes = add_commas_to_num(r[4]) if r[4] else None dislikes = add_commas_to_num(r[5]) if r[5] else None status = f'**Status:** {r[7]}' if r[7] == 'active': status += f', last checked on {r[8][:10]}' like_dislike_ratio = round(r[6], 2) if r[6] else 'n/a' return dcc.Markdown( dedent(f'''**Title:** {r[0]} **Channel:** {r[1]} **Published:** {r[2][:10]} **Views:** {views} **Likes:** {likes} **Dislikes:** {dislikes} **Ratio:** {like_dislike_ratio} **ID:** {point_of_interest} {status} '''))
def delete_dead_video(conn: sqlite3.Connection, video_id, verbose=False): if execute_query(conn, '''DELETE FROM dead_videos_ids WHERE id = ?''', (video_id, )): if verbose: logger.info(f'Removed {video_id!r} from dead videos due to ' f'retrieving some identifying info for the record') return True
def add_tags_to_table_and_videos(conn: sqlite3.Connection, tags: list, video_id: str, existing_tags: dict, existing_videos_tags_records: dict = None, verbose=False): id_query_string = 'SELECT id FROM tags WHERE tag = ?' for tag in tags: if tag not in existing_tags: tag_id = None if add_tag(conn, tag): if verbose: logger.info(f'Added tag {tag!r}') tag_id = execute_query(conn, id_query_string, (tag, ))[0][0] existing_tags[tag] = tag_id else: tag_id = existing_tags[tag] if tag_id: if existing_videos_tags_records: if existing_videos_tags_records.get(video_id): if tag_id not in existing_videos_tags_records[video_id]: if add_tag_to_video(conn, tag_id, video_id) and verbose: logger.info(f'Added {tag!r} to {video_id!r}') else: if add_tag_to_video(conn, tag_id, video_id) and verbose: # duplicate tags are possible in a record, but happen # rarely and are allowed to throw some integrity errors # (caught and logged) logger.info(f'Added {tag!r} to {video_id!r}')
def add_topic_to_video(conn: sqlite3.Connection, topic: str, video_id: str, verbose=False): if execute_query(conn, add_topic_to_video_query, (video_id, topic)): if verbose: logger.info(f'Added topic {topic!r} to {video_id!r}') return True
def delete_time(conn: sqlite3.Connection, watched_at: str, video_id: str, verbose=False): if execute_query(conn, delete_time_query, (video_id, watched_at)): if verbose: logger.info(f'Removed timestamp {watched_at} from {video_id!r}') return True
def add_tag_to_video(conn: sqlite3.Connection, tag_id: int, video_id: str, verbose=False): if execute_query(conn, add_tag_to_video_query, (video_id, tag_id)): if verbose: logger.info(f'Added tag id# {tag_id} to {video_id!r}') return True
def add_video(conn: sqlite3.Connection, cols_vals: dict, verbose=False): query_string = generate_insert_query('videos', list(cols_vals.keys())) values = cols_vals.values() if execute_query(conn, query_string, tuple(values)): if verbose: logger.info(f'Added video; id# {cols_vals["id"]}, ' f'title {cols_vals.get("title")!r}') return True
def add_time(conn: sqlite3.Connection, watched_at: str, video_id: str, verbose=False): if execute_query(conn, add_time_to_video_query, (video_id, watched_at), False): if verbose: logger.info(f'Added timestamp {watched_at} to {video_id!r}') return True
def insert_or_refresh_categories(conn: sqlite3.Connection, api_auth, refresh: bool = True): """Gets the video categories info from YT API.""" categories = youtube.get_categories(api_auth) query_string = generate_insert_query('categories', columns=CATEGORIES_COLUMNS, on_conflict_ignore=True) if refresh or not execute_query(conn, 'SELECT * FROM categories'): if execute_query(conn, 'DELETE FROM categories WHERE id IS NOT NULL;'): for category_dict in categories['items']: id_ = category_dict['id'] channel_id = category_dict['snippet']['channelId'] title = category_dict['snippet']['title'] assignable = str(category_dict['snippet']['assignable']) etag = category_dict['etag'] execute_query(conn, query_string, (id_, channel_id, title, assignable, etag)) conn.commit()
def update_video(conn: sqlite3.Connection, cols_vals: dict, verbose=False): """Updates all the fields that are passed""" video_id = cols_vals.pop('id') query_string = generate_unconditional_update_query(list(cols_vals.keys())) values = list(cols_vals.values()) values.append(video_id) if execute_query(conn, query_string, tuple(values)): if verbose: logger.info(f'Updated video {video_id!r}') return True
def update_channel(conn: sqlite3.Connection, channel_id: str, channel_name: str, old_name: str, verbose=False): if execute_query( conn, '''UPDATE channels SET title = ? WHERE id = ?''', (channel_name, channel_id)): if verbose: logger.info(f'Updated channel name; id# {channel_id}, ' f'from {old_name!r} to {channel_name!r}') return True
def update_db(project_path: str, cutoff: int, logging_verbosity: int): import sqlite3 progress.clear() DBProcessState.percent = '0.0' DBProcessState.stage = 'Updating...' add_sse_event(DBProcessState.stage, 'stage') db_path = join(project_path, DB_NAME) conn = sqlite_connection(db_path) front_end_data = { 'updated': 0, 'failed_api_requests': 0, 'newly_inactive': 0, 'records_in_db': execute_query(conn, 'SELECT count(*) from videos')[0][0] } try: api_auth = youtube.get_api_auth( load_file(join(project_path, 'api_key')).strip()) if DBProcessState.exit_thread_check(): return for record in write_to_sql.update_videos(conn, api_auth, cutoff, logging_verbosity): if DBProcessState.exit_thread_check(): break DBProcessState.percent = str(record[0]) add_sse_event(f'{DBProcessState.percent} {record[1]}') front_end_data['updated'] = record[2] front_end_data['newly_inactive'] = record[3] front_end_data['newly_active'] = record[4] front_end_data['deleted'] = record[5] _show_front_end_data(front_end_data, conn) except youtube.ApiKeyError: add_sse_event(f'{flash_err} Missing or invalid API key', 'errors') raise except youtube.ApiQuotaError: add_sse_event( f'API quota/rate limit exceeded, see ' f'<a href="https://console.developers.google.com/apis/' f'api/youtube.googleapis.com/overview" target="_blank">' f'here</a>', 'errors') raise except (sqlite3.OperationalError, sqlite3.DatabaseError) as e: add_sse_event(f'{flash_err} Fatal database error - {e!r}', 'errors') raise except FileNotFoundError: add_sse_event(f'{flash_err} Invalid database path', 'errors') raise conn.close()
def add_channel(conn: sqlite3.Connection, channel_id: str, channel_name: str = None, verbose=False) -> bool: values = [channel_id] if channel_name: values.append(channel_name) query_string = generate_insert_query('channels', CHANNEL_COLUMNS[:len(values)], on_conflict_ignore=True) if execute_query(conn, query_string, tuple(values)): if verbose: logger.info(f'Added channel; id# {channel_id}, ' f'name {channel_name!r}') return True
def update_videos(conn: sqlite3.Connection, api_auth, update_age_cutoff=86400, verbosity=1): verbosity_level_1 = verbosity >= 1 verbosity_level_2 = verbosity >= 2 verbosity_level_3 = verbosity >= 3 records_passed, updated, newly_inactive, newly_active, deleted = [0] * 5 conn.row_factory = sqlite3.Row cur = conn.cursor() cur.execute( """SELECT id, last_updated FROM videos WHERE title NOT IN (?, ?) AND NOT status = ? ORDER BY last_updated;""", ('unknown', 'YouTube Music', 'deleted')) records = {k: v for k, v in cur.fetchall()} cur.execute("""SELECT * FROM channels WHERE title is not NULL;""") channels = {k: v for k, v in cur.fetchall()} cur.execute("""SELECT * FROM tags;""") existing_tags = {v: k for k, v in cur.fetchall()} cur.execute("""SELECT * FROM videos_tags""") existing_videos_tags = {} for video_tag_entry in cur.fetchall(): existing_videos_tags.setdefault(video_tag_entry[0], []) existing_videos_tags[video_tag_entry[0]].append(video_tag_entry[1]) cur.execute("""SELECT * FROM videos_topics""") existing_topics_tags = {} for video_topic_entry in cur.fetchall(): existing_topics_tags.setdefault(video_topic_entry[0], []) existing_topics_tags[video_topic_entry[0]].append(video_topic_entry[1]) cur.close() now = datetime.utcnow() # for determining if the record is old enough dt_strp = datetime.strptime dt_format = '%Y-%m-%d %H:%M:%S' records_filtered_by_age = [ k for k, v in records.items() if (now - dt_strp(v, dt_format)).total_seconds() > update_age_cutoff ] sub_percent, sub_percent_int = calculate_subpercentage( len(records_filtered_by_age)) commit_interval = calculate_commit_interval(sub_percent_int) commit_interval_counter = 0 del records if verbosity_level_1: logger.info(f'\nStarting records\' updating...\n' + '-' * 100) for record in records_filtered_by_age: records_passed += 1 if records_passed % sub_percent_int == 0: yield ((records_passed // sub_percent) / 10, records_passed, updated, newly_inactive, newly_active, deleted) record = execute_query(conn, 'SELECT * FROM videos WHERE id = ?', (record, )) record = dict(record[0]) video_id = record['id'] for attempt in range(1, 6): api_response = youtube.get_video_info(video_id, api_auth) time.sleep(0.01 * attempt**attempt) if api_response: if api_response['items']: api_video_data = wrangle_video_record( api_response['items']) if len(api_video_data) >= 7: # a record must have at least 7 fields after # going through wrangle_video_record, otherwise it's a # record of a deleted video with no valid data api_video_data.pop('published_at', None) if 'channel_title' not in api_video_data: # the video is somehow available through API # (though some data is missing), but not on YouTube pass else: if record['status'] == 'inactive': record['status'] = 'active' newly_active += 1 if verbosity_level_1: logger.info( f'{get_record_id_and_title(record)}, ' f'is now active') record.update(api_video_data) else: record['status'] = 'deleted' deleted += 1 if verbosity_level_1: logger.info(f'{get_record_id_and_title(record)}, ' f'is now deleted from YT') else: if record['status'] == 'active': record['status'] = 'inactive' newly_inactive += 1 if verbosity_level_1: logger.info(f'{get_record_id_and_title(record)}, ' f'is now inactive') record['last_updated'] = datetime.utcnow().replace( microsecond=0) break else: continue if 'tags' in record: tags = record.pop('tags') add_tags_to_table_and_videos(conn, tags, video_id, existing_tags, existing_videos_tags, verbosity_level_3) # perhaps, the record should also be checked for tags that have # been removed from the updated version and have them removed from # the DB as well. However, keeping a fuller record, despite what # the video's uploader/author might think about its accuracy, # seems like a better option channel_id = record['channel_id'] if 'channel_title' in record: channel_title = record.pop('channel_title') try: if channel_title != channels[channel_id]: update_channel(conn, channel_id, channel_title, channels[channel_id], verbosity_level_1) channels[channel_id] = channel_title except KeyError: """The channel now has a different ID... it's a thing. One possible reason for this is large channels, belonging to large media companies, getting split off into smaller channels. That's what it looked like when I came across it. Only encountered this once in ~19k of my own records.""" add_channel(conn, channel_id, channel_title, verbosity_level_2) else: # Less than a handful of videos were not available on YouTube, but # were available through API, with channel id, but no channel title. # In Takeout, these had no channel title or id, but had regular # title/id. Very strange. if channel_id not in channels: add_channel(conn, channel_id) if 'relevant_topic_ids' in record: topics_list = record.pop('relevant_topic_ids') if existing_topics_tags.get(video_id): for topic in topics_list: if topic not in existing_topics_tags[video_id]: add_topic_to_video(conn, topic, video_id, verbosity_level_2) if update_video(conn, record, verbosity_level_3): updated += 1 commit_interval_counter += 1 if commit_interval_counter == commit_interval: conn.commit() commit_interval_counter = 0 conn.commit() execute_query(conn, 'VACUUM') conn.row_factory = None results = { 'records_processed': records_passed, 'records_updated': updated, 'newly_inactive': newly_inactive, 'newly_active': newly_active, 'deleted_from_youtube': deleted } if verbosity_level_1: logger.info(json.dumps(results, indent=4)) logger.info('\n' + '-' * 100 + f'\nUpdating finished')
def add_dead_video(conn: sqlite3.Connection, video_id): return execute_query(conn, add_dead_video_query, (video_id, ))
def populate_db(takeout_path: str, project_path: str, logging_verbosity: int): if DBProcessState.exit_thread_check(): return progress.clear() DBProcessState.percent = '0' DBProcessState.stage = 'Processing watch-history.html file(s)...' add_sse_event(DBProcessState.stage, 'stage') records = {} try: for f in get_all_records(takeout_path, project_path): if DBProcessState.exit_thread_check(): return if isinstance(f, tuple): DBProcessState.percent = f'{f[0]} {f[1]}' add_sse_event(DBProcessState.percent, 'takeout_progress') else: try: records = f['videos'] if len(records) == 1: # 1 because of the empty unknown rec add_sse_event( 'No records found in the provided ' 'watch-history.html file(s). ' 'Something is very wrong.', 'errors') return except KeyError: add_sse_event( f'No watch-history.html files found in ' f'{takeout_path!r}', 'errors') return failed_entries = f['failed_entries'] if failed_entries: add_sse_event( f'Couldn\'t parse {len(failed_entries)} ' f'entries; dumped to parse_fails.json ' f'in project directory', 'warnings') failed_files = f['failed_files'] if failed_files: add_sse_event( 'The following files could not be ' 'processed:', 'warnings') for ff in failed_files: add_sse_event(ff, 'warnings') total_ts = f['total_timestamps'] total_v = f['total_videos'] add_sse_event( f'Videos / timestamps found: ' f'{total_v} / {total_ts}', 'info') except FileNotFoundError: add_sse_event( f'Invalid/non-existent path for watch-history.html files', 'errors') raise if DBProcessState.exit_thread_check(): return db_path = join(project_path, DB_NAME) conn = sqlite_connection(db_path, types=True) front_end_data = {'updated': 0} try: api_auth = youtube.get_api_auth( load_file(join(project_path, 'api_key')).strip()) write_to_sql.setup_tables(conn, api_auth) records_at_start = execute_query(conn, 'SELECT count(*) from videos')[0][0] if not records_at_start: front_end_data['at_start'] = 0 else: front_end_data['at_start'] = records_at_start DBProcessState.percent = '0.0' add_sse_event(f'{DBProcessState.percent} 1') DBProcessState.stage = ('Inserting video records/timestamps from ' 'Takeout...') add_sse_event(DBProcessState.stage, 'stage') for record in write_to_sql.insert_videos(conn, records, api_auth, logging_verbosity): if DBProcessState.exit_thread_check(): break DBProcessState.percent = str(record[0]) add_sse_event(f'{DBProcessState.percent} {record[1]}') front_end_data['updated'] = record[2] _show_front_end_data(front_end_data, conn) if DBProcessState.stage: add_sse_event(event='stop') add_sse_event(json.dumps(front_end_data), 'stats') conn.close() except youtube.ApiKeyError: add_sse_event(f'Missing or invalid API key', 'errors') raise except youtube.ApiQuotaError: add_sse_event( f'API quota/rate limit exceeded, see ' f'<a href="https://console.developers.google.com/apis/' f'api/youtube.googleapis.com/overview" target="_blank">' f'here</a>', 'errors') raise except (sqlite3.OperationalError, sqlite3.DatabaseError) as e: add_sse_event(f'Fatal database error - {e!r}', 'errors') raise except FileNotFoundError: add_sse_event(f'Invalid database path', 'errors') raise conn.close()
def add_tag(conn: sqlite3.Connection, tag: str, verbose=False): if execute_query(conn, add_tag_query, (tag, )): if verbose: logger.info(f'Added tag {tag!r}') return True