示例#1
0
def basic_stats(fake_input_content):
    if fake_input_content:
        pass
    conn = sqlite_connection(get_db_path())

    unique_vids = execute_query(
        conn, "SELECT count(*) FROM videos "
        "WHERE NOT videos.id = 'unknown';")[0][0]
    total_opened = execute_query(conn, "SELECT count(*) FROM "
                                 "videos_timestamps;")[0][0]
    total_channels = execute_query(conn,
                                   "SELECT count(*) FROM channels;")[0][0]
    total_tags = execute_query(conn, "SELECT count(*) FROM videos_tags;")[0][0]
    unique_tags = execute_query(conn, "SELECT count(*) FROM tags;")[0][0]
    conn.close()
    mark_down = dcc.Markdown(
        dedent(f'''
    **{unique_vids}** unique videos 
    (and some with no identifying info available), opened/watched 
    **{total_opened}** times    
    **{total_tags}** tags across all videos (**{unique_tags}** unique), 
    an average of **{round(total_tags / unique_vids, 1)}** per video    
    **{total_channels}** channels
    '''))

    return mark_down
示例#2
0
def setup_tables(conn: sqlite3.Connection, api_auth):
    for schema in TABLE_SCHEMAS:
        create_schema_ = 'CREATE TABLE IF NOT EXISTS ' + TABLE_SCHEMAS[schema]
        execute_query(conn, create_schema_)

    insert_or_refresh_categories(conn, api_auth, True)
    insert_topics(conn)

    conn.commit()
示例#3
0
def insert_topics(conn: sqlite3.Connection):
    query_string = generate_insert_query('topics',
                                         columns=TOPICS_COLUMNS,
                                         on_conflict_ignore=True)
    for k, v in topics.items():
        insert_tuple = (k, v)
        execute_query(conn, query_string, insert_tuple)

    conn.commit()
示例#4
0
def _show_front_end_data(fe_data: dict, conn):
    """
    Composes a basic summary shown at the end of adding Takeout or updating
    records
    """
    fe_data['records_in_db'] = execute_query(
        conn, 'SELECT count(*) from videos')[0][0]
    fe_data['timestamps'] = execute_query(
        conn, 'SELECT count(*) from videos_timestamps')[0][0]
    at_start = fe_data.get('at_start', None)
    if at_start is not None:
        fe_data['inserted'] = fe_data['records_in_db'] - at_start
    if DBProcessState.stage:
        add_sse_event(event='stop')
    add_sse_event(json.dumps(fe_data), 'stats')
示例#5
0
def v_scatter_summary(hover_data: dict):
    if hover_data:
        point_of_interest = hover_data['points'][0].get('customdata', None)
    else:
        point_of_interest = None

    if point_of_interest:
        conn = sqlite_connection(get_db_path())
        '''Views, Ratio, Likes, Dislikes, Title, Channel, Upload date'''
        query = '''SELECT v.title, c.title, v.published_at, v.view_count,
        v.like_count, v.dislike_count, (v.like_count * 1.0 / dislike_count),
        v.status, v.last_updated FROM
        videos v JOIN channels c ON v.channel_id = c.id
        WHERE v.id = ?'''
        r = execute_query(conn, query, (point_of_interest, ))[0]
        conn.close()
        views = add_commas_to_num(r[3])
        likes = add_commas_to_num(r[4]) if r[4] else None
        dislikes = add_commas_to_num(r[5]) if r[5] else None
        status = f'**Status:** {r[7]}'
        if r[7] == 'active':
            status += f', last checked on {r[8][:10]}'

        like_dislike_ratio = round(r[6], 2) if r[6] else 'n/a'
        return dcc.Markdown(
            dedent(f'''**Title:** {r[0]}    
                    **Channel:** {r[1]}    
                    **Published:** {r[2][:10]}    
                    **Views:** {views}    
                    **Likes:** {likes}    
                    **Dislikes:** {dislikes}    
                    **Ratio:** {like_dislike_ratio}    
                    **ID:** {point_of_interest}    
                    {status}
                '''))
示例#6
0
def delete_dead_video(conn: sqlite3.Connection, video_id, verbose=False):
    if execute_query(conn, '''DELETE FROM dead_videos_ids WHERE id = ?''',
                     (video_id, )):
        if verbose:
            logger.info(f'Removed {video_id!r} from dead videos due to '
                        f'retrieving some identifying info for the record')
        return True
示例#7
0
def add_tags_to_table_and_videos(conn: sqlite3.Connection,
                                 tags: list,
                                 video_id: str,
                                 existing_tags: dict,
                                 existing_videos_tags_records: dict = None,
                                 verbose=False):

    id_query_string = 'SELECT id FROM tags WHERE tag = ?'
    for tag in tags:
        if tag not in existing_tags:
            tag_id = None
            if add_tag(conn, tag):
                if verbose:
                    logger.info(f'Added tag {tag!r}')
                tag_id = execute_query(conn, id_query_string, (tag, ))[0][0]
                existing_tags[tag] = tag_id

        else:
            tag_id = existing_tags[tag]
        if tag_id:
            if existing_videos_tags_records:
                if existing_videos_tags_records.get(video_id):
                    if tag_id not in existing_videos_tags_records[video_id]:
                        if add_tag_to_video(conn, tag_id,
                                            video_id) and verbose:
                            logger.info(f'Added {tag!r} to {video_id!r}')
            else:
                if add_tag_to_video(conn, tag_id, video_id) and verbose:
                    # duplicate tags are possible in a record, but happen
                    # rarely and are allowed to throw some integrity errors
                    # (caught and logged)
                    logger.info(f'Added {tag!r} to {video_id!r}')
示例#8
0
def add_topic_to_video(conn: sqlite3.Connection,
                       topic: str,
                       video_id: str,
                       verbose=False):
    if execute_query(conn, add_topic_to_video_query, (video_id, topic)):
        if verbose:
            logger.info(f'Added topic {topic!r} to {video_id!r}')
        return True
示例#9
0
def delete_time(conn: sqlite3.Connection,
                watched_at: str,
                video_id: str,
                verbose=False):
    if execute_query(conn, delete_time_query, (video_id, watched_at)):
        if verbose:
            logger.info(f'Removed timestamp {watched_at} from {video_id!r}')
        return True
示例#10
0
def add_tag_to_video(conn: sqlite3.Connection,
                     tag_id: int,
                     video_id: str,
                     verbose=False):
    if execute_query(conn, add_tag_to_video_query, (video_id, tag_id)):
        if verbose:
            logger.info(f'Added tag id# {tag_id} to {video_id!r}')
        return True
示例#11
0
def add_video(conn: sqlite3.Connection, cols_vals: dict, verbose=False):
    query_string = generate_insert_query('videos', list(cols_vals.keys()))
    values = cols_vals.values()
    if execute_query(conn, query_string, tuple(values)):
        if verbose:
            logger.info(f'Added video; id# {cols_vals["id"]}, '
                        f'title {cols_vals.get("title")!r}')
        return True
示例#12
0
def add_time(conn: sqlite3.Connection,
             watched_at: str,
             video_id: str,
             verbose=False):
    if execute_query(conn, add_time_to_video_query, (video_id, watched_at),
                     False):
        if verbose:
            logger.info(f'Added timestamp {watched_at} to {video_id!r}')
        return True
示例#13
0
def insert_or_refresh_categories(conn: sqlite3.Connection,
                                 api_auth,
                                 refresh: bool = True):
    """Gets the video categories info from YT API."""
    categories = youtube.get_categories(api_auth)
    query_string = generate_insert_query('categories',
                                         columns=CATEGORIES_COLUMNS,
                                         on_conflict_ignore=True)
    if refresh or not execute_query(conn, 'SELECT * FROM categories'):
        if execute_query(conn, 'DELETE FROM categories WHERE id IS NOT NULL;'):
            for category_dict in categories['items']:
                id_ = category_dict['id']
                channel_id = category_dict['snippet']['channelId']
                title = category_dict['snippet']['title']
                assignable = str(category_dict['snippet']['assignable'])
                etag = category_dict['etag']
                execute_query(conn, query_string,
                              (id_, channel_id, title, assignable, etag))
        conn.commit()
示例#14
0
def update_video(conn: sqlite3.Connection, cols_vals: dict, verbose=False):
    """Updates all the fields that are passed"""
    video_id = cols_vals.pop('id')
    query_string = generate_unconditional_update_query(list(cols_vals.keys()))
    values = list(cols_vals.values())
    values.append(video_id)
    if execute_query(conn, query_string, tuple(values)):
        if verbose:
            logger.info(f'Updated video {video_id!r}')
        return True
示例#15
0
def update_channel(conn: sqlite3.Connection,
                   channel_id: str,
                   channel_name: str,
                   old_name: str,
                   verbose=False):
    if execute_query(
            conn, '''UPDATE channels SET title = ?
                     WHERE id = ?''', (channel_name, channel_id)):
        if verbose:
            logger.info(f'Updated channel name; id# {channel_id}, '
                        f'from {old_name!r} to {channel_name!r}')
        return True
示例#16
0
def update_db(project_path: str, cutoff: int, logging_verbosity: int):
    import sqlite3

    progress.clear()
    DBProcessState.percent = '0.0'
    DBProcessState.stage = 'Updating...'
    add_sse_event(DBProcessState.stage, 'stage')
    db_path = join(project_path, DB_NAME)
    conn = sqlite_connection(db_path)
    front_end_data = {
        'updated': 0,
        'failed_api_requests': 0,
        'newly_inactive': 0,
        'records_in_db': execute_query(conn,
                                       'SELECT count(*) from videos')[0][0]
    }
    try:
        api_auth = youtube.get_api_auth(
            load_file(join(project_path, 'api_key')).strip())
        if DBProcessState.exit_thread_check():
            return
        for record in write_to_sql.update_videos(conn, api_auth, cutoff,
                                                 logging_verbosity):
            if DBProcessState.exit_thread_check():
                break
            DBProcessState.percent = str(record[0])
            add_sse_event(f'{DBProcessState.percent} {record[1]}')
            front_end_data['updated'] = record[2]
            front_end_data['newly_inactive'] = record[3]
            front_end_data['newly_active'] = record[4]
            front_end_data['deleted'] = record[5]

        _show_front_end_data(front_end_data, conn)
    except youtube.ApiKeyError:
        add_sse_event(f'{flash_err} Missing or invalid API key', 'errors')
        raise
    except youtube.ApiQuotaError:
        add_sse_event(
            f'API quota/rate limit exceeded, see '
            f'<a href="https://console.developers.google.com/apis/'
            f'api/youtube.googleapis.com/overview" target="_blank">'
            f'here</a>', 'errors')
        raise
    except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
        add_sse_event(f'{flash_err} Fatal database error - {e!r}', 'errors')
        raise
    except FileNotFoundError:
        add_sse_event(f'{flash_err} Invalid database path', 'errors')
        raise

    conn.close()
示例#17
0
def add_channel(conn: sqlite3.Connection,
                channel_id: str,
                channel_name: str = None,
                verbose=False) -> bool:
    values = [channel_id]
    if channel_name:
        values.append(channel_name)
    query_string = generate_insert_query('channels',
                                         CHANNEL_COLUMNS[:len(values)],
                                         on_conflict_ignore=True)
    if execute_query(conn, query_string, tuple(values)):
        if verbose:
            logger.info(f'Added channel; id# {channel_id}, '
                        f'name {channel_name!r}')
        return True
示例#18
0
def update_videos(conn: sqlite3.Connection,
                  api_auth,
                  update_age_cutoff=86400,
                  verbosity=1):
    verbosity_level_1 = verbosity >= 1
    verbosity_level_2 = verbosity >= 2
    verbosity_level_3 = verbosity >= 3
    records_passed, updated, newly_inactive, newly_active, deleted = [0] * 5
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()
    cur.execute(
        """SELECT id, last_updated FROM videos
                   WHERE title NOT IN (?, ?) AND NOT status = ?
                   ORDER BY last_updated;""",
        ('unknown', 'YouTube Music', 'deleted'))
    records = {k: v for k, v in cur.fetchall()}
    cur.execute("""SELECT * FROM channels WHERE title is not NULL;""")
    channels = {k: v for k, v in cur.fetchall()}
    cur.execute("""SELECT * FROM tags;""")
    existing_tags = {v: k for k, v in cur.fetchall()}
    cur.execute("""SELECT * FROM videos_tags""")
    existing_videos_tags = {}
    for video_tag_entry in cur.fetchall():
        existing_videos_tags.setdefault(video_tag_entry[0], [])
        existing_videos_tags[video_tag_entry[0]].append(video_tag_entry[1])
    cur.execute("""SELECT * FROM videos_topics""")
    existing_topics_tags = {}
    for video_topic_entry in cur.fetchall():
        existing_topics_tags.setdefault(video_topic_entry[0], [])
        existing_topics_tags[video_topic_entry[0]].append(video_topic_entry[1])
    cur.close()

    now = datetime.utcnow()  # for determining if the record is old enough
    dt_strp = datetime.strptime
    dt_format = '%Y-%m-%d %H:%M:%S'
    records_filtered_by_age = [
        k for k, v in records.items()
        if (now - dt_strp(v, dt_format)).total_seconds() > update_age_cutoff
    ]
    sub_percent, sub_percent_int = calculate_subpercentage(
        len(records_filtered_by_age))
    commit_interval = calculate_commit_interval(sub_percent_int)
    commit_interval_counter = 0
    del records

    if verbosity_level_1:
        logger.info(f'\nStarting records\' updating...\n' + '-' * 100)
    for record in records_filtered_by_age:
        records_passed += 1
        if records_passed % sub_percent_int == 0:
            yield ((records_passed // sub_percent) / 10, records_passed,
                   updated, newly_inactive, newly_active, deleted)
        record = execute_query(conn, 'SELECT * FROM videos WHERE id = ?',
                               (record, ))
        record = dict(record[0])
        video_id = record['id']

        for attempt in range(1, 6):
            api_response = youtube.get_video_info(video_id, api_auth)
            time.sleep(0.01 * attempt**attempt)
            if api_response:
                if api_response['items']:
                    api_video_data = wrangle_video_record(
                        api_response['items'])
                    if len(api_video_data) >= 7:
                        # a record must have at least 7 fields after
                        # going through wrangle_video_record, otherwise it's a
                        # record of a deleted video with no valid data
                        api_video_data.pop('published_at', None)
                        if 'channel_title' not in api_video_data:
                            # the video is somehow available through API
                            # (though some data is missing), but not on YouTube
                            pass
                        else:
                            if record['status'] == 'inactive':
                                record['status'] = 'active'
                                newly_active += 1
                                if verbosity_level_1:
                                    logger.info(
                                        f'{get_record_id_and_title(record)}, '
                                        f'is now active')
                        record.update(api_video_data)
                    else:
                        record['status'] = 'deleted'
                        deleted += 1
                        if verbosity_level_1:
                            logger.info(f'{get_record_id_and_title(record)}, '
                                        f'is now deleted from YT')
                else:
                    if record['status'] == 'active':
                        record['status'] = 'inactive'
                        newly_inactive += 1
                        if verbosity_level_1:
                            logger.info(f'{get_record_id_and_title(record)}, '
                                        f'is now inactive')
                record['last_updated'] = datetime.utcnow().replace(
                    microsecond=0)
                break
        else:
            continue

        if 'tags' in record:
            tags = record.pop('tags')
            add_tags_to_table_and_videos(conn, tags, video_id, existing_tags,
                                         existing_videos_tags,
                                         verbosity_level_3)
            # perhaps, the record should also be checked for tags that have
            # been removed from the updated version and have them removed from
            # the DB as well. However, keeping a fuller record, despite what
            # the video's uploader/author might think about its accuracy,
            # seems like a better option
        channel_id = record['channel_id']
        if 'channel_title' in record:
            channel_title = record.pop('channel_title')
            try:
                if channel_title != channels[channel_id]:
                    update_channel(conn, channel_id, channel_title,
                                   channels[channel_id], verbosity_level_1)
                    channels[channel_id] = channel_title
            except KeyError:
                """The channel now has a different ID... it's a thing.
                One possible reason for this is large channels, belonging to 
                large media companies, getting split off into smaller
                channels. That's what it looked like when I came across it.
                
                Only encountered this once in ~19k of my own records."""
                add_channel(conn, channel_id, channel_title, verbosity_level_2)
        else:
            # Less than a handful of videos were not available on YouTube, but
            # were available through API, with channel id, but no channel title.
            # In Takeout, these had no channel title or id, but had regular
            # title/id. Very strange.
            if channel_id not in channels:
                add_channel(conn, channel_id)

        if 'relevant_topic_ids' in record:
            topics_list = record.pop('relevant_topic_ids')
            if existing_topics_tags.get(video_id):
                for topic in topics_list:
                    if topic not in existing_topics_tags[video_id]:
                        add_topic_to_video(conn, topic, video_id,
                                           verbosity_level_2)

        if update_video(conn, record, verbosity_level_3):
            updated += 1

        commit_interval_counter += 1
        if commit_interval_counter == commit_interval:
            conn.commit()
            commit_interval_counter = 0

    conn.commit()
    execute_query(conn, 'VACUUM')
    conn.row_factory = None

    results = {
        'records_processed': records_passed,
        'records_updated': updated,
        'newly_inactive': newly_inactive,
        'newly_active': newly_active,
        'deleted_from_youtube': deleted
    }
    if verbosity_level_1:
        logger.info(json.dumps(results, indent=4))
        logger.info('\n' + '-' * 100 + f'\nUpdating finished')
示例#19
0
def add_dead_video(conn: sqlite3.Connection, video_id):
    return execute_query(conn, add_dead_video_query, (video_id, ))
示例#20
0
def populate_db(takeout_path: str, project_path: str, logging_verbosity: int):

    if DBProcessState.exit_thread_check():
        return

    progress.clear()

    DBProcessState.percent = '0'
    DBProcessState.stage = 'Processing watch-history.html file(s)...'
    add_sse_event(DBProcessState.stage, 'stage')
    records = {}
    try:
        for f in get_all_records(takeout_path, project_path):
            if DBProcessState.exit_thread_check():
                return
            if isinstance(f, tuple):
                DBProcessState.percent = f'{f[0]} {f[1]}'
                add_sse_event(DBProcessState.percent, 'takeout_progress')
            else:
                try:
                    records = f['videos']
                    if len(records) == 1:  # 1 because of the empty unknown rec
                        add_sse_event(
                            'No records found in the provided '
                            'watch-history.html file(s). '
                            'Something is very wrong.', 'errors')
                        return
                except KeyError:
                    add_sse_event(
                        f'No watch-history.html files found in '
                        f'{takeout_path!r}', 'errors')
                    return

                failed_entries = f['failed_entries']
                if failed_entries:
                    add_sse_event(
                        f'Couldn\'t parse {len(failed_entries)} '
                        f'entries; dumped to parse_fails.json '
                        f'in project directory', 'warnings')
                failed_files = f['failed_files']
                if failed_files:
                    add_sse_event(
                        'The following files could not be '
                        'processed:', 'warnings')
                    for ff in failed_files:
                        add_sse_event(ff, 'warnings')

                total_ts = f['total_timestamps']
                total_v = f['total_videos']
                add_sse_event(
                    f'Videos / timestamps found: '
                    f'{total_v} / {total_ts}', 'info')

    except FileNotFoundError:
        add_sse_event(
            f'Invalid/non-existent path for watch-history.html files',
            'errors')
        raise

    if DBProcessState.exit_thread_check():
        return

    db_path = join(project_path, DB_NAME)
    conn = sqlite_connection(db_path, types=True)
    front_end_data = {'updated': 0}
    try:
        api_auth = youtube.get_api_auth(
            load_file(join(project_path, 'api_key')).strip())
        write_to_sql.setup_tables(conn, api_auth)
        records_at_start = execute_query(conn,
                                         'SELECT count(*) from videos')[0][0]
        if not records_at_start:
            front_end_data['at_start'] = 0
        else:
            front_end_data['at_start'] = records_at_start

        DBProcessState.percent = '0.0'
        add_sse_event(f'{DBProcessState.percent} 1')
        DBProcessState.stage = ('Inserting video records/timestamps from '
                                'Takeout...')
        add_sse_event(DBProcessState.stage, 'stage')

        for record in write_to_sql.insert_videos(conn, records, api_auth,
                                                 logging_verbosity):

            if DBProcessState.exit_thread_check():
                break

            DBProcessState.percent = str(record[0])
            add_sse_event(f'{DBProcessState.percent} {record[1]}')
            front_end_data['updated'] = record[2]

        _show_front_end_data(front_end_data, conn)
        if DBProcessState.stage:
            add_sse_event(event='stop')
        add_sse_event(json.dumps(front_end_data), 'stats')
        conn.close()
    except youtube.ApiKeyError:
        add_sse_event(f'Missing or invalid API key', 'errors')
        raise
    except youtube.ApiQuotaError:
        add_sse_event(
            f'API quota/rate limit exceeded, see '
            f'<a href="https://console.developers.google.com/apis/'
            f'api/youtube.googleapis.com/overview" target="_blank">'
            f'here</a>', 'errors')
        raise

    except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
        add_sse_event(f'Fatal database error - {e!r}', 'errors')
        raise
    except FileNotFoundError:
        add_sse_event(f'Invalid database path', 'errors')
        raise

    conn.close()
示例#21
0
def add_tag(conn: sqlite3.Connection, tag: str, verbose=False):
    if execute_query(conn, add_tag_query, (tag, )):
        if verbose:
            logger.info(f'Added tag {tag!r}')
        return True