示例#1
0
def close_db_connection(db_connection, db_cursor):

    log_manager.debug_global("Closing DB connection and cursor ...")

    if db_cursor and not db_cursor.closed:
        db_cursor.close()

    if db_connection and db_connection.closed == 0:
        db_connection.close()
示例#2
0
def stream_from_db_with_predictions(ske_config, db_config, index_table_name):

    log_manager.debug_global("Streaming from DB with predictions ...")

    db_connection = None
    db_cursor = None

    (db_connection,
     db_cursor) = db_manager.open_db_connection(db_config, db_connection,
                                                db_cursor)

    try:

        while True:

            db_cursor.execute(
                sql.SQL(
                    'SELECT *, ("AF: Social Companions" + "AF: Soziale Medien") AS AF_SC_SM '
                    'FROM {index_table_name} '
                    'WHERE already_annotated = FALSE '
                    'AND already_selected = FALSE '  # left-over from the old system
                    "AND ((selected_on IS NULL) OR (selected_on < (NOW() - INTERVAL '2 days'))) "
                    'ORDER BY AF_SC_SM ASC '
                    'LIMIT 1').format(
                        index_table_name=sql.Identifier(index_table_name)))

            result = db_cursor.fetchone()

            url = result["url"]

            _select_text(db_connection, db_cursor, index_table_name, 'url',
                         url)

            options = _preselect_options(result)

            ske_doc = ske_manager.get_doc_from_url(ske_config, url)

            yield {
                "text": ske_doc["text"],
                "options": options['cats_as_options'],
                "accept": options['options_accepted'],
                "meta": {
                    "url": url,
                    "scores": options['scores_text']
                }
            }

    except Exception as ex:
        print(ex)

    finally:
        db_manager.close_db_connection(db_connection, db_cursor)
示例#3
0
def _select_text(db_connection, db_cursor, index_table_name, pk_column_name,
                 pk_value):

    log_manager.debug_global(
        f"  Updating table '{index_table_name}' with the info that we selected this text at this time ..."
    )

    db_cursor.execute(
        sql.SQL("UPDATE {index_table_name} SET "
                # "already_selected = TRUE, " # left-over from the old system
                "selected_on = NOW() "
                "WHERE {pk} = %(pk_value)s ").format(
                    index_table_name=sql.Identifier(index_table_name),
                    pk=sql.Identifier(pk_column_name)),
        {'pk_value': pk_value})
    db_connection.commit()
示例#4
0
def _preselect_options(result):

    log_manager.debug_global("  Preselecting predicted options ...")

    # options for the user to select
    cats_as_options = get_cats_as_options([
        c for c in result.keys()
        if c.startswith('AF:') or c.startswith('T:') or c.startswith('TI:')
    ] + ['VR: enthalten', 'VR: nicht enthalten'])

    # options that are preselected based on model predictions
    options_accepted = []

    # meta information
    scores_text = ""

    for o in cats_as_options:

        cat = o["text"]

        # for predictions from index1
        if cat in result:
            cat_pred = result[cat]

            # TODO: this is only correct for independent labels!
            if cat_pred > 0.5:

                options_accepted.append(o["id"])

            scores_text += cat + ": " + str(round(cat_pred, 4)) + ", "

        # for index2
        elif cat == "VR: enthalten":
            # all scores > 0 should be marked as "VR enthalten"
            # all texts streamed from index2 will have a score > 0
            options_accepted.append(o["id"])

    return {
        'cats_as_options': cats_as_options,
        'options_accepted': options_accepted,
        'scores_text': scores_text
    }
示例#5
0
def write_df_to_db(df, index_table_name, db_config):

    log_manager.debug_global("Creating SqlAlchemy engine ...")

    engine = sqlalchemy.create_engine(
        sqlalchemy.engine.url.URL('postgresql+psycopg2',
                                  host=db_config['host'],
                                  port=db_config['port'],
                                  username=db_config['user'],
                                  password=db_config['password'],
                                  database=db_config['dbname']))

    try:
        log_manager.debug_global(
            f"Writing DataFrame to DB {index_table_name} ...")

        df.to_sql(index_table_name, engine, if_exists='append')

    except ValueError as e:
        log_manager.info_global(
            f"Can't write DataFrame to Database: table {index_table_name} already exists"
        )

    finally:
        log_manager.debug_global("Disposing of SqlAlchemy engine ...")

        engine.dispose()
def create_table(db_connection, db_cursor, table_name):

    try:

        log_manager.debug_global(f"Dropping table {table_name} ...")

        db_cursor.execute(
            sql.SQL("""
                DROP TABLE IF EXISTS {table};
            """).format(table=sql.Identifier(table_name)))

        log_manager.debug_global(f"Creating table {table_name} ...")

        sql_stmt = sql.SQL("""
            CREATE TABLE {table} (
                {docid} varchar NOT NULL,
                {pos} varchar NOT NULL,
                {url} varchar NULL,
                CONSTRAINT ske_docid_pos_pk PRIMARY KEY ({docid}),
                CONSTRAINT ske_docid_pos_un_pos UNIQUE ({pos}),
                CONSTRAINT ske_docid_pos_un_url UNIQUE ({url})
            );
        """).format(table=sql.Identifier(table_name),
                    docid=sql.Identifier('docid'),
                    pos=sql.Identifier('pos_mara002'),
                    url=sql.Identifier('url_index1'))

        db_cursor.execute(sql_stmt)

        db_connection.commit()

    except Exception as e:
        log_manager.info_global(f"There was an error: {e}")

        log_manager.debug_global(
            f"This was the SQL string: \n{sql_stmt.as_string(db_connection)}")

        log_manager.debug_global("Rolling back DB operations ...")
        db_connection.rollback()

        raise e

    return
示例#7
0
def open_db_connection(db_config, db_connection=None, db_cursor=None) -> Tuple[connection, RealDictCursor]:

    log_manager.debug_global("Checking for DB connection ...")

    if not db_connection or db_connection.closed != 0:

        log_manager.debug_global("Opening DB connection ...")
        db_connection = psycopg2.connect(
            host=db_config["host"],
            user=db_config["user"],
            database=db_config["dbname"],
            password=db_config["password"],
            port=db_config["port"]
        )

    if (db_connection and db_connection.closed == 0) and (not db_cursor or db_cursor.closed):

        log_manager.debug_global("Opening DB cursor ...")
        db_cursor = db_connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    return db_connection, db_cursor
示例#8
0
def create_tables(db_config, index1_table_name, index2_table_names):

    (db_connection, db_cursor) = db_manager.open_db_connection(db_config)

    try:

        log_manager.debug_global("Dropping tables ...")
        db_cursor.execute(
            sql.SQL("""
                DROP TABLE IF EXISTS {table_keywords}, {table_scores}, {table_tokens} CASCADE;
                DROP INDEX IF EXISTS {score_idx} CASCADE;
            """).format(
                table_keywords=sql.Identifier(index2_table_names['keywords']),
                table_scores=sql.Identifier(index2_table_names['scores']),
                table_tokens=sql.Identifier(index2_table_names['tokens']),
                score_idx=sql.Identifier(
                    'index_2__mara002__lmvr_scores_score_rarity_diversity_idx')
            ))

        # table 1: keywords
        log_manager.debug_global(
            f"Creating table {index2_table_names['keywords']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {pk} varchar NOT NULL,
                    corpus_count int4 NOT NULL,
                    category varchar NOT NULL,
                    CONSTRAINT index_2__mara002__lmvr_keywords_pk PRIMARY KEY ({pk})
                );
            """).format(table=sql.Identifier(index2_table_names['keywords']),
                        pk=sql.Identifier('keyword_id')))

        # table 2: texts + scores
        log_manager.debug_global(
            f"Creating table {index2_table_names['scores']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {pk} varchar NOT NULL,
                    {score1} numeric NOT NULL,
                    already_annotated bool NULL,
                    selected_on timestamptz NULL,
                    CONSTRAINT index_2__mara002__lmvr_scores_pk PRIMARY KEY ({pk})
                );
                CREATE INDEX index_2__mara002__lmvr_scores_score_rarity_diversity_idx
                    ON {table}
                    USING btree
                    ({score1} DESC);
            """).format(table=sql.Identifier(index2_table_names['scores']),
                        pk=sql.Identifier('docid'),
                        score1=sql.Identifier('score_rarity_diversity')))

        # table 3: keywords in texts
        log_manager.debug_global(
            f"Creating table {index2_table_names['tokens']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {fk_texts} varchar NOT NULL,
                    {fk_kw} varchar NOT NULL,
                    token_count int4 NOT NULL DEFAULT 0,
                    CONSTRAINT index_2__mara002__lmvr_tokens_pk PRIMARY KEY ({fk_texts}, {fk_kw}),
                    CONSTRAINT index_2__mara002__lmvr_tokens_fk FOREIGN KEY ({fk_texts})
                        REFERENCES {table_texts}({fk_texts})
                        ON UPDATE CASCADE
                        ON DELETE CASCADE,
                    CONSTRAINT index_2__mara002__lmvr_tokens_fk_keyword FOREIGN KEY ({fk_kw})
                        REFERENCES {table_kw}({fk_kw})
                        ON UPDATE CASCADE
                        ON DELETE CASCADE
                );
            """).format(
                table=sql.Identifier(index2_table_names['tokens']),
                table_texts=sql.Identifier(index2_table_names['scores']),
                fk_texts=sql.Identifier('docid'),
                table_kw=sql.Identifier(index2_table_names['keywords']),
                fk_kw=sql.Identifier('keyword_id')))

        db_connection.commit()

    except Exception as e:

        db_connection.rollback()
        raise e

    finally:
        db_manager.close_db_connection(db_connection, db_cursor)

    return  # TODO: Is this empty return on purpose?
示例#9
0
def run(data_path, db_config, index1_table_name, index2_table_names,
        ske_config):

    start = datetime.datetime.now()
    log_manager.info_global("--------------------------------")
    log_manager.info_global(
        f"{start.strftime('[%y-%m-%d %H:%M:%S]')} START INDEXING\n")

    log_manager.info_global("Creating DB tables ...")

    create_tables(db_config, index1_table_name, index2_table_names)

    log_manager.info_global("Creating DataFrames from original CSV files ...")

    # 1. set up the keywords dataframe
    log_manager.debug_global("Creating DataFrame for keywords ...")
    keyword_df = read_keyword_df(data_path)

    # store the keywords df to the database
    log_manager.debug_global("Writing keywords DF to DB ...")
    write_df_to_db(
        keyword_df.drop(columns=['csv_tokens', 'csv_types'], inplace=False),
        index2_table_names['keywords'], db_config)

    # 2. set up the text token counts dataframe
    log_manager.debug_global("Creating DataFrame for token counts ...")
    token_df = pd.DataFrame()

    # in doc_df, we create a column for each keyword
    # and fill it with that keyword's token count in the given document
    bar = create_progress_bar('Calculating total of tokens per text',
                              keyword_df.shape[0])

    for kw in keyword_df.itertuples():
        # kw is a Pandas object representing the row
        # we find the token counts in the CSV file stored in the column 'csv_tokens' of keyword_df
        temp_df = pd.read_csv(f'{data_path}/CSV/{kw.csv_tokens}',
                              sep='\t',
                              skiprows=8,
                              names=['docid', 'token', 'token_count'],
                              usecols=['docid', 'token_count'])
        # we need to group by doc id and sum all the token counts for various shapes of the token
        temp_df = temp_df.groupby(['docid'], as_index=False).sum()

        # add a column
        temp_df['keyword_id'] = kw.Index

        temp_df = temp_df.set_index(['keyword_id', 'docid'],
                                    verify_integrity=True)
        # 1st index: keyword_id, because this allows for fewer lookups when calculating the scores

        # we append the rows to token_df
        token_df = token_df.append(temp_df, verify_integrity=True)

        bar.next()
    bar.finish()

    # Don't write to token_df to DB yet because it has a FK constraint to doc_df.

    # 3. set up the texts dataframe
    log_manager.debug_global("Creating DataFrame for texts ...")

    # we use this file only to get a complete list of doc ids
    doc_df = pd.read_csv(f'{data_path}/mara002_kvr_all.docids.counts.csv',
                         sep='\t',
                         names=['types_count', 'docid'],
                         usecols=['docid'])
    doc_df['score_rarity_diversity'] = 0.0
    doc_df['already_annotated'] = False
    doc_df['selected_on'] = None
    doc_df = doc_df.set_index('docid')

    # Calculate scores
    log_manager.debug_global("Calculating scores for texts ...")

    doc_df = score_rarity_diversity(doc_df, keyword_df, token_df)

    # Write doc_df to DB
    log_manager.debug_global("Writing DF for texts to DB ...")

    write_df_to_db(doc_df, index2_table_names['scores'], db_config)

    # Now we can write token_df to the DB.
    log_manager.debug_global("Writing DF for tokens to DB ...")

    write_df_to_db(token_df, index2_table_names['tokens'], db_config)

    # all done!
    end = datetime.datetime.now()
    log_manager.info_global(
        f"{end.strftime('[%y-%m-%d %H:%M:%S]')} DONE INDEXING, duration: {end-start}"
    )

    return  # TODO: Is this empty return on purpose?
示例#10
0
def score_rarity_diversity(doc_df, keyword_df, token_df):
    # This algorithm favors rare keywords over frequent keywords,
    #   and many types over many tokens,
    #   but also many tokens over few tokens.
    #
    # score(text) =
    #   sum for each keyword k:
    #     sum for n from 1 to the token count of k in text:
    #       (1/corpus token count of k) * (1/n)
    #
    # A keyword with a high token count in the corpus will yield a smaller coefficient, and vice versa,
    #   thus favoring rarity.
    # A text t1 where keyword k appears n times will have a lower score
    #   than a text t2 where k appears n+1 times, if t1 and t2 are otherwise identical,
    #   thus favoring higher token counts.
    # A text t1 where keyword k1 appears n times and keyword k2 appears m times,
    #   where k1 and k2 have the same corpus token count, will have a higher score
    #   than a text t2 where k1 appears n+l times and k2 appears m-l times,
    #   thus favoring diversity.

    log_manager.debug_global("Calculating rarity/diversity scores ...")

    # We select the column 'score_rarity_diversity', which as of now contains only 0s.
    # This returns a Series object whose index is the docids (the index of doc_df).
    scores = doc_df['score_rarity_diversity']

    bar = create_progress_bar('Computing scores per keyword',
                              keyword_df.shape[0])

    # iterate over rows in keyword_df
    for kw, data in keyword_df.iterrows():
        # kw is the label of the row (the keyword_id)
        # data is a Series of the values in this row

        # get this keyword's corpus token count
        # we will use this to calculate its inverse frequency
        kw_freq = data.corpus_count

        # get this keyword's token count per text
        try:
            # token_df has a MultiIndex: 1st the keyword_id, 2nd the docid
            # We select all rows with keyword_id = kw. This returns a DataFrame.
            # Then we select only the column 'token_count'. This returns a Series.
            tokencounts = token_df.loc[kw]['token_count']
            # tokencounts is a Series, indexed with docid,
            #   containing as values the token counts of kw in the given docid

        except KeyError as e:
            tokencounts = pd.Series(index=doc_df.index, data=0)

        # This is the formula:
        def calculate_score(token_count, kw_freq):
            return sum(
                map(lambda x: pow(kw_freq, -1) * pow(x, -1),
                    range(1,
                          int(token_count) + 1)))

        # Apply this function to the token counts of the current keyword.
        scores = scores.add(tokencounts.apply(calculate_score,
                                              args=(kw_freq, )),
                            fill_value=0.0)

        bar.next()

    bar.finish()

    # feed the temporary Series back into the table
    doc_df['score_rarity_diversity'] = scores

    # sort by highest score
    doc_df = doc_df.sort_values(by='score_rarity_diversity', ascending=False)

    return doc_df
示例#11
0
def stream_from_db_with_lmvr_keywords(ske_config, db_config, index1_table_name,
                                      index2_table_names,
                                      ske_translation_table_name):

    log_manager.debug_global("Streaming from database (index2) ...")

    # open db connection
    db_connection = None
    db_cursor = None

    (db_connection,
     db_cursor) = db_manager.open_db_connection(db_config, db_connection,
                                                db_cursor)
    # Don't know where to close the DB connection!

    while True:

        db_cursor.execute(
            sql.SQL("""
                SELECT *
                FROM {idx2_table} AS idx2
                INNER JOIN {ske_table} AS ske
                    ON ske.{ske_fk_idx2} = idx2.{idx2_fk_ske}
                INNER JOIN {idx1_table} AS idx1
                    ON idx1.{idx1_pk} = ske.{ske_fk_idx1}
                WHERE   idx1.already_annotated = FALSE
                    AND idx2.already_annotated = FALSE
                    AND idx1.already_selected = FALSE
                    AND ((idx1.selected_on IS NULL) OR (idx1.selected_on < (NOW() - INTERVAL '2 days')))
                    AND ((idx2.selected_on IS NULL) OR (idx2.selected_on < (NOW() - INTERVAL '2 days')))
                ORDER BY idx2.score_rarity_diversity DESC
                LIMIT 1
            """).format(idx2_table=sql.Identifier(
                index2_table_names['scores']),
                        idx2_fk_ske=sql.Identifier('docid'),
                        ske_table=sql.Identifier(ske_translation_table_name),
                        ske_fk_idx2=sql.Identifier('docid'),
                        ske_fk_idx1=sql.Identifier('url_index1'),
                        idx1_table=sql.Identifier(index1_table_name),
                        idx1_pk=sql.Identifier('url')))

        result = db_cursor.fetchone()

        # log_manager.debug_global(f"Result={result}")

        url = result['url']
        docid = result['docid']
        # log_manager.debug_global(f"Selected text with url={url}, docid={docid}")

        # Store the information that this URL is getting selected now
        _select_text(db_connection, db_cursor, index1_table_name, 'url', url)
        _select_text(db_connection, db_cursor, index2_table_names['scores'],
                     'docid', docid)

        # Calculate the preselection options based on model predictions
        # (Will be empty if there are no predictions for this URL)
        options = _preselect_options(result)

        # Get this text's LMVR token counts
        db_cursor.execute(
            sql.SQL("""
                SELECT keyword_id, token_count
                FROM {tokens_table}
                WHERE docid = %(docid)s
                    AND token_count > 0
            """).format(tokens_table=sql.Identifier(
                index2_table_names['tokens']), ), {'docid': docid})
        lmvr_count = {
            row['keyword_id']: int(row['token_count'])
            for row in db_cursor.fetchall()
        }
        lmvr_count_text = json.dumps(lmvr_count,
                                     ensure_ascii=False,
                                     sort_keys=True)

        # retrieving the text
        ske_doc = ske_manager.get_doc_from_url(ske_config, url)

        log_manager.debug_global("  Feeding this text into prodigy ...")

        yield {
            "text": ske_doc["text"],
            "options": options['cats_as_options'],
            "accept": options['options_accepted'],
            "meta": {
                "docid": result['docid'],
                "url": url,
                "category scores": options['scores_text'],
                "LMVR count": lmvr_count_text,
                "LMVR score": result['score_rarity_diversity']
            }
        }
示例#12
0
        def on_exit(controller):

            log_manager.debug_global("Prodigy: exiting ...")

            db_manager.close_db_connection(db_connection, db_cursor)
示例#13
0
def run_recipe(db, stream, dataset_name, db_config, index1_table_name,
               index2_table_names):

    import prodigy

    @prodigy.recipe("cats_recipe")
    def choice():

        db_connection = None
        db_cursor = None

        # custom function to run when an annotation is complete
        def update(examples):

            log_manager.debug_global("Prodigy: updating ...")

            nonlocal db_connection
            nonlocal db_cursor

            db_connection, db_cursor = db_manager.open_db_connection(
                db_config, db_connection, db_cursor)

            assert db_connection and db_connection.closed == 0  # 0 means 'open'
            assert db_cursor and not db_cursor.closed

            for example in examples:
                try:

                    if index1_table_name and 'url' in example['meta']:
                        url = example['meta']['url']

                        log_manager.debug_global(
                            f"Storing annotation meta info for url={url} in table {index1_table_name} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index1_table_name),
                                        pk=sql.Identifier('url')),
                            {'value': url})

                    # TODO: this could be made safer to ensure
                    # that index2 won't be updated accidentally with 'already_annotated'
                    # when we are actually only streaming from index1.
                    #
                    # Curently the stream from index1 does not set 'docid' in example['meta'],
                    # but this may not be good to rely on.
                    if index2_table_names and 'docid' in example['meta']:
                        docid = example['meta']['docid']

                        log_manager.debug_global(
                            f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index2_table_names['scores']),
                                        pk=sql.Identifier('docid')),
                            {'value': docid})

                    db_connection.commit()

                except Exception as ex:

                    log_manager.info_global(
                        f"Error storing an annotation in the database: {ex}")

                    db_connection.rollback()

        # custom function to run when the user exists prodigy
        # TODO: it is not ideal to put the closing of the database connection here because there might be multiple users.
        # but also, it won't hurt because the connection can be reopened at the next update,
        # and there is no better function to put it; see https://prodi.gy/docs/custom-recipes
        # at least, put here, it will close the connection when the last user stops annotating.
        def on_exit(controller):

            log_manager.debug_global("Prodigy: exiting ...")

            db_manager.close_db_connection(db_connection, db_cursor)

        return {
            "view_id": "choice",
            "dataset": dataset_name,
            "stream": stream,
            "db": db,
            "update": update,
            "on_exit": on_exit,
        }

    log_manager.debug_global("Starting up the prodigy server ...")

    prodigy.serve(
        "cats_recipe",
        host="0.0.0.0",
        choice_style="multiple",
    )
示例#14
0
        def update(examples):

            log_manager.debug_global("Prodigy: updating ...")

            nonlocal db_connection
            nonlocal db_cursor

            db_connection, db_cursor = db_manager.open_db_connection(
                db_config, db_connection, db_cursor)

            assert db_connection and db_connection.closed == 0  # 0 means 'open'
            assert db_cursor and not db_cursor.closed

            for example in examples:
                try:

                    if index1_table_name and 'url' in example['meta']:
                        url = example['meta']['url']

                        log_manager.debug_global(
                            f"Storing annotation meta info for url={url} in table {index1_table_name} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index1_table_name),
                                        pk=sql.Identifier('url')),
                            {'value': url})

                    # TODO: this could be made safer to ensure
                    # that index2 won't be updated accidentally with 'already_annotated'
                    # when we are actually only streaming from index1.
                    #
                    # Curently the stream from index1 does not set 'docid' in example['meta'],
                    # but this may not be good to rely on.
                    if index2_table_names and 'docid' in example['meta']:
                        docid = example['meta']['docid']

                        log_manager.debug_global(
                            f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index2_table_names['scores']),
                                        pk=sql.Identifier('docid')),
                            {'value': docid})

                    db_connection.commit()

                except Exception as ex:

                    log_manager.info_global(
                        f"Error storing an annotation in the database: {ex}")

                    db_connection.rollback()
示例#15
0
def run(ske_config,
        db_config,
        docid_table_name,
        index1_table_name,
        index2_table_names,
        should_drop_create_table=False):

    (db_connection, db_cursor) = db_manager.open_db_connection(db_config)

    if should_drop_create_table:

        create_table(db_connection, db_cursor, docid_table_name)

    # Direction 1: look for URLs that are not yet in the translation table

    # Hannes says that pos -> docid is faster than docid -> pos
    # because the SKE uses pos as internal indices

    log_manager.debug_global("Looking for URLs ...")
    url_records = select_urls_from_index1(db_cursor, docid_table_name,
                                          index1_table_name)
    log_manager.info_global(f"Found {len(url_records)} URLs to be converted. ")

    if len(url_records) > 0:

        ske_manager.create_session(ske_config)

        progressbar = progress.bar.Bar(
            'Converting URLs to docid',
            max=len(url_records),
            suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h')

        for record in url_records:
            url = record['url']
            pos = ske_manager.get_pos_from_url(url)
            docid = ske_manager.get_docid_from_pos(
                ske_config, pos)  # this calls the API endpoing 'fullref'
            insert_into_table(db_connection, db_cursor, docid_table_name,
                              docid, pos, url)
            progressbar.next()

        progressbar.finish()

    # Direction 2: look for docids that are not yet in the translation table

    log_manager.debug_global("Looking for docids ...")
    docid_records = select_docids_from_index2(db_cursor, docid_table_name,
                                              index2_table_names)
    log_manager.debug_global(
        f"Found {len(docid_records)} docids to be converted.")

    if len(docid_records) > 0:

        ske_manager.create_session(ske_config)

        progressbar = progress.bar.Bar(
            'Converting docids to URLs',
            max=len(docid_records),
            suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h')

        for record in docid_records:
            docid = record['docid']
            pos = ske_manager.get_pos_from_docid(
                ske_config, docid)  # this calls the API endpoint 'first'
            url = ske_manager.get_url_from_pos(ske_config, pos)
            insert_into_table(db_connection, db_cursor, docid_table_name,
                              docid, pos, url)
            progressbar.next()

        progressbar.finish()

    # All set!

    ske_manager.close_session()

    db_manager.close_db_connection(db_connection, db_cursor)

    return