def close_db_connection(db_connection, db_cursor): log_manager.debug_global("Closing DB connection and cursor ...") if db_cursor and not db_cursor.closed: db_cursor.close() if db_connection and db_connection.closed == 0: db_connection.close()
def stream_from_db_with_predictions(ske_config, db_config, index_table_name): log_manager.debug_global("Streaming from DB with predictions ...") db_connection = None db_cursor = None (db_connection, db_cursor) = db_manager.open_db_connection(db_config, db_connection, db_cursor) try: while True: db_cursor.execute( sql.SQL( 'SELECT *, ("AF: Social Companions" + "AF: Soziale Medien") AS AF_SC_SM ' 'FROM {index_table_name} ' 'WHERE already_annotated = FALSE ' 'AND already_selected = FALSE ' # left-over from the old system "AND ((selected_on IS NULL) OR (selected_on < (NOW() - INTERVAL '2 days'))) " 'ORDER BY AF_SC_SM ASC ' 'LIMIT 1').format( index_table_name=sql.Identifier(index_table_name))) result = db_cursor.fetchone() url = result["url"] _select_text(db_connection, db_cursor, index_table_name, 'url', url) options = _preselect_options(result) ske_doc = ske_manager.get_doc_from_url(ske_config, url) yield { "text": ske_doc["text"], "options": options['cats_as_options'], "accept": options['options_accepted'], "meta": { "url": url, "scores": options['scores_text'] } } except Exception as ex: print(ex) finally: db_manager.close_db_connection(db_connection, db_cursor)
def _select_text(db_connection, db_cursor, index_table_name, pk_column_name, pk_value): log_manager.debug_global( f" Updating table '{index_table_name}' with the info that we selected this text at this time ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} SET " # "already_selected = TRUE, " # left-over from the old system "selected_on = NOW() " "WHERE {pk} = %(pk_value)s ").format( index_table_name=sql.Identifier(index_table_name), pk=sql.Identifier(pk_column_name)), {'pk_value': pk_value}) db_connection.commit()
def _preselect_options(result): log_manager.debug_global(" Preselecting predicted options ...") # options for the user to select cats_as_options = get_cats_as_options([ c for c in result.keys() if c.startswith('AF:') or c.startswith('T:') or c.startswith('TI:') ] + ['VR: enthalten', 'VR: nicht enthalten']) # options that are preselected based on model predictions options_accepted = [] # meta information scores_text = "" for o in cats_as_options: cat = o["text"] # for predictions from index1 if cat in result: cat_pred = result[cat] # TODO: this is only correct for independent labels! if cat_pred > 0.5: options_accepted.append(o["id"]) scores_text += cat + ": " + str(round(cat_pred, 4)) + ", " # for index2 elif cat == "VR: enthalten": # all scores > 0 should be marked as "VR enthalten" # all texts streamed from index2 will have a score > 0 options_accepted.append(o["id"]) return { 'cats_as_options': cats_as_options, 'options_accepted': options_accepted, 'scores_text': scores_text }
def write_df_to_db(df, index_table_name, db_config): log_manager.debug_global("Creating SqlAlchemy engine ...") engine = sqlalchemy.create_engine( sqlalchemy.engine.url.URL('postgresql+psycopg2', host=db_config['host'], port=db_config['port'], username=db_config['user'], password=db_config['password'], database=db_config['dbname'])) try: log_manager.debug_global( f"Writing DataFrame to DB {index_table_name} ...") df.to_sql(index_table_name, engine, if_exists='append') except ValueError as e: log_manager.info_global( f"Can't write DataFrame to Database: table {index_table_name} already exists" ) finally: log_manager.debug_global("Disposing of SqlAlchemy engine ...") engine.dispose()
def create_table(db_connection, db_cursor, table_name): try: log_manager.debug_global(f"Dropping table {table_name} ...") db_cursor.execute( sql.SQL(""" DROP TABLE IF EXISTS {table}; """).format(table=sql.Identifier(table_name))) log_manager.debug_global(f"Creating table {table_name} ...") sql_stmt = sql.SQL(""" CREATE TABLE {table} ( {docid} varchar NOT NULL, {pos} varchar NOT NULL, {url} varchar NULL, CONSTRAINT ske_docid_pos_pk PRIMARY KEY ({docid}), CONSTRAINT ske_docid_pos_un_pos UNIQUE ({pos}), CONSTRAINT ske_docid_pos_un_url UNIQUE ({url}) ); """).format(table=sql.Identifier(table_name), docid=sql.Identifier('docid'), pos=sql.Identifier('pos_mara002'), url=sql.Identifier('url_index1')) db_cursor.execute(sql_stmt) db_connection.commit() except Exception as e: log_manager.info_global(f"There was an error: {e}") log_manager.debug_global( f"This was the SQL string: \n{sql_stmt.as_string(db_connection)}") log_manager.debug_global("Rolling back DB operations ...") db_connection.rollback() raise e return
def open_db_connection(db_config, db_connection=None, db_cursor=None) -> Tuple[connection, RealDictCursor]: log_manager.debug_global("Checking for DB connection ...") if not db_connection or db_connection.closed != 0: log_manager.debug_global("Opening DB connection ...") db_connection = psycopg2.connect( host=db_config["host"], user=db_config["user"], database=db_config["dbname"], password=db_config["password"], port=db_config["port"] ) if (db_connection and db_connection.closed == 0) and (not db_cursor or db_cursor.closed): log_manager.debug_global("Opening DB cursor ...") db_cursor = db_connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor) return db_connection, db_cursor
def create_tables(db_config, index1_table_name, index2_table_names): (db_connection, db_cursor) = db_manager.open_db_connection(db_config) try: log_manager.debug_global("Dropping tables ...") db_cursor.execute( sql.SQL(""" DROP TABLE IF EXISTS {table_keywords}, {table_scores}, {table_tokens} CASCADE; DROP INDEX IF EXISTS {score_idx} CASCADE; """).format( table_keywords=sql.Identifier(index2_table_names['keywords']), table_scores=sql.Identifier(index2_table_names['scores']), table_tokens=sql.Identifier(index2_table_names['tokens']), score_idx=sql.Identifier( 'index_2__mara002__lmvr_scores_score_rarity_diversity_idx') )) # table 1: keywords log_manager.debug_global( f"Creating table {index2_table_names['keywords']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {pk} varchar NOT NULL, corpus_count int4 NOT NULL, category varchar NOT NULL, CONSTRAINT index_2__mara002__lmvr_keywords_pk PRIMARY KEY ({pk}) ); """).format(table=sql.Identifier(index2_table_names['keywords']), pk=sql.Identifier('keyword_id'))) # table 2: texts + scores log_manager.debug_global( f"Creating table {index2_table_names['scores']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {pk} varchar NOT NULL, {score1} numeric NOT NULL, already_annotated bool NULL, selected_on timestamptz NULL, CONSTRAINT index_2__mara002__lmvr_scores_pk PRIMARY KEY ({pk}) ); CREATE INDEX index_2__mara002__lmvr_scores_score_rarity_diversity_idx ON {table} USING btree ({score1} DESC); """).format(table=sql.Identifier(index2_table_names['scores']), pk=sql.Identifier('docid'), score1=sql.Identifier('score_rarity_diversity'))) # table 3: keywords in texts log_manager.debug_global( f"Creating table {index2_table_names['tokens']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {fk_texts} varchar NOT NULL, {fk_kw} varchar NOT NULL, token_count int4 NOT NULL DEFAULT 0, CONSTRAINT index_2__mara002__lmvr_tokens_pk PRIMARY KEY ({fk_texts}, {fk_kw}), CONSTRAINT index_2__mara002__lmvr_tokens_fk FOREIGN KEY ({fk_texts}) REFERENCES {table_texts}({fk_texts}) ON UPDATE CASCADE ON DELETE CASCADE, CONSTRAINT index_2__mara002__lmvr_tokens_fk_keyword FOREIGN KEY ({fk_kw}) REFERENCES {table_kw}({fk_kw}) ON UPDATE CASCADE ON DELETE CASCADE ); """).format( table=sql.Identifier(index2_table_names['tokens']), table_texts=sql.Identifier(index2_table_names['scores']), fk_texts=sql.Identifier('docid'), table_kw=sql.Identifier(index2_table_names['keywords']), fk_kw=sql.Identifier('keyword_id'))) db_connection.commit() except Exception as e: db_connection.rollback() raise e finally: db_manager.close_db_connection(db_connection, db_cursor) return # TODO: Is this empty return on purpose?
def run(data_path, db_config, index1_table_name, index2_table_names, ske_config): start = datetime.datetime.now() log_manager.info_global("--------------------------------") log_manager.info_global( f"{start.strftime('[%y-%m-%d %H:%M:%S]')} START INDEXING\n") log_manager.info_global("Creating DB tables ...") create_tables(db_config, index1_table_name, index2_table_names) log_manager.info_global("Creating DataFrames from original CSV files ...") # 1. set up the keywords dataframe log_manager.debug_global("Creating DataFrame for keywords ...") keyword_df = read_keyword_df(data_path) # store the keywords df to the database log_manager.debug_global("Writing keywords DF to DB ...") write_df_to_db( keyword_df.drop(columns=['csv_tokens', 'csv_types'], inplace=False), index2_table_names['keywords'], db_config) # 2. set up the text token counts dataframe log_manager.debug_global("Creating DataFrame for token counts ...") token_df = pd.DataFrame() # in doc_df, we create a column for each keyword # and fill it with that keyword's token count in the given document bar = create_progress_bar('Calculating total of tokens per text', keyword_df.shape[0]) for kw in keyword_df.itertuples(): # kw is a Pandas object representing the row # we find the token counts in the CSV file stored in the column 'csv_tokens' of keyword_df temp_df = pd.read_csv(f'{data_path}/CSV/{kw.csv_tokens}', sep='\t', skiprows=8, names=['docid', 'token', 'token_count'], usecols=['docid', 'token_count']) # we need to group by doc id and sum all the token counts for various shapes of the token temp_df = temp_df.groupby(['docid'], as_index=False).sum() # add a column temp_df['keyword_id'] = kw.Index temp_df = temp_df.set_index(['keyword_id', 'docid'], verify_integrity=True) # 1st index: keyword_id, because this allows for fewer lookups when calculating the scores # we append the rows to token_df token_df = token_df.append(temp_df, verify_integrity=True) bar.next() bar.finish() # Don't write to token_df to DB yet because it has a FK constraint to doc_df. # 3. set up the texts dataframe log_manager.debug_global("Creating DataFrame for texts ...") # we use this file only to get a complete list of doc ids doc_df = pd.read_csv(f'{data_path}/mara002_kvr_all.docids.counts.csv', sep='\t', names=['types_count', 'docid'], usecols=['docid']) doc_df['score_rarity_diversity'] = 0.0 doc_df['already_annotated'] = False doc_df['selected_on'] = None doc_df = doc_df.set_index('docid') # Calculate scores log_manager.debug_global("Calculating scores for texts ...") doc_df = score_rarity_diversity(doc_df, keyword_df, token_df) # Write doc_df to DB log_manager.debug_global("Writing DF for texts to DB ...") write_df_to_db(doc_df, index2_table_names['scores'], db_config) # Now we can write token_df to the DB. log_manager.debug_global("Writing DF for tokens to DB ...") write_df_to_db(token_df, index2_table_names['tokens'], db_config) # all done! end = datetime.datetime.now() log_manager.info_global( f"{end.strftime('[%y-%m-%d %H:%M:%S]')} DONE INDEXING, duration: {end-start}" ) return # TODO: Is this empty return on purpose?
def score_rarity_diversity(doc_df, keyword_df, token_df): # This algorithm favors rare keywords over frequent keywords, # and many types over many tokens, # but also many tokens over few tokens. # # score(text) = # sum for each keyword k: # sum for n from 1 to the token count of k in text: # (1/corpus token count of k) * (1/n) # # A keyword with a high token count in the corpus will yield a smaller coefficient, and vice versa, # thus favoring rarity. # A text t1 where keyword k appears n times will have a lower score # than a text t2 where k appears n+1 times, if t1 and t2 are otherwise identical, # thus favoring higher token counts. # A text t1 where keyword k1 appears n times and keyword k2 appears m times, # where k1 and k2 have the same corpus token count, will have a higher score # than a text t2 where k1 appears n+l times and k2 appears m-l times, # thus favoring diversity. log_manager.debug_global("Calculating rarity/diversity scores ...") # We select the column 'score_rarity_diversity', which as of now contains only 0s. # This returns a Series object whose index is the docids (the index of doc_df). scores = doc_df['score_rarity_diversity'] bar = create_progress_bar('Computing scores per keyword', keyword_df.shape[0]) # iterate over rows in keyword_df for kw, data in keyword_df.iterrows(): # kw is the label of the row (the keyword_id) # data is a Series of the values in this row # get this keyword's corpus token count # we will use this to calculate its inverse frequency kw_freq = data.corpus_count # get this keyword's token count per text try: # token_df has a MultiIndex: 1st the keyword_id, 2nd the docid # We select all rows with keyword_id = kw. This returns a DataFrame. # Then we select only the column 'token_count'. This returns a Series. tokencounts = token_df.loc[kw]['token_count'] # tokencounts is a Series, indexed with docid, # containing as values the token counts of kw in the given docid except KeyError as e: tokencounts = pd.Series(index=doc_df.index, data=0) # This is the formula: def calculate_score(token_count, kw_freq): return sum( map(lambda x: pow(kw_freq, -1) * pow(x, -1), range(1, int(token_count) + 1))) # Apply this function to the token counts of the current keyword. scores = scores.add(tokencounts.apply(calculate_score, args=(kw_freq, )), fill_value=0.0) bar.next() bar.finish() # feed the temporary Series back into the table doc_df['score_rarity_diversity'] = scores # sort by highest score doc_df = doc_df.sort_values(by='score_rarity_diversity', ascending=False) return doc_df
def stream_from_db_with_lmvr_keywords(ske_config, db_config, index1_table_name, index2_table_names, ske_translation_table_name): log_manager.debug_global("Streaming from database (index2) ...") # open db connection db_connection = None db_cursor = None (db_connection, db_cursor) = db_manager.open_db_connection(db_config, db_connection, db_cursor) # Don't know where to close the DB connection! while True: db_cursor.execute( sql.SQL(""" SELECT * FROM {idx2_table} AS idx2 INNER JOIN {ske_table} AS ske ON ske.{ske_fk_idx2} = idx2.{idx2_fk_ske} INNER JOIN {idx1_table} AS idx1 ON idx1.{idx1_pk} = ske.{ske_fk_idx1} WHERE idx1.already_annotated = FALSE AND idx2.already_annotated = FALSE AND idx1.already_selected = FALSE AND ((idx1.selected_on IS NULL) OR (idx1.selected_on < (NOW() - INTERVAL '2 days'))) AND ((idx2.selected_on IS NULL) OR (idx2.selected_on < (NOW() - INTERVAL '2 days'))) ORDER BY idx2.score_rarity_diversity DESC LIMIT 1 """).format(idx2_table=sql.Identifier( index2_table_names['scores']), idx2_fk_ske=sql.Identifier('docid'), ske_table=sql.Identifier(ske_translation_table_name), ske_fk_idx2=sql.Identifier('docid'), ske_fk_idx1=sql.Identifier('url_index1'), idx1_table=sql.Identifier(index1_table_name), idx1_pk=sql.Identifier('url'))) result = db_cursor.fetchone() # log_manager.debug_global(f"Result={result}") url = result['url'] docid = result['docid'] # log_manager.debug_global(f"Selected text with url={url}, docid={docid}") # Store the information that this URL is getting selected now _select_text(db_connection, db_cursor, index1_table_name, 'url', url) _select_text(db_connection, db_cursor, index2_table_names['scores'], 'docid', docid) # Calculate the preselection options based on model predictions # (Will be empty if there are no predictions for this URL) options = _preselect_options(result) # Get this text's LMVR token counts db_cursor.execute( sql.SQL(""" SELECT keyword_id, token_count FROM {tokens_table} WHERE docid = %(docid)s AND token_count > 0 """).format(tokens_table=sql.Identifier( index2_table_names['tokens']), ), {'docid': docid}) lmvr_count = { row['keyword_id']: int(row['token_count']) for row in db_cursor.fetchall() } lmvr_count_text = json.dumps(lmvr_count, ensure_ascii=False, sort_keys=True) # retrieving the text ske_doc = ske_manager.get_doc_from_url(ske_config, url) log_manager.debug_global(" Feeding this text into prodigy ...") yield { "text": ske_doc["text"], "options": options['cats_as_options'], "accept": options['options_accepted'], "meta": { "docid": result['docid'], "url": url, "category scores": options['scores_text'], "LMVR count": lmvr_count_text, "LMVR score": result['score_rarity_diversity'] } }
def on_exit(controller): log_manager.debug_global("Prodigy: exiting ...") db_manager.close_db_connection(db_connection, db_cursor)
def run_recipe(db, stream, dataset_name, db_config, index1_table_name, index2_table_names): import prodigy @prodigy.recipe("cats_recipe") def choice(): db_connection = None db_cursor = None # custom function to run when an annotation is complete def update(examples): log_manager.debug_global("Prodigy: updating ...") nonlocal db_connection nonlocal db_cursor db_connection, db_cursor = db_manager.open_db_connection( db_config, db_connection, db_cursor) assert db_connection and db_connection.closed == 0 # 0 means 'open' assert db_cursor and not db_cursor.closed for example in examples: try: if index1_table_name and 'url' in example['meta']: url = example['meta']['url'] log_manager.debug_global( f"Storing annotation meta info for url={url} in table {index1_table_name} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index1_table_name), pk=sql.Identifier('url')), {'value': url}) # TODO: this could be made safer to ensure # that index2 won't be updated accidentally with 'already_annotated' # when we are actually only streaming from index1. # # Curently the stream from index1 does not set 'docid' in example['meta'], # but this may not be good to rely on. if index2_table_names and 'docid' in example['meta']: docid = example['meta']['docid'] log_manager.debug_global( f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index2_table_names['scores']), pk=sql.Identifier('docid')), {'value': docid}) db_connection.commit() except Exception as ex: log_manager.info_global( f"Error storing an annotation in the database: {ex}") db_connection.rollback() # custom function to run when the user exists prodigy # TODO: it is not ideal to put the closing of the database connection here because there might be multiple users. # but also, it won't hurt because the connection can be reopened at the next update, # and there is no better function to put it; see https://prodi.gy/docs/custom-recipes # at least, put here, it will close the connection when the last user stops annotating. def on_exit(controller): log_manager.debug_global("Prodigy: exiting ...") db_manager.close_db_connection(db_connection, db_cursor) return { "view_id": "choice", "dataset": dataset_name, "stream": stream, "db": db, "update": update, "on_exit": on_exit, } log_manager.debug_global("Starting up the prodigy server ...") prodigy.serve( "cats_recipe", host="0.0.0.0", choice_style="multiple", )
def update(examples): log_manager.debug_global("Prodigy: updating ...") nonlocal db_connection nonlocal db_cursor db_connection, db_cursor = db_manager.open_db_connection( db_config, db_connection, db_cursor) assert db_connection and db_connection.closed == 0 # 0 means 'open' assert db_cursor and not db_cursor.closed for example in examples: try: if index1_table_name and 'url' in example['meta']: url = example['meta']['url'] log_manager.debug_global( f"Storing annotation meta info for url={url} in table {index1_table_name} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index1_table_name), pk=sql.Identifier('url')), {'value': url}) # TODO: this could be made safer to ensure # that index2 won't be updated accidentally with 'already_annotated' # when we are actually only streaming from index1. # # Curently the stream from index1 does not set 'docid' in example['meta'], # but this may not be good to rely on. if index2_table_names and 'docid' in example['meta']: docid = example['meta']['docid'] log_manager.debug_global( f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index2_table_names['scores']), pk=sql.Identifier('docid')), {'value': docid}) db_connection.commit() except Exception as ex: log_manager.info_global( f"Error storing an annotation in the database: {ex}") db_connection.rollback()
def run(ske_config, db_config, docid_table_name, index1_table_name, index2_table_names, should_drop_create_table=False): (db_connection, db_cursor) = db_manager.open_db_connection(db_config) if should_drop_create_table: create_table(db_connection, db_cursor, docid_table_name) # Direction 1: look for URLs that are not yet in the translation table # Hannes says that pos -> docid is faster than docid -> pos # because the SKE uses pos as internal indices log_manager.debug_global("Looking for URLs ...") url_records = select_urls_from_index1(db_cursor, docid_table_name, index1_table_name) log_manager.info_global(f"Found {len(url_records)} URLs to be converted. ") if len(url_records) > 0: ske_manager.create_session(ske_config) progressbar = progress.bar.Bar( 'Converting URLs to docid', max=len(url_records), suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h') for record in url_records: url = record['url'] pos = ske_manager.get_pos_from_url(url) docid = ske_manager.get_docid_from_pos( ske_config, pos) # this calls the API endpoing 'fullref' insert_into_table(db_connection, db_cursor, docid_table_name, docid, pos, url) progressbar.next() progressbar.finish() # Direction 2: look for docids that are not yet in the translation table log_manager.debug_global("Looking for docids ...") docid_records = select_docids_from_index2(db_cursor, docid_table_name, index2_table_names) log_manager.debug_global( f"Found {len(docid_records)} docids to be converted.") if len(docid_records) > 0: ske_manager.create_session(ske_config) progressbar = progress.bar.Bar( 'Converting docids to URLs', max=len(docid_records), suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h') for record in docid_records: docid = record['docid'] pos = ske_manager.get_pos_from_docid( ske_config, docid) # this calls the API endpoint 'first' url = ske_manager.get_url_from_pos(ske_config, pos) insert_into_table(db_connection, db_cursor, docid_table_name, docid, pos, url) progressbar.next() progressbar.finish() # All set! ske_manager.close_session() db_manager.close_db_connection(db_connection, db_cursor) return