def _get_db_escaped_story_sentence_dicts( db: DatabaseHandler, story: dict, sentences: List[str], ) -> List[Dict[str, str]]: """Given a list of text sentences, return a list of sentences with properly escaped values for insertion.""" story = decode_object_from_bytes_if_needed(story) sentences = decode_object_from_bytes_if_needed(sentences) sentence_dicts = [] sentence_num = 0 for sentence in sentences: # Identify the language of each of the sentences sentence_lang = language_code_for_text(sentence) if (sentence_lang or '') != (story['language'] or ''): # Mark the language as unknown if the results for the sentence are not reliable if not identification_would_be_reliable(text=sentence): sentence_lang = '' sentence_dicts.append({ 'sentence': db.quote_varchar(sentence), 'language': db.quote_varchar(sentence_lang), 'sentence_number': str(sentence_num), 'stories_id': str(story['stories_id']), 'media_id': str(story['media_id']), 'publish_date': db.quote_timestamp(story['publish_date']), }) sentence_num += 1 return sentence_dicts
def _insert_story_sentences( db: DatabaseHandler, story: dict, sentences: List[str], no_dedup_sentences: bool = False, ) -> List[str]: """Insert the story sentences into story_sentences, optionally skipping duplicate sentences by setting is_dup = 't' to the found duplicates that are already in the table. Returns list of sentences that were inserted into the table. """ story = decode_object_from_bytes_if_needed(story) sentences = decode_object_from_bytes_if_needed(sentences) if isinstance(no_dedup_sentences, bytes): no_dedup_sentences = decode_object_from_bytes_if_needed( no_dedup_sentences) no_dedup_sentences = bool(int(no_dedup_sentences)) stories_id = story['stories_id'] media_id = story['media_id'] # Story's publish date is the same for all the sentences, so we might as well pass it as a constant escaped_story_publish_date = db.quote_date(story['publish_date']) if len(sentences) == 0: log.warning(f"Story sentences are empty for story {stories_id}") return [] if no_dedup_sentences: log.debug( f"Won't de-duplicate sentences for story {stories_id} because 'no_dedup_sentences' is set" ) dedup_sentences_statement = """ -- Nothing to deduplicate, return empty list SELECT NULL WHERE 1 = 0 """ else: # Limit to unique sentences within a story sentences = _get_unique_sentences_in_story(sentences) # Set is_dup = 't' to sentences already in the table, return those to be later skipped on INSERT of new # sentences dedup_sentences_statement = f""" -- noinspection SqlResolve UPDATE story_sentences SET is_dup = 't' FROM new_sentences WHERE public.half_md5(story_sentences.sentence) = public.half_md5(new_sentences.sentence) AND public.week_start_date(story_sentences.publish_date::date) = public.week_start_date({escaped_story_publish_date}) AND story_sentences.media_id = new_sentences.media_id RETURNING story_sentences.sentence """ # Convert to list of dicts (values escaped for insertion into database) sentence_dicts = _get_db_escaped_story_sentence_dicts(db=db, story=story, sentences=sentences) # Ordered list of columns story_sentences_columns = sorted(sentence_dicts[0].keys()) str_story_sentences_columns = ', '.join(story_sentences_columns) # List of sentences (in predefined column order) new_sentences_sql = [] for sentence_dict in sentence_dicts: new_sentence_sql = [] for column in story_sentences_columns: new_sentence_sql.append(sentence_dict[column]) new_sentences_sql.append(f"({', '.join(new_sentence_sql)})") str_new_sentences_sql = "\n{}".format(",\n".join(new_sentences_sql)) # sometimes the big story_sentences query below deadlocks sticks in an idle state, holding this lock so we set a # short idle timeout for postgres just while we do this query. the timeout should not kick in while the # big story_sentences query is actively processing, so we can set it pretty short. we usually set this timeout # to 0 globally, but just to be safe store and reset the pre-existing value. idle_timeout = db.query( "SHOW idle_in_transaction_session_timeout").flat()[0] db.query("SET idle_in_transaction_session_timeout = 5000") db.query('SET citus.max_adaptive_executor_pool_size TO 64') sql = f""" -- noinspection SqlType,SqlResolve WITH new_sentences ({str_story_sentences_columns}) AS (VALUES -- New sentences to potentially insert {str_new_sentences_sql} ) -- Either list of duplicate sentences already found in the table or return an empty list if deduplication is -- disabled -- -- The query assumes that there are no existing sentences for this story in the "story_sentences" table, so -- if you are reextracting a story, DELETE its sentences from "story_sentences" before running this query. {dedup_sentences_statement} """ log.debug(f"Running 'UPDATE story_sentences SET is_dup' query:\n{sql}") duplicate_sentences = db.query(sql).flat() duplicate_sentences = [ db.quote_varchar(sentence) for sentence in duplicate_sentences ] sql = f""" -- noinspection SqlType,SqlResolve WITH new_sentences ({str_story_sentences_columns}) AS (VALUES {str_new_sentences_sql} ), duplicate_sentences AS ( SELECT unnest(ARRAY[{', '.join(duplicate_sentences)}]::TEXT[]) AS sentence ) INSERT INTO story_sentences (language, media_id, publish_date, sentence, sentence_number, stories_id) SELECT language, media_id, publish_date, sentence, sentence_number, stories_id FROM new_sentences WHERE sentence NOT IN ( -- Skip the ones for which we've just set is_dup = 't' SELECT sentence FROM duplicate_sentences ) RETURNING story_sentences.sentence """ log.debug(f"Running 'INSERT INTO story_sentences' query:\n{sql}") inserted_sentences = db.query(sql).flat() db.query("SET idle_in_transaction_session_timeout = %(a)s", {'a': idle_timeout}) return inserted_sentences