Пример #1
0
def _get_db_escaped_story_sentence_dicts(
        db: DatabaseHandler,
        story: dict,
        sentences: List[str],
) -> List[Dict[str, str]]:
    """Given a list of text sentences, return a list of sentences with properly escaped values for insertion."""
    story = decode_object_from_bytes_if_needed(story)
    sentences = decode_object_from_bytes_if_needed(sentences)

    sentence_dicts = []

    sentence_num = 0
    for sentence in sentences:

        # Identify the language of each of the sentences
        sentence_lang = language_code_for_text(sentence)
        if (sentence_lang or '') != (story['language'] or ''):
            # Mark the language as unknown if the results for the sentence are not reliable
            if not identification_would_be_reliable(text=sentence):
                sentence_lang = ''

        sentence_dicts.append({
            'sentence': db.quote_varchar(sentence),
            'language': db.quote_varchar(sentence_lang),
            'sentence_number': str(sentence_num),
            'stories_id': str(story['stories_id']),
            'media_id': str(story['media_id']),
            'publish_date': db.quote_timestamp(story['publish_date']),
        })

        sentence_num += 1

    return sentence_dicts
Пример #2
0
def _insert_story_sentences(
    db: DatabaseHandler,
    story: dict,
    sentences: List[str],
    no_dedup_sentences: bool = False,
) -> List[str]:
    """Insert the story sentences into story_sentences, optionally skipping duplicate sentences by setting is_dup = 't'
    to the found duplicates that are already in the table.

    Returns list of sentences that were inserted into the table.
    """

    story = decode_object_from_bytes_if_needed(story)
    sentences = decode_object_from_bytes_if_needed(sentences)
    if isinstance(no_dedup_sentences, bytes):
        no_dedup_sentences = decode_object_from_bytes_if_needed(
            no_dedup_sentences)
    no_dedup_sentences = bool(int(no_dedup_sentences))

    stories_id = story['stories_id']
    media_id = story['media_id']

    # Story's publish date is the same for all the sentences, so we might as well pass it as a constant
    escaped_story_publish_date = db.quote_date(story['publish_date'])

    if len(sentences) == 0:
        log.warning(f"Story sentences are empty for story {stories_id}")
        return []

    if no_dedup_sentences:
        log.debug(
            f"Won't de-duplicate sentences for story {stories_id} because 'no_dedup_sentences' is set"
        )

        dedup_sentences_statement = """

            -- Nothing to deduplicate, return empty list
            SELECT NULL
            WHERE 1 = 0

        """

    else:

        # Limit to unique sentences within a story
        sentences = _get_unique_sentences_in_story(sentences)

        # Set is_dup = 't' to sentences already in the table, return those to be later skipped on INSERT of new
        # sentences
        dedup_sentences_statement = f"""

            -- noinspection SqlResolve
            UPDATE story_sentences
            SET is_dup = 't'
            FROM new_sentences
            WHERE public.half_md5(story_sentences.sentence) = public.half_md5(new_sentences.sentence)
              AND public.week_start_date(story_sentences.publish_date::date) = public.week_start_date({escaped_story_publish_date})
              AND story_sentences.media_id = new_sentences.media_id
            RETURNING story_sentences.sentence

        """

    # Convert to list of dicts (values escaped for insertion into database)
    sentence_dicts = _get_db_escaped_story_sentence_dicts(db=db,
                                                          story=story,
                                                          sentences=sentences)

    # Ordered list of columns
    story_sentences_columns = sorted(sentence_dicts[0].keys())
    str_story_sentences_columns = ', '.join(story_sentences_columns)

    # List of sentences (in predefined column order)
    new_sentences_sql = []
    for sentence_dict in sentence_dicts:
        new_sentence_sql = []
        for column in story_sentences_columns:
            new_sentence_sql.append(sentence_dict[column])
        new_sentences_sql.append(f"({', '.join(new_sentence_sql)})")
    str_new_sentences_sql = "\n{}".format(",\n".join(new_sentences_sql))

    # sometimes the big story_sentences query below deadlocks sticks in an idle state, holding this lock so we set a
    # short idle timeout for postgres just while we do this query. the timeout should not kick in while the
    # big story_sentences query is actively processing, so we can set it pretty short. we usually set this timeout
    # to 0 globally, but just to be safe store and reset the pre-existing value.
    idle_timeout = db.query(
        "SHOW idle_in_transaction_session_timeout").flat()[0]
    db.query("SET idle_in_transaction_session_timeout = 5000")

    db.query('SET citus.max_adaptive_executor_pool_size TO 64')

    sql = f"""
        -- noinspection SqlType,SqlResolve
        WITH new_sentences ({str_story_sentences_columns}) AS (VALUES
            -- New sentences to potentially insert
            {str_new_sentences_sql}
        )

        -- Either list of duplicate sentences already found in the table or return an empty list if deduplication is
        -- disabled
        --
        -- The query assumes that there are no existing sentences for this story in the "story_sentences" table, so
        -- if you are reextracting a story, DELETE its sentences from "story_sentences" before running this query.
        {dedup_sentences_statement}

    """
    log.debug(f"Running 'UPDATE story_sentences SET is_dup' query:\n{sql}")
    duplicate_sentences = db.query(sql).flat()

    duplicate_sentences = [
        db.quote_varchar(sentence) for sentence in duplicate_sentences
    ]

    sql = f"""
        -- noinspection SqlType,SqlResolve
        WITH new_sentences ({str_story_sentences_columns}) AS (VALUES
            {str_new_sentences_sql}
        ),
        duplicate_sentences AS (
            SELECT unnest(ARRAY[{', '.join(duplicate_sentences)}]::TEXT[]) AS sentence
        )
        INSERT INTO story_sentences (language, media_id, publish_date, sentence, sentence_number, stories_id)
        SELECT language, media_id, publish_date, sentence, sentence_number, stories_id
        FROM new_sentences
        WHERE sentence NOT IN (
            -- Skip the ones for which we've just set is_dup = 't'
            SELECT sentence
            FROM duplicate_sentences
        )
        RETURNING story_sentences.sentence
    """
    log.debug(f"Running 'INSERT INTO story_sentences' query:\n{sql}")
    inserted_sentences = db.query(sql).flat()

    db.query("SET idle_in_transaction_session_timeout = %(a)s",
             {'a': idle_timeout})

    return inserted_sentences