示例#1
0
def test_identification_would_be_reliable_digits():
    # Digits
    assert identification_would_be_reliable(
        text='0000000000000000000000') is False

    # More digits than letters
    assert identification_would_be_reliable(
        text='000000000000000aaaaaaa') is False
def test_identification_would_be_reliable():
    assert identification_would_be_reliable(text='') is False
    # noinspection PyTypeChecker
    assert identification_would_be_reliable(text=None) is False

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert identification_would_be_reliable(text=language.sample_sentence())
示例#3
0
def test_identification_would_be_reliable():
    assert identification_would_be_reliable(text='') is False
    # noinspection PyTypeChecker
    assert identification_would_be_reliable(text=None) is False

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert identification_would_be_reliable(
            text=language.sample_sentence())
示例#4
0
def _get_db_escaped_story_sentence_dicts(
        db: DatabaseHandler,
        story: dict,
        sentences: List[str],
) -> List[Dict[str, str]]:
    """Given a list of text sentences, return a list of sentences with properly escaped values for insertion."""
    story = decode_object_from_bytes_if_needed(story)
    sentences = decode_object_from_bytes_if_needed(sentences)

    sentence_dicts = []

    sentence_num = 0
    for sentence in sentences:

        # Identify the language of each of the sentences
        sentence_lang = language_code_for_text(sentence)
        if (sentence_lang or '') != (story['language'] or ''):
            # Mark the language as unknown if the results for the sentence are not reliable
            if not identification_would_be_reliable(text=sentence):
                sentence_lang = ''

        sentence_dicts.append({
            'sentence': db.quote_varchar(sentence),
            'language': db.quote_varchar(sentence_lang),
            'sentence_number': str(sentence_num),
            'stories_id': str(story['stories_id']),
            'media_id': str(story['media_id']),
            'publish_date': db.quote_timestamp(story['publish_date']),
        })

        sentence_num += 1

    return sentence_dicts
示例#5
0
    async def identify_story_bcp47_language_code(
            self, stories_id: int) -> Optional[str]:
        log.info(f"Identifying story language for story {stories_id}...")

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)
        if not story:
            raise McPermanentError(f"Story {stories_id} was not found.")

        # Podcast episodes typically come with title and description set so try guessing from that
        story_title = story['title']
        story_description = html_strip(story['description'])
        sample_text = f"{story_title}\n{story_description}"

        bcp_47_language_code = None
        if identification_would_be_reliable(text=sample_text):
            iso_639_1_language_code = language_code_for_text(text=sample_text)

            # Convert to BCP 47 identifier
            bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
                iso_639_1_code=iso_639_1_language_code,
                url_hint=story['url'],
            )

        log.info(
            f"Language code for story {stories_id} is {bcp_47_language_code}")

        return bcp_47_language_code
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        if self.__copy_to is None:
            raise StopIteration

        sentence = self.__copy_to.get_line()

        if sentence is None:
            self.__copy_to.end()
            self.__copy_to = None
            raise StopIteration

        sentence = sentence.strip()

        self.__sentence_counter += 1
        if self.__sentence_counter % 1000 == 0:
            log.info("Feeding sentence %d..." % self.__sentence_counter)

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
示例#7
0
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
示例#9
0
def fetch_and_store_episode(
        db: DatabaseHandler,
        stories_id: int,
        config: Optional[PodcastFetchEpisodeConfig] = None) -> None:
    """
    Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB.

    1) Determines the episode's likely language by looking into its title and description, converts the language code to
       BCP 47;
    1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most;
    2) Fetches the chosen enclosure;
    3) Transcodes the file (if needed) by:
        a) converting it to an audio format that the Speech API can support, and / or
        b) discarding video stream from the media file, and / or
        c) discarding other audio streams from the media file;
    5) Reads the various parameters, e.g. sample rate, of the episode audio file;
    4) Uploads the episode audio file to Google Cloud Storage;
    5) Adds a row to "podcast_episodes".

    Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller.

    :param db: Database handler.
    :param stories_id: Story ID for the story to operate on.
    :param config: (optional) Podcast fetcher configuration object (useful for testing).
    """

    if not config:
        config = PodcastFetchEpisodeConfig()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McStoryNotFoundException(f"Story {stories_id} was not found.")

    # Try to determine language of the story
    story_title = story['title']
    story_description = html_strip(story['description'])
    sample_text = f"{story_title}\n{story_description}"

    iso_639_1_language_code = None
    if identification_would_be_reliable(text=sample_text):
        iso_639_1_language_code = language_code_for_text(text=sample_text)

    if not iso_639_1_language_code:
        iso_639_1_language_code = 'en'

    # Convert to BCP 47 identifier
    bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
        iso_639_1_code=iso_639_1_language_code,
        url_hint=story['url'],
    )

    # Find the enclosure that might work the best
    best_enclosure = podcast_viable_enclosure_for_story(db=db,
                                                        stories_id=stories_id)
    if not best_enclosure:
        raise McPodcastNoViableStoryEnclosuresException(
            f"There were no viable enclosures found for story {stories_id}")

    if best_enclosure.length:
        if best_enclosure.length > MAX_ENCLOSURE_SIZE:
            raise McPodcastEnclosureTooBigException(
                f"Chosen enclosure {best_enclosure} is too big.")

    try:
        temp_dir = tempfile.mkdtemp('fetch_and_store')
    except Exception as ex:
        raise McPodcastFileStoreFailureException(
            f"Unable to create temporary directory: {ex}")

    # Fetch enclosure
    input_filename = 'input_file'
    input_file_path = os.path.join(temp_dir, input_filename)
    log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...")
    fetch_big_file(url=best_enclosure.url,
                   dest_file=input_file_path,
                   max_size=MAX_ENCLOSURE_SIZE)
    log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}")

    if os.stat(input_file_path).st_size == 0:
        # Might happen with misconfigured webservers
        raise McPodcastFileFetchFailureException(
            f"Fetched file {input_file_path} is empty.")

    # Transcode if needed
    input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir,
                                             filename=input_filename)
    transcoded_file_obj = transcode_media_file_if_needed(
        input_media_file=input_file_obj)

    # Unset the variable so that we don't accidentally use it later
    del input_filename, temp_dir

    if input_file_obj != transcoded_file_obj:
        # Function did some transcoding and stored everything in yet another file

        # Remove the input file
        _cleanup_temp_dir(temp=input_file_obj)

        # Consider the transcoded file the new input file
        input_file_obj = transcoded_file_obj

    # (Re)read the properties of either the original or the transcoded file
    media_info = media_file_info(media_file_path=input_file_obj.temp_full_path)
    best_audio_stream = media_info.best_supported_audio_stream()

    # Store input file to GCS
    try:
        gcs = GCSStore(config=config)
        gcs_uri = gcs.store_object(
            local_file_path=input_file_obj.temp_full_path,
            object_id=str(stories_id),
            mime_type=best_audio_stream.audio_codec_class.mime_type(),
        )

    except Exception as ex:

        log.error(
            f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}"
        )

        # Clean up, then raise further
        _cleanup_temp_dir(temp=input_file_obj)

        raise ex

    # Clean up the locally stored file as we don't need it anymore
    _cleanup_temp_dir(temp=input_file_obj)

    # Insert everything to the database
    try:
        db.query(
            """
            INSERT INTO podcast_episodes (
                stories_id,
                story_enclosures_id,
                gcs_uri,
                duration,
                codec,
                sample_rate,
                bcp47_language_code
            ) VALUES (
                %(stories_id)s,
                %(story_enclosures_id)s,
                %(gcs_uri)s,
                %(duration)s,
                %(codec)s,
                %(sample_rate)s,
                %(bcp47_language_code)s            
            ) ON CONFLICT (stories_id) DO UPDATE SET
                story_enclosures_id = %(story_enclosures_id)s,
                gcs_uri = %(gcs_uri)s,
                duration = %(duration)s,
                codec = %(codec)s,
                sample_rate = %(sample_rate)s,
                bcp47_language_code = %(bcp47_language_code)s
        """, {
                'stories_id':
                stories_id,
                'story_enclosures_id':
                best_enclosure.story_enclosures_id,
                'gcs_uri':
                gcs_uri,
                'duration':
                best_audio_stream.duration,
                'codec':
                best_audio_stream.audio_codec_class.postgresql_enum_value(),
                'sample_rate':
                best_audio_stream.sample_rate,
                'bcp47_language_code':
                bcp_47_language_code,
            })

    except Exception as ex_db:

        # Try to delete object on GCS first
        try:
            gcs.delete_object(object_id=str(stories_id))
        except Exception as ex_gcs:
            # We should be able to delete it as we've just uploaded it
            raise McPodcastGCSStoreFailureException((
                f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; "
                f"database insert exception: {ex_db}; "
                f"GCS exception: {ex_gcs}"))

        raise McPodcastPostgreSQLException(
            f"Failed inserting episode for story {stories_id}: {ex_db}")
def test_identification_would_be_reliable_digits():
    # Digits
    assert identification_would_be_reliable(text='0000000000000000000000') is False

    # More digits than letters
    assert identification_would_be_reliable(text='000000000000000aaaaaaa') is False