Exemplo n.º 1
0
    async def identify_story_bcp47_language_code(
            self, stories_id: int) -> Optional[str]:
        log.info(f"Identifying story language for story {stories_id}...")

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)
        if not story:
            raise McPermanentError(f"Story {stories_id} was not found.")

        # Podcast episodes typically come with title and description set so try guessing from that
        story_title = story['title']
        story_description = html_strip(story['description'])
        sample_text = f"{story_title}\n{story_description}"

        bcp_47_language_code = None
        if identification_would_be_reliable(text=sample_text):
            iso_639_1_language_code = language_code_for_text(text=sample_text)

            # Convert to BCP 47 identifier
            bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
                iso_639_1_code=iso_639_1_language_code,
                url_hint=story['url'],
            )

        log.info(
            f"Language code for story {stories_id} is {bcp_47_language_code}")

        return bcp_47_language_code
Exemplo n.º 2
0
    async def fetch_enclosure_to_gcs(self, stories_id: int,
                                     enclosure: StoryEnclosureDict) -> None:

        log.info(f"Fetching enclosure to GCS for story {stories_id}")
        log.debug(f"Best enclosure for story {stories_id}: {enclosure}")

        enclosure = StoryEnclosure.from_dict(enclosure)

        with tempfile.TemporaryDirectory(
                prefix='fetch_enclosure_to_gcs') as temp_dir:
            raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure')
            fetch_big_file(url=enclosure.url,
                           dest_file=raw_enclosure_path,
                           max_size=self.config.max_enclosure_size())

            if os.stat(raw_enclosure_path).st_size == 0:
                # Might happen with misconfigured webservers
                raise McPermanentError(
                    f"Fetched file {raw_enclosure_path} is empty.")

            gcs = GCSStore(bucket_config=self.config.raw_enclosures())
            gcs.upload_object(local_file_path=raw_enclosure_path,
                              object_id=str(stories_id))

        log.info(f"Done fetching enclosure to GCS for story {stories_id}")
Exemplo n.º 3
0
    def download_object(self, object_id: str, local_file_path: str) -> None:
        """
        Download a GCS object to a local file.

        :param object_id: Object ID of an object that should be downloaded.
        :param local_file_path: Local file that the object should be stored to.
        """

        if os.path.isfile(local_file_path):
            raise McProgrammingError(
                f"Local file '{local_file_path}' already exists.")

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        log.debug(
            f"Downloading object ID {object_id} to '{local_file_path}'...")

        if not self.object_exists(object_id=object_id):
            raise McPermanentError(f"Object ID {object_id} was not found.")

        blob = self._blob_from_object_id(object_id=object_id)

        try:
            blob.download_to_filename(filename=local_file_path)
        except Exception as ex:
            raise McTransientError(
                f"Unable to download object ID {object_id} to '{local_file_path}': {ex}"
            )
Exemplo n.º 4
0
def fetch_big_file(url: str, dest_file: str, max_size: int = 0) -> None:
    """
    Fetch a huge file from an URL to a local file.

    Raises one of the _AbstractFetchBigFileException exceptions.

    :param url: URL that points to a huge file.
    :param dest_file: Destination path to write the fetched file to.
    :param max_size: If >0, limit the file size to a defined number of bytes.
    :raise: ProgrammingError on unexpected fatal conditions.
    """

    if os.path.exists(dest_file):
        # Something's wrong with the code
        raise McProgrammingError(f"Destination file '{dest_file}' already exists.")

    try:

        # Using "requests" as our UserAgent doesn't support writing directly to files
        with requests.get(url, stream=True) as r:
            r.raise_for_status()

            bytes_read = 0

            with open(dest_file, 'wb') as f:
                for chunk in r.iter_content(chunk_size=65536):
                    # Filter out keep-alive new chunks
                    if chunk:

                        bytes_read += len(chunk)
                        if max_size:
                            if bytes_read > max_size:
                                raise McPermanentError(f"The file is bigger than the max. size of {max_size}")

                        f.write(chunk)
                        f.flush()

    except McPermanentError as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise ex

    except requests.exceptions.RequestException as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise McTransientError(f"'requests' exception while fetching {url}: {ex}")

    except Exception as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise McTransientError(f"Unable to fetch and store {url}: {ex}")

    if not os.path.isfile(dest_file):
        __cleanup_dest_file(dest_file=dest_file)

        # There should be something here so in some way it is us that have messed up
        raise McProgrammingError(f"Fetched file {dest_file} is not here after fetching it.")
Exemplo n.º 5
0
    async def transcribe_episode(self, stories_id: int) -> None:

        bcp47_language_code = await self.activities.identify_story_bcp47_language_code(
            stories_id)
        if bcp47_language_code is None:
            # Default to English in case there wasn't enough sizable text in title / description to make a good guess
            bcp47_language_code = 'en'

        enclosure = await self.activities.determine_best_enclosure(stories_id)
        if not enclosure:
            raise McPermanentError(
                f"No viable enclosure found for story {stories_id}")

        await self.activities.fetch_enclosure_to_gcs(stories_id, enclosure)

        episode_metadata_dict = await self.activities.fetch_transcode_store_episode(
            stories_id)

        episode_metadata = MediaFileInfoAudioStream.from_dict(
            episode_metadata_dict)

        max_duration = PodcastTranscribeEpisodeConfig().max_duration()
        if episode_metadata.duration > max_duration:
            raise McPermanentError(
                f"Episode's duration ({episode_metadata.duration} s) exceeds max. duration ({max_duration} s)"
            )

        speech_operation_id = await self.activities.submit_transcribe_operation(
            stories_id,
            episode_metadata_dict,
            bcp47_language_code,
        )

        # Wait for Google Speech API to finish up transcribing
        await Workflow.sleep(int(episode_metadata.duration * 1.1))

        await self.activities.fetch_store_raw_transcript_json(
            stories_id, speech_operation_id)

        await self.activities.fetch_store_transcript(stories_id)

        await self.activities.add_to_extraction_queue(stories_id)
Exemplo n.º 6
0
    async def determine_best_enclosure(
            self, stories_id: int) -> Optional[StoryEnclosureDict]:

        log.info(f"Determining best enclosure for story {stories_id}...")

        db = connect_to_db_or_raise()

        # Find the enclosure that might work the best
        best_enclosure = viable_story_enclosure(db=db, stories_id=stories_id)
        if not best_enclosure:
            raise McPermanentError(
                f"There were no viable enclosures found for story {stories_id}"
            )

        if best_enclosure.length:
            if best_enclosure.length > self.config.max_enclosure_size():
                raise McPermanentError(
                    f"Chosen enclosure {best_enclosure} is too big.")

        log.info(f"Done determining best enclosure for story {stories_id}")
        log.debug(f"Best enclosure for story {stories_id}: {best_enclosure}")

        return best_enclosure.to_dict()
Exemplo n.º 7
0
def transcode_file_if_needed(input_file: str, output_file: str) -> bool:
    """
    Transcode file (if needed) to something that Speech API will support.

    * If input has a video stream, it will be discarded;
    * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech
      API can support);
    * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless
      FLAC 16 bit in order to preserve quality;
    * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono)
      channel as Speech API supports multi-channel recognition only when different voices speak into each of the
      channels.

    :param input_file: Input media file to consider for transcoding.
    :param output_file: If we decide to transcode, output media file to transcode to.
    :return: True if file had to be transcoded into "output_file", or False if input file can be used as it is.
    """

    if not os.path.isfile(input_file):
        raise McProgrammingError(f"File '{input_file}' does not exist.")

    # Independently from what <enclosure /> has told us, identify the file type again ourselves
    media_info = media_file_info(media_file_path=input_file)

    if not media_info.audio_streams:
        raise McPermanentError(
            "Downloaded file doesn't appear to have any audio streams.")

    ffmpeg_args = []

    supported_audio_stream = media_info.best_supported_audio_stream()
    if supported_audio_stream:
        log.info(f"Found a supported audio stream")

        # Test if there is more than one audio stream
        if len(media_info.audio_streams) > 1:
            log.info(
                f"Found other audio streams besides the supported one, will discard those"
            )

            ffmpeg_args.extend([
                '-f',
                supported_audio_stream.audio_codec_class.
                ffmpeg_container_format()
            ])

            # Select all audio streams
            ffmpeg_args.extend(['-map', '0:a'])

            for stream in media_info.audio_streams:
                # Deselect the unsupported streams
                if stream != supported_audio_stream:
                    ffmpeg_args.extend(
                        ['-map', f'-0:a:{stream.ffmpeg_stream_index}'])

    # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality
    else:
        log.info(
            f"None of the audio streams are supported by the Speech API, will transcode to FLAC"
        )

        # Map first audio stream to input 0
        ffmpeg_args.extend(['-map', '0:a:0'])

        # Transcode to FLAC (16 bit) in order to not lose any quality
        ffmpeg_args.extend(['-acodec', 'flac'])
        ffmpeg_args.extend(['-f', 'flac'])
        ffmpeg_args.extend(['-sample_fmt', 's16'])

        # Ensure that we end up with mono audio
        ffmpeg_args.extend(['-ac', '1'])

    # If there's video in the file (e.g. video), remove it
    if media_info.has_video_streams:
        # Discard all video streams
        ffmpeg_args.extend(['-map', '-0:v'])

    if not ffmpeg_args:
        # No need to transcode -- caller should use the input file as-is
        return False

    log.info(f"Transcoding '{input_file}' to '{output_file}'...")

    # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly
    ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', '-i', input_file
                      ] + ffmpeg_args + [output_file]
    log.debug(f"FFmpeg command: {ffmpeg_command}")
    subprocess.check_call(ffmpeg_command)

    log.info(f"Done transcoding '{input_file}' to '{output_file}'")

    return True
Exemplo n.º 8
0
def media_file_info(media_file_path: str) -> MediaFileInfo:
    """
    Read audio / video media file information, or raise if it can't be read.

    :param media_file_path: Full path to media file.
    :return: MediaFileInfo object.
    """
    if not os.path.isfile(media_file_path):
        # Input file should exist at this point; it it doesn't, we have probably messed up something in the code
        raise McProgrammingError(
            f"Input file {media_file_path} does not exist.")

    try:
        file_info = ffmpeg.probe(media_file_path)
        if not file_info:
            raise Exception("Returned metadata is empty.")
    except Exception as ex:
        raise McPermanentError(
            f"Unable to read metadata from file {media_file_path}: {ex}")

    if 'streams' not in file_info:
        # FFmpeg should come up with some sort of a stream in any case
        raise McProgrammingError("Returned probe doesn't have 'streams' key.")

    # Test if one of the audio streams is of one of the supported codecs
    audio_streams = []
    has_video_streams = False
    for stream in file_info['streams']:
        if stream['codec_type'] == 'audio':

            try:
                audio_channel_count = int(stream['channels'])
                if audio_channel_count == 0:
                    raise Exception("Audio channel count is 0")
            except Exception as ex:
                log.warning(
                    f"Unable to read audio channel count from stream {stream}: {ex}"
                )
                # Just skip this stream if we can't figure it out
                continue

            audio_codec_class = None

            # We'll need to transcode audio files with more than one channel count anyway
            if audio_channel_count == 1:
                for codec in _SUPPORTED_CODEC_CLASSES:
                    if codec.ffmpeg_stream_is_this_codec(ffmpeg_stream=stream):
                        audio_codec_class = codec
                        break

            try:

                if 'duration' in stream:
                    # 'duration': '3.766621'
                    duration = math.floor(float(stream['duration']))

                elif 'DURATION' in stream.get('tags', {}):
                    # 'DURATION': '00:00:03.824000000'
                    duration_parts = stream['tags']['DURATION'].split(':')
                    if len(duration_parts) != 3:
                        raise McPermanentError(
                            f"Unable to parse 'DURATION': {duration_parts}")

                    hh = int(duration_parts[0])
                    mm = int(duration_parts[1])
                    ss_ms = duration_parts[2].split('.')

                    if len(ss_ms) == 1:
                        ss = int(ss_ms[0])
                        ms = 0
                    elif len(ss_ms) == 2:
                        ss = int(ss_ms[0])
                        ms = int(ss_ms[1])
                    else:
                        raise McPermanentError(
                            f"Unable to parse 'DURATION': {duration_parts}")

                    duration = hh * 3600 + mm * 60 + ss + (1 if ms > 0 else 0)

                else:
                    raise McPermanentError(
                        f"Stream doesn't have duration: {stream}")

                audio_stream = MediaFileInfoAudioStream(
                    ffmpeg_stream_index=stream['index'],
                    audio_codec_class=audio_codec_class,
                    duration=duration,
                    audio_channel_count=audio_channel_count,
                    sample_rate=int(stream['sample_rate']),
                )
                audio_streams.append(audio_stream)

            except Exception as ex:
                # Just skip this stream if we can't figure it out
                log.warning(
                    f"Unable to read audio stream data for stream {stream}: {ex}"
                )

        elif stream['codec_type'] == 'video':
            has_video_streams = True

    return MediaFileInfo(
        audio_streams=audio_streams,
        has_video_streams=has_video_streams,
    )