from ekstep_data_pipelines.common.file_system.gcp_file_systen import GCPFileSystem from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.common import BaseProcessor from ekstep_data_pipelines.data_marker.constants import ( CONFIG_NAME, FILTER_CRITERIA, LANDING_BASE_PATH, SOURCE_BASE_PATH, ) from ekstep_data_pipelines.data_marker.data_filter import DataFilter from ekstep_data_pipelines.data_marker.data_mover import MediaFilesMover ESTIMATED_CPU_SHARE = 0.02 Logger = get_logger("Data marker") class DataMarker(BaseProcessor): """ 1. Load Configuration 2. Filter data baased on criteria 2. Tag/Mark data in the DB 3. Move marked data """ @staticmethod def get_instance(data_processor_instance, gcs_instance, **kwargs): return DataMarker(data_processor_instance, gcs_instance, **kwargs) def __init__(self, postgres_client, gcs_instance, **kwargs): self.postgres_client = postgres_client
import pandas as pd from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("DataFilter") class DataFilter(object): def exclude_audio_ids(self, utterances, audio_ids): excluding_audio_ids = filter(lambda t: t[0] not in audio_ids, utterances) return excluding_audio_ids def exclude_speaker_ids(self, utterances, speaker_ids): excluding_speaker_ids = filter(lambda t: t[0] not in speaker_ids, utterances) return excluding_speaker_ids def by_utterance_duration(self, utterances, filters): by_utterance_duration = filter( lambda t: filters["lte"] >= t[2] >= filters["gte"], utterances) return by_utterance_duration def by_snr(self, utterances, filters): by_snr_utterances = filter( lambda t: filters["lte"] >= t[4] >= filters["gte"], utterances) return by_snr_utterances def by_duration( self, utterances,
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger import re LOGGER = get_logger("PunjabiSanitizer") class PunjabiSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ਼ ਂ ੍ੑ ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋੰੱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return PunjabiSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:" + transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip()
from ekstep_data_pipelines.audio_processing.generate_hash import ( get_hash_code_of_audio_file, ) from ekstep_data_pipelines.audio_processing.constants import ( CONFIG_NAME, REMOTE_RAW_FILE, CHUNKING_CONFIG, SNR_CONFIG, REMOTE_PROCESSED_FILE_PATH, MASTER_META_DATA_FILE_PATH, SNR_DONE_FOLDER_PATH, DUPLICATE_AUDIO_FOLDER_PATH, ) from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.common import BaseProcessor Logger = get_logger("Audio Processor") class AudioProcessor(BaseProcessor): """ Class for breaking a downloaded file into smaller chunks of audio files as well as filtering out files with more than an acceptable level of Sound to Noise Ratio(or SNR) """ DEFAULT_DOWNLOAD_PATH = "/tmp/audio_processing_raw" @staticmethod def get_instance(data_processor, gcs_instance, audio_commons, catalogue_dao, **kwargs): return AudioProcessor(data_processor, gcs_instance, audio_commons,
import os from google.cloud import speech_v1 from google.cloud.speech_v1 import enums from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import ( GoogleTranscriptionClientError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("GoogleTranscriptionClient") class GoogleTranscriptionClient(object): @staticmethod def get_instance(config_dict): google_config_dict = config_dict.get("common", {}).get( "google_transcription_client", {}) return GoogleTranscriptionClient(**google_config_dict) def __init__(self, **config_dict): self.language = config_dict.get("language", "hi-IN") self.sample_rate = config_dict.get("sample_rate", 16000) self.channels = config_dict.get("audio_channel_count", 1) self.bucket = config_dict.get("bucket") self._client = None def make_directories(self, path): if not os.path.exists(path): LOGGER(f"Directory {path} not does already exist") os.makedirs(path) LOGGER.info("Directory %s created successfully", path)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger import re LOGGER = get_logger("KannadaTranscriptionSanitizer") class KannadaSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ಂ-ಃಅ-ಋಎ-ಐಒ-ನಪ-ರಲ-ಳವ-ಹಾ-ೄೆ-ೈೊ-್ೲ]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return KannadaSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:" + transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip()
from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.audio_analysis.speaker_analysis.speaker_clustering import ( create_speaker_clusters, ) from ekstep_data_pipelines.audio_analysis.speaker_analysis.file_cluster_mapping import ( speaker_to_file_name_map, ) Logger = get_logger("AudioSpeakerClusteringProcessor") def create_embeddings( local_audio_download_path, dir_pattern, embed_file_path, fs_interface, npz_bucket_destination_path, source_name, ): is_uploaded = fs_interface.upload_to_location(embed_file_path, npz_bucket_destination_path) if is_uploaded: Logger.info("npz file uploaded to :" + npz_bucket_destination_path) else: Logger.info("npz file could not be uploaded to :" + npz_bucket_destination_path) file_map_dict, noise_file_map_dict = create_speaker_clusters( embed_file_path, source_name) speaker_to_file_name = speaker_to_file_name_map(file_map_dict) Logger.info("total speakers:" + str(len(speaker_to_file_name)))
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("MalayalamTranscriptionSanitizer") class MalayalamSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ം-ഃഅ-ഋഎ-ഐഒ-നപ-ഺാ-ൃെ-ൈൊ-്ൺ-ൾ]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return MalayalamSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:%s", transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip() if len(transcription) == 0:
# import signal import sys import multiprocessing import os from concurrent.futures import ThreadPoolExecutor from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.common import BaseProcessor from ekstep_data_pipelines.audio_embedding.create_embeddings import ( encode_each_batch) LOGGER = get_logger("AudioEmbeddingProcessor") ESTIMATED_CPU_SHARE = 0.1 class AudioEmbedding(BaseProcessor): """ Class to identify speaker for each utterance in a source """ local_txt_path = "./audio_speaker_cluster/file_path/" local_audio_path = "./audio_speaker_cluster/audio_files/" embed_file_path = "./audio_speaker_cluster/embed_file_path/" @staticmethod def get_instance(data_processor, **kwargs): return AudioEmbedding(data_processor, **kwargs) def __init__(self, data_processor, **kwargs):
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger import re LOGGER = get_logger("IndianEnglishSanitizer") class IndianEnglishSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ a-zA-Z0-9']" PUNCTUATION = '!"#%&()*+,./;<=>?@[\\]^_`{|}~ред' @staticmethod def get_instance(**kwargs): return IndianEnglishSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:" + transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip()
import pandas as pd from ekstep_data_pipelines.common.utils import get_logger from sqlalchemy import text from ekstep_data_pipelines.common.dao.constants import ( GET_UNIQUE_ID, IS_EXIST, COMMAND_WITH_LICENSE, COMMAND_WITHOUT_LICENSE, LICENSE, ) LOGGER = get_logger("CatalogueDao") class CatalogueDao: def __init__(self, postgres_client): self.postgres_client = postgres_client def get_utterances(self, audio_id): parm_dict = {"audio_id": audio_id} utterances = self.postgres_client.execute_query( "select utterances_files_list from media_metadata_staging where audio_id = :audio_id", **parm_dict, ) return json.loads(utterances[0][0]) if len(utterances) > 0 else [] def get_utterances_by_source(self, source, status):
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("UrduTranscriptionSanitizer") class UrduSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ء-آؤئ-بت-غف-قل-نؤٹپچڈڑژکگںھہیے-ۓ]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return UrduSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:%s", transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip() if len(transcription) == 0:
import json import os import shutil import subprocess import pandas as pd from ekstep_data_pipelines.audio_language_identification.audio_language_inference import ( infer_language, ) from ekstep_data_pipelines.audio_processing.audio_duration import calculate_duration from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("Snr") class SNR: """ Util object for performing SNR analysis over different """ MAX_DURATION = 15 @staticmethod def get_instance(initialization_dict): feat_language_identification = initialization_dict.get( "audio_processor_config", {} ).get("feat_language_identification", False) LOGGER.info( "Running with feat_language_identification=%s", str(feat_language_identification), )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("GujratiTranscriptionSanitizer") class GujratiSanitizer(BaseTranscriptionSanitizer): @staticmethod def get_instance(**kwargs): return GujratiSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): pass
from concurrent.futures import ThreadPoolExecutor from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("MediaFilesMover") class MediaFilesMover(object): def __init__(self, file_system, concurrency): self.file_system = file_system self.concurrency = concurrency def move_media_files(self, files, landing_path_with_source): Logger.info("using concurrency:%s", str(self.concurrency)) worker_pool = ThreadPoolExecutor(max_workers=self.concurrency) for file in files: relative_audio_id_clean_path = "/".join(file.split("/")[-3:-1]) landing_path = f"{landing_path_with_source}/{relative_audio_id_clean_path}" worker_pool.submit(self.file_system.mv_file, file, landing_path) worker_pool.shutdown(wait=True)
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("BengaliTranscriptionSanitizer") class BengaliSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-নপ-রলশ-হ়া-্ে-ৈো-ৎয়]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return BengaliSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:%s", transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip() if len(transcription) == 0:
import glob import os import subprocess import collections import contextlib import sys import wave import webrtcvad import sox from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("Chunking Util") class ChunkingConversionUtil: re_chunking_aggressiveness = 3 @staticmethod def get_instance(): return ChunkingConversionUtil() def convert_to_wav(self, input_dir, output_dir=None, ext="mp4"): Logger.info(f"Convert all the files in {input_dir} to wav") audio_paths = glob.glob(input_dir + "/*." + ext) Logger.info(f"Files to be completed: {audio_paths}") if len(audio_paths) < 1:
class ACTIONS: DATA_MARKING = "data_marking" AUDIO_PROCESSING = "audio_processing" AUDIO_TRANSCRIPTION = "audio_transcription" AUDIO_ANALYSIS = "audio_analysis" AUDIO_CATALOGUER = "audio_cataloguer" class FILE_SYSTEMS: GOOGLE = "google" LOCAL = "local" LOGGER = get_logger("EKSTEP_PROCESSOR") ACTIONS_LIST = [ ACTIONS.DATA_MARKING, ACTIONS.AUDIO_PROCESSING, ACTIONS.AUDIO_TRANSCRIPTION, ACTIONS.AUDIO_ANALYSIS, ACTIONS.AUDIO_CATALOGUER, ] FILES_SYSTEMS_LIST = [FILE_SYSTEMS.GOOGLE, FILE_SYSTEMS.LOCAL] # config_bucket = 'ekstepspeechrecognition-dev' parser = argparse.ArgumentParser( description="Util for data processing for EkStep") parser.add_argument( "-b",
from azure.cognitiveservices import speech from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import ( AzureTranscriptionClientError, ) LOGGER = get_logger("AzureTranscriptionClient") class AzureTranscriptionClient(object): @staticmethod def get_instance(config_dict): azure_config_dict = config_dict.get("common", {}).get( "azure_transcription_client", {}) return AzureTranscriptionClient(**azure_config_dict) def __init__(self, **kwargs): self.speech_key = kwargs.get("speech_key") self.service_region = kwargs.get("service_region") self.language = kwargs.get("language", "hi-IN") self.speech_config = speech.SpeechConfig(subscription=self.speech_key, region=self.service_region) def generate_transcription(self, language, source_file_path): try: result = self.speech_to_text(source_file_path) except RuntimeError as error: raise AzureTranscriptionClientError(error) return result.text def speech_to_text(self, audio_file_path): audio_input = speech.audio.AudioConfig(filename=audio_file_path)
import os from os import listdir from os.path import isfile, join from google.cloud import storage from ekstep_data_pipelines.common.infra_commons.storage import BaseStorageInterface from concurrent.futures import ThreadPoolExecutor from ekstep_data_pipelines.common.infra_commons.storage.exceptions import ( FileNotFoundException, PathDoesNotExist, ) from ekstep_data_pipelines.common.utils import get_logger from tqdm import tqdm Logger = get_logger("GoogleStorage") class GoogleStorage(BaseStorageInterface): def __init__(self, **kwargs): self._client = None def get_bucket_from_path(self, path) -> str: if not path: return None splitted_path = list(filter(None, path.split("/"))) if len(splitted_path) < 1: return None return splitted_path[0]
from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("GCPFileSystem") class GCPFileSystem: def __init__(self, gcp_operations): self.gcp_operations = gcp_operations def ls(self, dir_path): paths = self.gcp_operations.list_blobs_in_a_path(dir_path) return list(map(lambda p: p.name, paths)) def mv(self, source_dir, target_dir, is_dir=True): if is_dir and not self.gcp_operations.check_path_exists( self, source_dir): Logger.info("source dir does not exist:%s", source_dir) return files = self.ls(source_dir) for file in files: self.mv_file(file, target_dir) def mv_file(self, file, target_dir): paths = file.split("/") paths.pop() source_dir = "/".join(paths) destination_blob_name = file.replace(source_dir, target_dir) Logger.info("Moving file %s --> %s", file, destination_blob_name) self.gcp_operations.move_blob(file, destination_blob_name)
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("TeluguTranscriptionSanitizer") class TeluguSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ం-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హా-ౄె-ైొ-్ౠ]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return TeluguSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:%s", transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip()
import multiprocessing import os import yaml import shutil, glob from os import listdir from os.path import isfile, join from google.cloud import storage from concurrent.futures import ThreadPoolExecutor import datetime from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("GCS Operations") class CloudStorageOperations: @staticmethod def get_instance(config_dict, **kwargs): gcs_instance = CloudStorageOperations(config_dict, **kwargs) return gcs_instance def __init__(self, config_dict, **kwargs): self.config_dict = config_dict self._bucket = None self._client = None @property def client(self): if self._client: return self._client self._client = storage.Client()
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("AssameseTranscriptionSanitizer") class AssameseSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-চচ-নপ-যলশ-হা-ৃে-ৈো-ৎৗড়-ঢ়য়-ৠৰ-ৱ৺]+" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return AssameseSanitizer() def __init__(self, *args, **kwargs): pass def sanitize(self, transcription): LOGGER.info("Sanitizing transcription:%s", transcription) transcription = transcription.strip() transcription = self.replace_bad_char(transcription) transcription = transcription.strip()
AUDIO_LANGUAGE, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( get_transcription_sanitizers, ) from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import ( AzureTranscriptionClientError, GoogleTranscriptionClientError, ) from ekstep_data_pipelines.common.file_utils import get_file_name from ekstep_data_pipelines.common.utils import get_logger from ekstep_data_pipelines.common import BaseProcessor import os LOGGER = get_logger("audio_transcription") class AudioTranscription(BaseProcessor): LOCAL_PATH = None @staticmethod def get_instance(data_processor, gcs_instance, audio_commons, catalogue_dao, **kwargs): return AudioTranscription(data_processor, gcs_instance, audio_commons, catalogue_dao, **kwargs) def __init__(self, data_processor, gcs_instance, audio_commons, catalogue_dao, **kwargs): self.data_processor = data_processor self.gcs_instance = gcs_instance
MAX_LOAD_DATE_FOR_MEDIA_QUERY, INSERT_INTO_MEDIA_TABLE_QUERY, INSERT_UNIQUE_SPEAKER_QUERY, GET_AUDIO_ID_QUERY, DEFULT_QUERY_FOR_INSERT_INTO_MAPPING_TABLE, GET_SPEAKER_ID_QUERY, FETCH_QUERY_WHERE_SPEAKER_IS_NULL, DEFAULT_INSERT_QUERY, DEFAULT_UPDATE_QUERY_FOR_NORMALIZED_FLAG, GET_LOAD_TIME_FOR_AUDIO_QUERY, GET_UTTERANCES_LIST_OF_AUDIO_ID, ) from ekstep_data_pipelines.common import BaseProcessor from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("Audio_cataloguer") class AudioCataloguer(BaseProcessor): """ docstring cataloguer """ @staticmethod def get_instance(data_processor): return AudioCataloguer(data_processor) def __init__(self, data_processor): self.data_processor = data_processor def process(self, **kwargs):
import hashlib from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("hash_code") def get_hash_code_of_audio_file(file_path): md5_hash = hashlib.md5() audio_file = open(file_path, "rb") content = audio_file.read() md5_hash.update(content) digest = md5_hash.hexdigest() LOGGER.info("Given file is %s and hash is %s", file_path, digest) return digest
import re from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import ( BaseTranscriptionSanitizer, ) from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import ( TranscriptionSanitizationError, ) from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("HindiTranscriptionSanitizer") class HindiSanitizer(BaseTranscriptionSanitizer): VALID_CHARS = "[ ँ-ःअ-ऋए-ऑओ-नप-रलव-ह़ा-ृे-ॉो-्0-9क़-य़ ॅ]" PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।" @staticmethod def get_instance(**kwargs): return HindiSanitizer() def __init__(self, *agrs, **kwargs): pass def sanitize(self, transcription: str): LOGGER.info("Sanitizing transcription:" + transcription) transcription = ( transcription.strip() ) # removes spaces from the starting and ending of transcription if ":" in transcription: raise TranscriptionSanitizationError("transcription has :")
from ekstep_data_pipelines.audio_analysis.audio_embeddings.gender_inference import ( load_model, get_prediction_from_npz_file, ) from ekstep_data_pipelines.common.utils import get_logger Logger = get_logger("analyse_speakers") def analyse_gender(embed_file_path): Logger.info("Start analyse gender") gender_model = load_model( "ekstep_data_pipelines/audio_analysis/models/clf_svc.sav") file_to_speaker_gender_mapping = get_prediction_from_npz_file( gender_model, embed_file_path) return file_to_speaker_gender_mapping
import librosa import sox from ekstep_data_pipelines.common.utils import get_logger LOGGER = get_logger("audio_duration") def calculate_duration(input_filepath): duration = sox.file_info.duration(input_filepath) LOGGER.info("Duration for input_filepath:%s : %s", input_filepath, str(duration)) return duration def calculate_duration_librosa(input_filepath): y, sr = librosa.load(input_filepath) return librosa.get_duration(y)