def test_filter_files_doc(): from utilities.utilities import Utilities u = Utilities() data = ['file.txt', 'hello.doc', 'file.wav'] expected = ['file.txt', 'file.wav'] result = u.filter_files(data) assert result == expected
def test_get_ref_set(): from utilities.utilities import Utilities u = Utilities() expected = {'2.txt', '3.txt'} data = ['hello.wav', 'foo.doc', 'bar.flac', '2.txt', '3.txt'] result = u._get_ref_set(data) assert result == expected
def test_get_audio_set(): from utilities.utilities import Utilities u = Utilities() expected = {'a.wav', 'b.wav', 'c.wav'} data = ['a.wav', 'foo.txt', 'c.txt', 'b.wav', 'c.wav', 'a.txt', 'b.txt'] result = u.get_audio_set(data) assert result == expected
def test_get_root_filename_from_file_name(): from utilities.utilities import Utilities u = Utilities() expected = 'file' data = 'file.doc' result = u.get_root_filename(data) assert result == expected
def test_filter_files_orphan_ref(): from utilities.utilities import Utilities u = Utilities() data = ['blah.txt', 'blah.ogg', 'orphan.txt'] expected = ['blah.txt', 'blah.ogg'] result = u.filter_files(data) assert result == expected
def test_clean_url(self): text = "vom 13. bis 16. mai findet in Coburg das 34. BMW Veteranentreffen statt," + \ "Infos auch unter http://www.facebook.com/TourismusCoburg?v=app_2344061033&ref=ts#!/event.php" + \ "?eid=110512678986182&index=1" cleaner = Utilities() result = "vom 13. bis 16. mai findet in Coburg das 34. BMW Veteranentreffen statt,Infos auch unter" self.assertEqual(cleaner.clean_url(text), result)
def test_is_valid_file_extension_true2(): from utilities.utilities import Utilities u = Utilities() expected = True extension = 'MP3' result = u._is_valid_file_extension(extension) assert result == expected
def test_is_valid_file_extension_false(): from utilities.utilities import Utilities u = Utilities() expected = False extension = 'doc' result = u._is_valid_file_extension(extension) assert result == expected
def test_get_count_of_word_instances(): from utilities.utilities import Utilities u = Utilities() word_list = ['hello', 'hi', 'hello', 'there', 'hello'] result = u.get_count_of_word_instances(word_list) expected = {'hello': 3, 'hi': 1, 'there': 1} assert result == expected
def test_get_root_filename_from_uri(): from utilities.utilities import Utilities u = Utilities() expected = 'file' data = 'gs://foo/bar/baz/file.wav' result = u.get_root_filename(data) assert result == expected
def test_string_to_enum_speex_with_header_byte(): from utilities.utilities import Utilities from google.cloud.speech_v1p1beta1 import enums u = Utilities() expected = enums.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE txt = 'SPEEX_WITH_HEADER_BYTE' result = u.string_to_enum(txt) assert result == expected
def test_string_to_enum_ogg_opus(): from utilities.utilities import Utilities from google.cloud.speech_v1p1beta1 import enums u = Utilities() expected = enums.RecognitionConfig.AudioEncoding.OGG_OPUS txt = 'OGG_opus' result = u.string_to_enum(txt) assert result == expected
def test_string_to_enum_amr_wb(): from utilities.utilities import Utilities from google.cloud.speech_v1p1beta1 import enums u = Utilities() expected = enums.RecognitionConfig.AudioEncoding.AMR_WB txt = 'amr_WB' result = u.string_to_enum(txt) assert result == expected
def test_string_to_enum_unspecified(): from utilities.utilities import Utilities from google.cloud.speech_v1p1beta1 import enums u = Utilities() expected = enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED txt = '' result = u.string_to_enum(txt) assert result == expected
def test_append_uri_2(): from utilities.utilities import Utilities u = Utilities() data = 'this.wav' uri = 'gs://foo/bar/' expected = uri + data result = u.append_uri(uri, data) assert result == expected
def test_detect_language_langdetect(self): cleaner = Utilities() with open("comments.txt") as f: for line in f: line = line.replace('"', "") lang = cleaner.check_language_languagedetect(line) print line print lang pass
def test_clean_multiple_punctuations(self): text1 = "schön zu sehen das sowas in coburg stattfindet...vielleicht besuche ich mal wieder die alte" + \ " heimat.. gut zu wissen ;)" text2 = "Genau für Mich!!!!" cleaner = Utilities() result1 = "schön zu sehen das sowas in coburg stattfindet. vielleicht besuche ich mal wieder die alte" + \ " heimat. gut zu wissen ;)" result2 = "Genau für Mich!" self.assertEqual(cleaner.clean_multiple_punctuations(text1), result1) self.assertEqual(cleaner.clean_multiple_punctuations(text2), result2)
def test_parse_uri_2(): from utilities.utilities import Utilities u = Utilities() uri = 'gs://foo/bar/baz/test.flac' expected_bucket = 'foo' expected_folder = 'bar/baz' expected_file = 'test.flac' result_unused_scheme, result_bucket, result_unused_path, result_folder, result_file = u.parse_uri( uri) assert result_bucket == expected_bucket assert result_folder == expected_folder assert result_file == expected_file
def __init__(self, browser='chrome'): if sys.platform == "win32": chrome = '\chromedriver.exe' else: chrome = '/chromedriver' utilities = Utilities() # path = utilities.move_up_directory(os.getcwd(), 1) currentFilePath = os.path.realpath(__file__) new_path = utilities.move_up_directory(currentFilePath, 1) if browser.lower() == 'chrome': self.driver_provider = webdriver.Chrome(new_path + chrome) elif browser.lower() == 'firefox': self.driver_provider = webdriver.Firefox()
def read_ref(self, uri, txt_file): from google.cloud import storage as storage logger = logging.getLogger(__name__) client = storage.Client() bucket, folder = self._parse_uri(uri) b = client.bucket(bucket) path = f"{txt_file}" if len(folder) > 0: path = f"{folder}/{txt_file}" blob = b.get_blob(path) result = blob.download_as_string().decode('latin-1') r = result.replace('\n', '') r = str(r) r = r.lower() utilities = Utilities() r = utilities.strip_puc(text = r) logger.debug(f'REF STRIPPED: {r}') return r
def test_create_unique_root_2(): from utilities.utilities import Utilities from model.configuration import Configuration from model.nlp import NLPModel u = Utilities() configuration = Configuration() nlp_model = NLPModel() root = '12345' configuration.set_model('video') configuration.set_use_enhanced(False) configuration.set_language_code('fr_FR') configuration.set_alternative_language_codes(['en-US', 'ru-RU']) configuration.set_speech_context('hi', 5) nlp_model.set_remove_stop_words(True) nlp_model.set_apply_stemming(False) nlp_model.set_expand_contractions(True) nlp_model.set_n2w(True) result = u.create_unique_root(root, configuration, nlp_model) expected = '12345_video_fr_FR_alts_applied_speech_adaptation_applied_boost_5_stop_words_removed_contractions_expanded_numbers_converted_2_words' assert result == expected
def test_get_extension_flac(): from utilities.utilities import Utilities u = Utilities() expected = 'flac' result = u._get_extension('woooooo.flac') assert result == expected
import dash_table import base64 import pandas as pd import numpy as np from file_handle import File_Handle from SIR_model import SIR from SIR_predict import SirPredict from utilities.utilities import Utilities import matplotlib.pyplot as plt #%% #Instances handle = File_Handle() sirmodel = SIR() sirpredict = SirPredict() utl = Utilities() fig = go.Figure() #%% #Downloading data handle.download_censo_file() file_status = handle.download_covid_file() #%% #Loading data censo_df = pd.read_excel('data/ProyeccionMunicipios2005_2020.xls', sheet_name = 'Mpios',header=8) censo_df['MPIO'] = np.where(censo_df['MPIO'] == 'Bogotá, D.C.', 'Bogotá D.C.', censo_df['MPIO']) censo_df['MPIO'] = np.where(censo_df['MPIO'] == 'Cartagena', 'Cartagena de Indias', censo_df['MPIO']) data_org = pd.read_csv('data/Casos_positivos_de_COVID-19_en_Colombia.csv')
enc = args.encoding sample_rate_hertz = args.sample_rate_hertz language_codes = args.langs phrase_file_path = args.phrase_file boosts = [int(i) for i in args.boosts] if not no_zeros_for_boost: boosts.append(0) alternative_language_codes = args.alternative_languages encoding = args.encoding random_queue = args.random_queue use_fake_hyp = args.fake_hyp # init utilities utilities = Utilities() # # Audit phrase file # phrases = list() if phrase_file_path: phrases = io_handler.read_file(phrase_file_path) if phrases: if no_zeros_for_boost: speech_context_runs = [True] else: speech_context_runs = [False, True] logger.debug(f'PHRASES: {phrases}') else:
def get_hypothesis(self, uri, configuration): import time """Asynchronously transcribes the audio uri specified by the gcs_uri.""" client = speech.SpeechClient() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } audio = {"uri": uri} operation = object try: operation = client.long_running_recognize(config=config, audio=audio) except google.api_core.exceptions.InvalidArgument as e: raise e count = 0 sleep_time = 5 while not operation.done() and count != 30000: print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) if count == 29999: raise TimeoutError("Time out processing audio") count += 1 time.sleep(sleep_time) print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) response = operation.result(timeout=1200) transcript = str() for result in response.results: # First alternative is the most probable result transcript += " " + result.alternatives[0].transcript if not transcript: logger.debug('No transcript returned') utilities = Utilities() t = utilities.strip_puc(text=transcript) return t.lower()
def test_clean_test(self): cleaner = Utilities() self.assertEqual('dass auto ist traumhaft.', cleaner.clean_text('dass auto ist traumhaft........'))
def test_multiple_dots(self): cleaner = Utilities() self.assertEqual('dass auto ist traumhaft.', cleaner.clean_multiple_dots('dass auto ist traumhaft........')) test2 = "vom 13. bis 16. mai findet in Coburg das 34. BMW Veteranentreffen statt, Infos auch unter" self.assertEqual(test2, cleaner.clean_multiple_dots(test2))
def test_clean_beginning_punct(self): cleaner = Utilities() self.assertEqual("Sonnebrille und wech",cleaner.clean_dots_beginning_of_text("...Sonnebrille und wech"))
def test_clean_multiple_whitespaces(self): cleaner = Utilities() test1 = "UUU Nice Lets Play Rock n Roll :))" self.assertEqual(cleaner.clean_multiple_whitespaces(test1), "UUU Nice Lets Play Rock n Roll :))")
def test_clean_smileys(self): cleaner = Utilities() test1 = ":o))" test2 = "UUU Nice Lets Play Rock n Roll :))" self.assertEqual(cleaner.clean_smileys(test1), ":o)") self.assertEqual(cleaner.clean_smileys(test2), "UUU Nice Lets Play Rock n Roll :)")