def punctuate(text): p = Punctuator( "C:\\Users\\lm44\\Documents\\Code\\Python\\Sumit Backend\\functions\\INTERSPEECH-T-BRNN.pcl" ) punctuated = p.punctuate(text) return punctuated
def punctuate_text(text): print("Performing Puntuation ... \n") p = Punctuator('models/punctuator1.pcl') new_text = p.punctuate(text) new_text = re.sub("[?:;,]", "", new_text) new_text = re.sub("\s\s+", " ", new_text) print("Original text is:\n") print(new_text) print("\n\n") return new_text
def correct(begin_of_path,text,language="English"): #text is currently raw string words=text.split(" ") correct_words=[] spell = SpellChecker() for word in words: correct_words.append(spell.correction(word)) separator = ' ' correct_text=separator.join(correct_words) path_to_model=os.path.join(begin_of_path,"data","Demo-Europarl-EN.pcl") p = Punctuator(path_to_model) correct_text_with_punct=p.punctuate(correct_text) return(correct_text_with_punct)
def punctuates(doo=True): if doo == True: text = open("transcription.txt", "r") text = text.read() from punctuator import Punctuator p = Punctuator('hel.pcl') punctuated = p.punctuate(text) print("Punctuating Done") return punctuated else: text = open("transcription.txt", "r") text = text.read() return text
def addPunctuation(text_file): #load the pre-trained model p = Punctuator('model.pcl') #read unstructured text from the file fp = open(text_file, "r") text = fp.read() #punctuate the read text sentences = p.punctuate(text) fp.close() #write punctuated text into the file otp_file = open("notes.txt", "w") otp_file.write(sentences) otp_file.close()
def punctuate(): global filename global PCL t = open('****************', 'r') file = t.read() source = file # Punctuate timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Punctuating chunk') p = Punctuator('****************') timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Saving your file') t.write(p.punctuate(source)) timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Punctuation complete')
def Transcription(videoFile): try: videoToWav(videoFile) r = sr.Recognizer() audio_clip = sr.AudioFile("{}".format("extracted.wav")) with audio_clip as source: audio_file = r.record(source) print("Please wait ...") resultTemp = r.recognize_google(audio_file, language="en-EN") punctuator = Punctuator('en') resultTemp = punctuator.punct([resultTemp], batch_size=32) empty = " " resultFinal3 = empty.join(resultTemp) print("Speech to text conversion successfull.") return resultFinal3 except Exception as e: print("Attempt failed -- ", e)
def test(request): video_id = "5v1B1R3lEO8" srt = YouTubeTranscriptApi.get_transcript(video_id) alltext = "" for item in srt: alltext = alltext + item['text'] + " " print(os.path.join(settings.BASE_DIR)) file_ = os.path.join(settings.BASE_DIR, 'model.pcl') p = Punctuator(file_) punctuated = p.punctuate(alltext[:5000]) specialtag = punctuated.split('.') # newpara = "" # for item in specialtag: # newpara = newpara + item+".<br>" #specialtag = alltext return render(request, 'test.html', { 'foo': specialtag, })
def fix_text(text_list, is_saved): """ Cleans, punctuates, neural coreferences, and sentencizes the transcript. :param is_saved: True if a version of the fixed text is already saved in a file :param text_list: A list of strings; an 'unclean' transcript :return: A list of tokenized sentences (every sentence is a Doc object) """ file_name = 'fixed.txt' if is_saved: with open(file_name, 'r') as fixed: fixed_text_list = fixed.readlines() fixed_text_list = [text.replace('\n', '') for text in fixed_text_list] fixed_text_list = [nlp(sentence) for sentence in fixed_text_list] return fixed_text_list else: fixed_text = ' '.join(text_list) # convert the list into one string fixed_text.replace(' ', ' ') # remove double spaces print('adding punctuation; please wait a few minutes...') punctuator = Punctuator('Demo-Europarl-EN.pcl') fixed_text = punctuator.punctuate(fixed_text) print('removing interjections; please wait a few more minutes...') fixed_text_doc = remove_tokens_by_pos(nlp(fixed_text), 'INTJ') print( 'performing neural coreferencing; please wait for several more minutes...' ) neuralcoref.add_to_pipe(nlp) fixed_text_doc = fixed_text_doc._.coref_resolved print('splitting the text into sentences; please keep waiting...') fixed_text_list = re.split('\\.|\\?|!', fixed_text_doc) with open(file_name, 'w') as fixed: for sentence in fixed_text_list: fixed.write(sentence + "\n") fixed_text_list = [nlp(sentence) for sentence in fixed_text_list] return fixed_text_list
def get_punctuator_model(self) -> Punctuator: """Returns a punctuator model. It will reuse the same punctuator model when calling this method multiple times, i.e. a singleton. Returns: Punctuator: the punctuator model """ if self._punctuator_model is None: print("Loading punctuator model..") model = str(Path("/app/chappy/punctuator/INTERSPEECH-T-BRNN.pcl").resolve()) self._punctuator_model = Punctuator(model) # use pretrained model print("Completed loading punctuator model.\n") return self._punctuator_model
def summarize(vid_id): text = """ """ subs = YouTubeTranscriptApi.get_transcript(vid_id) sentences = [i['text'] for i in subs] text = ' '.join(sentences) p = Punctuator('INTERSPEECH-T-BRNN.pcl') text = p.punctuate(text) stopWords = set(stopwords.words("english")) words = word_tokenize(text) freqTable = dict() for word in words: word = word.lower() if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 sentences = sent_tokenize(text) sentenceValue = dict() for sentence in sentences: for word, freq in freqTable.items(): if word in sentence.lower(): if sentence in sentenceValue: sentenceValue[sentence] += freq else: sentenceValue[sentence] = freq sumValues = 0 for sentence in sentenceValue: sumValues += sentenceValue[sentence] average = int(sumValues / len(sentenceValue)) summary = '' for sentence in sentences: if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): summary += " " + sentence return summary
def main(): words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs( EMBEDDING_FILE) punctuator = Punctuator(word_to_index, None) punctuator.load_model(MODEL_FILE) punctuator.load_weights(WEIGHTS_FILE) examples = [ "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have" ] for example in examples: words = example.split() x = punctuator.create_live_data(words) for s in x: prediction = punctuator.predict(s) result = punctuator.add_punctuation(prediction, words) print(result)
import json from .word_analyzer import WordAnalyzer from .script_analyzer import ScriptAnalyzer from punctuator import Punctuator target_video_id = "TLnUJzueBOQ" cbc_kid = 'SuSTBXGiOsw' comedy_central = 'fKSiol1uczc' bbc_news = 'hFAROEKiHl8' WA = WordAnalyzer() SA = ScriptAnalyzer() # Place model files at '~/.punctuator' Download ULR : https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms # model list : Demo-Europarl-EN.pcl INTERSPEECH-T-BRNN-pre.pcl INTERSPEECH-T-BRNN.pcl P = Punctuator('Demo-Europarl-EN.pcl') def analyzeAll(videoId): sa_result = json.loads(SA.analyzeScript(videoId)) print('script analyze ok') punc_script = P.punctuate(sa_result['script']) # 문장부호 포함된 스크립트 print('punctuator ok') wa_result = json.loads(WA.analyzeText(punc_script)) print('word analyze ok') analyze_result = {} analyze_result['videoId'] = sa_result['videoId'] analyze_result['script'] = punc_script analyze_result['totalWords'] = wa_result['Total_words'] analyze_result['totalUniqueWords'] = wa_result['Total_unique_words']
def main(): """Train a model using lines of text contained in a file and evaluates the model. """ #read golve vecs words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs( EMBEDDING_FILE) #create word embedding matrix embedding_matrix = create_emb_matrix(word_to_index, word_to_vec_map) print('shape of embedding_matrix:', embedding_matrix.shape) #load trainig text from a file utterances = load_text_data(TEXT_FILE) print(utterances[0]) #create an instance of Punctutor and create training data punctuator = Punctuator(word_to_index, None) X, Y = punctuator.create_training_data(utterances, False) #if a model already exists, load the model if os.path.isfile(MODEL_FILE): punctuator.load_model(MODEL_FILE) else: model = BidirectionalGruWithGru.create_model( input_shape=(X.shape[1], ), embedding_matrix=embedding_matrix, vocab_len=len(word_to_index), n_d1=128, n_d2=128, n_c=len(punctuator.labels)) print(model.summary()) punctuator.__model__ = model #if the model has been already trained, use the pre-trained weights if os.path.isfile(WEIGHTS_FILE): punctuator.load_weights(WEIGHTS_FILE) #shuffle the training data shuffle(X, Y) denom_Y = Y.swapaxes(0, 1).sum((0, 1)) print('Summary of Y:', denom_Y) print('shape of X:', X.shape) print(X[0:10]) print('shape of Y:', Y.shape) print(Y[0:10]) #define optimizer and compile the model opt = Adam(lr=0.007, beta_1=0.9, beta_2=0.999, decay=0.01) punctuator.compile(opt, loss='categorical_crossentropy', metrics=['accuracy']) #split the training data into training set, test set, and dev set t_size = int(X.shape[0] * 0.9) train_X, train_Y = X[:t_size], Y[:t_size] test_X, test_Y = X[t_size:-DEV_SIZE], Y[t_size:-DEV_SIZE] dev_X, dev_Y = X[-DEV_SIZE:], Y[-DEV_SIZE:] print(train_Y.swapaxes(0, 1).sum((0, 1))) print(test_Y.swapaxes(0, 1).sum((0, 1))) #train the model punctuator.fit([train_X], train_Y, batch_size=BATCH, epochs=EPOCH) punctuator.save_model(MODEL_FILE) punctuator.save_weights(WEIGHTS_FILE) #evaluate the model on the dev set (or the test set) for i, example in enumerate(dev_X): prediction = punctuator.predict(example) punctuator.check_result(prediction, dev_Y[i]) #manually evaluate the model on an example examples = [ "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have" ] for example in examples: words = example.split() x = punctuator.create_live_data(words) print x for s in x: print s prediction = punctuator.predict(s) result = punctuator.add_punctuation(prediction, words) print(result)
import math import datetime import numpy as np import config app = Flask(__name__) app.config.from_object(config.ProductionConfig) app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db = SQLAlchemy(app) jwt = JWTManager(app) punctuate_model_name = 'PT_Punctuator.pcl' punctuate_model_directory = './punctuate_model/' punctuate_model_path = punctuate_model_directory + punctuate_model_name app.config['punctuate_model_path'] = punctuate_model_path punctuator_model = Punctuator(app.config['punctuate_model_path']) classifier_model_name = 'saved_model/my_model' classifier_model_directory = './classifier_model/' classifier_model_path = classifier_model_directory + classifier_model_name app.config['classifier_model_path'] = classifier_model_path classifier_model = tf.keras.models.load_model( app.config['classifier_model_path']) vocab_path = classifier_model_directory + 'vocab.txt' tokenizer = FullTokenizer(vocab_file=vocab_path) def punctuateTextFile(file_name): with open(file_name, "r") as file: text_to_punctuate = file.read() text_to_punctuate = text_to_punctuate.lower()
from punctuator import Punctuator p = Punctuator('Models/model.pcl') def auto_punctuation(text): return p.punctuate(text)
def handlePunctuation(text): #Punctuator库 #https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms p = Punctuator('Demo-Europarl-EN.pcl') return p.punctuate(text)
from transformers import pipeline import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import numpy as np from punctuator import Punctuator import jamspell # load pre-trained model and tokenizer tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") punctuator = Punctuator("../models/INTERSPEECH-T-BRNN2.pcl") corrector = jamspell.TSpellCorrector() corrector.LoadLangModel("../models/spellchecker_en.bin") # load any audio file of your choice speech, rate = librosa.load("../10mintest.mp3", sr=16000) lenght = librosa.get_duration(speech, sr=16000) n_chuncks = np.ceil(lenght / 10) chuncks = np.array_split(speech, n_chuncks) def transcriptor(chunks): string = "" for i in chuncks: input_values = tokenizer(i, return_tensors='pt').input_values # Store logits (non-normalized predictions) logits = model(input_values).logits # Store predicted id's predicted_ids = torch.argmax(logits, dim=-1) # decode the audio to generate text
def main(fileName): fileName, fileExt = fileName.split('.') print(fileName, fileExt) # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav") # ipFile = ffmpeg.input(fileName + fileExt) # opFile = ffmpeg.output(ipFile, fileName + ".wav") clip = AudioFileClip(f"{fileName}.{fileExt}") clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le') f = sf.SoundFile(f'{fileName}.wav') audio_dur = len(f) / f.samplerate r = sr.Recognizer() text = "" rec_dur = 25 with sr.AudioFile(f'{fileName}.wav') as source: for x in range(0, int(audio_dur / rec_dur)): audio = r.record(source, duration=rec_dur) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass audio = r.record(source, duration=(audio_dur - int(audio_dur / rec_dur))) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass print("Done") p = Punctuator('Demo-Europarl-EN.pcl') text = p.punctuate(text) tool = language_tool_python.LanguageTool('en-US') matches = tool.check(text) print(len(matches)) for lab in range(len(matches)): print(lab) print(matches[lab].ruleId, matches[lab].replacements) text_new = tool.correct(text) print(text_new) nltk.download('punkt') nltk.download('stopwords') preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner()) similarity_algorithm = BM25Plus() ranker = TextRank() ir = ClassicalIR() # Text Summarization model = Summarizer(preprocessor, similarity_algorithm, ranker, ir) summarised_content = model.summarise(text_new, reduction_ratio=0.80, preserve_order=True) print("\n --- Summarized Text ---\n") print(construct_sentences_from_ranking(summarised_content)) with open(f"{fileName}.txt", "w+") as file: file.write(construct_sentences_from_ranking(summarised_content)) # Text Keyword Extraction preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner(skip_stemming=True)) keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR()) keywords = keyword_extractor.extract_keywords(text, count=10, raw=False) print("\n --- Keywords ---\n") print(keywords)
from punctuator import Punctuator p = Punctuator('/home/erviewre/h3podcastbot/.punctuator/Demo-Europarl-EN.pcl') import os directory = os.fsencode("/home/erviewre/h3podcastbot/raw_scripts") file_count = len(os.listdir(directory)) progress = 1 for file in os.listdir(directory): print(str(progress) + "/" + str(file_count)) progress += 1 filename = os.fsdecode(file) with open('/home/erviewre/h3podcastbot/raw_scripts/' + filename, 'r') as open_file: data = open_file.read().replace('\n', ' ') with open('/home/erviewre/h3podcastbot/punctuated_scripts/' + filename, 'w') as punctuated_file: punctuated_file.write(p.punctuate(data))
def punctuate_text(text): p = Punctuator('models/INTERSPEECH-T-BRNN.pcl') print(p.punctuate(text))
import os import subprocess import time import logging import uuid from speech_recognizer import SpeechRecognizer from punctuator import Punctuator from number_utils.text2numbers import TextToNumbers speech_recognizer = SpeechRecognizer() punctuator = Punctuator(model_path="data/punctuator") text2numbers = TextToNumbers() class FileHandler: @staticmethod def get_recognized_text(blob): try: filename = str(uuid.uuid4()) os.makedirs('./records', exist_ok=True) new_record_path = os.path.join('./records', filename + '.webm') blob.save(new_record_path) new_filename = filename + '.wav' converted_record_path = FileHandler.convert_to_wav( new_record_path, new_filename) response_models_result = FileHandler.get_models_result( converted_record_path) return 0, new_filename, response_models_result except Exception as e: logging.exception(e) return 1, None, str(e)
import json import pprint import string from nltk.tokenize import sent_tokenize import nltk from punctuator import Punctuator punc = Punctuator('model.pcl') nltk.download('punkt') file = 'captions.txt' def parse_line(line): parts = line.split(" ") if len(parts) > 1: id = parts[0] val = " ".join(parts[1:]) return id, val with open(file, 'r') as fd: while True: line = fd.readline() if not line: break id, valstr = parse_line(line) val = json.loads(valstr) text = val.get("text") text = " ".join(text)
def punctuate_conversation(conversation, loc): p = Punctuator(loc) punctuated_converse = p.punctuate(conversation) return punctuated_converse
def form_valid(self, form): self.object = form.save(commit=False) video_id = self.object.body.split('?v=')[1].split("&")[0] self.object.vid_id = video_id print('Does he have it?') print(self.object.vid_id) if (Post.objects.filter(vid_id=self.object.vid_id).exists()): yo = Post.objects.filter(vid_id=self.object.vid_id)[:1] print(yo[0].pk) print('REDIRECT!') return HttpResponseRedirect( reverse('article-detail', kwargs={ 'pk': str(yo[0].pk), "yt": video_id })) #return redirect('article-detail', post.pk post.vid_id+str(yo[0].pk)+'/'+str(video_id)) data = scrape_url('http://youtube.com/watch?v=' + video_id) print(data.title) print(data.poster) srt = YouTubeTranscriptApi.get_transcript(video_id) totalduration = 0 alltext = "" for item in srt: go = item['start'] if (go - totalduration > 30): alltext = alltext + time.strftime( '%H:%M:%S', time.gmtime( item['start'])) + item['text'] + " " totalduration = item['start'] else: alltext = alltext + item['text'] + " " r = re.findall( '(?:[0123456789]\d|2[0123456789]):(?:[0123456789]\d):(?:[0123456789]\d)', alltext) for item in r: # print(item) alltext = alltext.replace( item, "<br><a class='ytlink' href='#' type='button' onclick='seek(" + str(get_sec(item)) + ")'>" + item + "</a> </br>") print(alltext) file_ = os.path.join(settings.BASE_DIR, 'model.pcl') p = Punctuator(file_) punctuated = p.punctuate(alltext) self.object.title = data.title self.object.title_tag = data.poster # totaltext = punctuated.split(".") # finaltext = "" # for item in totaltext: # finaltext = finaltext + item +"."+"<br><br>" iframe = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/PjDw3azfZWI?enablejsapi=1" frameborder="0"></iframe>''' embed = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/''' + video_id + '''?enablejsapi=1" frameborder="0"></iframe>''' self.object.body = embed + "<br>" + alltext self.object.save() return HttpResponseRedirect( reverse('article-detail', kwargs={ 'pk': self.object.id, "yt": video_id }))
parser.add_argument('-c', '--captions-path', type=str, required=True, help='path to filtered captions') parser.add_argument('-p', '--punctuator-model', type=str, required=True, help='path to punctuator .pcl model') parser.add_argument('-l', '--labelled-data', type=str, required=True, help='path to labelled data json file') parser.add_argument('-f', '--root-features', type=str, required=True, help='directory with all the video features') parser.add_argument('-s', '--save-path', type=str, required=True, help='json file to save training data to') args = parser.parse_args() captions_path = args.captions_path save_path = args.save_path punc = Punctuator(args.punctuator_model) captions = json.load(open(captions_path, 'r')) labelled_data = json.load(open(args.labelled_data, 'r')) vid_ids = os.listdir(args.root_features) start = 0 if os.path.exists(save_path): train_data = json.load(open(save_path)) print('starting from vid id', len(train_data)) start = len(train_data) else: train_data = {} def timestamp_to_idx(time): return int(0.5 + time / 1.5)
from punctuator import Punctuator import sys p = Punctuator('Demo-Europarl-EN.pcl') textfile = sys.argv[1] with open(textfile, 'r') as file: data = file.read().replace('\n', ' ') data = data.lower() print(p.punctuate(data))
from punctuator import Punctuator p = Punctuator('Demo-Europarl-EN.pcl') output_file = open('output.txt', 'w') output_file.write('Demo-Europarl_EN.pcl\n\n') output_file.write( p.punctuate( 'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point' )) #output_file.write(p.punctuate('this is a test sentence for part 1')) p3 = Punctuator('INTERSPEECH-T-BRNN.pcl') output_file.write('\n\nINTERSPEECH-T-BRNN.pcl\n\n') output_file.write( p3.punctuate( 'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point' )) #output_file.write(p3.punctuate('this is a test sentence for part 3')) output_file.close()
import os import config import json init(init(autoreset=True)) if config.settings.punct_correction_tool == "fastpunct": from fastpunct import FastPunct fastpunct = FastPunct("en") elif config.settings.punct_correction_tool == "punctuator": from punctuator import Punctuator model_file = os.path.join(str(Path.home()), ".punctuator", "Demo-Europarl-EN.pcl") punctuator_runner = Punctuator(model_file) def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def _transformTime(srt_time, start_utc_time=0): """[summary] :return: [description] :rtype: [type] """ minutes = srt_time.minutes seconds = srt_time.seconds + 60 * minutes milliseconds = srt_time.milliseconds + seconds * 1000 + start_utc_time
def main(): """Train a model using lines of text contained in a file and evaluates the model. """ #read golve vecs #words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(EMBEDDING_FILE) #create word embedding matrix #embedding_matrix = create_emb_matrix(word_to_index, word_to_vec_map) embedding_matrix = None #print('shape of embedding_matrix:', embedding_matrix.shape) #load trainig text from a file utterances = load_text_data(TEXT_FILE) punctuator = Punctuator(None, None) X, Y = punctuator.create_training_data(utterances[:3], False) print(X.shape) print(X.shape[1]) print(Y.shape) #if a model already exists, load the model if os.path.isfile(MODEL_FILE) and False: punctuator.load_model(MODEL_FILE) else: model = BidirectionalGruWithGru.create_model(input_shape=( X.shape[1], X.shape[2], ), embedding_matrix=None, vocab_len=0, n_d1=128, n_d2=128, n_c=len( punctuator.labels)) print(model.summary()) punctuator.__model__ = model #if the model has been already trained, use the pre-trained weights if os.path.isfile(WEIGHTS_FILE): punctuator.load_weights(WEIGHTS_FILE) for i in range(100): shuffle(utterances) print(utterances[0]) #create an instance of Punctutor and create training data X, Y = punctuator.create_training_data(utterances[:300000], False) #shuffle the training data shuffle(X, Y) denom_Y = Y.swapaxes(0, 1).sum((0, 1)) print('Summary of Y:', denom_Y) print('shape of X:', X.shape) print(X[0:10]) print('shape of Y:', Y.shape) print(Y[0:10]) #define optimizer and compile the model opt = Adam(lr=0.007, beta_1=0.9, beta_2=0.999, decay=0.01) punctuator.compile(opt, loss='categorical_crossentropy', metrics=['accuracy']) #split the training data into training set, test set, and dev set t_size = int(X.shape[0] * 0.9) train_X, train_Y = X[:t_size], Y[:t_size] test_X, test_Y = X[t_size:-DEV_SIZE], Y[t_size:-DEV_SIZE] dev_X, dev_Y = X[-DEV_SIZE:], Y[-DEV_SIZE:] print(train_Y.swapaxes(0, 1).sum((0, 1))) print(test_Y.swapaxes(0, 1).sum((0, 1))) #train the model punctuator.fit([train_X], train_Y, batch_size=BATCH, epochs=EPOCH) punctuator.save_model(MODEL_FILE) punctuator.save_weights(WEIGHTS_FILE) #evaluate the model on the dev set (or the test set) for i, example in enumerate(dev_X): prediction = punctuator.predict(example) punctuator.check_result(prediction, dev_Y[i]) #manually evaluate the model on an example examples = [ "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have", "this was a measure that first was introduced back in 1979 known as the International bill of rights for women it is the first and only international instrument that comprehensively addresses women's rights within political cultural economic social and family life", "I'm Elizabeth Neumann from the San Francisco Department on the status of women Sita is not just about naming equal rights for women and girls it provides a framework to identify and address inequality", "we have monitored the demographics of commissioners and board members in San Francisco to assess the equality of political opportunities and after a decade of reports women are now half of appointees but white men are still over-represented and Asian and Latina men and women are underrepresented", "when the city and county faced a 300 million dollar budget deficit in 2003 a gender analysis of budget cuts by city departments identified the disproportionate effect on women and particularly women of color in the proposed layoffs and reduction of services" ] for example in examples: words = example.split() x = punctuator.create_live_data(words) print x for s in x: print s prediction = punctuator.predict(s) result = punctuator.add_punctuation(prediction, words) print(result)