def transcript(): if request.method == "POST": transcript_path = request.form.get('transcript_path') with open(transcript_path) as file: text = file.read() segmenter = DeepSegment('deepsegment_eng_v1/config.json') var = segmenter.segment(text) operational_1 = operational(var) length_chat_file = len(var) length_operational = len(operational_1) length_non_operational = length_chat_file - length_operational # Call function to create a pie chart showing Operational vs Non-Operational Problems # draw_figure(length_operational, length_non_operational) return render_template("showgraph.html", length_operational = length_operational, length_non_operational = length_non_operational) else: return redirect(url_for("index"))
def main(): args = parse_args() with open(args.input, mode='r') as read_text_file: line = read_text_file.readline() segmenter = DeepSegment('en') corrector = DeepCorrect(args.params_path, args.checkpoint_path) with open(args.output, mode='w') as close_text_file: for part in segmenter.segment(line): tester2 = corrector.correct(part) close_text_file.write(tester2[0]['sequence'] + '\n')
def processing(id): paragraph_object = Paragraph.objects.get(id=id) if not hasattr(globals, 'corrector') and not hasattr(globals, 'segmenter'): segmenter = DeepSegment('en') corrector = DeepCorrect('deep_punc/deeppunct_params_en', 'deep_punc/deeppunct_checkpoint_wikipedia') globals.corrector = corrector globals.segmenter = segmenter else: corrector = globals.corrector segmenter = globals.segmenter list_of_sentences = segmenter.segment(paragraph_object.original_text) paragraph = '' for i in range(len(list_of_sentences)): sentence = corrector.correct(list_of_sentences[i]) if i == 0: paragraph += sentence[0]['sequence'] else: paragraph += ' ' + sentence[0]['sequence'] paragraph = paragraph.replace("\\", "") paragraph_object.processed_text = paragraph paragraph_object.processing = False paragraph_object.save()
def predict(self, sample_text, word_length, segment, verbose): #A text seed is provided '''Predicts the next text sequences''' #model = self.model for wordLength in range(word_length): #Generates a text with a range of word length tokenList = self.tokenizer.texts_to_sequences([sample_text])[0] #Turns the seed into sequences tokenList = pad_sequences([tokenList], maxlen=self.maxSequenceLen - 1, padding=self.padding_method) predicted = self.model.predict_classes(tokenList, verbose=verbose) #Predicts the next sequence(generated outputWord = " " #text) for word, index in self.tokenizer.word_index.items(): if index == predicted: outputWord = word break sample_text += " " + outputWord #Returns the seed plus generated text self.sample_text = sample_text if segment == True: segmenter = DeepSegment('en') result = segmenter.segment(self.sample_text) sample_text = result else: print(sample_text) sample_text = self.sample_text return sample_text
""" if __name__ == "__main__": video_id = sys.argv[1] vader_flag = 1 #print(get_transcript()) #get_comments_clean_and_organise() print("TRANSCRIPT" + "\n") transcript_li = get_transcript(video_id) segmented_text_li = segmenter.segment(" ".join(transcript_li)) print(segmented_text_li) str_text = ". ".join( segmented_text_li ) #converting into a single string to easily pass to watson tone analyzer #print(str_text) print("NRC") sent_by_sent_transcript_li = segmented_text_li normalized_counts_transcript, Arousal = normalized_emotion_counts( sent_by_sent_transcript_li, vader_flag) print("Normalized emotion counts of transcript: \n") print(normalized_counts_transcript) print("Arousal Score: " + str(Arousal))
splitter = NNSplit("de") res = splitter.split([data]) # ============================================================================= # More advanced: Deepsegment: Does not support German # ============================================================================= if False: from deepsegment import DeepSegment # The default language is 'en' segmenter = DeepSegment('de') with open('data/start.txt', 'r') as myfile: data = myfile.read() segmenter.segment('I am Batman i live in gotham') # ============================================================================= # Huggingface tokenizer # ============================================================================= if False: from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing from pathlib import Path tokenizer = ByteLevelBPETokenizer( "data/german_old.json", "data/german_old.txt", )
) """ text = 'Team, I know that times are tough! Product '\ 'sales have been disappointing for the past three '\ 'quarters. We have a competitive product, but we '\ 'need to do a better job of selling it!' """ li_B= ['since 1990 the number of gun deaths', 'worldwide has reached six point five', 'million three quarters of gun deaths', 'occur in just 15 countries Latin America', 'is home to some of the worlds most', 'violent countries by murder rate El', 'Salvador Venezuela and Guatemala are the', 'top three countries for deaths caused by', 'guns per population these Latin American', 'countries are marred by corruption', 'organized crime and a dysfunctional', 'criminal justice system that further', 'fuels the problem the availability of', 'guns in the United States is another', 'concern for these countries an estimated', '200,000 guns a year that were first sold', 'in the United States are smuggled over', 'the southern border and used in violent', 'crimes in Latin America and the', 'Caribbean in the United States the', 'constitutional right to bear arms has', 'led to looser regulations and easier', 'access to firearms this contributes to', 'the 30,000 men women and children who', 'were killed with guns each year mass', 'shootings attract their headlines but in', 'fact these make up only 0.2% of gun', 'deaths 60% of gun related deaths are in', 'fact suicide', "America's suicide rate increased by 25", 'percent between 1999 and 2015 of nearly', '45,000 taking their own lives in 2015', 'alone half of these suicides were', "carried out with guns though guns aren't", 'the most common method of suicide they', 'are the most lethal other wealthy', 'countries have far lower rates of gun', 'violence in Japan if you want to own a', 'gun you must pass a written exam and a', 'shooting range test alongside a series', 'of mental health drug in criminal record', 'tests', 'it has virtually eradicated gun crime', 'after a mass shooting in 1996 Australia', 'introduced an effective buyback scheme', 'of firearms in the 20 years following', 'the bag there was an accelerated decline', 'in total gun deaths but in America the', 'House of Representatives has not voted', 'on a single measure to prevent gun', 'violence and in some states such as', 'Texas where students at public colleges', 'can now carry concealed handguns the law', 'has actually loosened easy access to', 'firearms will continue to be the main', 'driver of Americas gun debt'] text_B= segmenter.segment(" ".join(li_B) ) str_text_B= ". ".join(text_B) #print(str_text_B) str_text_C= "The journalist is twisting the facts and reporting fake news. The boy is twisting the rope on the swing. The deal was an unfortunate twist in events." print("PARALLELDOTS") a= "The journalist is twisting the facts and reporting fake news." b= "The boy is twisting the rope on the swing." c= "The deal was an unfortunate twist in events." print(a)
def fn(test): from deepsegment import DeepSegment segmenter=DeepSegment('en') import textrazor textrazor.api_key = "043e170ef41a6d297a508581225bd493943f3a9f831345fb71f86d64" client = textrazor.TextRazor(extractors=["words", "relations"]) #client.set_do_cleanup_HTML(True) response = client.analyze(test) l=[] for property in response.properties(): for word in property.predicate_words: l.append(word.lemma) if word.lemma == "sound": for property_word in property.property_words: for phrase in property_word.noun_phrases: print (phrase) break l=[] flag=False for sentence in response.sentences(): print(sentence.words) for word in sentence.words: if word.lemma=="image" or word.lemma=="picture" or word.lemma=="photo" or word.lemma=="show" or word.lemma=="see" or word.lemma=="display": k=word.lemma flag=True l.append(word.lemma) astring="" for i in l: astring+=i+" " f=open("keyword.txt",'a') f.write(astring+"\n") f.close() alist=segmenter.segment(astring) print(alist) if(flag): s=l.index(k) m=l[s:] t="" st="" for i in m: t+=i+" " else: t="No image found" st="" for j in l: st+=j+" " text1=st text2=t print(t) response1=client.analyze(t) for noun in response1.noun_phrases(): print(noun.words) for word in noun.words: print(word.lemma) from requests import exceptions import argparse import requests import cv2 import os import time starttime=time.time(); # set your Microsoft Cognitive Services API key along with (1) the # maximum number of results for a given search and (2) the group size # for results (maximum of 50 per request) API_KEY = "948886a19a794c428c53fcfa2aa0325b" MAX_RESULTS = 1 GROUP_SIZE = 1 # set the endpoint API URL URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search" # when attempting to download images from the web both the Python # programming language and the requests library have a number of # exceptions that can be thrown so let's build a list of them now # so we can filter on them EXCEPTIONS = set([IOError, FileNotFoundError, exceptions.RequestException, exceptions.HTTPError, exceptions.ConnectionError, exceptions.Timeout]) # store the search term in a convenience variable then set the # headers and search parameters term = t headers = {"Ocp-Apim-Subscription-Key" : API_KEY} params = {"q": term, "offset": 0, "count": GROUP_SIZE} # make the search print("[INFO] searching Bing API for '{}'".format(term)) search = requests.get(URL, headers=headers, params=params) search.raise_for_status() # grab the results from the search, including the total number of # estimated results returned by the Bing API results = search.json() estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS) print("[INFO] {} total results for '{}'".format(estNumResults, term)) # initialize the total number of images downloaded thus far total = 0 for offset in range(0, estNumResults, GROUP_SIZE): # update the search parameters using the current offset, then # make the request to fetch the results print("[INFO] making request for group {}-{} of {}...".format( offset, offset + GROUP_SIZE, estNumResults)) params["offset"] = offset search = requests.get(URL, headers=headers, params=params) search.raise_for_status() results = search.json() print("[INFO] saving images for group {}-{} of {}...".format( offset, offset + GROUP_SIZE, estNumResults)) # loop over the results for v in results["value"]: # try to download the image try: # make a request to download the image print("[INFO] fetching: {}".format(v["contentUrl"])) r = requests.get(v["contentUrl"], timeout=30) # build the path to the output image ext = v["contentUrl"][v["contentUrl"].rfind("."):] p = os.path.sep.join([r"C:\Users\HP\Desktop\Projects\VIT Hack\SlideEZ-test", "{}{}".format( str(total).zfill(8), ext)]) print("The answer is") print(p) # write the image to disk f = open(p, "wb") f.write(r.content) f.close() # catch any errors that would not unable us to download the # image except Exception as e: # check to see if our exception is in our list of # exceptions to check for if type(e) in EXCEPTIONS: print("[INFO] skipping: {}".format(v["contentUrl"])) continue # try to load the image from disk image = cv2.imread(p) # if the image is `None` then we could not properly load the # image from disk (so it should be ignored) if image is None: print("[INFO] deleting: {}".format(p)) os.remove(p) continue # update the counter total += 1 endtime=time.time()-starttime print("Total time taken to search for the query is") print(endtime) from pptx import Presentation from pptx.util import Inches, Pt from pptx.enum.text import PP_ALIGN from PIL import Image from pptx.dml.color import RGBColor from pptx.enum.dml import MSO_THEME_COLOR presentation = "testppt3.pptx" prs = Presentation(presentation) if len(prs.slides)==0: title_slide_layout = prs.slide_layouts[0] slide = prs.slides.add_slide(title_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 title = slide.shapes.title subtitle = slide.placeholders[1] title.text = "Test" subtitle.text = "test" prs.save(presentation) if not flag: text_slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(text_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 title = slide.shapes.title blist=[] for i in range(0,len(alist)): blist+=alist[i].split(" ") mx=0 slide_t="" for j in blist: if(len(j)>=mx): mx=len(j) slide_t=j.title() title.text= slide_t content = slide.shapes.placeholders[1] tf = content.text_frame for i in alist: para=tf.add_paragraph() para.text=i para.level=1 prs.save(presentation) else: image_slide_layout = prs.slide_layouts[8] slide = prs.slides.add_slide(image_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 #title = slide.shapes.title #title.text="Sub2" content = slide.shapes.placeholders[1] im=Image.open(p) width,height= im.size content.height= height content.width= width content.insert_picture(p) content = slide.shapes.placeholders[0] tf = content.text_frame for i in alist: para=tf.add_paragraph() para.text=i para.level=1 para.alignment=PP_ALIGN.CENTER #left = Inches(6) #top = Inches(3) #height = Inches(2) #pic = slide.shapes.add_picture(p, left, top, height=height) prs.save(presentation)
def segmentsent(self, text): segmenter = DeepSegment('en') result = segmenter.segment(text) return result
from deepsegment import DeepSegment seg = DeepSegment('en') lt = seg.segment( 'today i have to talk about IC 741 it has 14 pins pin number 2 is missing in the slide please note ' )
from json import dump from time import time from deepsegment import DeepSegment model = DeepSegment("en") example = [ "I was hungry i ordered a pizza and i went to the movies which movie did you go to i watched dark knight rises oh how was it it was a good movie yeah thought so" ] # Warmup for _ in range(3): print("Expected result:", model.segment(example, batch_size=1)) # Expected result is [['I was hungry', 'i ordered a pizza and i went to the movies', 'which movie did you go to', 'i watched dark knight rises', 'oh how was it', 'it was a good movie', 'yeah thought so']] in_data = list(example * 8192) for batch_size in [1, 32, 128]: dump({'data': in_data[:batch_size]}, open(f"{batch_size}.json", "w")) start = time() results = model.segment(in_data, batch_size) end = time() print( f"\nBatch Size:{batch_size} Total Time:{end - start} per {len(in_data)} examples." )
'it has virtually eradicated gun crime', 'after a mass shooting in 1996 Australia', 'introduced an effective buyback scheme', 'of firearms in the 20 years following', 'the bag there was an accelerated decline', 'in total gun deaths but in America the', 'House of Representatives has not voted', 'on a single measure to prevent gun', 'violence and in some states such as', 'Texas where students at public colleges', 'can now carry concealed handguns the law', 'has actually loosened easy access to', 'firearms will continue to be the main', "driver of America's gun debt" ] segmented_li = segmenter.segment(" ".join(li_B)) text_B = ". ".join(segmented_li) tone_analysis_B = tone_analyzer.tone( { 'text': text_B }, content_type='application/json').get_result() print("B: \n") print(tone_analysis_B) print("\n") print(json.dumps(tone_analysis_B, indent=2)) #NRC Emotions