def updateCharacterList(self): """Update character list based on Charnetto output.""" # Sanity checks... if not self.model or not self.inputSeg: return # Init UI... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) # Get input strings... strings = [segment.get_content() for segment in self.inputSeg] progressBar.advance() # Extract character tokens... # if self.sourceType == "Plain text": # self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # elif self.sourceType == "IMSDB-formatted script": # self.char_df = charnetto.extract_movie_df(" ".join(strings)) self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # TODO deal with \n in names progressBar.advance() # Unify spaCy tags to match those of flair... self.char_df = charnetto.unify_tags(self.char_df) progressBar.advance() # Collapse characters whose name is the prefix of another. self.char_list = charnetto.concatenate_parents(self.char_df, min_occ = 1) # Build char list and reset UI. self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) # Cache character list for resetting if needed. self.cachedCaracters = self.characters[:]
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def searchFunction(self): """Search from website Genius""" result_list = {} query_string = self.newQuery if query_string != "": page = 1 page_max = int(self.nbr_results) / 10 result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=page_max) while page <= page_max: values = {'q': query_string, 'page': page} data = urllib.parse.urlencode(values) query_url = 'http://api.genius.com/search?' + data json_obj = self.url_request(query_url) body = json_obj["response"]["hits"] # Each result is stored in a dictionnary with its title, # artist's name, artist's ID and URL path for result in body: result_id += 1 title = result["result"]["title"] artist = result["result"]["primary_artist"]["name"] artist_id = result["result"]["primary_artist"]["id"] path = result["result"]["path"] result_list[result_id] = { 'artist': artist, 'artist_id': artist_id, 'path': path, 'title': title } page += 1 # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = self.searchResults[idx]["title"] + " - " + \ self.searchResults[idx]["artist"] self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning")
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some songs first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() song_content = list() annotations = list() try: for song in self.myBasket: # song is a dict {'idx1':{'title':'song1'...}, # 'idx2':{'title':'song2'...}} page_url = "http://genius.com" + song['path'] lyrics = self.html_to_text(page_url) song_content.append(lyrics) annotations.append(song.copy()) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Genius website.", "error") self.controlArea.setDisabled(False) return # Store downloaded lyrics strings in input objects... for song in song_content: newInput = Input(song, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Lyrics importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Clear morphology... self.morphology = dict() # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Morphologically analyzed data", None, self) self.updateGUI() return # Perform morphological analysis... # Initialize progress bar. self.infoBox.setText( u"Processing, please wait (word count)...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=100) # Word count... wordCounts = collections.Counter( [segment.get_content() for segment in self.inputSeg]) self.morphology["wordCounts"] = wordCounts self.infoBox.setText( u"Processing, please wait (signature extraction)...", "warning", ) progressBar.advance(5) # 5 ticks on the progress bar... # Learn signatures... try: lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts) self.morphology["signatures"] = signatures self.morphology["stems"] = stems self.morphology["suffixes"] = suffixes except ValueError as e: self.infoBox.setText(e.__str__(), "warning") self.send("Morphologically analyzed data", None, self) self.controlArea.setDisabled(False) progressBar.finish() # Clear progress bar. self.morphology = dict() self.updateGUI() return self.infoBox.setText( u"Processing, please wait (word parsing)...", "warning", ) progressBar.advance(80) # Parse words... parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes) self.morphology["parser"] = parser newSegments = list() num_analyzed_words = 0 for segment in self.inputSeg: parses = parser[segment.get_content()] newSegment = segment.deepcopy() if parses[0].signature: num_analyzed_words += 1 newSegment.annotations.update( { "stem": parses[0].stem, "suffix": parses[0].suffix \ if len(parses[0].suffix) else "NULL", "signature": parses[0].signature } ) newSegments.append(newSegment) self.send( "Morphologically analyzed data", Segmentation(newSegments, self.captionTitle), self, ) self.updateGUI() progressBar.advance(15) # Set status to OK and report data size... message = "%i segment@p sent to output (%.2f%% analyzed)." % (len( self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100)) message = pluralize(message, len(self.inputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def summarize(self, cv, content): "Main function that summarize the text" progressBar = ProgressBar(self, iterations=3) doc = self.nlp(content) corpus = [sent.text.lower() for sent in doc.sents] cv_fit = self.cv.fit_transform(corpus) # Count unique words and how many times they appear word_list = self.cv.get_feature_names() count_list = cv_fit.toarray().sum(axis=0) word_frequency = dict(zip(word_list, count_list)) # Get sorted dict of word frequency and print the top to test val = sorted(word_frequency.values()) higher_word_frequencies = [ word for word, freq in word_frequency.items() if freq in val[-3:] ] # gets relative frequency of words to frequent words higher_frequency = val[-1] for word in word_frequency.keys(): word_frequency[word] = (word_frequency[word] / higher_frequency) progressBar.advance() # Initialise a sentence dictionnary sentence_rank = {} # For each word in each sentence ... for sent in doc.sents: count = 0 for word in sent: count += 1 # if the word appears in word_frequency dict if word.text.lower() in word_frequency.keys(): # If the sentence is already in sentence_rank dict, we add points if sent in sentence_rank.keys(): sentence_rank[sent] += word_frequency[ word.text.lower()] # else we create a new key/value pair in dict else: sentence_rank[sent] = word_frequency[word.text.lower()] # Normalize: divide score of current sentence by number of words if sentence_rank.get(sent, None) != None: sentence_rank[sent] = (sentence_rank.get(sent) / count) progressBar.advance() # Sort sentences top_sentences = (sorted(sentence_rank.values())[::-1]) # This is where we can choose how many sentences we want to keep for the summary # Depending on the choosen method: sentences or % if self.method == "Number of sentences": top_sent = top_sentences[:self.numSents] elif self.method == "Percentage of text lenght": percentSent = int(round(self.percentage * len(sentence_rank) / 100)) top_sent = top_sentences[:percentSent] summary = list() for sent, strength in sentence_rank.items(): if strength in top_sent: summary.append(sent) else: continue progressBar.advance() #Summary contains spacy.tokens.span.Span that must be converted to string summary_str = [str(i) for i in summary] # Join all sentence in a single string resume = " ".join(summary_str) # Create HTML resume html_summary = list() for sent in doc.sents: if sent in summary: new_sent = '<b style=\'color:blue\'>' + str(sent) + '</b>' html_summary.append(new_sent) else: html_summary.append(sent) #html_summary contains spacy.tokens.span.Span that must be converted to string html_summary_str = [str(i) for i in html_summary] # Join all sentence in a single string html_resume = "<!DOCTYPE html>\n<html>\n<body>\n" + " ".join( html_summary_str) + "\n</body>\n</html>" progressBar.finish() # Create ouput segmentation from summary return Input(resume), Input(html_resume)
def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpora: self.infoBox.setText("Please add a corpus to the selection.", "warning") self.send("Files", None, self) self.send("Utterances", None, self) return # Clear created Inputs and initialize progress bar... self.clearCreatedInputs() numberOfSteps = 2 if self.outputUtterances else 1 numberOfSteps += 2 if self.outputWords else 0 self.infoBox.setText( "(1/%i) Retrieving data, please wait..." % numberOfSteps, "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.importedCorpora)) annotations = list() # Iterate over corpora... for importedCorpus in self.importedCorpora: corpus = importedCorpus.split("/")[-1] # Try to retrieve corpus from cache... try: basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) corpusFilepath = os.path.normpath( os.path.join( basepath, self.__class__.cachedFoldername, importedCorpus[len(self.__class__.baseUrl):], )) myZip = zipfile.ZipFile(corpusFilepath) except IOError: # Else try to download (and cache) requested zip file... try: response = requests.get(importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) corpusFolderpath = os.path.dirname(corpusFilepath) try: os.makedirs(corpusFolderpath) except OSError: pass try: outputFile = open(corpusFilepath, "wb") outputFile.write(response.content) outputFile.close() except IOError: pass # If an error occurs (e.g. connection error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download corpus %s from CHILDES website." % corpus, "error") # Reset output channel. self.send("Files", None, self) self.send("Utterances", None, self) progressBar.finish() self.controlArea.setDisabled(False) return # Create Input for each zipped file and store annotations... for file in myZip.infolist(): file_content = myZip.read(file).decode('utf-8') # If word segmentation is requested... if self.outputWords: # Implement replacements. file_content = re.sub( r"<w.+?(<replacement.+</replacement>).*?</w>", r"\1", file_content, ) # Prepend pre-clitics. file_content, n = re.subn( r"(<mor .+?)(<mor-pre>.+</mor-pre>)", r"\2\1", file_content, ) # Move <gra> into <mw>. file_content, n = re.subn( r"(</mw>)(<gra.+?/>)", r"\2\1", file_content, ) newInput = Input(file_content, self.captionTitle + "_files") self.createdInputs.append(newInput) chatSeg = Segmenter.import_xml(newInput, "CHAT") annotations.append(dict()) annotations[-1]["file_path"] = file.filename for key in ["Corpus", "Lang", "PID"]: try: annotations[-1][key.lower()] = \ chatSeg[0].annotations[key] except KeyError: pass participantListSeg = Segmenter.import_xml( newInput, "Participants") recodedInput, _ = Segmenter.recode( participantListSeg, [(re.compile("/>"), "> </participant>")]) participantSeg = Segmenter.import_xml(recodedInput, "participant") targetChildData = list() for participant in participantSeg: if participant.annotations["role"] != "Target_Child": continue targetChildData.append(dict()) if "age" in participant.annotations: targetChildData[-1]["target_child_age"] = \ participant.annotations["age"] age_parse = re.search( r"(\d+)Y(\d+)M(\d+)D", participant.annotations["age"], ) if age_parse: targetChildData[-1]["target_child_years"] = \ age_parse.group(1) months = int(age_parse.group(2)) \ + 12 * int(age_parse.group(1)) targetChildData[-1]["target_child_months"] = \ '%02d' % months days = int(age_parse.group(3)) \ + 30 * months targetChildData[-1]["target_child_days"] = \ '%02d' % days if "id" in participant.annotations: targetChildData[-1]["target_child_id"] = \ participant.annotations["id"] if "sex" in participant.annotations: targetChildData[-1]["target_child_sex"] = \ participant.annotations["sex"] if len(targetChildData) == 1: annotations[-1].update(targetChildData[0]) progressBar.advance() # If there's only one file, the widget's output is the created Input... if len(self.createdInputs) == 1: self.fileSegmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.fileSegmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle + "_files", import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.fileSegmentation): segment.annotations.update(annotations[idx]) self.fileSegmentation[idx] = segment # Terminate progress bar... progressBar.finish() message = "%i file@p" % len(self.fileSegmentation) message = pluralize(message, len(self.fileSegmentation)) self.send("Files", self.fileSegmentation, self) # Build utterance segmentation if needed... if self.outputUtterances: self.infoBox.setText( "(2/%i) Building utterance segmentation, please wait..." \ % numberOfSteps, "warning", ) progressBar = ProgressBar(self, iterations=len(self.fileSegmentation)) self.utteranceSegmentation = Segmenter.import_xml( self.fileSegmentation, "u", progress_callback=progressBar.advance, label=self.captionTitle + "_utterances", ) progressBar.finish() message += " and " if not self.outputWords else ", " message += "%i utterance@p" % len(self.utteranceSegmentation) message = pluralize(message, len(self.utteranceSegmentation)) self.send("Utterances", self.utteranceSegmentation, self) else: self.send("Utterances", None, self) # Build word segmentation if needed... if self.outputWords: self.infoBox.setText( "(%i/%i) Building word segmentation, please wait..." \ % (2 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) try: baseSegmentation = self.utteranceSegmentation except: baseSegmentation = self.fileSegmentation progressBar = ProgressBar(self, iterations=2 * len(baseSegmentation)) wordSegmentation = Segmenter.import_xml( baseSegmentation, "w", progress_callback=progressBar.advance, ) mwSegmentation = Segmenter.import_xml( baseSegmentation, "mw", progress_callback=progressBar.advance, ) # Analyze words to extract annotations... self.infoBox.setText( "(%i/%i) Extracting word annotations, please wait..." \ % (3 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(wordSegmentation)) wordSegments = list() for word in wordSegmentation: mws = word.get_contained_segments(mwSegmentation) if mws: for mw in mws: wordSegment = word.deepcopy() wordSegment.annotations.update( self.extractWordAnnotations(mw)) wordSegments.append(wordSegment) else: wordSegments.append(word) progressBar.advance() self.wordSegmentation = Segmentation( wordSegments, label=self.captionTitle + "_words", ) message += " and %i word@p" % len(self.wordSegmentation) message = pluralize(message, len(self.wordSegmentation)) self.send("Words", self.wordSegmentation, self) else: self.send("Words", None, self) # Set status to OK and report data size... message += " sent to output." message = pluralize(message, len(self.fileSegmentation)) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Send data from website springfieldspringfield""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") self.segmentation = None self.send("Movie Scripts importation", self.segmentation, self) # Clear created Inputs. self.clearCreatedInputs() annotations = list() script_list = list() annotations_dict = dict() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # This part of code is what fetches the actual script try: for movie in self.myBasket: # Each movie that is in the corpus is split into title and year # (rsplit makes sure to only split last occurence) which will become annotations b = copy.copy(movie) future_annotation = b.rsplit('(', 1) movie_title = future_annotation[0] movie_year = future_annotation[-1] movie_year = movie_year[:-1] annotations_dict["Movie Title"] = movie_title annotations_dict["Year of release"] = movie_year # It is important to make a copy of dictionary, otherwise each iteration # will replace every element of the annotations list annotations.append(annotations_dict.copy()) # link_end and page_url are the two variables that will have to be changed # in case scripts need to be taken from elsewhere link_end = self.path_storage[movie] page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # This is what grabs the movie script script = soup.find("div", {"class": "movie_script"}) script_list.append(script.text) # 1 tick on the progress bar of the widget progressBar.advance() except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error") self.controlArea.setDisabled(False) return # Store downloaded script strings in input objects... for script in script_list: newInput = Input(script, self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Movie Scripts importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def send_data(self): """Compute result of widget processing and send to output""" # Check that there's a table in input... if self.inputTable is None: self.infoBox.setText( "Widget needs input.", "warning" ) self.send("Term-topic Textable table", None) self.send("Document-topic Textable table", None) self.send("Term-topic Orange table", None) self.send("Document-topic Orange table", None) self.listEntries = list() return # Initialize progress bar. self.controlArea.setDisabled(True) progressBar = ProgressBar( self, iterations=1 # TODO ) # Convert input table to gensim dictionary. dictionary, corpus = pivot_crosstab_to_gensim(self.inputTable) # Apply topic modelling... # Case 1: LDA... if self.method == "Latent Dirichlet allocation": model = models.LdaModel( corpus, id2word=dictionary, num_topics=self.numTopics, ) # Create segment-topic PivotCrosstab table. values = dict() terms = list() for topic in range(self.numTopics): topic_terms = model.get_topic_terms( topic, len(self.inputTable.col_ids), ) for term, score in topic_terms: values[(dictionary[term], topic)] = score terms.append( list( dictionary[t] for t, s in topic_terms[:MAX_NUM_DISPLAYED_TERMS] ) ) segmentTopicTable = PivotCrosstab( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), ) # Fill listbox... newListEntries = list() for topicNum in range(self.numTopics): displayedTerms = ", ".join(terms[topicNum]) if len(self.inputTable.col_ids) > MAX_NUM_DISPLAYED_TERMS: displayedTerms += ", ..." listEntry = "%i. %s" % ( topicNum+1, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table... corpus_lda = model[corpus] values = dict() for row_idx, row in enumerate(self.inputTable.row_ids): lda_doc = corpus_lda[row_idx] for topic, score in lda_doc: values[(row, topic)] = score contextTopicTable = PivotCrosstab( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__context__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), missing=0, ) # Case 2: LSI... if self.method == "Latent semantic indexing": model = models.LsiModel( corpus, id2word=dictionary, num_topics=self.numTopics, ) # Create segment-topic PivotCrosstab table. segmentTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), np_array=model.projection.u, header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), ) # Fill listbox... colIds = np.array(self.inputTable.col_ids) newListEntries = list() # Subtask: compute total inertia, i.e. sum of eigenvalues of # doc-term matrix multiplied by its transposed... rect_matrix = self.inputTable.to_numpy() matrix_dims = self.inputTable.to_numpy().shape if matrix_dims[0] > matrix_dims[1]: square_matrix = np.dot(np.transpose(rect_matrix), rect_matrix) else: square_matrix = np.dot(rect_matrix, np.transpose(rect_matrix)) total_inertia = sum(np.linalg.eigvals(square_matrix)) for topicNum in range(self.numTopics): # Proportion of inertia is SQUARE of singular value divided by # total inertia, because n-th singular value = square root of # n-th eigenvalue (cf. compute total inertia above)... propInertia = model.projection.s[topicNum] ** 2 / total_inertia scores = model.projection.u[:,topicNum] sortedTerms = colIds[scores.argsort()[::-1]] if len(colIds) > MAX_NUM_DISPLAYED_TERMS: displayedTerms = ", ".join( sortedTerms[:MAX_NUM_DISPLAYED_TERMS//2] ) displayedTerms += ", ..., " displayedTerms += ", ".join( sortedTerms[-MAX_NUM_DISPLAYED_TERMS//2:] ) else: displayedTerms = ", ".join(sortedTerms) listEntry = "%i. (%.2f%%) %s" % ( topicNum+1, propInertia*100, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table... contextTopicMatrix = corpus2dense( model[corpus], len(model.projection.s) ).T / model.projection.s values = dict() for row_idx, row in enumerate(contextTopicMatrix): for topic, val in enumerate(row): values[(self.inputTable.row_ids[row_idx], topic)] = val contextTopicTable = PivotCrosstab( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__context__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), missing=0, ) # Case 2: Correspondence analysis... elif self.method == "Correspondence analysis": ca = correspondence(self.inputTable.to_numpy()) # Create segment-topic PivotCrosstab table. segmentTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), np_array=ca.col_factors[:, range(self.numTopics)], header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), ) # Fill listbox... colIds = np.array(self.inputTable.col_ids) newListEntries = list() total_inertia = sum(ca.inertia_of_axis()) for topicNum in range(self.numTopics): propInertia = ca.inertia_of_axis()[topicNum] / total_inertia scores = np.array(ca.col_factors[:,topicNum]) sortedTerms = colIds[scores.argsort()[::-1]] if len(colIds) > MAX_NUM_DISPLAYED_TERMS: displayedTerms = ", ".join( sortedTerms[:MAX_NUM_DISPLAYED_TERMS//2] ) displayedTerms += ", ..., " displayedTerms += ", ".join( sortedTerms[-MAX_NUM_DISPLAYED_TERMS//2:] ) else: displayedTerms = ", ".join(sortedTerms) listEntry = "%i. (%.2f%%) %s" % ( topicNum+1, propInertia*100, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table. contextTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), np_array=ca.row_factors[:, range(self.numTopics)], header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict( (col_id, 'continuous') for col_id in range(self.numTopics) ), ) # Set status to OK and report... self.infoBox.setText("Tables correctly sent to output.") # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send tokens... self.send("Term-topic Textable table", segmentTopicTable) self.send("Document-topic Textable table", contextTopicTable) self.send( "Term-topic Orange table", segmentTopicTable.to_orange_table(), ) self.send( "Document-topic Orange table", contextTopicTable.to_orange_table(), ) self.sendButton.resetSettingsChangedFlag()
def updateCharacterList(self): """Update character list based on Charnet output.""" if self.mustLoad: self.loadModel() self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) string = " ".join(segment.get_content() for segment in self.inputSeg) progressBar.advance() self.char_df = charnet.extract_spacy_df(string, self.nlp) # TODO progress progressBar.advance() self.char_df = charnet.unify_tags(self.char_df) progressBar.advance() self.char_list = charnet.concatenate_parents(self.char_df, min_occ = 1) self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False)
class Treetagger(OWTextableBaseWidget): """Orange widget for POS-tagging and lemmatization with Treetagger""" name = "Treetagger" description = "POS-tagging and lemmatization with Treetagger" icon = "icons/treetagger.svg" priority = 2003 inputs = [("Segmentation", Segmentation, "inputData")] outputs = [("Tagged data", Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) language = settings.Setting(0) replaceUnknown = settings.Setting(False) outputFormat = settings.Setting("segment into words") want_main_area = False configFilePath = os.path.normpath( appdirs.user_data_dir("textable", "langtech") + "/treetagger_path" ) def __init__(self, *args, **kwargs): """Initialize a Message widget""" super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.createdInputs = list() self.noLanguageParameterWarning = ( "Please make sure that at least one language parameter " "file is installed in your Treetagger 'lib' directory, " "then click 'Reload language parameter files'." ) self.noTreetaggerPathWarning = ( "Please click 'Locate Treetagger' below and select the " "base directory of a valid Treetagger distribution." ) self.TreetaggerPath = ( treetaggerwrapper.locate_treetagger() or self.lookupSavedTreetaggerPath() ) self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute=u"infoBox", sendIfPreCallback=self.updateGUI ) gui.separator(self.controlArea, height=3) self.optionsBox = gui.widgetBox( self.controlArea, u"Options", ) self.languageCombobox = gui.comboBox( widget=self.optionsBox, master=self, value="language", items=list(), sendSelectedValue=True, orientation=u"horizontal", label="Input language:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=( u"Select the language of the input text." ), ) self.languageCombobox.setMinimumWidth(120) gui.separator(self.optionsBox, height=3) gui.comboBox( widget=self.optionsBox, master=self, value="outputFormat", items=[ "segment into words", "add XML tags", ], sendSelectedValue=True, orientation=u"horizontal", label="Output format:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=( u"Select the format of the output:\n\n" u"Segment into words: each word is in a separate segment,\n" u"with lemma and POS-tag as annotations.\n\n" u"Add XML tags: output segments correspond to input segments\n" u"and each word is tagged in XML as a 'w' element with\n" u"lemma and POS-tag as attributes." ), ) gui.separator(self.optionsBox, height=3) gui.checkBox( widget=self.optionsBox, master=self, value="replaceUnknown", label="Output token in place of [unknown] lemmas", callback=self.sendButton.settingsChanged, tooltip=( u"For out-of-vocabulary words, the word form is used as the\n" u"lemma (in place of Treetagger's default 'unknown' code)." ), ) gui.rubber(self.controlArea) self.sendButton.draw() self.infoBox.draw() self.locateTreetaggerBox=gui.widgetBox( self.controlArea, addSpace=False, ) gui.separator(self.locateTreetaggerBox, height=3) self.treetaggerButton = gui.button( widget=self.locateTreetaggerBox, master=self, label="Locate Treetagger", callback=self.validateTreetagger, tooltip=( u"Click to select the location of the Treetagger base\n" u"directory (containing the 'lib' and 'bin' subdirectories)." ), ) self.sendButton.sendIf() self.adjustSizeWithTimer() def inputData(self, inputData): """Process incoming data.""" self.segmentation = inputData self.infoBox.inputChanged() self.sendButton.sendIf() def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText( u"Widget needs input", "warning" ) self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar( self, iterations = 5 ) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Re-segment to match the original segmentation structure. tagged_segmentation = Segmenter.import_xml(tagged_input, "ax_tt") self.progressBar.advance() # Replace <unknown> with [unknown], " with " and place # each output line of Treetagger in an xml tag with annotations... xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), "[unknown]"), (re.compile( r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>' ), (re.compile(r'"""'), '"""'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error" ) self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag() def updateGUI(self): """Update GUI state""" if self.TreetaggerPath: self.optionsBox.setDisabled(False) self.locateTreetaggerBox.setVisible(False) self.languageCombobox.clear() languages = self.getAvailableLanguages() if not languages: self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.treetaggerButton.setText("Reload language parameter files") else: self.language = self.language or languages[0] else: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.adjustSizeWithTimer() def getAvailableLanguages(self): languages = list() for lang_code in sorted(treetaggerwrapper.g_langsupport): if lang_code.startswith("__"): continue try: treetaggerwrapper.TreeTagger( TAGLANG=lang_code, TAGDIR=self.TreetaggerPath, ) language = pycountry.languages.get(alpha_2=lang_code).name self.languageCombobox.addItem(language) languages.append(language) except: pass return languages def lookupSavedTreetaggerPath(self): """Look for a saved Treetagger base dir path in app data""" if os.path.exists(self.__class__.configFilePath): try: inputFile = open(self.__class__.configFilePath, "r") TreetaggerSavedPath = inputFile.read() inputFile.close() if self.checkTreetaggerPath(TreetaggerSavedPath): return TreetaggerSavedPath else: os.remove(self.__class__.configFilePath) return None except IOError: pass def validateTreetagger(self): """Respond to user actions needed to validate Treetagger path""" # If the Treetagger path is known, make sure there are language files... if self.TreetaggerPath: if self.getAvailableLanguages(): self.sendButton.settingsChanged() self.updateGUI() else: QMessageBox.warning( None, 'Textable', 'Language parameter files not found.', QMessageBox.Ok ) return # Else if the path is not known... # First try to locate it automatically... TreetaggerPath = treetaggerwrapper.locate_treetagger() # If it fails, let the user locate it manually... if not (TreetaggerPath and self.checkTreetaggerPath(TreetaggerPath)): TreetaggerManualPath = os.path.normpath( str( QFileDialog.getExistingDirectory( self, u"Please locate Treetagger base directory" ) ) ) # If user selected a dir... if TreetaggerManualPath: # Check if selected dir contains Treetagger binary... if self.checkTreetaggerPath(TreetaggerManualPath): TreetaggerPath = TreetaggerManualPath else: QMessageBox.warning( None, 'Textable', 'Not a valid Treetagger base directory.', QMessageBox.Ok ) # If a valid path was found somehow, save config to app data... if TreetaggerPath: try: user_data_editor_dir = os.path.normpath( self.__class__.configFilePath + "/../.." ) if not os.path.exists(user_data_editor_dir): os.makedirs(user_data_editor_dir) user_data_software_dir = os.path.normpath( self.__class__.configFilePath + "/.." ) if not os.path.exists(user_data_software_dir): os.makedirs(user_data_software_dir) outputFile = open(self.__class__.configFilePath, "w") outputFile.write(TreetaggerPath) outputFile.close() except IOError: pass self.TreetaggerPath = TreetaggerPath self.sendButton.settingsChanged() def checkTreetaggerPath(self, path): """Check if path is a valid Treetagger base dir""" return os.path.exists( os.path.normpath( path + "/bin/tree-tagger" + (".exe" if os.name == "nt" else "") ) ) def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText( u"Widget needs input", "warning" ) self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar( self, iterations = 5 ) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Re-segment to match the original segmentation structure. tagged_segmentation = Segmenter.import_xml(tagged_input, "ax_tt") self.progressBar.advance() # Replace <unknown> with [unknown], " with " and place # each output line of Treetagger in an xml tag with annotations... xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), "[unknown]"), (re.compile( r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>' ), (re.compile(r'"""'), '"""'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error" ) self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def searchMovies(self): """Search from imdb movie database""" result_list = {} query_string = self.newQuery if query_string != "": counter = 1 counter_max = int(self.nbr_results) result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar progressBar = ProgressBar(self, iterations=counter_max) ia = imdb.IMDb() # movie name name = query_string # searching the movie search = ia.search_movie(name) print(search) # Each result is stored in a dictionnary with its title # and year of publication if it is specified for result in search: if counter <= counter_max: #print(counter) #print(counter_max) try: result_id += 1 year = result['year'] movie_id = result.movieID result_list[result_id] = { 'name': result, 'year': year, 'id': movie_id } except KeyError: result_id += 1 result_list[result_id] = { 'name': result, } counter += 1 else: break # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: try: result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}' self.titleLabels.append(result_string) except KeyError: result_string = f'{self.searchResults[idx]["name"]}' self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(False) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("Please enter a movie title", "warning")
def refreshDatabaseCache(self): """Refresh the database cache""" basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) cacheFoldername = self.__class__.cachedFoldername if os.path.exists(cacheFoldername) and list(os.walk('.'))[0]: dialog = AnyQt.QtGui.QMessageBox() response = dialog.question(self, "CHILDES", "Keep previously saved files?", dialog.Yes | dialog.No) self.infoBox.setText( "Connecting to CHILDES website, please wait...", "warning", ) progressBar = ProgressBar(self, iterations=1) self.controlArea.setDisabled(True) # Scrape website... self.database = dict() self.importedCorpora = list() try: self.recursivelyScrapeUrl( self.__class__.baseUrl, self.database, ) # Dump cache to file... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open( os.path.join(path, self.__class__.cacheFilename), "wb", ) pickle.dump(self.database, file) file.close() except IOError: self.infoBox.setText( "Couldn't save database to disk.", "warning", ) self.sendButton.settingsChanged() except requests.exceptions.ConnectionError: self.infoBox.setText( "Error while attempting to scrape the CHILDES website.", "error", ) self.send("Files", None, self) self.send("Utterances", None, self) # Remove saved files if required... try: if response == dialog.No: shutil.rmtree(cacheFoldername) except UnboundLocalError: pass progressBar.advance() progressBar.finish() self.currentFolder = self.__class__.baseUrl self.updateDisplayedFolders() self.updateSelection() self.controlArea.setDisabled(False)
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() list_review = list() annotations = list() try: for item in self.myBasket: ia = imdb.IMDb() movie = ia.get_movie_reviews(item['id']) list_review.append(movie) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: #for key, value in movie.items(): #try: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) new_dict = review.copy() annotations.append(new_dict) """ except: self.infoBox.setText( "The movie has no associated reviews", "warning" ) self.controlArea.setDisabled(False) return """ # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.infoBox.setText( "Please download a language model first.", "warning", ) self.tabs.setCurrentIndex(1) return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return # Check max length and adjust if needed... inputLength = sum(len(s.get_content()) for s in self.inputSeg) if self.maxLen != "no limit": maxNumChar = int(self.maxLen.split()[0]) * 1000000 if inputLength > maxNumChar: self.infoBox.setText( "Input exceeds max number of characters set by user.", "warning", ) for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return else: if inputLength > self.nlp.max_length: maxNumChar = inputLength # Load components if needed... disabled, enabled = self.getComponentStatus() if self.mustLoad or not( self.nlp and set(enabled) <= set(self.loadedComponents) ): self.loadModel() self.nlp.max_length = maxNumChar # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenSegments = list() entitySegments = list() chunkSegments = list() sentenceSegments = list() # Process each input segment... for segment in self.inputSeg: # NLP analysis... disabled, _ = self.getComponentStatus() disabled = [c for c in disabled if c in set(self.loadedComponents)] with self.nlp.disable_pipes(*disabled): doc = self.nlp(segment.get_content()) # Get token segments... tokenSegments.extend(spacyItemsToSegments(doc, segment)) # Get named entity segments... if self.segmentEntities: entitySegments.extend(spacyItemsToSegments(doc.ents, segment)) # Get noun chunk segments... if self.segmentChunks: chunkSegments.extend( spacyItemsToSegments(doc.noun_chunks, segment), ) # Get sentences segments... if self.segmentSentences: sentenceSegments.extend( spacyItemsToSegments(doc.sents, segment), ) progressBar.advance() # Build segmentations and send them to output... tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens") self.send("Tokenized text", tokenSeg, self) if self.segmentChunks: chunkSeg = Segmentation( chunkSegments, self.captionTitle + "_chunks", ) self.send("Noun chunks", chunkSeg, self) if self.segmentEntities: entitySeg = Segmentation( entitySegments, self.captionTitle + "_entities", ) self.send("Named entities", entitySeg, self) if self.segmentSentences: sentenceSeg = Segmentation( sentenceSegments, self.captionTitle + "_sentences", ) self.send("Sentences", sentenceSeg, self) # Set status to OK and report data size... message = "%i token@p" % len(tokenSeg) message = pluralize(message, len(tokenSeg)) if self.segmentChunks: message += ", %i chunk@p" % len(chunkSeg) message = pluralize(message, len(chunkSeg)) if self.segmentEntities: message += ", %i " % len(entitySeg) message += "entity" if len(entitySeg) == 1 else "entities" if self.segmentSentences: message += ", %i sentence@p" % len(sentenceSeg) message = pluralize(message, len(sentenceSeg)) message += " sent to output." last_comma_idx = message.rfind(",") if last_comma_idx > -1: message = message[:last_comma_idx] + " and" + \ message[last_comma_idx+1:] self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) self.controlArea.setDisabled(False) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ( (self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings) ): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning' ) self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[ self.file, self.encoding, u'', u'', u'', self.ocrLanguages, self.ocrForce ]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar( self, iterations=len(myFiles) ) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content(filePath) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content(filePath) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath) # End SuperTextFiles except IOError: progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8') ) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) text_content = list() annotations = list() try: # Retrieve selected texts from gutenberg for text in self.myBasket: gutenberg_id = text[2] # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode( "utf-8") text_content.append(gutenberg_text) # populate the annotation list annotations.append([text[0], text[1], text[3]]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception as exc: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) print(exc) return # Store downloaded text strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there's only one text, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation. else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments with book metadata for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx][0]}) segment.annotations.update({"author": annotations[idx][1]}) segment.annotations.update({"language": annotations[idx][2]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def get_large_audio_transcription(self, path, language, set_silence_len=500, set_silence_threshold=14): """ Splitting the large audio file into chunks and apply speech recognition on each of these chunks """ # Create a temporary folder to handle the chunks, will be deleted upon completion of the task with tempfile.TemporaryDirectory() as tempDict: # Initialize the recognizer r = sr.Recognizer() if 'wav' not in self.file or 'mp3' not in self.file: return # Check type of the audio file and change it to wav if mp3 audio_type = self.detect_format(path) if audio_type == "mp3": path = self.to_wav(path, tempDict) # Open the audio file using pydub sound = AudioSegment.from_wav(path) # Split audio sound where silence is 700 milliseconds or more and get chunks chunks = split_on_silence( sound, # Experiment with this value for your target audio file min_silence_len=set_silence_len, # Adjust this per requirement silence_thresh=sound.dBFS - set_silence_threshold, # Keep the silence for 1 second, adjustable as well keep_silence=500, ) # Initiates ouput variables (depending on advanced settings) whole_text = "" segments = list() #Initiate alert message and progress bar progressBar = ProgressBar(self, iterations=len(chunks)) # Process each chunk for i, audio_chunk in enumerate(chunks, start=1): # export audio chunk and save it in # the tempDict directory. chunk_filename = os.path.join(tempDict, f"chunk{i}.wav") audio_chunk.export(chunk_filename, format="wav") # recognize the chunk with sr.AudioFile(chunk_filename) as source: audio_listened = r.record(source) # Try converting it to text try: # Get the value of the chosen language in the dictionnary text = r.recognize_google( audio_listened, language=AudioFile.dict_languages[self.language]) except sr.UnknownValueError as e: print("Error : ", str(e)) else: # Creates an entry of the list "segments" for each audio_chunk if self.selected_seg: segmented_text = f"{text.capitalize()}. " print(chunk_filename, " : ", segmented_text) segments.append(segmented_text) # Add the segment to the segmentation else: # Returns transciprtion as whole_text text = f"{text.capitalize()}. " print(chunk_filename, " : ", text) whole_text += text self.infoBox.setText(u"Processing, please wait...", "warning") progressBar.advance() # return the text for all chunks detected if self.selected_seg: progressBar.finish() return segments else: progressBar.finish() return whole_text