def downloadModels(self): """Respond to Download button (Model manager tab).""" global INSTALLED_MODELS # Ask for confirmation... num_models = len(self.selectedModels) message = "Your are about to download %i language model@p. " + \ "This may take up to several minutes depending on your " + \ "internet connection. Do you want to proceed?" message = message % num_models buttonReply = QMessageBox.question(self, "Textable", pluralize(message, num_models), QMessageBox.Ok | QMessageBox.Cancel) if buttonReply == QMessageBox.Cancel: return # Download models... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_models) for model_idx in reversed(self.selectedModels): model = self.downloadableModelLabels[model_idx] download_spacy_model(AVAILABLE_MODELS[model]) del self.downloadableModelLabels[model_idx] progressBar.advance() # Update GUI... self.downloadableModelLabels = self.downloadableModelLabels self.selectedModels = list() progressBar.finish() self.controlArea.setDisabled(False) message = "Downloaded %i language model@p, please restart " + \ "Orange for changes to take effect." message = message % num_models QMessageBox.information(None, "Textable", pluralize(message, num_models), QMessageBox.Ok)
def create_content_segment(self, post, includeImage=False): """ Creation of segments for posts""" progressBar = ProgressBar(self, iterations=1) self.infoBox.setText("Processing post content, please wait.", "warning") # Create annotations annotations = dict() annotations["Title"] = post.title annotations["Id"] = post.id annotations["Parent"] = post.id annotations["Author"] = post.author annotations["Score"] = post.score annotations["Parent_type"] = "0" # Time annotations time = post.created_utc ts = int(time) date = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') annotations["Posted_Unix"] = time annotations["Posted_at"] = date # author, created_utc (ou created ?) et score content = post.selftext if content == "": content = "[image]" if not (includeImage == False and content == "[image]"): # Create a segment is the user wishes to have a segment # When there is only an image progressBar.advance() self.listeTempPosts.append(content) self.listeTempAnnot.append(annotations) progressBar.finish() return
def get_all_titles(self): '''php_query_string and http_query_string are the variable that will need to be changed if different database is used or if current database's structure undergoes changes''' php_query_string = '/movie_script.php?movie=' http_query_string = 'https://www.springfieldspringfield.co.uk/' + \ 'movie_scripts.php?order=' alphabet = ['0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'K', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(alphabet) ) self.controlArea.setDisabled(True) try: for lettre in alphabet: page_num = 1 # 1 tick on the progress bar of the widget progressBar.advance() # This part of code is what gets all the movie titles from each # page of the database while True: page_url = http_query_string + '%s&page=%i' % ( lettre, page_num, ) page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # script_links is a variable that may need to be changed if # another database is used or current database undergoes # change script_links = soup.findAll('a', attrs={'class': re.compile("^script-list-item")}) if not script_links: break links = dict() for link in soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")} ): links[link.text] = \ link.get('href')[len(php_query_string):] self.title_to_href.update(links) page_num += 1 except: self.infoBox.setText( "Couldn't download data from springfieldspringfield website.", "error" ) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) return self.title_to_href
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList self.send("CSV Segmentation", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # Treat... for segment in self.csvSeg: pass progressBar.advance() # Set status to OK and report data size... outputSeg = Segmentation(self.csvSeg, label=self.captionTitle) if len(self.contentIsNone) == 0 : message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # message if one or more segments has no content and has been ignored elif len(self.contentIsNone) == 1: message = "%i segment@p sent to output. (ignored %i segment with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) else : message = "%i segment@p sent to output. (ignored %i segments with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("CSV Segmentation", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def searchFunction(self): self.controlArea.setDisabled(True) # Search from the springfieldspringfield.co.uk query_string = self.newQuery testdict = self.title_to_href # Reset and clear the visible widget list del self.titleLabels[:] self.titleLabels = self.titleLabels del self.movie_titles[:] self.movie_titles = self.movie_titles if query_string != "": # Initialize progress bar. progressBar = ProgressBar(self, iterations=1) self.searchResults = process.extractBests( query_string, testdict, limit=100000, score_cutoff=80 ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(self.searchResults)) for key, score, val in self.searchResults: self.titleLabels.append(val) self.movie_titles.append(val) self.path_storage[val] = key # 1 tick on the progress bar of the widget progressBar.advance() self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.controlArea.setDisabled(False) # Clear progress bar. progressBar.finish() if self.searchResults: self.infoBox.setText("Search complete") elif self.searchResults == []: self.infoBox.setText("No result please try again", 'warning') else: self.infoBox.setText( "Please, enter a query in a search bar", "warning" ) self.controlArea.setDisabled(False)
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # Basic NLP analysis for dev purposes... analyzedSegments = list() for segment in self.inputSeg: analyzedString = "" doc = self.nlp(segment.get_content()) for token in doc: analyzedString += "%s\t%s\n" % (token.text, token.pos_) analyzedSegments.append(Input(analyzedString)) progressBar.advance() outputSeg = LTTL.Segmenter.concatenate( analyzedSegments, import_labels_as=None, label=self.captionTitle, ) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def loadModel(self): """(Re-)load language model if needed.""" # Initialize progress bar. self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) self.nlp = spacy.load( #AVAILABLE_MODELS[self.model], "en_core_web_sm", ) self.mustLoad = False progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False)
def loadModel(self): """(Re-)load language model if needed.""" # Initialize progress bar. self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) disabled, enabled = self.getComponentStatus() self.nlp = spacy.load( AVAILABLE_MODELS[self.model], disable=disabled, ) self.loadedComponents = enabled self.updateReloadNeededLabels() self.mustLoad = False progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False)
def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpus: self.infoBox.setText("Please select a corpus to import.", "warning") self.send("XML data", None, self) return progressBar = ProgressBar(self, iterations=1) response = requests.get(self.importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) for file in myZip.infolist(): print(file.filename, len(myZip.read(file))) progressBar.advance() progressBar.finish() corpus = self.displayedFolderLabels[self.selectedItems[0]] self.importedCorpusLabel.setText("Corpus %s correctly imported." % corpus) self.infoBox.setText("All good!") self.sendButton.resetSettingsChangedFlag()
def loadModel(self): """(Re-)load language model if needed.""" # Display warning, disable UI and initialize progress bar... self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) # Load model and reset UI. self.nlp = spacy.load(spacy_widget.AVAILABLE_MODELS[self.model]) progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) # Update char list if there's an input... if self.inputSeg: self.updateCharacterList() self.sendButton.settingsChanged()
def languageChanged(self): """Load the appropriate model according to user choice""" self.infoBox.setText( u"Loading model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) if self.language == "French": self.cv = self.loadModelFR() elif self.language == "English": self.cv = self.loadModelEN() elif self.language == "Portuguese": self.cv = self.loadModelPT() elif self.language == "Dutch": self.cv = self.loadModelLN() elif self.language == "German": self.cv = self.loadModelDE() elif self.language == "Greek": self.cv = self.loadModelEL() elif self.language == "Italian": self.cv = self.loadModelIT() elif self.language == "Lithuanian": self.cv = self.loadModelLT() elif self.language == "Norwegian": self.cv = self.loadModelNB() elif self.language == "Spanish": self.cv = self.loadModelES() progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.settingsChanged()
def create_comments_segments(self, post): """ Creation of segments for each comment in the post""" # Get the totality of the comments under a post post.comments.replace_more(limit=None) comments = post.comments.list() progressBar = ProgressBar(self, iterations=len(comments)) self.infoBox.setText( "Processing comments, this can take a while, please wait.", "warning" ) # Creation of a segment for each comment for comment in comments: annotations = dict() annotations["Title"] = post.title annotations["Id"] = comment.id annotations["Author"] = comment.author annotations["Score"] = comment.score # Time annotations time = comment.created_utc ts = int(time) date = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') annotations["Posted_Unix"] = time annotations["Posted_at"] = date # author, created_utc (ou created ?) et score parentId= comment.parent_id.split("_") annotations["Parent"] = parentId[1] annotations["Parent_type"] = parentId[0][1] self.listeTempPosts.append(comment.body) self.listeTempAnnot.append(annotations) progressBar.advance() progressBar.finish() return
def updateCharacterList(self): """Update character list based on Charnet output.""" if self.mustLoad: self.loadModel() self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) string = " ".join(segment.get_content() for segment in self.inputSeg) progressBar.advance() self.char_df = charnet.extract_spacy_df(string, self.nlp) # TODO progress progressBar.advance() self.char_df = charnet.unify_tags(self.char_df) progressBar.advance() self.char_list = charnet.concatenate_parents(self.char_df, min_occ = 1) self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False)
def updateCharacterList(self): """Update character list based on Charnetto output.""" # Sanity checks... if not self.model or not self.inputSeg: return # Init UI... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) # Get input strings... strings = [segment.get_content() for segment in self.inputSeg] progressBar.advance() # Extract character tokens... # if self.sourceType == "Plain text": # self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # elif self.sourceType == "IMSDB-formatted script": # self.char_df = charnetto.extract_movie_df(" ".join(strings)) self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # TODO deal with \n in names progressBar.advance() # Unify spaCy tags to match those of flair... self.char_df = charnetto.unify_tags(self.char_df) progressBar.advance() # Collapse characters whose name is the prefix of another. self.char_list = charnetto.concatenate_parents(self.char_df, min_occ = 1) # Build char list and reset UI. self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) # Cache character list for resetting if needed. self.cachedCaracters = self.characters[:]
def sendData(self): """Send data from website springfieldspringfield""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning" ) self.segmentation = None self.send("Movie transcripts", self.segmentation, self) return # Clear created Inputs. self.clearCreatedInputs() annotations = list() script_list = list() annotations_dict = dict() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # This part of code is what fetches the actual script try: for movie in self.myBasket: # Each movie that is in the corpus is split into title and year # (rsplit makes sure to only split last occurence) which will # become annotations b = copy.copy(movie) future_annotation = b.rsplit('(', 1) movie_title = future_annotation[0] movie_year = future_annotation[-1] movie_year = movie_year[:-1] annotations_dict["Movie Title"] = movie_title annotations_dict["Year of release"] = movie_year # It is important to make a copy of dictionary, otherwise each # iteration will replace every element of the annotations list annotations.append(annotations_dict.copy()) # link_end and page_url are the two variables that will have to # be changed in case scripts need to be taken from elsewhere link_end = self.path_storage[movie] page_url = "https://www.springfieldspringfield.co.uk/" + \ "movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # This is what grabs the movie script script = soup.find("div", {"class":"movie_script"}) script_list.append(script.text) # 1 tick on the progress bar of the widget progressBar.advance() except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error" ) self.controlArea.setDisabled(False) return # Store downloaded script strings in input objects... for script in script_list: newInput = Input(script, self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Movie transcripts", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Connect to imdb and add elements in lists list_review = list() list_annotation = list() annotations = list() try: for item in self.myBasket: movie = self.ia.get_movie_reviews(item['id']) movie_annotations = self.ia.get_movie(item['id']) list_review.append(movie) list_annotation.append(movie_annotations) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) for item in list_annotation: print(item) # Store the annotation as dicts in a separate list annotations_dict = {"title": item, "year": item["year"]} annot_dict_copy = annotations_dict.copy() for i in range(25): annotations.append(annot_dict_copy) print(annotations) # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += " (%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def searchMovies(self): """Search from imdb movie database""" result_list = {} query_string = self.newQuery if query_string != "": counter = 1 counter_max = int(self.nbr_results) result_id = 0 self.controlArea.setDisabled(True) # Initialize progress bar progressBar = ProgressBar(self, iterations=counter_max) filtered_results = list() if self.type_results == 'Title': # movie name movie_name = query_string # searching the movie search = self.ia.search_movie(movie_name) for film in search: if 'year' in film: filtered_results.append(film) elif self.type_results == 'Actor': actor_name = query_string people = self.ia.search_person(actor_name) searched_actor = people[0].personID first_search = self.ia.get_person_filmography(searched_actor) # Checks if the user input is a valid actor/actress if 'actor' in first_search['data']['filmography']: search = first_search['data']['filmography']['actor'] elif 'actress' in first_search['data']['filmography']: search = first_search['data']['filmography']['actress'] else: search = list() self.infoBox.setText( "Please enter a valid actor or actress name", "warning") # Checks if the movie has a year associated and stores it in a list filtered_results = [film for film in search if 'year' in film] if self.filter_results == 'Random': random.shuffle(filtered_results) elif self.filter_results == 'Alphabetical': alpha_dict = dict() for result in filtered_results: my_id = result.movieID alpha_dict[str(result)] = my_id print(alpha_dict) sorted_dict = sorted(alpha_dict.keys(), key=lambda x: x.lower()) print(sorted_dict) filtered_results = list() for i in sorted_dict: value = alpha_dict[i] print(value) print(self.ia.get_movie(value)) filtered_results.append(self.ia.get_movie(value)) # Each result is stored in a dictionnary with its title # and year of publication if it is specified for result in filtered_results: if counter <= counter_max: try: result_id += 1 year = result['year'] movie_id = result.movieID result_list[result_id] = { 'name': result, 'year': year, 'id': movie_id } except KeyError: continue counter += 1 else: break # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}' self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("Please type something in the search bar", "warning")
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.infoBox.setText( "Please download a language model first.", "warning", ) self.tabs.setCurrentIndex(1) return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return # Check max length and adjust if needed... inputLength = sum(len(s.get_content()) for s in self.inputSeg) if self.maxLen != "no limit": maxNumChar = int(self.maxLen.split()[0]) * 1000000 if inputLength > maxNumChar: self.infoBox.setText( "Input exceeds max number of characters set by user.", "warning", ) for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return else: if inputLength > self.nlp.max_length: maxNumChar = inputLength # Load components if needed... disabled, enabled = self.getComponentStatus() if self.mustLoad or not( self.nlp and set(enabled) <= set(self.loadedComponents) ): self.loadModel() self.nlp.max_length = maxNumChar # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenSegments = list() entitySegments = list() chunkSegments = list() sentenceSegments = list() # Process each input segment... for segment in self.inputSeg: # NLP analysis... disabled, _ = self.getComponentStatus() disabled = [c for c in disabled if c in set(self.loadedComponents)] with self.nlp.disable_pipes(*disabled): doc = self.nlp(segment.get_content()) # Get token segments... tokenSegments.extend(spacyItemsToSegments(doc, segment)) # Get named entity segments... if self.segmentEntities: entitySegments.extend(spacyItemsToSegments(doc.ents, segment)) # Get noun chunk segments... if self.segmentChunks: chunkSegments.extend( spacyItemsToSegments(doc.noun_chunks, segment), ) # Get sentences segments... if self.segmentSentences: sentenceSegments.extend( spacyItemsToSegments(doc.sents, segment), ) progressBar.advance() # Build segmentations and send them to output... tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens") self.send("Tokenized text", tokenSeg, self) if self.segmentChunks: chunkSeg = Segmentation( chunkSegments, self.captionTitle + "_chunks", ) self.send("Noun chunks", chunkSeg, self) if self.segmentEntities: entitySeg = Segmentation( entitySegments, self.captionTitle + "_entities", ) self.send("Named entities", entitySeg, self) if self.segmentSentences: sentenceSeg = Segmentation( sentenceSegments, self.captionTitle + "_sentences", ) self.send("Sentences", sentenceSeg, self) # Set status to OK and report data size... message = "%i token@p" % len(tokenSeg) message = pluralize(message, len(tokenSeg)) if self.segmentChunks: message += ", %i chunk@p" % len(chunkSeg) message = pluralize(message, len(chunkSeg)) if self.segmentEntities: message += ", %i " % len(entitySeg) message += "entity" if len(entitySeg) == 1 else "entities" if self.segmentSentences: message += ", %i sentence@p" % len(sentenceSeg) message = pluralize(message, len(sentenceSeg)) message += " sent to output." last_comma_idx = message.rfind(",") if last_comma_idx > -1: message = message[:last_comma_idx] + " and" + \ message[last_comma_idx+1:] self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def treat_input(self): # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # clear lists del self.csvSeg[:] del self.contentIsNone[:] # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() if not self.deleteQuotes == False : inputContent = inputContent.replace('"',"") inputAnnotations = segment.annotations inputStrIdx = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) #Call data processing csv_stream = io.StringIO(inputContent) dialect = sniffer.sniff(csv_stream.readline()) dialect.quoting=csv.QUOTE_NONE csv_stream.seek(0) my_reader = csv.reader(csv_stream, dialect) position = 0 # Process each seg in inputContent for seg in inputContent: segAnnotations = inputAnnotations.copy() # This will launch if sniffer detects a header in the content. if sniffer.has_header(inputContent) == True: # go back to the start otherwise we're going to start from the # second row csv_stream.seek(0) # the header row is defined here. if self.isRenamed == False : self.dict_keys = next(my_reader) for key in self.dict_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) else : input_keys = next(my_reader) for key in input_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) # This will launch if sniffer does not detect a header # in the content. if sniffer.has_header(inputContent) == False: # go back to the start otherwise we're going to start from the # second row. we do this here even though we don't really care # about the first row simply because in general we consider the # first row to not have any missing values csv_stream.seek(0) first_row = next(my_reader) n_cols = len(first_row) if self.isRenamed == False : self.dict_keys = list() for item in range(1, n_cols+1): self.dict_keys.append(str(item)) csv_stream.seek(0) # clear the list before appending del self.headerList[:] for key in self.dict_keys: # appends the headers to the gui list if self.dict_keys.index(key) == self.content_column: self.headerList.append(str(key)+"(*content)") self.headerList = self.headerList else : self.headerList.append(str(key)) self.headerList = self.headerList for idx, row in enumerate(my_reader, start=2): # Get old annotations in new dictionary oldAnnotations = inputAnnotations.copy() segAnnotations = dict() # initiate next row starting position next_position = position for key in oldAnnotations.keys(): segAnnotations[key] = oldAnnotations[key] # This is the main part where we transform our data into # annotations. for key in self.dict_keys: # segAnnotations["length"] = position # segAnnotations["row"] = str(row) # if column is content (first column (0) by default) if self.dict_keys.index(key) == self.content_column: # put value as content content = row[self.dict_keys.index(key)] # else we put value in annotation else: # only if value is not None if len(row[self.dict_keys.index(key)]) != 0 : segAnnotations[key] = row[self.dict_keys.index(key)] # implement position and next_position depending on # content column if self.dict_keys.index(key) < self.content_column: position += len(row[self.dict_keys.index(key)]) + 1 next_position += len(row[self.dict_keys.index(key)]) + 1 if self.dict_keys.index(key) >= self.content_column: next_position += len(row[self.dict_keys.index(key)]) + 1 if len(content) != 0: self.csvSeg.append( Segment( str_index = inputStrIdx, start = position, end = position + len(content), annotations = segAnnotations ) ) else : # if no content, add idx of the row and do not append # TODO : something with contentIsNone self.contentIsNone.append(idx) # initiate new row starting position position = next_position progressBar.advance() unSeg = len(self.csvSeg) # Set status to OK and report segment analyzed... message = "%i segment@p analyzed." % unSeg message = pluralize(message, unSeg) message += " (Ignored %i segment@p with no content)" % \ len(self.contentIsNone) message = pluralize(message, len(self.contentIsNone)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() self.sendButton.sendIf()
def sendData(self): """Compute result of widget processing and send to output""" # Clear morphology... self.morphology = dict() # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Morphologically analyzed data", None, self) self.updateGUI() return # Perform morphological analysis... # Initialize progress bar. self.infoBox.setText( u"Processing, please wait (word count)...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=100) # Word count... wordCounts = collections.Counter( [segment.get_content() for segment in self.inputSeg]) self.morphology["wordCounts"] = wordCounts self.infoBox.setText( u"Processing, please wait (signature extraction)...", "warning", ) progressBar.advance(5) # 5 ticks on the progress bar... # Learn signatures... try: lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts) self.morphology["signatures"] = signatures self.morphology["stems"] = stems self.morphology["suffixes"] = suffixes except ValueError as e: self.infoBox.setText(e.__str__(), "warning") self.send("Morphologically analyzed data", None, self) self.controlArea.setDisabled(False) progressBar.finish() # Clear progress bar. self.morphology = dict() self.updateGUI() return self.infoBox.setText( u"Processing, please wait (word parsing)...", "warning", ) progressBar.advance(80) # Parse words... parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes) self.morphology["parser"] = parser newSegments = list() num_analyzed_words = 0 for segment in self.inputSeg: parses = parser[segment.get_content()] newSegment = segment.deepcopy() if parses[0].signature: num_analyzed_words += 1 newSegment.annotations.update( { "stem": parses[0].stem, "suffix": parses[0].suffix \ if len(parses[0].suffix) else "NULL", "signature": parses[0].signature } ) newSegments.append(newSegment) self.send( "Morphologically analyzed data", Segmentation(newSegments, self.captionTitle), self, ) self.updateGUI() progressBar.advance(15) # Set status to OK and report data size... message = "%i segment@p sent to output (%.2f%% analyzed)." % (len( self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100)) message = pluralize(message, len(self.inputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
class Treetagger(OWTextableBaseWidget): """Orange widget for POS-tagging and lemmatization with Treetagger""" name = "Treetagger" description = "POS-tagging and lemmatization with Treetagger" icon = "icons/treetagger.svg" priority = 2003 inputs = [("Segmentation", Segmentation, "inputData")] outputs = [("Tagged data", Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) language = settings.Setting(0) replaceUnknown = settings.Setting(False) outputFormat = settings.Setting("segment into words") want_main_area = False configFilePath = os.path.normpath( appdirs.user_data_dir("textable", "langtech") + "/treetagger_path") def __init__(self, *args, **kwargs): """Initialize a Message widget""" super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.createdInputs = list() self.noLanguageParameterWarning = ( "Please make sure that at least one language parameter " "file is installed in your Treetagger 'lib' directory, " "then click 'Reload language parameter files'.") self.noTreetaggerPathWarning = ( "Please click 'Locate Treetagger' below and select the " "base directory of a valid Treetagger distribution.") self.TreetaggerPath = (treetaggerwrapper.locate_treetagger() or self.lookupSavedTreetaggerPath()) self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton(widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute=u"infoBox", sendIfPreCallback=self.updateGUI) gui.separator(self.controlArea, height=3) self.optionsBox = gui.widgetBox( self.controlArea, u"Options", ) self.languageCombobox = gui.comboBox( widget=self.optionsBox, master=self, value="language", items=list(), sendSelectedValue=True, orientation=u"horizontal", label="Input language:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=(u"Select the language of the input text."), ) self.languageCombobox.setMinimumWidth(120) gui.separator(self.optionsBox, height=3) gui.comboBox( widget=self.optionsBox, master=self, value="outputFormat", items=[ "segment into words", "add XML tags", ], sendSelectedValue=True, orientation=u"horizontal", label="Output format:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=( u"Select the format of the output:\n\n" u"Segment into words: each word is in a separate segment,\n" u"with lemma and POS-tag as annotations.\n\n" u"Add XML tags: output segments correspond to input segments\n" u"and each word is tagged in XML as a 'w' element with\n" u"lemma and POS-tag as attributes."), ) gui.separator(self.optionsBox, height=3) gui.checkBox( widget=self.optionsBox, master=self, value="replaceUnknown", label="Output token in place of [unknown] lemmas", callback=self.sendButton.settingsChanged, tooltip=( u"For out-of-vocabulary words, the word form is used as the\n" u"lemma (in place of Treetagger's default 'unknown' code)."), ) gui.rubber(self.controlArea) self.sendButton.draw() self.infoBox.draw() self.locateTreetaggerBox = gui.widgetBox( self.controlArea, addSpace=False, ) gui.separator(self.locateTreetaggerBox, height=3) self.treetaggerButton = gui.button( widget=self.locateTreetaggerBox, master=self, label="Locate Treetagger", callback=self.validateTreetagger, tooltip=( u"Click to select the location of the Treetagger base\n" u"directory (containing the 'lib' and 'bin' subdirectories)."), ) self.sendButton.sendIf() self.adjustSizeWithTimer() def inputData(self, inputData): """Process incoming data.""" self.segmentation = inputData self.infoBox.inputChanged() self.sendButton.sendIf() def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag() def updateGUI(self): """Update GUI state""" if self.TreetaggerPath: self.optionsBox.setDisabled(False) self.locateTreetaggerBox.setVisible(False) self.languageCombobox.clear() languages = self.getAvailableLanguages() if not languages: self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.treetaggerButton.setText( "Reload language parameter files") else: self.language = self.language or languages[0] else: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.adjustSizeWithTimer() def getAvailableLanguages(self): languages = list() for lang_code in sorted(treetaggerwrapper.g_langsupport): if lang_code.startswith("__"): continue try: treetaggerwrapper.TreeTagger( TAGLANG=lang_code, TAGDIR=self.TreetaggerPath, ) language = pycountry.languages.get(alpha_2=lang_code).name self.languageCombobox.addItem(language) languages.append(language) except: pass return languages def lookupSavedTreetaggerPath(self): """Look for a saved Treetagger base dir path in app data""" if os.path.exists(self.__class__.configFilePath): try: inputFile = open(self.__class__.configFilePath, "r") TreetaggerSavedPath = inputFile.read() inputFile.close() if self.checkTreetaggerPath(TreetaggerSavedPath): return TreetaggerSavedPath else: os.remove(self.__class__.configFilePath) return None except IOError: pass def validateTreetagger(self): """Respond to user actions needed to validate Treetagger path""" # If the Treetagger path is known, make sure there are language files... if self.TreetaggerPath: if self.getAvailableLanguages(): self.sendButton.settingsChanged() self.updateGUI() else: QMessageBox.warning(None, 'Textable', 'Language parameter files not found.', QMessageBox.Ok) return # Else if the path is not known... # First try to locate it automatically... TreetaggerPath = treetaggerwrapper.locate_treetagger() # If it fails, let the user locate it manually... if not (TreetaggerPath and self.checkTreetaggerPath(TreetaggerPath)): TreetaggerManualPath = os.path.normpath( str( QFileDialog.getExistingDirectory( self, u"Please locate Treetagger base directory"))) # If user selected a dir... if TreetaggerManualPath: # Check if selected dir contains Treetagger binary... if self.checkTreetaggerPath(TreetaggerManualPath): TreetaggerPath = TreetaggerManualPath else: QMessageBox.warning( None, 'Textable', 'Not a valid Treetagger base directory.', QMessageBox.Ok) # If a valid path was found somehow, save config to app data... if TreetaggerPath: try: user_data_editor_dir = os.path.normpath( self.__class__.configFilePath + "/../..") if not os.path.exists(user_data_editor_dir): os.makedirs(user_data_editor_dir) user_data_software_dir = os.path.normpath( self.__class__.configFilePath + "/..") if not os.path.exists(user_data_software_dir): os.makedirs(user_data_software_dir) outputFile = open(self.__class__.configFilePath, "w") outputFile.write(TreetaggerPath) outputFile.close() except IOError: pass self.TreetaggerPath = TreetaggerPath self.sendButton.settingsChanged() def checkTreetaggerPath(self, path): """Check if path is a valid Treetagger base dir""" return os.path.exists( os.path.normpath(path + "/bin/tree-tagger" + (".exe" if os.name == "nt" else ""))) def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
def searchMovies(self): """Search from imdb movie database""" result_list = {} query_string = self.newQuery if query_string != "": counter = 1 counter_max = int(self.nbr_results) result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar progressBar = ProgressBar(self, iterations=counter_max) ia = imdb.IMDb() # movie name name = query_string # searching the movie search = ia.search_movie(name) print(search) # Each result is stored in a dictionnary with its title # and year of publication if it is specified for result in search: if counter <= counter_max: #print(counter) #print(counter_max) try: result_id += 1 year = result['year'] movie_id = result.movieID result_list[result_id] = { 'name': result, 'year': year, 'id': movie_id } except KeyError: result_id += 1 result_list[result_id] = { 'name': result, } counter += 1 else: break # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: try: result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}' self.titleLabels.append(result_string) except KeyError: result_string = f'{self.searchResults[idx]["name"]}' self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(False) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("Please enter a movie title", "warning")
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def searchFunction(self): """Search from website Genius""" result_list = {} query_string = self.newQuery if query_string != "": page = 1 page_max = int(self.nbr_results) / 10 result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=page_max) while page <= page_max: values = {'q': query_string, 'page': page} data = urllib.parse.urlencode(values) query_url = 'http://api.genius.com/search?' + data json_obj = self.url_request(query_url) body = json_obj["response"]["hits"] # Each result is stored in a dictionnary with its title, # artist's name, artist's ID and URL path for result in body: result_id += 1 title = result["result"]["title"] artist = result["result"]["primary_artist"]["name"] artist_id = result["result"]["primary_artist"]["id"] path = result["result"]["path"] result_list[result_id] = { 'artist': artist, 'artist_id': artist_id, 'path': path, 'title': title } page += 1 # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = self.searchResults[idx]["title"] + " - " + \ self.searchResults[idx]["artist"] self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning")
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some songs first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() song_content = list() annotations = list() try: for song in self.myBasket: # song is a dict {'idx1':{'title':'song1'...}, # 'idx2':{'title':'song2'...}} page_url = "http://genius.com" + song['path'] lyrics = self.html_to_text(page_url) song_content.append(lyrics) annotations.append(song.copy()) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Genius website.", "error") self.controlArea.setDisabled(False) return # Store downloaded lyrics strings in input objects... for song in song_content: newInput = Input(song, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Lyrics importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to Theatre-classique and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: response = urllib.request.urlopen( self.document_base_url + self.filteredTitleSeg[title].annotations["url"]) xml_contents.append(response.read().decode('utf-8')) annotations.append( self.filteredTitleSeg[title].annotations.copy()) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download data from theatre-classique website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) self.controlArea.setDisabled(False) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) text_content = list() annotations = list() try: # Retrieve selected texts from gutenberg for text in self.myBasket: gutenberg_id = text[2] # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode( "utf-8") text_content.append(gutenberg_text) # populate the annotation list annotations.append([text[0], text[1], text[3]]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception as exc: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) print(exc) return # Store downloaded text strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there's only one text, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation. else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments with book metadata for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx][0]}) segment.annotations.update({"author": annotations[idx][1]}) segment.annotations.update({"language": annotations[idx][2]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, "", "", "", "eng", False]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content( filePath, ocr_languages, ) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content( filePath, ocr_languages, ) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath, ocr_languages) if fileContent == -1: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # End SuperTextFiles except IOError as e: if "tesseract" in str(e): QMessageBox.warning(None, 'Textable', str(e), QMessageBox.Ok) progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()