def test_concatenate_progress(self): """Does concatenate track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.concatenate( [self.letter_seg1], progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.letter_seg1), msg="concatenate doesn't track progress!")
def test_concatenate_progress(self): """Does concatenate track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.concatenate( [self.letter_seg1], progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.letter_seg1), msg="concatenate doesn't track progress!" )
def send_data(self): """Creates the inputs based on the fetched data""" self.controlArea.setDisabled(True) self.clearCreatedInputs() segmentation = None # Goes over each queries in the data list for query in self.queryList: for text in query: # Create inputs newInput = Input(text) self.createdInputs.append(newInput) # If there is only one input, create a segmentation... if len(self.createdInputs) == 1: segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... annotations = list() for elem in self.annotList: for dic in elem: annotations.append(dic) for idx, segment in enumerate(segmentation): segment.annotations.update(annotations[idx]) segmentation[idx] = segment # Calculate number of characters... num_chars = 0 for segment in segmentation: num_chars += len(Segmentation.get_data(segment.str_index)) # If there is data... if len(segmentation) != 0: # Inform the user of the number of segments and the number of characters... self.infoBox.setText( "{} segments sent to output ({} characters)".format( len(segmentation), num_chars, )) # Send the segments self.send("Segmentation", segmentation) self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() else: # Else, signal the user that no data is sendable... self.infoBox.setText( "There are {} segments to send to output. Please fill the query basket and click 'send' again" .format(len(segmentation)), "warning") self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False) self.send("Segmentation", None)
def sendData(self): if not self.file: self.infoBox.setText(u"Please select input file.", "warning") self.send('Text data', None, self) return # Clear created Inputs. self.clearCreatedInputs() # Get transcription try: transcription = self.get_large_audio_transcription( self.file, language=self.language, set_silence_len=self.selected_dur, set_silence_threshold=self.selected_vol) except speech_recognition.UnknownValueError as err: self.infoBox.setText( u"You seem to have overuseed the built-in API key, refer to the documentation for further informations.", "warning") self.send('Text data', None, self) return # Checks if there is a transcription if transcription is None: self.infoBox.setText(u"You must use mp3 or wav audio files.", "warning") self.send('Text data', None, self) return # Regex to get the name of the input file title = self.file regex = re.compile("[^(/\\)]+[mp3|wav]$") match = re.findall(regex, title) if self.selected_seg: for chunk in transcription: new_input = Input(chunk, label=match) self.createdInputs.append(new_input) else: new_input = Input(transcription, label=match) self.createdInputs.append(new_input) # Concatenates the segmentations in the output segmentation self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=False, import_labels_as="") #Sending segments length message = " Succesfully transcripted ! % i segment@p sent to output" % len( self.segmentation) message = pluralize(message, len(self.segmentation)) # Send token... self.send("Text data", self.segmentation, self) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def test_concatenate_merge_segments(self): """Does concatenate merge input segments?""" segmentation = Segmenter.concatenate([ self.letter_seg2, self.letter_seg1, ], ) self.assertEqual([s.get_content() for s in segmentation], ['a', 'b', 'c', 'd', 'e'], msg="concatenate doesn't merge input segments!")
def test_concatenate_copy_annotations_false(self): """Does concatenate skip copying annotations?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], copy_annotations=False, ) self.assertFalse('b' in segmentation[1].annotations or 'a' in segmentation[3].annotations, msg="concatenate doesn't skip copying annotations!")
def test_concatenate_autonumber(self): """Does concatenate autonumber input segments?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], auto_number_as='num', ) self.assertEqual([s.annotations['num'] for s in segmentation], list(range(1, 6)), msg="concatenate doesn't autonumber input segments!")
def test_concatenate_merge_duplicates(self): """Does concatenate merge duplicates?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.single_letter_seg, ], merge_duplicates=True, ) self.assertEqual([s.get_content() for s in segmentation], ['c', 'd', 'e'], msg="concatenate doesn't merge duplicates!")
def test_concatenate_solve_conflicts_merge_duplicates(self): """Does concatenate solve conflicts when merging duplicates?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.single_letter_seg, ], merge_duplicates=True, ) self.assertEqual( segmentation[1].annotations['b'], '1', msg="concatenate doesn't solve conflicts when merging duplicates!")
def test_concatenate_merge_segments(self): """Does concatenate merge input segments?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], ) self.assertEqual( [s.get_content() for s in segmentation], ['a', 'b', 'c', 'd', 'e'], msg="concatenate doesn't merge input segments!" )
def test_concatenate_copy_annotations(self): """Does concatenate copy annotations?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], copy_annotations=True, ) self.assertEqual([ segmentation[3].annotations['b'], segmentation[0].annotations['a'], ], ['2', '1'], msg="concatenate doesn't copy annotations!")
def test_concatenate_solve_conflicts_merge_duplicates(self): """Does concatenate solve conflicts when merging duplicates?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.single_letter_seg, ], merge_duplicates=True, ) self.assertEqual( segmentation[1].annotations['b'], '1', msg="concatenate doesn't solve conflicts when merging duplicates!" )
def test_concatenate_merge_duplicates(self): """Does concatenate merge duplicates?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.single_letter_seg, ], merge_duplicates=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['c', 'd', 'e'], msg="concatenate doesn't merge duplicates!" )
def test_concatenate_autonumber(self): """Does concatenate autonumber input segments?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], auto_number_as='num', ) self.assertEqual( [s.annotations['num'] for s in segmentation], list(range(1, 6)), msg="concatenate doesn't autonumber input segments!" )
def test_concatenate_copy_annotations_false(self): """Does concatenate skip copying annotations?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], copy_annotations=False, ) self.assertFalse( 'b' in segmentation[1].annotations or 'a' in segmentation[3].annotations, msg="concatenate doesn't skip copying annotations!" )
def huntTheLexic(self): """ main I/O function, filters the inputSeg with the selected lexical fields and outputs a copy of the input this Segmentation with segments labelised according to the topic they belong in """ # initiations... out = list() selectedListsNames = list() # first we select the topics according to the ones the user chose if self.titleLabels: selectedListsNames = [ list(self.titleLabels)[idx] for idx in self.selectedFields ] # we can then associate the topics with their respective lists selectedLists = { key: value for key, value in defaultDict.items() if key in selectedListsNames } # if we have an input, we can select the segments of the input and # label them according to the lists they are found in if self.inputSeg is not None: for filter_list in selectedLists: work_list = [i for i in selectedLists[filter_list] if i] if work_list: out.append( Segmenter.select( self.inputSeg, self.listToRegex(work_list), label=filter_list, )[0]) # lastly we define the output as a segmentation that is a copy of # the input, with the segments that we found labeled accordingly if self.labelName == "": labelNameVar = "Topic" else: labelNameVar = self.labelName self.outputSeg = Segmenter.concatenate( [Segmenter.bypass(self.inputSeg, label="__None__")] + out, merge_duplicates=True, label=self.captionTitle, import_labels_as=labelNameVar, )
def test_concatenate_copy_annotations(self): """Does concatenate copy annotations?""" segmentation = Segmenter.concatenate( [ self.letter_seg2, self.letter_seg1, ], copy_annotations=True, ) self.assertEqual( [ segmentation[3].annotations['b'], segmentation[0].annotations['a'], ], ['2', '1'], msg="concatenate doesn't copy annotations!" )
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Connect to imdb and add elements in lists list_review = list() list_annotation = list() annotations = list() try: for item in self.myBasket: movie = self.ia.get_movie_reviews(item['id']) movie_annotations = self.ia.get_movie(item['id']) list_review.append(movie) list_annotation.append(movie_annotations) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) for item in list_annotation: print(item) # Store the annotation as dicts in a separate list annotations_dict = {"title": item, "year": item["year"]} annot_dict_copy = annotations_dict.copy() for i in range(25): annotations.append(annot_dict_copy) print(annotations) # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += " (%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Clear created Inputs self.clearCreatedInputs() if self.service == u'Twitter': try: self.createdInputs = self.get_tweets( self.word_to_search, self.nb_tweet, self.include_RT, self.useTwitterLicenseKey, ( self.twitterLicenseKeysConsumerKey, self.twitterLicenseKeysConsumerSecret, ( self.twitterLicenseKeysAccessToken, self.twitterLicenseKeysAccessTokenSecret ) ) ) except (HTTP401Authentication, HTTP400BadRequest): self.infoBox.setText( u'Please enter valid Twitter api keys.', u'error', ) self.send(u'Text data', None, self) return False except SearchEngineLimitError: self.infoBox.setText( u'Twitter search limit has been exceeded.', u'error', ) self.send(u'Text data', None, self) return False elif self.service == u'Wikipedia': self.createdInputs = self.get_wiki_article( self.word_to_search, self.wiki_section, self.wiki_type_of_text ) elif self.service == u'Bing': self.createdInputs = self.get_bing_entries( self.word_to_search, self.nb_bing_entry ) if len(self.createdInputs) == 0: self.infoBox.setText( u'Please try to change query or settings.', u'warning', ) self.send(u'Text data', None, self) return False # Initialize progress bar progressBar = OWGUI.ProgressBar( self, iterations=50 ) output_segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None ) message = u'%i segment@p sent to output ' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) numChars = 0 for segment in output_segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) for _ in xrange(50): progressBar.advance() # Clear progress bar. progressBar.finish() self.send('Text data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = gui.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load folders, create and send segmentation""" # Check that there's something on input... if ( (self.displayAdvancedSettings and not self.folders) or not (self.rootFolderPath or self.displayAdvancedSettings) ): self.infoBox.setText(u'Please select input folder.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning' ) self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFolders = self.folders else: myFolders = [[self.rootFolderPath]] progressBar = gui.ProgressBar( self, iterations=len(myFolders) ) # Walk through each folder and open each files successively... fileContents = self.fileContents # Annotations... myFolders = self.folders for myFolder in myFolders: myFiles = myFolder['fileList'] for myFile in myFiles: # print(myFile) annotation = dict() if self.importFileNameKey: annotation[self.importFileNameKey] = myFile['fileName'] if self.importFolderNameKey: annotation[self.importFolderNameKey] = myFile['folderName'] if self.FolderDepth1Key: annotation[self.FolderDepth1Key] = myFile['depth1'] if self.FolderDepth2Key: annotation[self.FolderDepth2Key] = myFile['depth2'] if self.FolderDepthLvl: annotation[self.FolderDepthLvl] = myFile['depthLvl'] annotations.append(annotation) # progressBar.advance() # Create an LTTL.Input for each files... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpora: self.infoBox.setText("Please add a corpus to the selection.", "warning") self.send("Files", None, self) self.send("Utterances", None, self) return # Clear created Inputs and initialize progress bar... self.clearCreatedInputs() numberOfSteps = 2 if self.outputUtterances else 1 numberOfSteps += 2 if self.outputWords else 0 self.infoBox.setText( "(1/%i) Retrieving data, please wait..." % numberOfSteps, "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.importedCorpora)) annotations = list() # Iterate over corpora... for importedCorpus in self.importedCorpora: corpus = importedCorpus.split("/")[-1] # Try to retrieve corpus from cache... try: basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) corpusFilepath = os.path.normpath( os.path.join( basepath, self.__class__.cachedFoldername, importedCorpus[len(self.__class__.baseUrl):], )) myZip = zipfile.ZipFile(corpusFilepath) except IOError: # Else try to download (and cache) requested zip file... try: response = requests.get(importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) corpusFolderpath = os.path.dirname(corpusFilepath) try: os.makedirs(corpusFolderpath) except OSError: pass try: outputFile = open(corpusFilepath, "wb") outputFile.write(response.content) outputFile.close() except IOError: pass # If an error occurs (e.g. connection error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download corpus %s from CHILDES website." % corpus, "error") # Reset output channel. self.send("Files", None, self) self.send("Utterances", None, self) progressBar.finish() self.controlArea.setDisabled(False) return # Create Input for each zipped file and store annotations... for file in myZip.infolist(): file_content = myZip.read(file).decode('utf-8') # If word segmentation is requested... if self.outputWords: # Implement replacements. file_content = re.sub( r"<w.+?(<replacement.+</replacement>).*?</w>", r"\1", file_content, ) # Prepend pre-clitics. file_content, n = re.subn( r"(<mor .+?)(<mor-pre>.+</mor-pre>)", r"\2\1", file_content, ) # Move <gra> into <mw>. file_content, n = re.subn( r"(</mw>)(<gra.+?/>)", r"\2\1", file_content, ) newInput = Input(file_content, self.captionTitle + "_files") self.createdInputs.append(newInput) chatSeg = Segmenter.import_xml(newInput, "CHAT") annotations.append(dict()) annotations[-1]["file_path"] = file.filename for key in ["Corpus", "Lang", "PID"]: try: annotations[-1][key.lower()] = \ chatSeg[0].annotations[key] except KeyError: pass participantListSeg = Segmenter.import_xml( newInput, "Participants") recodedInput, _ = Segmenter.recode( participantListSeg, [(re.compile("/>"), "> </participant>")]) participantSeg = Segmenter.import_xml(recodedInput, "participant") targetChildData = list() for participant in participantSeg: if participant.annotations["role"] != "Target_Child": continue targetChildData.append(dict()) if "age" in participant.annotations: targetChildData[-1]["target_child_age"] = \ participant.annotations["age"] age_parse = re.search( r"(\d+)Y(\d+)M(\d+)D", participant.annotations["age"], ) if age_parse: targetChildData[-1]["target_child_years"] = \ age_parse.group(1) months = int(age_parse.group(2)) \ + 12 * int(age_parse.group(1)) targetChildData[-1]["target_child_months"] = \ '%02d' % months days = int(age_parse.group(3)) \ + 30 * months targetChildData[-1]["target_child_days"] = \ '%02d' % days if "id" in participant.annotations: targetChildData[-1]["target_child_id"] = \ participant.annotations["id"] if "sex" in participant.annotations: targetChildData[-1]["target_child_sex"] = \ participant.annotations["sex"] if len(targetChildData) == 1: annotations[-1].update(targetChildData[0]) progressBar.advance() # If there's only one file, the widget's output is the created Input... if len(self.createdInputs) == 1: self.fileSegmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.fileSegmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle + "_files", import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.fileSegmentation): segment.annotations.update(annotations[idx]) self.fileSegmentation[idx] = segment # Terminate progress bar... progressBar.finish() message = "%i file@p" % len(self.fileSegmentation) message = pluralize(message, len(self.fileSegmentation)) self.send("Files", self.fileSegmentation, self) # Build utterance segmentation if needed... if self.outputUtterances: self.infoBox.setText( "(2/%i) Building utterance segmentation, please wait..." \ % numberOfSteps, "warning", ) progressBar = ProgressBar(self, iterations=len(self.fileSegmentation)) self.utteranceSegmentation = Segmenter.import_xml( self.fileSegmentation, "u", progress_callback=progressBar.advance, label=self.captionTitle + "_utterances", ) progressBar.finish() message += " and " if not self.outputWords else ", " message += "%i utterance@p" % len(self.utteranceSegmentation) message = pluralize(message, len(self.utteranceSegmentation)) self.send("Utterances", self.utteranceSegmentation, self) else: self.send("Utterances", None, self) # Build word segmentation if needed... if self.outputWords: self.infoBox.setText( "(%i/%i) Building word segmentation, please wait..." \ % (2 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) try: baseSegmentation = self.utteranceSegmentation except: baseSegmentation = self.fileSegmentation progressBar = ProgressBar(self, iterations=2 * len(baseSegmentation)) wordSegmentation = Segmenter.import_xml( baseSegmentation, "w", progress_callback=progressBar.advance, ) mwSegmentation = Segmenter.import_xml( baseSegmentation, "mw", progress_callback=progressBar.advance, ) # Analyze words to extract annotations... self.infoBox.setText( "(%i/%i) Extracting word annotations, please wait..." \ % (3 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(wordSegmentation)) wordSegments = list() for word in wordSegmentation: mws = word.get_contained_segments(mwSegmentation) if mws: for mw in mws: wordSegment = word.deepcopy() wordSegment.annotations.update( self.extractWordAnnotations(mw)) wordSegments.append(wordSegment) else: wordSegments.append(word) progressBar.advance() self.wordSegmentation = Segmentation( wordSegments, label=self.captionTitle + "_words", ) message += " and %i word@p" % len(self.wordSegmentation) message = pluralize(message, len(self.wordSegmentation)) self.send("Words", self.wordSegmentation, self) else: self.send("Words", None, self) # Set status to OK and report data size... message += " sent to output." message = pluralize(message, len(self.fileSegmentation)) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load folders, create and send segmentation""" # Check that there's something on input... if (self.displayAdvancedSettings and not self.folders) or not (self.rootFolderPath or self.displayAdvancedSettings): self.infoBox.setText(u'Please select input folder.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() annotations = list() counter = 1 if self.displayAdvancedSettings: myFolders = self.folders else: myFolders = [self.folder] # Annotations... allFileListContent = list() for myFolder in myFolders: myFiles = myFolder['fileList'] for myFile in myFiles: annotation = dict() annotation['file name'] = myFile['fileName'] annotation['file depth level'] = myFile['depthLvl'] annotation['file path'] = myFile['absoluteFilePath'] try: annotation['file encoding, confidence'] = myFile[ 'encoding'] + ", " + str(myFile['encodingConfidence']) except TypeError: annotation['file encoding, confidence'] = "unknown" depths = [k for k in myFile.keys() if k.startswith('depth_')] for depth in depths: annotation[depth] = myFile[depth] annotations.append(annotation) allFileListContent.append(myFile['fileContent']) # Create an LTTL.Input for each files... if len(allFileListContent) == 1: label = self.captionTitle else: label = None for index in range(len(allFileListContent)): myInput = Input(allFileListContent[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(allFileListContent) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText(u'Please select one or more titles.', 'warning') self.send(u'Text data', None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = OWGUI.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to Theatre-classique and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: response = urllib2.urlopen( self.document_base_url + self.filteredTitleSeg[title].annotations[u'url']) xml_contents.append(unicode(response.read(), u'utf8')) annotations.append( self.filteredTitleSeg[title].annotations.copy()) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to 'error' state. self.infoBox.setText( u"Couldn't download data from theatre-classique website.", 'error') # Reset output channel. self.send(u'Text data', None, self) return # Store downloaded XML in input objects... for xml_content_idx in xrange(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations[u'url'] ] # Set status to OK and report data size... message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send(u'Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
self.sendButton.sendIf() if __name__ == '__main__': import sys, re from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableVariety() seg1 = Input(u'aabccc', 'text1') seg2 = Input(u'abci', 'text2') seg3 = Segmenter.concatenate( [seg1, seg2], import_labels_as='string', label='corpus' ) seg4 = Segmenter.tokenize( seg3, regexes=[(re.compile(r'\w+'), u'tokenize',)], ) seg5 = Segmenter.tokenize( seg4, regexes=[(re.compile(r'[ai]'), u'tokenize',)], label='V' ) seg6 = Segmenter.tokenize( seg4, regexes=[(re.compile(r'[bc]'), u'tokenize',)], label='C'
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, u'', u'']] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] # Try to open the file... self.error() try: if encoding == "(auto-detect)": detector = UniversalDetector() fh = open(filePath, 'rb') for line in fh: detector.feed(line) if detector.done: break detector.close() fh.close() encoding = detector.result['encoding'] fh = open( filePath, mode='rU', encoding=encoding, ) try: fileContent = "" i = 0 chunks = list() for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""): chunks.append('\n'.join(chunk.splitlines())) i += CHUNK_LENGTH if i % (CHUNK_NUM * CHUNK_LENGTH) == 0: fileContent += "".join(chunks) chunks = list() if len(chunks): fileContent += "".join(chunks) del chunks except UnicodeError: progressBar.finish() if len(myFiles) > 1: message = u"Please select another encoding " \ + u"for file %s." % filePath else: message = u"Please select another encoding." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return finally: fh.close() except IOError: progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
label='V' ) seg4 = Segmenter.tokenize( seg2, [(re.compile(r'[hlwrdc]'), u'tokenize')], label='C' ) seg5 = Segmenter.tokenize( seg2, [(re.compile(r' '), u'tokenize')], label='S' ) seg6 = Segmenter.concatenate( [seg3, seg4, seg5], import_labels_as='category', label='chars', sort=True, merge_duplicates=True, ) seg7 = Segmenter.tokenize( seg6, [(re.compile(r'l'), u'tokenize')], label='pivot' ) ow.inputData(seg2, 1) ow.inputData(seg6, 2) ow.inputData(seg7, 3) ow.show() appl.exec_() ow.saveSettings()
def sendData(self): """Send data from website springfieldspringfield""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning" ) self.segmentation = None self.send("Movie transcripts", self.segmentation, self) return # Clear created Inputs. self.clearCreatedInputs() annotations = list() script_list = list() annotations_dict = dict() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # This part of code is what fetches the actual script try: for movie in self.myBasket: # Each movie that is in the corpus is split into title and year # (rsplit makes sure to only split last occurence) which will # become annotations b = copy.copy(movie) future_annotation = b.rsplit('(', 1) movie_title = future_annotation[0] movie_year = future_annotation[-1] movie_year = movie_year[:-1] annotations_dict["Movie Title"] = movie_title annotations_dict["Year of release"] = movie_year # It is important to make a copy of dictionary, otherwise each # iteration will replace every element of the annotations list annotations.append(annotations_dict.copy()) # link_end and page_url are the two variables that will have to # be changed in case scripts need to be taken from elsewhere link_end = self.path_storage[movie] page_url = "https://www.springfieldspringfield.co.uk/" + \ "movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # This is what grabs the movie script script = soup.find("div", {"class":"movie_script"}) script_list.append(script.text) # 1 tick on the progress bar of the widget progressBar.advance() except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error" ) self.controlArea.setDisabled(False) return # Store downloaded script strings in input objects... for script in script_list: newInput = Input(script, self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Movie transcripts", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Fetch URL content, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.URLs) or not (self.URL or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select source URL.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() URLContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myURLs = self.URLs else: myURLs = [[self.URL, self.encoding, u'', u'']] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myURLs)) # Process each URL successively... for myURL in myURLs: URL = myURL[0] if not URL.startswith("http"): URL = "http://" + URL encoding = myURL[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myURL[2] annotation_value = myURL[3] # Try to fetch URL content... self.error() URLContent = "" try: URLHandle = urlopen(URL) URLContent = URLHandle.read() URLHandle.close() except http.client.IncompleteRead as e: URLContent = e.partial except IOError: progressBar.finish() if len(myURLs) > 1: message = u"Couldn't retrieve %s." % URL else: message = u"Couldn't retrieve URL." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return try: if encoding == "(auto-detect)": encoding = chardet.detect(URLContent)['encoding'] URLContent = URLContent.decode(encoding) except UnicodeError: progressBar.finish() if len(myURLs) > 1: message = u"Please select another encoding " \ + u"for URL %s." % URL else: message = u"Please select another encoding." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Replace newlines with '\n'... # URLContent = URLContent.replace('\r\n', '\n').replace('\r', '\n') # TODO: check if this is more efficient than replace above... URLContent = '\n'.join(URLContent.splitlines()) # Remove utf-8 BOM if necessary... if encoding == u'utf-8': URLContent = URLContent.lstrip(codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... URLContent = normalize('NFC', URLContent) URLContents.append(URLContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importURLs and self.importURLsKey: annotation[self.importURLsKey] = URL if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each URL... if len(URLContents) == 1: label = self.captionTitle else: label = None for index in range(len(URLContents)): myInput = Input(URLContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one URL, the widget's output is the created Input. if len(URLContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
self.contextAnnotationKey = self.contextAnnotationKey def handleNewSignals(self): """Overridden: called after multiple signals have been added""" self.openContext(self.uuid, self.segmentations) self.updateGUI() self.sendButton.sendIf() if __name__ == '__main__': import sys from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableCount() seg1 = Input(u'hello world', label=u'text1') seg2 = Input(u'cruel world', label=u'text2') seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus') seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize', { 'type': 'mot' })], label=u'words') ow.inputData(seg3, 1) ow.inputData(seg4, 2) ow.show() appl.exec_() ow.saveSettings()
def sendData(self): """Check inputs, build merged segmentation, then send it""" # Check that there's something on input... if not self.texts: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Merged data', None, self) return # TODO: remove message 'No label was provided.' from docs # Extract segmentations from self.texts and get number of segments... segmentations = [text[1] for text in self.texts] num_segments = sum([len(s) for s in segmentations]) # Check that labelKey is not empty (if necessary)... if self.importLabels: if self.labelKey: labelKey = self.labelKey else: self.infoBox.setText( u'Please enter an annotation key for imported labels.', 'warning') self.send('Merged data', None, self) return else: labelKey = None # Check that autoNumberKey is not empty (if necessary)... if self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Merged data', None, self) return else: autoNumberKey = None # Initialize progress bar... self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_segments) # Perform concatenation. concatenation = Segmenter.concatenate( segmentations, label=self.captionTitle, copy_annotations=self.copyAnnotations, import_labels_as=labelKey, sort=True, # TODO: document auto_number_as=autoNumberKey, merge_duplicates=self.mergeDuplicates, progress_callback=progressBar.advance, ) progressBar.finish() self.controlArea.setDisabled(False) message = u'%i segment@p sent to output.' % len(concatenation) message = pluralize(message, len(concatenation)) self.infoBox.setText(message) self.send('Merged data', concatenation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) text_content = list() annotations = list() try: # Retrieve selected texts from gutenberg for text in self.myBasket: gutenberg_id = text[2] # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode( "utf-8") text_content.append(gutenberg_text) # populate the annotation list annotations.append([text[0], text[1], text[3]]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception as exc: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) print(exc) return # Store downloaded text strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there's only one text, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation. else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments with book metadata for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx][0]}) segment.annotations.update({"author": annotations[idx][1]}) segment.annotations.update({"language": annotations[idx][2]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some songs first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() song_content = list() annotations = list() try: for song in self.myBasket: # song is a dict {'idx1':{'title':'song1'...}, # 'idx2':{'title':'song2'...}} page_url = "http://genius.com" + song['path'] lyrics = self.html_to_text(page_url) song_content.append(lyrics) annotations.append(song.copy()) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Genius website.", "error") self.controlArea.setDisabled(False) return # Store downloaded lyrics strings in input objects... for song in song_content: newInput = Input(song, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Lyrics importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, "", "", "", "eng", False]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content( filePath, ocr_languages, ) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content( filePath, ocr_languages, ) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath, ocr_languages) if fileContent == -1: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # End SuperTextFiles except IOError as e: if "tesseract" in str(e): QMessageBox.warning(None, 'Textable', str(e), QMessageBox.Ok) progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Clear created Inputs self.clearCreatedInputs() if self.service == u'Twitter': try: self.createdInputs = self.get_tweets( self.word_to_search, self.nb_tweet, self.include_RT, self.useTwitterLicenseKey, (self.twitterLicenseKeysConsumerKey, self.twitterLicenseKeysConsumerSecret, (self.twitterLicenseKeysAccessToken, self.twitterLicenseKeysAccessTokenSecret))) except (HTTP401Authentication, HTTP400BadRequest): self.infoBox.setText( u'Please enter valid Twitter api keys.', u'error', ) self.send(u'Text data', None, self) return False except SearchEngineLimitError: self.infoBox.setText( u'Twitter search limit has been exceeded.', u'error', ) self.send(u'Text data', None, self) return False elif self.service == u'Wikipedia': self.createdInputs = self.get_wiki_article(self.word_to_search, self.wiki_section, self.wiki_type_of_text) elif self.service == u'Bing': self.createdInputs = self.get_bing_entries(self.word_to_search, self.nb_bing_entry) if len(self.createdInputs) == 0: self.infoBox.setText( u'Please try to change query or settings.', u'warning', ) self.send(u'Text data', None, self) return False # Initialize progress bar progressBar = OWGUI.ProgressBar(self, iterations=50) output_segmentation = Segmenter.concatenate(self.createdInputs, self.captionTitle, import_labels_as=None) message = u'%i segment@p sent to output ' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) numChars = 0 for segment in output_segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) for _ in xrange(50): progressBar.advance() # Clear progress bar. progressBar.finish() self.send('Text data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()