def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
class TestSegmentation(unittest.TestCase): """Test suite for LTTL Segment module""" def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation( [ Segment( str_index=self.str_index, start=0, end=2, annotations={'a': '1', 'bc': '20'} ), Segment( str_index=self.str_index, start=3, end=6 ) ] ) self.overlapping_seg = Segmentation( [ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ] ) self.base_output_string = ( 'segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6' ) % (self.str_index, self.str_index) self.count = 0 def tearDown(self): """Cleaning up after the test""" pass def test_creator(self): """Does creator return Segmentation object?""" self.assertIsInstance( Segmentation(), Segmentation, msg="creator doesn't return Segmentation object!" ) def test_to_string_default_format(self): """Does to_string() format segmentation correctly by default?""" output_string = self.word_seg.to_string() self.assertEqual( output_string, self.base_output_string, msg="to_string() doesn't format segmentation correctly by default!" ) def test_to_string_header(self): """Does to_string() format header correctly?""" output_string = self.word_seg.to_string( header='HEADER', ) self.assertEqual( output_string, 'HEADER' + self.base_output_string, msg="to_string() doesn't format header correctly!" ) def test_to_string_footer(self): """Does to_string() format footer correctly?""" output_string = self.word_seg.to_string( footer='FOOTER', ) self.assertEqual( output_string, self.base_output_string + 'FOOTER', msg="to_string() doesn't format footer correctly!" ) def test_to_string_humanize_addresses(self): """Does to_string() humanize addresses?""" output_string = self.word_seg.to_string( humanize_addresses=True, ) humanized_str_index = self.str_index + 1 humanized_string = self.base_output_string.replace('t:\t3', 't:\t4') humanized_string = humanized_string.replace('t:\t0', 't:\t1') humanized_string = humanized_string.replace( 'x:\t%i' % self.str_index, 'x:\t%i' % humanized_str_index ) self.assertEqual( output_string, humanized_string, msg="to_string() doesn't humanize addresses!" ) def test_to_string_interpolate_builtin_variables(self): """Does to_string() interpolate builtin variables?""" output_string = self.word_seg.to_string( formatting=( '%(__num__)s,%(__content__)s,' '%(__str_index__)s,%(__start__)s,%(__end__)s,' '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s' ) ) self.assertEqual( output_string, '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' % ( self.str_index, self.str_index, self.str_index, self.str_index ), msg="to_string() doesn't interpolate builtin variables!" ) def test_to_string_interpolate_annotations(self): """Does to_string() interpolate annotations?""" output_string = self.word_seg.to_string( formatting='%(a)s' ) self.assertEqual( output_string, '1\n__none__', msg="to_string() doesn't interpolate annotations!" ) def test_to_string_progress(self): """Does to_string track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 self.word_seg.to_string( progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.word_seg), msg="to_string doesn't track progress!" ) def test_get_annotation_keys(self): """Does get_annotation_keys() return existing annotations?""" annotations = self.word_seg.get_annotation_keys() self.assertEqual( sorted(annotations), sorted(['a', 'bc']), msg="get_annotation_keys() doesn't return existing annotations!" ) def test_is_non_overlapping(self): """Does is_non_overlapping() recognize absence of overlap?""" self.assertTrue( self.word_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize absence of overlap!" ) def test_is_overlapping(self): """Does is_non_overlapping() recognize presence of overlap?""" self.assertFalse( self.overlapping_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize presence of overlap!" )
class TestSegmentation(unittest.TestCase): """Test suite for LTTL Segment module""" def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=self.str_index, start=0, end=2, annotations={ 'a': '1', 'bc': '20' }), Segment(str_index=self.str_index, start=3, end=6) ]) self.overlapping_seg = Segmentation([ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ]) self.base_output_string = ('segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6') % (self.str_index, self.str_index) self.count = 0 def tearDown(self): """Cleaning up after the test""" pass def test_creator(self): """Does creator return Segmentation object?""" self.assertIsInstance( Segmentation(), Segmentation, msg="creator doesn't return Segmentation object!") def test_to_string_default_format(self): """Does to_string() format segmentation correctly by default?""" output_string = self.word_seg.to_string() self.assertEqual( output_string, self.base_output_string, msg="to_string() doesn't format segmentation correctly by default!" ) def test_to_string_delimiter(self): """Does to_string() format segment delimiter correctly?""" output_string = self.word_seg.to_string( segment_delimiter='DELIMITER', ) self.assertIn( 'DELIMITER', output_string, msg="to_string() doesn't format segment delimiter correctly!") def test_to_string_header(self): """Does to_string() format header correctly?""" output_string = self.word_seg.to_string(header='HEADER', ) self.assertEqual(output_string, 'HEADER' + self.base_output_string, msg="to_string() doesn't format header correctly!") def test_to_string_footer(self): """Does to_string() format footer correctly?""" output_string = self.word_seg.to_string(footer='FOOTER', ) self.assertEqual(output_string, self.base_output_string + 'FOOTER', msg="to_string() doesn't format footer correctly!") def test_to_string_humanize_addresses(self): """Does to_string() humanize addresses?""" output_string = self.word_seg.to_string(humanize_addresses=True, ) humanized_str_index = self.str_index + 1 humanized_string = self.base_output_string.replace('t:\t3', 't:\t4') humanized_string = humanized_string.replace('t:\t0', 't:\t1') humanized_string = humanized_string.replace( 'x:\t%i' % self.str_index, 'x:\t%i' % humanized_str_index) self.assertEqual(output_string, humanized_string, msg="to_string() doesn't humanize addresses!") def test_to_string_interpolate_builtin_variables(self): """Does to_string() interpolate builtin variables?""" output_string = self.word_seg.to_string(formatting=( '%(__num__)s,%(__content__)s,' '%(__str_index__)s,%(__start__)s,%(__end__)s,' '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s')) self.assertEqual( output_string, '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' % (self.str_index, self.str_index, self.str_index, self.str_index), msg="to_string() doesn't interpolate builtin variables!") def test_to_string_interpolate_annotations(self): """Does to_string() interpolate annotations?""" output_string = self.word_seg.to_string(formatting='%(a)s') self.assertEqual(output_string, '1\n__none__', msg="to_string() doesn't interpolate annotations!") def test_to_string_progress(self): """Does to_string track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 self.word_seg.to_string(progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.word_seg), msg="to_string doesn't track progress!") def test_get_annotation_keys(self): """Does get_annotation_keys() return existing annotations?""" annotations = self.word_seg.get_annotation_keys() self.assertEqual( sorted(annotations), sorted(['a', 'bc']), msg="get_annotation_keys() doesn't return existing annotations!") def test_is_non_overlapping(self): """Does is_non_overlapping() recognize absence of overlap?""" self.assertTrue( self.word_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize absence of overlap!") def test_is_overlapping(self): """Does is_non_overlapping() recognize presence of overlap?""" self.assertFalse( self.overlapping_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize presence of overlap!")
def sendData(self): # Si le lien vers treetagger n"est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send("Text data", None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send("Text data", None) # affiche que quelque chose se passe... else: self.infoBox.setText(u"TreeTagger is running...", "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = gui.ProgressBar(self, iterations=5) # Copie de la segmentation avec ajout d"une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()]) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d"un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d"un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d"un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent("") # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send("Text data", final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText( u"Sorry, TreeTagger's link not found.", "error" ) self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText( u"Widget needs input", "warning" ) self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText( u'TreeTagger is running...', "warning" ) # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar( self, iterations = 5 ) # Copie de la segmentation avec ajout d'une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open(os.path.normpath( "/Users/" + self.user + "/treetagger_link.txt"), 'w' ) file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()