def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=self.str_index, start=0, end=2, annotations={ 'a': '1', 'bc': '20' }), Segment(str_index=self.str_index, start=3, end=6) ]) self.overlapping_seg = Segmentation([ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ]) self.base_output_string = ('segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6') % (self.str_index, self.str_index) self.count = 0
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model if not self.model: self.noLanguageModelWarning() return # Check that there's an input if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.send('Summary', None, self) self.send('HTML_Summary', None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) # Type of segmentation (per segment or per segmentation) segments = list() html_segments = list() if self.typeSeg == "Summarize each segments individually": # Process each segment separately, then create segmentation for segment in self.inputSeg: content = segment.get_content() resume, html_resume = self.summarize(self.cv, content) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append( Segment(str_index=html_resume[0].str_index, )) elif self.typeSeg == "Summarize all segments as one": merged_seg = " ".join( [segment.get_content() for segment in self.inputSeg]) resume, html_resume = self.summarize(self.cv, merged_seg) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append(Segment(str_index=html_resume[0].str_index, )) # Create segmentation from segment() and assign it to the output self.outputSeg = Segmentation(segments, self.captionTitle) self.html_outputSeg = Segmentation(html_segments, self.captionTitle) # Send segmentation to output channels self.send("Summary", self.outputSeg, self) self.send('HTML_Summary', self.html_outputSeg, self) # Set message to sent message = "%i segment@p sent to output " % len(self.outputSeg) message = pluralize(message, len(self.outputSeg)) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False)
def test_get_content_missing_end(self): """Does get_content() work with end=None?""" segment = Segment( str_index=self.entire_text_seg[0].str_index, start=0, end=None, ) self.assertEqual(segment.get_content(), 'ab cde', msg="get_content() doesn't work with end=None!")
def test_get_content_missing_start_and_end(self): """Does get_content() work with start=None and end=None?""" segment = Segment( str_index=self.entire_text_seg[0].str_index, start=None, end=None, ) self.assertEqual( segment.get_content(), 'ab cde', msg="get_content() doesn't work with start=None and end=None!" )
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.other_entire_text_seg = Input('d') str_index = self.entire_text_seg[0].str_index self.first_word_seg = Segmentation( [ Segment( str_index=str_index, start=0, end=2, annotations={'a': 1} ) ] ) self.last_word_seg = Segmentation( [Segment(str_index=str_index, start=3, end=6)] ) self.char_seg = Segmentation( [ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ] )
def spacyItemsToSegments(items, parentSegment): """Convert spaCy items (tokens or spans) to Textable segments.""" parentStrIndex = parentSegment.str_index parentAnnotations = parentSegment.annotations parentStart = parentSegment.start or 0 segments = list() for item in items: annotations = parentAnnotations.copy() annotations.update( { k: getattr(item, k) for k in RELEVANT_KEYS if hasattr(item, k) and getattr(item, k) is not None and getattr(item, k) is not "" } ) if str(type(item)).endswith("Token'>"): startPos = parentStart + item.idx endPos = startPos + len(item) else: startPos = parentStart + item.start_char endPos = parentStart + item.end_char segments.append( Segment( str_index=parentStrIndex, start=startPos, end=endPos, annotations=annotations, ) ) return segments
def test_creator_no_annotations(self): """Does creator initialize param annotations to {} by default?""" segment = Segment(0, annotations=None) self.assertEqual( segment.annotations, dict(), msg="creator doesn't init param annotations to {} by default!")
def test_creator(self): """Does creator return Segment object?""" mock_address = 1 self.assertIsInstance( Segment(mock_address), Segment, msg="creator doesn't return Segment object!" )
def treat_input(self): # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # clear lists del self.csvSeg[:] del self.contentIsNone[:] # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() if not self.deleteQuotes == False : inputContent = inputContent.replace('"',"") inputAnnotations = segment.annotations inputStrIdx = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) #Call data processing csv_stream = io.StringIO(inputContent) dialect = sniffer.sniff(csv_stream.readline()) dialect.quoting=csv.QUOTE_NONE csv_stream.seek(0) my_reader = csv.reader(csv_stream, dialect) position = 0 # Process each seg in inputContent for seg in inputContent: segAnnotations = inputAnnotations.copy() # This will launch if sniffer detects a header in the content. if sniffer.has_header(inputContent) == True: # go back to the start otherwise we're going to start from the # second row csv_stream.seek(0) # the header row is defined here. if self.isRenamed == False : self.dict_keys = next(my_reader) for key in self.dict_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) else : input_keys = next(my_reader) for key in input_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) # This will launch if sniffer does not detect a header # in the content. if sniffer.has_header(inputContent) == False: # go back to the start otherwise we're going to start from the # second row. we do this here even though we don't really care # about the first row simply because in general we consider the # first row to not have any missing values csv_stream.seek(0) first_row = next(my_reader) n_cols = len(first_row) if self.isRenamed == False : self.dict_keys = list() for item in range(1, n_cols+1): self.dict_keys.append(str(item)) csv_stream.seek(0) # clear the list before appending del self.headerList[:] for key in self.dict_keys: # appends the headers to the gui list if self.dict_keys.index(key) == self.content_column: self.headerList.append(str(key)+"(*content)") self.headerList = self.headerList else : self.headerList.append(str(key)) self.headerList = self.headerList for idx, row in enumerate(my_reader, start=2): # Get old annotations in new dictionary oldAnnotations = inputAnnotations.copy() segAnnotations = dict() # initiate next row starting position next_position = position for key in oldAnnotations.keys(): segAnnotations[key] = oldAnnotations[key] # This is the main part where we transform our data into # annotations. for key in self.dict_keys: # segAnnotations["length"] = position # segAnnotations["row"] = str(row) # if column is content (first column (0) by default) if self.dict_keys.index(key) == self.content_column: # put value as content content = row[self.dict_keys.index(key)] # else we put value in annotation else: # only if value is not None if len(row[self.dict_keys.index(key)]) != 0 : segAnnotations[key] = row[self.dict_keys.index(key)] # implement position and next_position depending on # content column if self.dict_keys.index(key) < self.content_column: position += len(row[self.dict_keys.index(key)]) + 1 next_position += len(row[self.dict_keys.index(key)]) + 1 if self.dict_keys.index(key) >= self.content_column: next_position += len(row[self.dict_keys.index(key)]) + 1 if len(content) != 0: self.csvSeg.append( Segment( str_index = inputStrIdx, start = position, end = position + len(content), annotations = segAnnotations ) ) else : # if no content, add idx of the row and do not append # TODO : something with contentIsNone self.contentIsNone.append(idx) # initiate new row starting position position = next_position progressBar.advance() unSeg = len(self.csvSeg) # Set status to OK and report segment analyzed... message = "%i segment@p analyzed." % unSeg message = pluralize(message, unSeg) message += " (Ignored %i segment@p with no content)" % \ len(self.contentIsNone) message = pluralize(message, len(self.contentIsNone)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() self.sendButton.sendIf()
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2, annotations={'a': '1'}), Segment(str_index=str_index, start=3, end=6) ]) self.char_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg1 = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), ]) self.letter_seg2 = Segmentation([ Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.single_letter_seg = Segmentation([ Segment(str_index=str_index, start=4, end=5, annotations={'b': '1'}), ]) self.duplicate_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=0, end=1), ]) self.overlapping_seg = Segmentation([ Segment(str_index=str_index, start=3, end=5), Segment(str_index=str_index, start=4, end=6), ]) self.other_entire_text_seg = Input('abbccc') str_index2 = self.other_entire_text_seg[0].str_index self.other_letter_seg = Segmentation([ Segment(str_index=str_index2, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index2, start=1, end=2, annotations={'a': '1'}), Segment(str_index=str_index2, start=2, end=3, annotations={'a': '1'}), Segment(str_index=str_index2, start=3, end=4, annotations={'a': '2'}), Segment(str_index=str_index2, start=4, end=5, annotations={'a': '2'}), Segment(str_index=str_index2, start=5, end=6, annotations={'a': '3'}), ]) self.third_entire_text_seg = Input('bd1') str_index3 = self.third_entire_text_seg[0].str_index self.third_letter_seg = Segmentation([ Segment(str_index=str_index3, start=0, end=1), Segment(str_index=str_index3, start=1, end=2, annotations={'a': '2'}), Segment(str_index=str_index3, start=2, end=3, annotations={'a': 'b'}), ]) self.fourth_entire_text_seg = Input('AB cd\xe9') str_index = self.fourth_entire_text_seg[0].str_index self.second_word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2), Segment(str_index=str_index, start=3, end=6), ]) self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>') self.wrong_xml_seg = Input('<a><a>test</a>') self.wrong_xml_seg2 = Input('<a>test</a></a>') self.part_xml_seg = Input('<a>1<a>2<a>3</a>4') str_index3 = self.part_xml_seg[0].str_index self.part_xml_seg2 = Input('</a>5</a>') str_index4 = self.part_xml_seg2[0].str_index self.broken_xml_seg = Segmentation([ Segment(str_index=str_index3, annotations={'a': '1'}), Segment(str_index=str_index4), ]) self.count = 0
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Convert input(s) and send output""" if not (self.segmentation or self.corpus): self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Textable segmentation', None, self) self.send('Text Mining corpus', None) return msg_seg = msg_corpus = "" num_iterations = 0 if self.corpus: num_iterations += len(self.corpus) if self.segmentation: num_iterations += len(self.segmentation) self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) # Convert corpus to segmentation... if self.corpus: self.clearCreatedInputs() new_segments = list() text_feature = self.corpus.text_features[self.segmentContent] for row in self.corpus: content = row[text_feature].value if content == "": continue new_input = Input(row[text_feature].value) new_segment_annotations = dict() for attr in self.corpus.domain: attr_str = str(row[attr]) if attr_str != "?": new_segment_annotations[str(attr)] = attr_str for meta_attr in self.corpus.domain.metas: meta_attr_str = str(row[meta_attr]) if (meta_attr != text_feature and meta_attr_str != "?"): new_segment_annotations[str(meta_attr)] = meta_attr_str new_segments.append( Segment(new_input[0].str_index, new_input[0].start, new_input[0].end, new_segment_annotations)) self.createdInputs.append(new_input) progressBar.advance() new_segmentation = Segmentation(new_segments, self.captionTitle) msg_seg = u'%i segment@p' % len(new_segmentation) msg_seg = pluralize(msg_seg, len(new_segmentation)) self.send('Textable segmentation', new_segmentation, self) else: self.send('Textable segmentation', None, self) # Convert segmentation to corpus... if self.segmentation: metas = list() attributes = list() meta_keys = list() attribute_keys = list() for key in self.segmentation.get_annotation_keys(): possible_values = set() for segment in self.segmentation: try: possible_values.add(str(segment.annotations[key])) except KeyError: pass if (self.limitNumCategories and len(possible_values) > self.maxNumCategories): metas.append(StringVariable(key)) meta_keys.append(key) else: attributes.append( DiscreteVariable(key, values=list(possible_values))) attribute_keys.append(key) metas.append(StringVariable("textable_text")) domain = Domain(attributes, [], metas) rows = list() for segment in self.segmentation: row = [ str(segment.annotations.get(annotation_key, None)) for annotation_key in attribute_keys ] row.extend([ str(segment.annotations.get(annotation_key, None)) for annotation_key in meta_keys ]) row.append(segment.get_content()) rows.append(row) progressBar.advance table = Table(domain, rows) if textMiningIsInstalled: corpus = Corpus(domain, X=table.X, metas=table.metas, text_features=[metas[-1]]) msg_corpus = u'%i document@p' % len(self.segmentation) msg_corpus = pluralize(msg_corpus, len(self.segmentation)) self.send('Text Mining corpus', corpus) else: self.send('Text Mining corpus', None) progressBar.finish() self.controlArea.setDisabled(False) if msg_seg or msg_corpus: message = msg_seg if msg_seg and msg_corpus: message += " and " message += msg_corpus message += " sent to output." self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()