def _make_token(self, start, end): ''' Instantiates a Token from self._input_string[start:end] ''' ## all thfift strings must be encoded first tok_string = self._input_string[start:end].encode('utf-8') if only_whitespace.match(tok_string): ## drop any tokens with only whitespace return None tok = Token() tok.token = tok_string tok.token_num = self.tok_num if 'BYTES' in self.config['offset_types']: tok.offsets[OffsetType.BYTES] = Offset( type = OffsetType.BYTES, first=self.byte_idx + len(self._input_string[:start].encode('utf-8')), length=len(tok_string), value=self.config['offset_debugging'] and tok_string or None, ) if 'LINES' in self.config['offset_types']: tok.offsets[OffsetType.LINES] = Offset( type = OffsetType.LINES, first=self.line_idx, length=1, value=self.config['offset_debugging'] and tok_string or None, ) self.tok_num += 1 ## keep track of position within a sentence tok.sentence_pos = self.sent_pos self.sent_pos += 1 return tok
def test_entity_type(): tok_per = Token(entity_type=EntityType.PER) tok_foo = Token(entity_type=EntityType.CUSTOM_TYPE, custom_entity_type='foo') assert get_entity_type(tok_per) == 'PER' assert get_entity_type(tok_foo) == 'foo'
def _make_token(self, start, end): ''' Instantiates a Token from self._input_string[start:end] ''' ## all thfift strings must be encoded first tok_string = self._input_string[start:end].encode('utf-8') if only_whitespace.match(tok_string): ## drop any tokens with only whitespace return None tok = Token() tok.token = tok_string tok.token_num = self.tok_num if 'BYTES' in self.config['offset_types']: tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=self.byte_idx + len(self._input_string[:start].encode('utf-8')), length=len(tok_string), value=self.config['offset_debugging'] and tok_string or None, ) if 'LINES' in self.config['offset_types']: tok.offsets[OffsetType.LINES] = Offset( type=OffsetType.LINES, first=self.line_idx, length=1, value=self.config['offset_debugging'] and tok_string or None, ) self.tok_num += 1 ## keep track of position within a sentence tok.sentence_pos = self.sent_pos self.sent_pos += 1 return tok
def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length = end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent)
def test_multi_token_match(): si = make_stream_item(0, '') tagger_id = 'test_tagger' annotator_id = 'test_anno' target_id = 'test_target' si.body.sentences[tagger_id] = [ Sentence(tokens=[ Token(token='This'), Token(token='-LRB-big-RRB- dog'), Token(token='Jake'), Token(token='has'), Token(token='no'), Token(token=u'\u1F601'.encode('utf8')), Token(token='...'), Token(token='Teeth'), ]) ] rating = Rating( annotator=Annotator(annotator_id=annotator_id), target=Target(target_id=target_id), mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')], ) add_annotation(si, rating) aligner_data = dict( tagger_id=tagger_id, annotator_id=annotator_id, ) multi_token_match(si, aligner_data) assert si.body.sentences[tagger_id][0].tokens[1].labels assert si.body.sentences[tagger_id][0].tokens[2].labels assert si.body.sentences[tagger_id][0].tokens[-3].labels assert si.body.sentences[tagger_id][0].tokens[-2].labels assert si.body.sentences[tagger_id][0].tokens[-1].labels
def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences( stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent)
assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version' ## clean_visible is byte identical to clean_html, except all the ## tags are converted to whitespace, so offsets in match #input_html = si.body.clean_html = text.encode('utf8') clean_visible = si.body.clean_visible.decode('utf8') ## run the text through a tagger #tagger_output = my_tagger( clean_visible ) ## to illustrate, here, we construct a single sentence of tokens ## with all the fields populated first_sentence = Sentence() first_sentence.tokens = [ Token( token='The', ), Token( token='cat', ), Token( token='jumped', ), Token( token='over', ), Token( token='the', ), Token( token='car',