def test_document(self): """ test basic use of Tipster's Document class, detailed tests TODO """ # we are using naive text sample here, and a primitive PunktSentenceTokenizer to split the sentences # more advanced and practical sentence splitter is needed to handle abbreviations sample_txt = 'This is a sample text. It has two sentences.' true_toks = 'This is a sample text . It has two sentences .'.split() # add sentence annotations doc = Document(sample_txt) senttokenizer = PunktSentenceTokenizer() sents = senttokenizer.tokenize(sample_txt) sent_spans = senttokenizer.span_tokenize(sample_txt) for sent, sent_span in zip(sents, sent_spans): doc.annotate(sent_span[0], sent_span[1], 'sentence') # add token annotations treebanktokenizer = TreebankWordTokenizer() # use sentence annotation to retrieve sentences for sent in doc.get_annotations_by_type('sentence'): toks = treebanktokenizer.tokenize(doc.get_text_by_ann(sent)) spans = treebanktokenizer.span_tokenize(doc.get_text_by_ann(sent)) sent_base = sent.span.start for tok, tok_span in zip(toks, spans): span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base) doc.annotate(span.start, span.end, 'token') # check if all tokens are correct for tok, true_tok in zip(doc.get_annotations_by_type('token'), true_toks): self.assertTrue(doc.get_text_by_ann(tok) == true_tok)
def sentence_tokenize(text: str, include_spans: bool = False) -> List: """ Args: text (str): a unicode string Returns: list: tokenized sentence. If include_spans is True, then each item is a Span object with both text and a start_index. Otherwise, only text is returned. """ lines = re.split('\n', text) #lines = list(filter(None, sentences)) sentences = [] tokenizer = PunktSentenceTokenizer() total_offset = 0 for line in lines: line_start = total_offset for sentence_start, sentence_end in tokenizer.span_tokenize(line): sentence = line[sentence_start:sentence_end] if include_spans: sentences.append( Span(text=sentence, start_index=line_start + sentence_start)) else: sentences.append(sentence) total_offset = line_start + sentence_end total_offset += 1 return sentences
class Parse(Service): def __init__(self, langs=[]): Service.__init__(self, 'parse', 'nltk', []) self.punktSentenceTokenizer = PunktSentenceTokenizer() self.treebankWordTokenizer = TreebankWordTokenizer() #PunkSentence, TreebankWordTokenizer def run(self, request, response): if request['lang'] == 'en': text = request['text'] debug = request.get('debug', False) result = [] for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize( text): tokens = [] sentence = text[sent_s:sent_e] for token_s, token_e in self.treebankWordTokenizer.span_tokenize( sentence): item = {'start': token_s + sent_s, 'end': token_e + sent_s} if debug: item['text'] = sentence[token_s:token_e] tokens.append(item) result.append(tokens) return result else: raise MissingLanguage(request['lang']) def describe(self): result = super().describe() result['langs'] = ['en'] result['models'] = {'en': {'pretrained': True}} return result
class SimpleTokenizer(BaseTokenizer): def __init__(self, doc, tok_anno_name='token', sent_anno_name='sentence'): self.doc = doc self.tok_anno_name = tok_anno_name super(SimpleTokenizer, self).__init__(self.doc, self.tok_anno_name) self.sent_anno_name = sent_anno_name self.sentTokenizer = PunktSentenceTokenizer() self.treebankTokenizer = TreebankWordTokenizer() def tokenize(self, start_pos=0, text=None): if text is None: # if text not given, assume it spans to the end of doc text = self.doc.get_text_by_span( Span(start_pos, self.doc.get_length())) # add sentence annotations sents = self.sentTokenizer.tokenize(text) sent_spans = self.sentTokenizer.span_tokenize(text) for sent, sent_span in zip(sents, sent_spans): self.doc.annotate(start_pos + sent_span[0], start_pos + sent_span[1], self.sent_anno_name) # use sentence annotation to retrieve sentences for sent in self.doc.get_annotations_by_type(self.sent_anno_name): toks = self.treebankTokenizer.tokenize( self.doc.get_text_by_ann(sent)) spans = self.treebankTokenizer.span_tokenize( self.doc.get_text_by_ann(sent)) sent_base = sent.span.start for tok, tok_span in zip(toks, spans): span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base) self.doc.annotate(span.start, span.end, self.tok_anno_name)
def __get_setences_boundaries(self): """ function to tokenize sentences and return sentence boundaries of each sentence using a tokenizer. :return: """ tokenizer = PunktSentenceTokenizer() sentences = list(tokenizer.span_tokenize(self.text)) return sentences
class DefaultSentenceTokenizer(object): def __init__(self): self.tokenizer = PunktSentenceTokenizer() def tokenize_sents(self, text): """ Returns spans """ return self.tokenizer.span_tokenize(text)
def nltkSentenceSplit(document): sentenceTokenizer = PunktSentenceTokenizer() sentences = sentenceTokenizer.span_tokenize(document) sentenceDict = dict() for beginSpan, endSpan in sentences: # print(beginSpan, endSpan, document[beginSpan:endSpan]) sentenceDict[beginSpan] = document[beginSpan:endSpan] return sentenceDict
class SpanTokenizer: """ 基于NLTK工具包,自定义一个详细版本的Tokenizer """ def __init__(self): self.tokenizer_big = PunktSentenceTokenizer() self.tokenizer_small = TreebankWordTokenizer() def tokenize(self, text): result = [] sentences_span = self.tokenizer_big.span_tokenize(text) for start, end in sentences_span: sentence = text[start:end] tokens_span = self.tokenizer_small.span_tokenize(sentence) for token_start, token_end in tokens_span: result.append([start + token_start, start + token_end]) return result
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute("""SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id,)) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end =instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end =instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute("""UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute( """SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id, )) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate( WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end = instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute( """UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))