def sentSegment(par, lang): try: sents = sent_tokenize(par, lang) except: try: par_seq = Sequence(par) st = SentenceTokenizer(locale = lang_map[lang]) sents = [sent for sent in st.transform(par_seq)] except: return None return sents
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq))) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq))) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(s_tokenizer.transform(seq)) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(w_tokenizer.transform(seq)) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sentence_objects = [] sent_tokenizer = SentenceTokenizer(locale=self.language.code) seq = Sequence(self.raw) seq = sent_tokenizer.transform(seq) for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]): # Sentences share the same models as their parent blob sent = seq.text[start_index:end_index].strip() if not sent: continue s = Sentence(sent, start_index=start_index, end_index=end_index) s.detected_languages = self.detected_languages sentence_objects.append(s) return sentence_objects
def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sentence_objects = [] sent_tokenizer = SentenceTokenizer(locale=self.language.code) seq = Sequence(self.raw) seq = sent_tokenizer.transform(seq) for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]): # Sentences share the same models as their parent blob sent = seq.text[start_index: end_index].strip() if not sent: continue s = Sentence(sent, start_index=start_index, end_index=end_index) s.detected_languages = self.detected_languages sentence_objects.append(s) return sentence_objects
class Text(BaseBlob): """. """ def __init__(self, text, lang_code=None, word_tokenizer=None, sentiment_weighting=None, sent_tokenizer=None): super(Text, self).__init__(text, lang_code, word_tokenizer, sentiment_weighting) if sent_tokenizer is not None: self.__sent_tokenizer = sent_tokenizer else: self.__sent_tokenizer = SentenceTokenizer(locale=self.language.code) def __str__(self): if len(self.raw) > 1000: return u"{}...{}".format(self.raw[:500], self.raw[-500:]) else: return self.raw @property def sentences(self): """Return list of :class:`Sentence <Sentence>` objects.""" return self._create_sentence_objects() @property def raw_sentences(self): """List of strings, the raw sentences in the blob.""" return [sentence.raw for sentence in self.sentences] @property def serialized(self): """Returns a list of each sentence's dict representation.""" return [sentence.dict for sentence in self.sentences] def to_json(self, *args, **kwargs): '''Return a json representation (str) of this blob. Takes the same arguments as json.dumps. .. versionadded:: 0.5.1 ''' return json.dumps(self.serialized, *args, **kwargs) @property def json(self): '''The json representation of this blob. .. versionchanged:: 0.5.1 Made ``json`` a property instead of a method to restore backwards compatibility that was broken after version 0.4.0. ''' return self.to_json() def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sentence_objects = [] seq = self.__sent_tokenizer.transform(Sequence(self.raw)) for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]): # Sentences share the same models as their parent blob sent = seq.text[start_index: end_index].strip() if sent: s = Sentence(sent, start_index=start_index, end_index=end_index, lang_code=self.language.code, word_tokenizer=self.word_tokenizer, sentiment_weighting=self.sentiment_weighting) s.detected_languages = self.detected_languages sentence_objects.append(s) return sentence_objects