Exemplo n.º 1
0
def sentSegment(par, lang):
  try:
    sents = sent_tokenize(par, lang)
  except:
    try:
      par_seq = Sequence(par)
      st = SentenceTokenizer(locale = lang_map[lang])
      sents = [sent for sent in st.transform(par_seq)]
    except:
      return None
  return sents
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects from the raw text.
 '''
     sentence_objects = []
     sent_tokenizer = SentenceTokenizer(locale=self.language.code)
     seq = Sequence(self.raw)
     seq = sent_tokenizer.transform(seq)
     for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]):
         # Sentences share the same models as their parent blob
         sent = seq.text[start_index:end_index].strip()
         if not sent: continue
         s = Sentence(sent, start_index=start_index, end_index=end_index)
         s.detected_languages = self.detected_languages
         sentence_objects.append(s)
     return sentence_objects
Exemplo n.º 3
0
 def _create_sentence_objects(self):
   '''Returns a list of Sentence objects from the raw text.
   '''
   sentence_objects = []
   sent_tokenizer = SentenceTokenizer(locale=self.language.code)
   seq = Sequence(self.raw)
   seq = sent_tokenizer.transform(seq)
   for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]):
     # Sentences share the same models as their parent blob
     sent = seq.text[start_index: end_index].strip()
     if not sent: continue
     s = Sentence(sent, start_index=start_index, end_index=end_index)
     s.detected_languages = self.detected_languages
     sentence_objects.append(s)
   return sentence_objects
Exemplo n.º 4
0
  def __init__(self, text, lang_code=None, word_tokenizer=None,
               sentiment_weighting=None, sent_tokenizer=None):
    super(Text, self).__init__(text, lang_code,
                               word_tokenizer, sentiment_weighting)

    if sent_tokenizer is not None:
        self.__sent_tokenizer = sent_tokenizer
    else:
        self.__sent_tokenizer = SentenceTokenizer(locale=self.language.code)
Exemplo n.º 5
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))
Exemplo n.º 6
0
def segment(args):
  lang  = args.lang
  w_tokenizer = WordTokenizer(locale=lang)
  s_tokenizer = SentenceTokenizer(locale=lang)

  if args.only_sent:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(s_tokenizer.transform(seq))

  elif args.only_word:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(w_tokenizer.transform(seq))

  else:
    for l in args.input:
      seq = Sequence(l)
      sents = s_tokenizer.transform(seq)
      words = w_tokenizer.transform(seq)
      for tokenized_sent in words.split(sents):
        if not tokenized_sent.empty():
          _print(u' '.join(tokenized_sent.tokens()))
Exemplo n.º 7
0
class Text(BaseBlob):
  """.
  """

  def __init__(self, text, lang_code=None, word_tokenizer=None,
               sentiment_weighting=None, sent_tokenizer=None):
    super(Text, self).__init__(text, lang_code,
                               word_tokenizer, sentiment_weighting)

    if sent_tokenizer is not None:
        self.__sent_tokenizer = sent_tokenizer
    else:
        self.__sent_tokenizer = SentenceTokenizer(locale=self.language.code)

  def __str__(self):
    if len(self.raw) > 1000:
      return u"{}...{}".format(self.raw[:500], self.raw[-500:])
    else:
      return self.raw

  @property
  def sentences(self):
    """Return list of :class:`Sentence <Sentence>` objects."""
    return self._create_sentence_objects()

  @property
  def raw_sentences(self):
    """List of strings, the raw sentences in the blob."""
    return [sentence.raw for sentence in self.sentences]

  @property
  def serialized(self):
    """Returns a list of each sentence's dict representation."""
    return [sentence.dict for sentence in self.sentences]

  def to_json(self, *args, **kwargs):
    '''Return a json representation (str) of this blob.
    Takes the same arguments as json.dumps.
    .. versionadded:: 0.5.1
    '''
    return json.dumps(self.serialized, *args, **kwargs)

  @property
  def json(self):
    '''The json representation of this blob.
    .. versionchanged:: 0.5.1
        Made ``json`` a property instead of a method to restore backwards
        compatibility that was broken after version 0.4.0.
    '''
    return self.to_json()

  def _create_sentence_objects(self):
    '''Returns a list of Sentence objects from the raw text.
    '''
    sentence_objects = []
    seq = self.__sent_tokenizer.transform(Sequence(self.raw))
    for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]):
      # Sentences share the same models as their parent blob
      sent = seq.text[start_index: end_index].strip()

      if sent:
        s = Sentence(sent, start_index=start_index, end_index=end_index,
                     lang_code=self.language.code,
                     word_tokenizer=self.word_tokenizer,
                     sentiment_weighting=self.sentiment_weighting)

        s.detected_languages = self.detected_languages

        sentence_objects.append(s)

    return sentence_objects