def tokenize_sentence(self, sentence): """ Tokenize the provided sentence. """ if self.sentence_tokenizer is None: try: self.sentence_tokenizer = load_data( 'tokenizers/punkt/{language}.pickle'.format( language=self.language.ENGLISH_NAME.lower())) except LookupError: # Fall back to English sentence splitting rules if a language is not supported self.sentence_tokenizer = load_data( 'tokenizers/punkt/{language}.pickle'.format( language=languages.ENG.ENGLISH_NAME.lower())) return self.sentence_tokenizer.tokenize(sentence)
def get_sentence_detector(self): """ Get the initialized sentence detector. """ if self.sentence_detector is None: self.sentence_detector = load_data('tokenizers/punkt/english.pickle') return self.sentence_detector
def get_sentence_tokenizer(self): """ Get the initialized sentence detector. """ if self.sentence_tokenizer is None: self.sentence_tokenizer = load_data('tokenizers/punkt/{language}.pickle'.format( language=self.language.ENGLISH_NAME.lower() )) return self.sentence_tokenizer
def get_bigram_pair_string(self, text): """ For example: What a beautiful swamp becomes: DT:beautiful JJ:wetland """ WORD_INDEX = 0 POS_INDEX = 1 pos_tags = [] sentence_detector = load_data('tokenizers/punkt/english.pickle') for sentence in sentence_detector.tokenize(text.strip()): # Remove punctuation if sentence and sentence[-1] in string.punctuation: sentence_with_punctuation_removed = sentence[:-1] if sentence_with_punctuation_removed: sentence = sentence_with_punctuation_removed words = sentence.split() pos_tags.extend(pos_tag(words)) hypernyms = self.get_hypernyms(pos_tags) high_quality_bigrams = [] all_bigrams = [] word_count = len(pos_tags) if word_count == 1: all_bigrams.append(pos_tags[0][WORD_INDEX].lower()) for index in range(1, word_count): word = pos_tags[index][WORD_INDEX].lower() previous_word_pos = pos_tags[index - 1][POS_INDEX] if word not in self.get_stopwords() and len(word) > 1: bigram = previous_word_pos + ':' + hypernyms[index].lower() high_quality_bigrams.append(bigram) all_bigrams.append(bigram) else: bigram = previous_word_pos + ':' + word all_bigrams.append(bigram) if high_quality_bigrams: all_bigrams = high_quality_bigrams return ' '.join(all_bigrams)