Пример #1
0
    def tokenize_sentence(self, sentence):
        """
        Tokenize the provided sentence.
        """
        if self.sentence_tokenizer is None:
            try:
                self.sentence_tokenizer = load_data(
                    'tokenizers/punkt/{language}.pickle'.format(
                        language=self.language.ENGLISH_NAME.lower()))
            except LookupError:
                # Fall back to English sentence splitting rules if a language is not supported
                self.sentence_tokenizer = load_data(
                    'tokenizers/punkt/{language}.pickle'.format(
                        language=languages.ENG.ENGLISH_NAME.lower()))

        return self.sentence_tokenizer.tokenize(sentence)
Пример #2
0
    def get_sentence_detector(self):
        """
        Get the initialized sentence detector.
        """
        if self.sentence_detector is None:
            self.sentence_detector = load_data('tokenizers/punkt/english.pickle')

        return self.sentence_detector
Пример #3
0
    def get_sentence_tokenizer(self):
        """
        Get the initialized sentence detector.
        """
        if self.sentence_tokenizer is None:
            self.sentence_tokenizer = load_data('tokenizers/punkt/{language}.pickle'.format(
                language=self.language.ENGLISH_NAME.lower()
            ))

        return self.sentence_tokenizer
Пример #4
0
    def get_bigram_pair_string(self, text):
        """
        For example:
        What a beautiful swamp

        becomes:

        DT:beautiful JJ:wetland
        """
        WORD_INDEX = 0
        POS_INDEX = 1

        pos_tags = []

        sentence_detector = load_data('tokenizers/punkt/english.pickle')

        for sentence in sentence_detector.tokenize(text.strip()):

            # Remove punctuation
            if sentence and sentence[-1] in string.punctuation:
                sentence_with_punctuation_removed = sentence[:-1]

                if sentence_with_punctuation_removed:
                    sentence = sentence_with_punctuation_removed

            words = sentence.split()

            pos_tags.extend(pos_tag(words))

        hypernyms = self.get_hypernyms(pos_tags)

        high_quality_bigrams = []
        all_bigrams = []

        word_count = len(pos_tags)

        if word_count == 1:
            all_bigrams.append(pos_tags[0][WORD_INDEX].lower())

        for index in range(1, word_count):
            word = pos_tags[index][WORD_INDEX].lower()
            previous_word_pos = pos_tags[index - 1][POS_INDEX]
            if word not in self.get_stopwords() and len(word) > 1:
                bigram = previous_word_pos + ':' + hypernyms[index].lower()
                high_quality_bigrams.append(bigram)
                all_bigrams.append(bigram)
            else:
                bigram = previous_word_pos + ':' + word
                all_bigrams.append(bigram)

        if high_quality_bigrams:
            all_bigrams = high_quality_bigrams

        return ' '.join(all_bigrams)