Python PerceptronTagger примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk

Класс/Тип: PerceptronTagger

Примеров на hotexamples.com: 6

Python PerceptronTagger - 6 примеров найдено. Это лучшие примеры Python кода для nltk.PerceptronTagger, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PerceptronTagger(5)

tag(2)

Основные методы

PerceptronTagger (5)

tag (2)

Пример #1

Показать файл

Файл: text_processing.py Проект: Dolorousrtur/KeywordClassifier

def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized

Пример #2

Показать файл

Файл: pos_tagger.py Проект: zxz53000/AspectBasedSentimentAnalysis

    def get_tagger():
        """

        Returns:

        """
        return PerceptronTagger()

Пример #3

Показать файл

    def pos_tag_reviews(records):
        print('%s: tag reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        tagger = PerceptronTagger()

        for record in records:
            tagged_words =\
                nlp_utils.tag_words(record[Constants.TEXT_FIELD], tagger)
            record[Constants.POS_TAGS_FIELD] = tagged_words

Пример #4

Показать файл

def clean_text(text,
               stopwords,
               remove_stopwords=True,
               pos_filtering=False,
               stemming=True,
               lower_case=True):
    if lower_case:
        # convert to lower case
        text = text.lower()
    # strip extra white space
    text = re.sub(' +', ' ', text)
    # strip leading and trailing white space
    text = text.strip()
    # tokenize (split based on whitespace)
    tokens = text.split(' ')

    # remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]

    if pos_filtering:
        tagger = PerceptronTagger()
        # apply POS-tagging
        tagged_tokens = tagger.tag(tokens)
        # retain only nouns and adjectives
        tokens = [
            item[0] for item in tagged_tokens if item[1] in [
                'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD',
                'VBG', 'VBN', 'VBP', 'VBZ'
            ]
        ]
    if remove_stopwords:
        # remove stopwords
        tokens = [token for token in tokens if token.lower() not in stopwords]
    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed

    return (tokens)

Пример #5

Показать файл

Файл: text.py Проект: parnellj/glass

class Syntax(Characteristic):
    """
    Accounts for the syntactic aspects of the source text.
        Word-for-word:  Parses and stores part-of-speech (POS) tags.
        Entire text:    Enumerates all configurations of clause and sentence found in
                        the text.
    """
    POS_TAG = PerceptronTagger()

    def __init__(self, tokens):
        Characteristic.__init__(self)

    def pos_tag(self):
        pass

    def find_clauses(self):
        pass

    def find_sentences(self):
        pass

Пример #6

Показать файл

if domain == 'meeting':
    path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
    path_to_filler_words = path_to_root + 'resources/stopwords/meeting/filler_words.' + language + '.txt'
    stopwords = utils.load_stopwords(path_to_stopwords)
    filler_words = utils.load_filler_words(path_to_filler_words)

    if dataset_id == 'ami':
        ids = meeting_lists.ami_development_set + meeting_lists.ami_test_set
    elif dataset_id == 'icsi':
        ids = meeting_lists.icsi_development_set + meeting_lists.icsi_test_set

if language == 'en':
    path_to_word2vec_keys = path_to_root + 'resources/word2vec_keys.txt'
# tokenizer = DictionaryTokenizer(path_to_word2vec_keys) # highly time-consuming
# tokenizer = TweetTokenizer()
tagger = PerceptronTagger()

# ######################
# ### CORPUS LOADING ###
# ######################
corpus = {}
for id in ids:
    if domain == 'meeting':
        if dataset_id == 'ami' or dataset_id == 'icsi':
            if source == 'asr':
                path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da-asr'
            elif source == 'manual':
                path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da'
            # filler words will be removed during corpus loading
            corpus[id] = utils.read_ami_icsi(path, filler_words)