Python tokenwrap 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk

메소드/함수: tokenwrap

hotexamples.com에서의 예제들: 11

Python tokenwrap - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.tokenwrap에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: c03_re.py 프로젝트: AkiraKane/Python

def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
            for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])


    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()


    cv_word_pairs = [(cv, w) for w in rotokas_words
            for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']

예제 #2

파일 보기

def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
                       for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])

    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [
        cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)
    ]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()

    cv_word_pairs = [(cv, w) for w in rotokas_words
                     for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']

예제 #3

파일 보기

파일: pperish-run.py 프로젝트: angelaambroz/nltk-or-perish

def Analysis(corpus):
    raw = open(corpus, "rU").read()

    words, sents, text = Tokenize(raw)

    random_sentence = randomSentence(sents)

    fdist, cfd, top_words = FreqDists(words)

    print nltk.tokenwrap(Przm(w) for w in words[:20])
    print fdist
    cfd.tabulate()

예제 #4

파일 보기

def regex_compress():
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    return nltk.tokenwrap(compress(w) for w in english_udhr[:75])

예제 #5

파일 보기

    def generate(self, length=100):
        """"""
        # Change tokens
        self.tokens = nltk.word_tokenize(
            self.__words[randint(1, len(self.__words)) - 1])

        estimator = lambda fdist, bins: nltk.LidstoneProbDist(
            fdist, self.__random.random())
        #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
        self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15),
                                              self, estimator)
        #self._trigram_model = nltk.NgramModel(3, self, estimator)
        text = self._trigram_model.generate(length)
        return nltk.tokenwrap(text)

예제 #6

파일 보기

nacute_utf = nacute.encode('utf8')
print repr(nacute_utf)

#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print nltk.tokenwrap(compress(w) for w in english_udhr[:75])
#处理词干

def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
#非贪婪
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'

예제 #7

파일 보기

파일: ch03.py 프로젝트: prashiyn/nltk-examples

def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])

예제 #8

파일 보기

파일: basicsReview.py 프로젝트: z-o-e/exploreNLP

raw.rfind('End of Project')

# regular expression
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$',w) and len(w)==3]
[w for w in wordlist if re.search('ed$',w) and len(w)==4]
[w for w in wordlist if re.search('^..j..t..$',w)]
[w for w in wordlist if re.search('^[abdc][efgh][ijkl]$',w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp,word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print (nltk.tokenwrap(compress(w) for w in english_udhr[:100]))

# suffix, stemming
re.findall(r'^.*(ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ive|es|s|ment)$','processes')
re.findall(r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$','processes')                                    
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp,word)[0]
    return stem
stem('processing')

# searching tokenized text
moby = nltk.Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a>(<.*>)<man>")

예제 #9

파일 보기

파일: chapter3.py 프로젝트: hbdhj/python

def testCompress():
    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])

예제 #10

파일 보기

def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])

예제 #11

파일 보기

파일: regex_examples.py 프로젝트: MARS87/ieor242

 def lossy_compression(self):
     pattern = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
     print nltk.tokenwrap(self.compress(pattern, w) for w in self.wordlist)