Exemplos de raw em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: nltk.corpus.stopwords

Método / Função: raw

Exemplos em hotexamples.com: 4

raw em Python - 4 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de nltk.corpus.stopwords.raw em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Relacionados

_clean

UrwidUI

leading_zeros

common_average_reference_cnt

OrderedDict

YankOptions

Dive

build_classes

shuffle_in_unison

need

Related in langs

fin_grand_cadre (PHP)

Events (PHP)

VerifySecondPassword (C#)

AccountCreateArguments (C#)

hasspace (C++)

CreateLoserTree (C++)

wiringPiSetup (Go)

New (Go)

LangDataKeys.PSI_ELEMENT (Java)

SpecificDatumWriter (Java)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: index.py Projeto: nkartashov/NKrawler

def is_stopword(word): english_stopwords = stopwords.raw('english') russian_stopwords = stopwords.raw('russian') is_english_stopword = word in english_stopwords try: word = word.decode('utf-8') return is_english_stopword or word in russian_stopwords except UnicodeError: return is_english_stopword

Exemplo n.º 2

0

Exibir arquivo

Arquivo: stemmer_comparison.py Projeto: sergio-garcia-clones/nlplaying

def create_indexes(): stopwords_pt = stopwords.raw('portuguese').decode('utf-8').split('\n')[:-1] snowball_stemmer = PortugueseStemmer() rslp_stemmer = RSLPStemmer() indexes = {'no-stemmer-with-stopwords': Index(stemmer=None, stopwords=[]), 'no-stemmer-without-stopwords': Index(stemmer=None, stopwords=stopwords_pt), 'snowball-with-stopwords': Index(stemmer=snowball_stemmer, stopwords=[]), 'snowball-without-stopwords': Index(stemmer=snowball_stemmer, stopwords=stopwords_pt), 'rslp-with-stopwords': Index(stemmer=rslp_stemmer, stopwords=[]), 'rslp-without-stopwords': Index(stemmer=rslp_stemmer, stopwords=stopwords_pt),} for index_name, index in indexes.iteritems(): index.name = index_name filenames = machado.fileids() index_count = len(indexes) total_iterations = len(filenames) * index_count counter = 1 for filename in filenames: contents = machado.raw(filename) for index_name, index in indexes.iteritems(): info = '[{:05d}/{:05d}] Adding document "{}" to index "{}" ... '\ .format(counter, total_iterations, filename, index_name) sys.stdout.write(info) start = time() index.add_document(filename, contents) end = time() sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start)) counter += 1 if not os.path.exists('data'): os.mkdir('data') counter = 1 for index_name, index in indexes.iteritems(): info = '[{:02d}/{:02d}] Dumping index "{}" ... '.format(counter, index_count, index_name) sys.stdout.write(info) start = time() index.dump('data/{}.pickle'.format(index_name)) end = time() sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start)) counter += 1

Exemplo n.º 3

0

Exibir arquivo

from nltk.tokenize import word_tokenize from nltk import WordNetLemmatizer, wordnet, pos_tag from nltk.corpus import stopwords import string wl = WordNetLemmatizer() sw = stopwords.raw("english").split() vocab = {} def get_tag(treebank_tag): if treebank_tag.startswith('J'): return wordnet.wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.wordnet.ADV else: return wordnet.wordnet.NOUN def get_lem(text): t = "" for m in text: t += wl.lemmatize(m[0].lower(), get_tag(m[1])) + " " return t.strip() def preprocess(text):

Exemplo n.º 4

0

Exibir arquivo

Arquivo: 2 Language Identifier.py Projeto: M9T/DIY-NLTK

Idea I: Let us count the stop words. Idea II: Let us use word bigrams. """ # Idea I from nltk.tokenize import wordpunct_tokenize sentence = "Zalando SE is a European e-commerce company based in Berlin, Germany. The company follows a platform approach, offering Fashion and Lifestyle products to customers in 17 European markets. Zalando was founded in Germany in 2008. Swedish company Kinnevik is the largest owner with 32%." tokens = wordpunct_tokenize(sentence) print(tokens) # Explore stop word corpus from nltk.corpus import stopwords print(stopwords.readme().replace("\n", " ")) # German stop words print(stopwords.raw("german").replace("\n", " ")) # How many stop words for english and german? print(len(stopwords.words(["english", "german"]))) # Classify language by counting stop words language_ratios = {} test_words = [word.lower() for word in test_tokens] test_words_set = set(test_words) for language in stopwords.fileids(): # For some languages it would be a wise idea to tokenize the stop words by punctuation too. stopwords_set = set(stopwords.words(language)) common_elements = test_words_set.intersection(stopwords_set) language_ratios[language] = len(common_elements)