Exemplo n.º 1
0
def is_stopword(word):
    english_stopwords = stopwords.raw('english')
    russian_stopwords = stopwords.raw('russian')
    is_english_stopword = word in english_stopwords
    try:
        word = word.decode('utf-8')
        return is_english_stopword or word in russian_stopwords
    except UnicodeError:
        return is_english_stopword
def create_indexes():
    stopwords_pt = stopwords.raw('portuguese').decode('utf-8').split('\n')[:-1]
    snowball_stemmer = PortugueseStemmer()
    rslp_stemmer = RSLPStemmer()
    indexes = {'no-stemmer-with-stopwords': Index(stemmer=None, stopwords=[]),
               'no-stemmer-without-stopwords': Index(stemmer=None, stopwords=stopwords_pt),
               'snowball-with-stopwords': Index(stemmer=snowball_stemmer, stopwords=[]),
               'snowball-without-stopwords': Index(stemmer=snowball_stemmer, stopwords=stopwords_pt),
               'rslp-with-stopwords': Index(stemmer=rslp_stemmer, stopwords=[]),
               'rslp-without-stopwords': Index(stemmer=rslp_stemmer, stopwords=stopwords_pt),}
    for index_name, index in indexes.iteritems():
        index.name = index_name
    filenames = machado.fileids()
    index_count = len(indexes)
    total_iterations = len(filenames) * index_count
    counter = 1
    for filename in filenames:
        contents = machado.raw(filename)
        for index_name, index in indexes.iteritems():
            info = '[{:05d}/{:05d}] Adding document "{}" to index "{}" ... '\
                    .format(counter, total_iterations, filename, index_name)
            sys.stdout.write(info)
            start = time()
            index.add_document(filename, contents)
            end = time()
            sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start))
            counter += 1

    if not os.path.exists('data'):
        os.mkdir('data')
    counter = 1
    for index_name, index in indexes.iteritems():
        info = '[{:02d}/{:02d}] Dumping index "{}" ... '.format(counter,
                index_count, index_name)
        sys.stdout.write(info)
        start = time()
        index.dump('data/{}.pickle'.format(index_name))
        end = time()
        sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start))
        counter += 1
Exemplo n.º 3
0
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer, wordnet, pos_tag
from nltk.corpus import stopwords
import string

wl = WordNetLemmatizer()
sw = stopwords.raw("english").split()
vocab = {}


def get_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.wordnet.ADV
    else:
        return wordnet.wordnet.NOUN


def get_lem(text):
    t = ""
    for m in text:
        t += wl.lemmatize(m[0].lower(), get_tag(m[1])) + " "
    return t.strip()


def preprocess(text):
Exemplo n.º 4
0
Idea I: Let us count the stop words.
Idea II: Let us use word bigrams.
"""

# Idea I
from nltk.tokenize import wordpunct_tokenize
sentence = "Zalando SE is a European e-commerce company based in Berlin, Germany. The company follows a platform approach, offering Fashion and Lifestyle products to customers in 17 European markets. Zalando was founded in Germany in 2008. Swedish company Kinnevik is the largest owner with 32%."
tokens = wordpunct_tokenize(sentence)
print(tokens)

# Explore stop word corpus
from nltk.corpus import stopwords
print(stopwords.readme().replace("\n", " "))

# German stop words
print(stopwords.raw("german").replace("\n", " "))

# How many stop words for english and german?
print(len(stopwords.words(["english", "german"])))

# Classify language by counting stop words
language_ratios = {}
test_words = [word.lower() for word in test_tokens]
test_words_set = set(test_words)

for language in stopwords.fileids():
    # For some languages it would be a wise idea to tokenize the stop words by punctuation too.
    stopwords_set = set(stopwords.words(language))
    common_elements = test_words_set.intersection(stopwords_set)
    language_ratios[language] = len(common_elements)