Python download示例，nltk.downloader.download Python示例

示例#1

0

显示文件

def training_downloads():
    # NLTK:
    info("Downloading (if necessary) NLTK resources:")
    download('punkt')
    download('stopwords')

    # Glove:
    info('Downloading Glove Embeddings:')
    if not os.path.exists(VEC_DIR):
        os.makedirs(VEC_DIR)
    download_and_extract(GLOVE_EMBEDDINGS_URL, VEC_DIR)

    # Squad:
    info('Downloading Squad:')
    if not os.path.exists(SQUAD_SOURCE_DIR):
        os.makedirs(SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/train-v1.1.json', SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/dev-v1.1.json', SQUAD_SOURCE_DIR)

    # TriviaQA:
    info('Downloading TriviaQA:')
    if not os.path.exists(TRIVIA_QA):
        os.makedirs(TRIVIA_QA)
    download_and_extract(TRIVIAQA_SERVER + 'triviaqa-rc.tar.gz', TRIVIA_QA)

    # LM:
    info('Downloading LM:')
    if not os.path.exists(LM_DIR):
        os.makedirs(LM_DIR)
    download_and_extract(LM_URL, LM_DIR)

示例#2

0

显示文件

def clean_sw():
    try:
        sw = stopwords.words('english')
    except LookupError:
        downloader.download('stopwords')
        sw = stopwords.words('english')
    return set([english_stemmer(w) for w in sw])

示例#3

0

显示文件

def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))

示例#4

0

显示文件

文件： ex3.py 项目： BabisK/M36209P

def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))

示例#5

0

显示文件

文件： generate_documents.py 项目： stegerp/andsports-ir

def generate_documents(data_file):
    # Read data from csv export file
    data = pandas.read_csv(data_file, sep='\t', header=None,
                           names=[TEXT_IDENTIFIER_COLUMN, TEXT_COLUMN], skiprows=[0])

    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")

    if os.path.exists(DOCUMENTS_DIRECTORY):
        shutil.rmtree(DOCUMENTS_DIRECTORY)
    os.makedirs(DOCUMENTS_DIRECTORY)

    data = data.apply(tokenize, axis=1)
    data = data.apply(remove_stopwords, axis=1)
    data = data.apply(stem, axis=1)
    data.apply(save_to_document, axis=1)

示例#6

0

显示文件

文件： get_query_likelyhood_score.py 项目： stegerp/andsports-ir

def get_query_likelihood_score(documents_directory, query_text):
    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")
    query_document = generate_query_document(query_text)
    if len(query_document) == 0:
        print("Query not precise enough. Please refine your query")
        return

    collection_bag_of_words = load_collection_bag_of_words(documents_directory)
    document_bags_of_words = load_document_bags_of_words(documents_directory)

    scores = calculate_query_likelihood(query_document,
                                        collection_bag_of_words,
                                        document_bags_of_words)
    for document_name, score in scores.items():
        print(document_name + "\t" + str(score))

示例#7

0

显示文件

文件： TextFunctions.py 项目： ldsands/SandsPythonFunctions

    def get_stopwords():
        """
        this will test to see if the stopwords from the nltk module have already been
        downloaded if they have not they will be download this function is needed for both
        word embedding and topic modeling and is just overall useful
        """
        from nltk.downloader import download
        from nltk.corpus import stopwords

        try:
            return stopwords.words("english")
        except:
            print(
                f"NLTK needs to download the stopwords. This will take a while."
            )
            download("stopwords")
            print(f"NLTK has finished downloading stopwords.")
            return stopwords.words("english")

示例#8

0

显示文件

文件： index_corpus.py 项目： BluesparkLabs/guided-search

    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)

示例#9

0

显示文件

文件： downloader.py 项目： soerensigfusson/collective.classification

def downloadNLTKData():
    """
    """
    logger = logging.getLogger('collective.classification')    
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

示例#10

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')

示例#11

0

显示文件

文件： stopwords.py 项目： dangur/ud120-projects

#!/usr/bin/python

from nltk.corpus import stopwords
from nltk.downloader import download

download('all', halt_on_error=False)
sw = stopwords.words("english")
count = len(sw)
# print(sw)

示例#12

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')

示例#13

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')

示例#14

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')

示例#15

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

示例#16

0

显示文件

文件： ex4.py 项目： BabisK/M36209P

def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams)
    print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy,
                                                                                               bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams)
    print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy,
                                                                                                trigram_perplexity))

示例#17

0

显示文件

def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t)
                                       for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist,
                                      vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) /
        float(len(bigram_length_probabilities[length]))
        for length in bigram_length_probabilities.keys()
    }
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) /
        float(len(trigram_length_probabilities[length]))
        for length in trigram_length_probabilities.keys()
    }

    random_sentences = [[
        words[random.randint(0,
                             len(words) - 1)].lower() for i in range(key)
    ] for key in bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()),
                         color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()),
                          color='blue')
    random_bigram = plt.scatter(
        list(random_bigram_length_probabilities.values()),
        list(random_bigram_length_probabilities.keys()),
        color='green')
    random_trigram = plt.scatter(
        list(random_trigram_length_probabilities.values()),
        list(random_trigram_length_probabilities.keys()),
        color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the bigram model produced this text of length 30: {}'
        .format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the trigram model produced this text of length 30: {}'
        .format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(
        cpd_bigram, test_bigrams)
    print(
        'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'
        .format(bigram_entropy, bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(
        cpd_trigram, test_trigrams)
    print(
        'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'
        .format(trigram_entropy, trigram_perplexity))

示例#18

0

显示文件

文件： setup.py 项目： PeterBishop0/image-captioning-with-attention

def _post_setup():
    from nltk.downloader import download
    download('punkt')

示例#19

0

显示文件

import pickle
import re
import time
from SAR_utils import *
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import downloader

downloader.download('stopwords')


def compareT(a, b):
    (d1, p1) = a
    (d2, p2) = b
    if d1 == d2:
        return p1 - p2
    else:
        return d1 - d2


# Obtain posting list of a word
# If an index i is not specified it will match : patterns and develop stemming
def getPList(word, i=None, stemming=False):
    if i != None:
        return i.get(word, [])

    if ":" in word:
        [where, word] = word.split(":")
        if where == "headline" or where == "h":
            i = titleIndex
        elif where == "date" or where == "d":

示例#20

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')

示例#21

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')

示例#22

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')

示例#23

0

显示文件

文件： install_reuters.py 项目： wedavey/atnlp

def main():
    if not os.path.exists(NLTK_DIR):
        os.makedirs(NLTK_DIR)
        
    download('reuters', download_dir=NLTK_DIR)

示例#24

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')

示例#25

0

显示文件

文件： vectorizacionTextos.py 项目： Fasgort/AIA-ClasificacionTextos

# -*- coding: utf-8 -*-

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import unicodedata
import operator
from nltk import downloader
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
downloader.download("stopwords")
np.set_printoptions(threshold=np.nan)

stopwords_list = set(stopwords.words("spanish"))
stemmer = SnowballStemmer("spanish")


def tratamiento1(documentos):
    # Tratamiento de datos básico
    new_documentos = []
    for d in range(len(documentos)):
        unaccented_text = ''.join(
            c for c in unicodedata.normalize('NFD', documentos[d])
            if unicodedata.category(c) != 'Mn')
        lower_words = [str.lower(word) for word in unaccented_text.split(" ")]
        new_documentos.append(" ".join(lower_words))
    return new_documentos


def tratamiento2(documentos):

示例#26

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')

示例#27

0

显示文件

文件： install_nltk_data.py 项目： arantes555/oblivious-movie-gharial

import config
from nltk import downloader

# Async, can't be run in main process :/
# for wordnet stemmer
downloader.download(info_or_id='wordnet', download_dir=config.NLTK_DATA_DIR)
# for snowball and porter stemmer
# downloader.download(info_or_id='punkt', download_dir=config.NLTK_DATA_DIR)
# stop words <- used in snowball
downloader.download(info_or_id='stopwords', download_dir=config.NLTK_DATA_DIR)

示例#28

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

示例#29

0

显示文件

文件： tokenizer.py 项目： rodart/retail_store_items_classification

def download_corpus():
    downloader = nltk.downloader.Downloader(download_dir=NLTK_DIR)
    downloader.download('wordnet', download_dir=NLTK_DIR)

示例#30

0

显示文件

from nltk import downloader

downloader.download()

示例#31

0

显示文件

文件： get_data.py 项目： hxsebastien/topicmod

from nltk import downloader

if __name__ == "__main__":
    for ii in ["punkt", "stopwords", "wordnet"]:
        downloader.download(ii)

示例#32

0

显示文件

文件： downloader.py 项目： avoinea/collective.classification

def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')

示例#33

0

显示文件

文件： __init__.py 项目： trunghlt/cape-document-qa

#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from nltk.downloader import download
from logging import info
from cape_document_qa.download_and_extract import download_and_extract
from cape_document_qa.cape_document_qa_settings import MODEL_FOLDER, MODEL_URL, MODELS_FOLDER, MODEL_MB_SIZE, \
    GLOVE_EMBEDDINGS_URL, DOWNLOAD_ALL_GLOVE_EMBEDDINGS

glove_filepath = os.path.join(MODEL_FOLDER, 'glove.840B.300d.txt')
if not os.path.isfile(os.path.join(MODEL_FOLDER, 'model.pkl')) or \
        not os.path.isfile(glove_filepath) or \
        (
                DOWNLOAD_ALL_GLOVE_EMBEDDINGS and os.path.getsize(glove_filepath) / 1e6 < 2e3
                # less than 2 GBs-> we only have the top X embeddings
        ):
    # Downloading NLTK dependencies
    info("Downloading (if necessary) NLTK ressources:")
    download('punkt')
    download('stopwords')
    info('Downloading default model with top X Glove embeddings:')
    download_and_extract(MODEL_URL, MODELS_FOLDER, total_mb_size=MODEL_MB_SIZE)
    if DOWNLOAD_ALL_GLOVE_EMBEDDINGS:
        info('Downloading complete Glove Embeddings:')
        download_and_extract(GLOVE_EMBEDDINGS_URL, MODEL_FOLDER)

示例#34

0

显示文件

文件： twitter.py 项目： alexandreesl/socialmediasentimental-handson

from config import *
from textblob import TextBlob
from nltk import downloader
import tweepy


class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print('A TWEET!')
        print(status.text)
        print('AND THE SENTIMENT PER SENTENCE IS:')
        blob = TextBlob(status.text)
        for sentence in blob.sentences:
            print(sentence.sentiment.polarity)


auth = tweepy.OAuthHandler(consumerkey, consumerkeysecret)
auth.set_access_token(accesstoken, accesstokensecret)

downloader.download('punkt')

myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=auth, listener=myStreamListener)

stream = tweepy.Stream(auth, myStreamListener)
stream.filter(track=['coca cola'], languages=['en'])