def training_downloads(): # NLTK: info("Downloading (if necessary) NLTK resources:") download('punkt') download('stopwords') # Glove: info('Downloading Glove Embeddings:') if not os.path.exists(VEC_DIR): os.makedirs(VEC_DIR) download_and_extract(GLOVE_EMBEDDINGS_URL, VEC_DIR) # Squad: info('Downloading Squad:') if not os.path.exists(SQUAD_SOURCE_DIR): os.makedirs(SQUAD_SOURCE_DIR) download_file(SQUAD_SERVER + '/train-v1.1.json', SQUAD_SOURCE_DIR) download_file(SQUAD_SERVER + '/dev-v1.1.json', SQUAD_SOURCE_DIR) # TriviaQA: info('Downloading TriviaQA:') if not os.path.exists(TRIVIA_QA): os.makedirs(TRIVIA_QA) download_and_extract(TRIVIAQA_SERVER + 'triviaqa-rc.tar.gz', TRIVIA_QA) # LM: info('Downloading LM:') if not os.path.exists(LM_DIR): os.makedirs(LM_DIR) download_and_extract(LM_URL, LM_DIR)
def clean_sw(): try: sw = stopwords.words('english') except LookupError: downloader.download('stopwords') sw = stopwords.words('english') return set([english_stemmer(w) for w in sw])
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [['start0'] + [ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence) - 1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice( lower_case_letters) + sentence[word][letter + 1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [ ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence)-1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def generate_documents(data_file): # Read data from csv export file data = pandas.read_csv(data_file, sep='\t', header=None, names=[TEXT_IDENTIFIER_COLUMN, TEXT_COLUMN], skiprows=[0]) download('punkt', download_dir="nltk_data") download('stopwords', download_dir="nltk_data") if os.path.exists(DOCUMENTS_DIRECTORY): shutil.rmtree(DOCUMENTS_DIRECTORY) os.makedirs(DOCUMENTS_DIRECTORY) data = data.apply(tokenize, axis=1) data = data.apply(remove_stopwords, axis=1) data = data.apply(stem, axis=1) data.apply(save_to_document, axis=1)
def get_query_likelihood_score(documents_directory, query_text): download('punkt', download_dir="nltk_data") download('stopwords', download_dir="nltk_data") query_document = generate_query_document(query_text) if len(query_document) == 0: print("Query not precise enough. Please refine your query") return collection_bag_of_words = load_collection_bag_of_words(documents_directory) document_bags_of_words = load_document_bags_of_words(documents_directory) scores = calculate_query_likelihood(query_document, collection_bag_of_words, document_bags_of_words) for document_name, score in scores.items(): print(document_name + "\t" + str(score))
def get_stopwords(): """ this will test to see if the stopwords from the nltk module have already been downloaded if they have not they will be download this function is needed for both word embedding and topic modeling and is just overall useful """ from nltk.downloader import download from nltk.corpus import stopwords try: return stopwords.words("english") except: print( f"NLTK needs to download the stopwords. This will take a while." ) download("stopwords") print(f"NLTK has finished downloading stopwords.") return stopwords.words("english")
def handle(self): """ Process corpus documents indexation. """ download('stopwords') indexdb = IndexDB() self.connection = indexdb.handler() data_dir = '/Users/pablocc/harvard_data/' counter = 0 for filename in os.listdir(data_dir): if os.path.isdir(data_dir + filename) or filename[0] == '.': continue with open(data_dir + filename, 'rb') as fh: reader = MARCReader(fh) for record in reader: document = self.prepare_record(record) counter += 1 print("%s - processing document %s." % (counter, document['id'])) self.index_document(document)
def downloadNLTKData(): """ """ logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Punkt Tokenizer Models") download('punkt') logger.info("Downloading NLTK's Brown corpus") download('brown') logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)") download('maxent_treebank_pos_tagger')
def downloadNLTKTokenizerData(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Punkt Tokenizer Models") download('punkt')
#!/usr/bin/python from nltk.corpus import stopwords from nltk.downloader import download download('all', halt_on_error=False) sw = stopwords.words("english") count = len(sw) # print(sw)
def downloadNLTKAlpinoCorpus(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Alpino corpus") download('alpino')
def downloadNLTKEurParlRaw(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Sample European Parliament Proceedings " "Parallel Corpus") download('europarl_raw')
def downloadNLTKPenTreeBank(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)") download('maxent_treebank_pos_tagger')
def main(): matplotlib.use('Qt5Agg') import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary_length = word_frequency_distribution.B() # Calculate bigrams and trigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3))) # Calculate the conditional frequency distributions for bigrams and trigrams bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train) trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train]) # Calculate the conditional probability distributions for bigrams and trigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length) bigrams_test = ngrams_sentences(test, 2) bigram_length_probabilities = defaultdict(list) for sentence in bigrams_test: logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) bigram_length_probabilities[len(sentence)].append(logprob) x = 0 s = None for sentence in bigrams_test: if (len(sentence) > x): x = len(sentence) s = sentence trigrams_test = ngrams_sentences(test, 3) trigram_length_probabilities = defaultdict(list) for sentence in trigrams_test: logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence] logprob = sum(logprob) trigram_length_probabilities[len(sentence)].append(logprob) average_bigram_length_probabilities = { length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in bigram_length_probabilities.keys()} average_trigram_length_probabilities = { length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length in trigram_length_probabilities.keys()} random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in bigram_length_probabilities.keys()] bigrams_random = ngrams_sentences(random_sentences, 2) random_bigram_length_probabilities = defaultdict(list) for sentence in bigrams_random: logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) random_bigram_length_probabilities[len(sentence)].append(logprob) trigrams_random = ngrams_sentences(random_sentences, 3) random_trigram_length_probabilities = defaultdict(list) for sentence in trigrams_random: logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence] logprob = sum(logprob) random_trigram_length_probabilities[len(sentence)].append(logprob) bigram = plt.scatter(list(average_bigram_length_probabilities.values()), list(average_bigram_length_probabilities.keys()), color='red') trigram = plt.scatter(list(average_trigram_length_probabilities.values()), list(average_trigram_length_probabilities.keys()), color='blue') random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()), list(random_bigram_length_probabilities.keys()), color='green') random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()), list(random_trigram_length_probabilities.keys()), color='black') plt.xlabel('$log_2(P(W_1^k))$') plt.ylabel('$k$') plt.legend((bigram, trigram, random_bigram, random_trigram), ('Bigram', 'Trigram', 'Random bigram', 'Random trigram')) plt.ylim(ymin=0) # plt.show() plt.savefig('logprob') seed = 'this' for i in range(30): newword = predict_word(cpd_bigram, seed, 'bigram') if newword != None: seed += ' ' + newword else: break print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed)) seed = 'this' for i in range(30): newword = predict_word(cpd_trigram, seed, 'trigram') if newword != None: seed += ' ' + newword else: break print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed)) test_bigrams = [] for sentence in bigrams_test: test_bigrams += sentence bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams) print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy, bigram_perplexity)) test_trigrams = [] for sentence in trigrams_test: test_trigrams += sentence trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams) print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy, trigram_perplexity))
def main(): matplotlib.use('Qt5Agg') import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [[ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary_length = word_frequency_distribution.B() # Calculate bigrams and trigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3))) # Calculate the conditional frequency distributions for bigrams and trigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train]) # Calculate the conditional probability distributions for bigrams and trigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length) bigrams_test = ngrams_sentences(test, 2) bigram_length_probabilities = defaultdict(list) for sentence in bigrams_test: logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) bigram_length_probabilities[len(sentence)].append(logprob) x = 0 s = None for sentence in bigrams_test: if (len(sentence) > x): x = len(sentence) s = sentence trigrams_test = ngrams_sentences(test, 3) trigram_length_probabilities = defaultdict(list) for sentence in trigrams_test: logprob = [ cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence ] logprob = sum(logprob) trigram_length_probabilities[len(sentence)].append(logprob) average_bigram_length_probabilities = { length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in bigram_length_probabilities.keys() } average_trigram_length_probabilities = { length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length in trigram_length_probabilities.keys() } random_sentences = [[ words[random.randint(0, len(words) - 1)].lower() for i in range(key) ] for key in bigram_length_probabilities.keys()] bigrams_random = ngrams_sentences(random_sentences, 2) random_bigram_length_probabilities = defaultdict(list) for sentence in bigrams_random: logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) random_bigram_length_probabilities[len(sentence)].append(logprob) trigrams_random = ngrams_sentences(random_sentences, 3) random_trigram_length_probabilities = defaultdict(list) for sentence in trigrams_random: logprob = [ cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence ] logprob = sum(logprob) random_trigram_length_probabilities[len(sentence)].append(logprob) bigram = plt.scatter(list(average_bigram_length_probabilities.values()), list(average_bigram_length_probabilities.keys()), color='red') trigram = plt.scatter(list(average_trigram_length_probabilities.values()), list(average_trigram_length_probabilities.keys()), color='blue') random_bigram = plt.scatter( list(random_bigram_length_probabilities.values()), list(random_bigram_length_probabilities.keys()), color='green') random_trigram = plt.scatter( list(random_trigram_length_probabilities.values()), list(random_trigram_length_probabilities.keys()), color='black') plt.xlabel('$log_2(P(W_1^k))$') plt.ylabel('$k$') plt.legend((bigram, trigram, random_bigram, random_trigram), ('Bigram', 'Trigram', 'Random bigram', 'Random trigram')) plt.ylim(ymin=0) # plt.show() plt.savefig('logprob') seed = 'this' for i in range(30): newword = predict_word(cpd_bigram, seed, 'bigram') if newword != None: seed += ' ' + newword else: break print( 'Given the seed word "this", the bigram model produced this text of length 30: {}' .format(seed)) seed = 'this' for i in range(30): newword = predict_word(cpd_trigram, seed, 'trigram') if newword != None: seed += ' ' + newword else: break print( 'Given the seed word "this", the trigram model produced this text of length 30: {}' .format(seed)) test_bigrams = [] for sentence in bigrams_test: test_bigrams += sentence bigram_entropy, bigram_perplexity = centropy_perplexity( cpd_bigram, test_bigrams) print( 'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}' .format(bigram_entropy, bigram_perplexity)) test_trigrams = [] for sentence in trigrams_test: test_trigrams += sentence trigram_entropy, trigram_perplexity = centropy_perplexity( cpd_trigram, test_trigrams) print( 'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}' .format(trigram_entropy, trigram_perplexity))
def _post_setup(): from nltk.downloader import download download('punkt')
import pickle import re import time from SAR_utils import * from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from nltk import downloader downloader.download('stopwords') def compareT(a, b): (d1, p1) = a (d2, p2) = b if d1 == d2: return p1 - p2 else: return d1 - d2 # Obtain posting list of a word # If an index i is not specified it will match : patterns and develop stemming def getPList(word, i=None, stemming=False): if i != None: return i.get(word, []) if ":" in word: [where, word] = word.split(":") if where == "headline" or where == "h": i = titleIndex elif where == "date" or where == "d":
def downloadNLTKBrownCorpus(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's Brown corpus") download('brown')
def main(): if not os.path.exists(NLTK_DIR): os.makedirs(NLTK_DIR) download('reuters', download_dir=NLTK_DIR)
def downloadNLTKConll2000Corpus(): logger = logging.getLogger('collective.classification') logger.info("Downloading NLTK's conll2000 corpus") download('conll2000')
# -*- coding: utf-8 -*- import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import pairwise_distances import unicodedata import operator from nltk import downloader from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer downloader.download("stopwords") np.set_printoptions(threshold=np.nan) stopwords_list = set(stopwords.words("spanish")) stemmer = SnowballStemmer("spanish") def tratamiento1(documentos): # Tratamiento de datos básico new_documentos = [] for d in range(len(documentos)): unaccented_text = ''.join( c for c in unicodedata.normalize('NFD', documentos[d]) if unicodedata.category(c) != 'Mn') lower_words = [str.lower(word) for word in unaccented_text.split(" ")] new_documentos.append(" ".join(lower_words)) return new_documentos def tratamiento2(documentos):
import config from nltk import downloader # Async, can't be run in main process :/ # for wordnet stemmer downloader.download(info_or_id='wordnet', download_dir=config.NLTK_DATA_DIR) # for snowball and porter stemmer # downloader.download(info_or_id='punkt', download_dir=config.NLTK_DATA_DIR) # stop words <- used in snowball downloader.download(info_or_id='stopwords', download_dir=config.NLTK_DATA_DIR)
def download_corpus(): downloader = nltk.downloader.Downloader(download_dir=NLTK_DIR) downloader.download('wordnet', download_dir=NLTK_DIR)
from nltk import downloader downloader.download()
from nltk import downloader if __name__ == "__main__": for ii in ["punkt", "stopwords", "wordnet"]: downloader.download(ii)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from nltk.downloader import download from logging import info from cape_document_qa.download_and_extract import download_and_extract from cape_document_qa.cape_document_qa_settings import MODEL_FOLDER, MODEL_URL, MODELS_FOLDER, MODEL_MB_SIZE, \ GLOVE_EMBEDDINGS_URL, DOWNLOAD_ALL_GLOVE_EMBEDDINGS glove_filepath = os.path.join(MODEL_FOLDER, 'glove.840B.300d.txt') if not os.path.isfile(os.path.join(MODEL_FOLDER, 'model.pkl')) or \ not os.path.isfile(glove_filepath) or \ ( DOWNLOAD_ALL_GLOVE_EMBEDDINGS and os.path.getsize(glove_filepath) / 1e6 < 2e3 # less than 2 GBs-> we only have the top X embeddings ): # Downloading NLTK dependencies info("Downloading (if necessary) NLTK ressources:") download('punkt') download('stopwords') info('Downloading default model with top X Glove embeddings:') download_and_extract(MODEL_URL, MODELS_FOLDER, total_mb_size=MODEL_MB_SIZE) if DOWNLOAD_ALL_GLOVE_EMBEDDINGS: info('Downloading complete Glove Embeddings:') download_and_extract(GLOVE_EMBEDDINGS_URL, MODEL_FOLDER)
from config import * from textblob import TextBlob from nltk import downloader import tweepy class MyStreamListener(tweepy.StreamListener): def on_status(self, status): print('A TWEET!') print(status.text) print('AND THE SENTIMENT PER SENTENCE IS:') blob = TextBlob(status.text) for sentence in blob.sentences: print(sentence.sentiment.polarity) auth = tweepy.OAuthHandler(consumerkey, consumerkeysecret) auth.set_access_token(accesstoken, accesstokensecret) downloader.download('punkt') myStreamListener = MyStreamListener() myStream = tweepy.Stream(auth=auth, listener=myStreamListener) stream = tweepy.Stream(auth, myStreamListener) stream.filter(track=['coca cola'], languages=['en'])