def test_austen(): from nltk.data import load from nltk.corpus import gutenberg as g stok = load('tokenizers/punkt/english.pickle') train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))] test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))] test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))] model1 = AdditiveSmoothing(n=2) model1.generate_model(train) print 'cross entropy additive smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2) model2 = KnesserNey(n=2) model2.generate_model(train) print 'cross entropy knesser-ney smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2) model3 = SimpleGoodTuring(n=2) model3.generate_model(train) print 'cross entropy simple good-turing smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2) model4 = KatzSmoothing(n=2) model4.generate_model(train) print 'cross entropy katz smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
def test_austen(): from nltk.data import load from nltk.corpus import gutenberg as g stok = load('tokenizers/punkt/english.pickle') train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))] test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))] test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))] model1 = AdditiveSmoothing(n=2) model1.generate_model(train) print 'cross entropy additive smoothing:' print 'emma to sense&sensibility: %f0.8' % cross_entropy(model1, test1) print 'emma to persuasion: %f0.8' % cross_entropy(model1, test2) model2 = KnesserNey(n=2) model2.generate_model(train) print 'cross entropy knesser-ney smoothing:' print 'emma to sense&sensibility: %f0.8' % cross_entropy(model2, test1) print 'emma to persuasion: %f0.8' % cross_entropy(model2, test2) model3 = SimpleGoodTuring(n=2) model3.generate_model(train) print 'cross entropy simple good-turing smoothing:' print 'emma to sense&sensibility: %f0.8' % cross_entropy(model3, test1) print 'emma to persuasion: %f0.8' % cross_entropy(model3, test2) model4 = KatzSmoothing(n=2) model4.generate_model(train) print 'cross entropy katz smoothing:' print 'emma to sense&sensibility: %f0.8' % cross_entropy(model4, test1) print 'emma to persuasion: %f0.8' % cross_entropy(model4, test2)
def exercise_gutenberg(): # 打印古腾堡项目的文件列表 print gutenberg.fileids() # 挑选一个文本: 简-奥斯丁的《爱玛》 emma = gutenberg.words("austen-emma.txt") # 查看书的长度 print len(emma) # 导入文本 emma_text = nltk.Text(emma) emma_text.concordance("surprize") for file_id in gutenberg.fileids(): chars_list = gutenberg.raw(file_id) words_list = gutenberg.words(file_id) sents_list = gutenberg.sents(file_id) # 统计文件的总字符数 num_chars = len(chars_list) # 统计文件的总单词数 num_words = len(words_list) # 统计文件的总句子数 num_sents = len(sents_list) # 统计文件的非重复单词数 num_vocab = len(set([w.lower() for w in words_list])) # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名 print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
def gutenberg(): from nltk.corpus import gutenberg file_ids = get_fileids(gutenberg) # average characters in a word: raw/words # average word in a sentence: words/sents # lexical diversity - num_words/num_vocab for fileid in file_ids: num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars / num_words), int(num_words / num_sents), int( num_words / num_vocab), fileid emma = gutenberg.words('austen-emma.txt') emma_len = len(emma) # print 'percentage', percentage(text1.count('monstrous'), len(text1)) macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt') macbeth_longest_len = max([len(s) for s in macbeth_sents]) macbeth_longest_sent = [ s for s in macbeth_sents if len(s) == macbeth_longest_len ] return render_template('gutenberg.html', file_ids=file_ids, emma=emma, emma_len=emma_len, macbeth_longest_sent=macbeth_longest_sent)
def get_text_chars(file): _text = '' for txt in file: if 'shakespeare' in txt: _text += gutenberg.raw(txt).lower() _chars = sorted(list(set(_text))) return _chars, _text
def test(): from nltk.corpus import gutenberg emma = gutenberg.raw('austen-emma.txt') print len(emma) ex = createexercise(emma, pos='v', last_index=False, fast=True) print len(ex)
def exercise_gutenberg(): # 打印古腾堡项目的文件列表 print(gutenberg.fileids()) # 挑选一个文本: 简-奥斯丁的《爱玛》 emma = gutenberg.words("austen-emma.txt") # 查看书的长度 print(len(emma)) # 导入文本 emma_text = nltk.Text(emma) emma_text.concordance("surprize") for file_id in gutenberg.fileids(): chars_list = gutenberg.raw(file_id) words_list = gutenberg.words(file_id) sents_list = gutenberg.sents(file_id) # 统计文件的总字符数 num_chars = len(chars_list) # 统计文件的总单词数 num_words = len(words_list) # 统计文件的总句子数 num_sents = len(sents_list) # 统计文件的非重复单词数 num_vocab = len(set([w.lower() for w in words_list])) # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名 print(num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id)
def load_moby_dick_analysis(): tokens = get_moby_dick_tokens() text = gutenberg.raw('melville-moby_dick.txt') try: moby_dick_doc = Document( url='gutenberg', name='moby dick', text=text, month='Jan', year='1851' ) odm_session.flush() except DuplicateKeyError: moby_dick_doc = Document.query.get(name='moby dick') for sum_threshold in sum_thresholds: log.info("Trying analysis for threshold = %s" % sum_threshold) analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1] anal_dict = analysis.encode() window_size = anal_dict['window_size'] log.debug("Best result = %s" % window_size) InformationValueResult( window_size = window_size, threshold = sum_threshold, document = moby_dick_doc, iv_words = anal_dict['top_words'], max_iv = anal_dict['max_iv'], sum_iv = anal_dict['sum_iv'] ) odm_session.flush()
def Asst2(text): raw_txt = gutenberg.raw(text) #deleting all spaces in the text split_txt = re.sub('(\n)+', '', raw_txt) split_txt = re.sub(' ', '', split_txt) #leaving only letters split_txt = "".join(re.findall("[a-zA-Z]+", split_txt)) #making all letters to lower case split_txt = split_txt.lower() #counting all the letters counter = Counter(split_txt) #calculating the frequency of each letter and puting it into a Counter called prob_counter prob_counter = probability(counter) #making the prob_counter into an ordered list prob_counter_sorted = prob_counter.most_common() #making a bar plot of the frequency of each letter letter = [] frequency = [] letter, frequency = zip(*prob_counter_sorted) indices = np.arange(len(prob_counter_sorted)) plt.bar(indices, frequency, color='b') plt.xticks(indices, letter, rotation='horizontal') plt.tight_layout() plt.show()
def convert_to_json_split(filename): try: input_txt = gut.raw(filename).split('\n') input_txt = [line for line in input_txt if line != ""] output_txt = input_txt[1:] raw_data = {'Input': input_txt[:-1], 'Output': output_txt} df = pd.DataFrame(raw_data, columns=['Input', 'Output']) train, test = train_test_split(df, test_size=0.25) valid, test = train_test_split(test, test_size=0.4) train.to_json(os.path.join(TRAIN_PATH, 'train-{}.json'.format(filename)), orient='records', lines=True) test.to_json(os.path.join(TEST_PATH, 'test-{}.json'.format(filename)), orient='records', lines=True) valid.to_json(os.path.join(VALIDATION_PATH, 'validation-{}.json'.format(filename)), orient='records', lines=True) print("Processed {}".format(filename)) return df except Exception as e: print('Error {} occurred'.format(e)) print('Failed to process {}'.format(filename))
def gutenFreqListNoStop(): # Obtain the list of words gutenberg_words = gutenberg.raw().split(' ') englishstop = stopwords.words('english') filtered_gutenberg_words = [ w for w in gutenberg_words if not w in englishstop ] num_gutenberg_words = len(filtered_gutenberg_words) print "We have " + str(num_gutenberg_words) + " gutenberg filtered words" counter = 0 gutenberg_frequ = defaultdict(int) sleep(2) for word in filtered_gutenberg_words: counter += 1 gutenberg_frequ[word] += 1 if counter % 1000 == 0: print "Progress : " + str( (counter / float(num_gutenberg_words)) * 100) + " %" gutenberg_frequ = sorted(gutenberg_frequ.values(), reverse=True) gutenberg_rank = np.array(xrange(1, len(gutenberg_frequ) + 1)) c, alpha = powerLaw(gutenberg_frequ, gutenberg_rank) plotPowerLaws( gutenberg_rank, gutenberg_frequ, [c, c], [-1, -alpha], title= "Relation between word rank and frequency for gutenberg, no stop words", xlabel="Word Rank", ylabel="Word Frequency") return 0
def get_austen_emma_sample(): nlp = en_core_web_sm.load() emma = gutenberg.raw('austen-emma.txt') parsed_emma = nlp(emma) seed(181520) sample_size = 100 my_sample = random.sample(list(parsed_emma.sents), sample_size) sample = [] for sent in my_sample: sent = re.sub("\s+", " ", sent.text) sample.append(sent) entities = [] type_entity = [] sentences = [] for sent in sample: parsed_sentence = nlp(sent) for ent in parsed_sentence.ents: if ent.text not in entities: entities.append(ent.text) sentences.append(sent) type_entity.append(ent.label_) Entities = pd.DataFrame({ 'Sentence': sentences, 'Entity': entities, 'Entity_type': type_entity }) return Entities
def demo(): """ LOAD DATA , veri setini yukleyelim. """ # Sense and Sensibility by Jane Austen 1811 text = gutenberg.raw('austen-sense.txt') print "Manual CLeaning : \n", cleaning(text) print "\nNLTK: Cleaning & Stemming : \n", cleaning_and_stemming(text)
def extractCorpus(self): # .raw() returns raw text in a strign format raw_text = gutenberg.raw(self.pos_ex_fn) # print(raw_text[:500]) #removing text inside [] text = re.sub("^\[.*\]", " ", raw_text) #print("text after removing brackets ....") # print(text[:200]) #removing VOLUME and Chapter nos. text = re.sub("\sVOLUME\s[A-Z]", " ", text) # print("removing volume....") # print(text[:500]) text = re.sub("\sCHAPTER\s[A-Z]", " ", text) text = re.sub(r"--", " ", text) text = re.sub(r'\"', " ", text) #text = re.sub(r'[\"|\?\"|\.\"]'," ", text) text = re.sub(r'(?<=[MmSDsdr]){2}\.\s', ' ', text) text = re.sub(r'(?<=[MmSDsdr]){3}\.\s', ' ', text) text = re.sub(r'_.*_', ' ', text) # removing multiple spaces text = re.sub(r"\s+", " ", text) sents = re.split(r'\.|\?', text) # sents = text.lower().split(".") # print("sentences generated : ") # print(sents[1:10]) return sents
def Main(): db = Database() index = InvertedIndex(db) brown_list = brown.fileids() gutenberg_list = gutenberg.fileids() # document1 = { # 'id': '1', # 'text': 'The big sharks of Belgium drink beer.' # } # document2 = { # 'id': '2', # 'text': 'Belgium has great beer. They drink beer all the time.' # } i = 0 for item in brown_list: documentTemp = {'id': str(i), 'text': brown.raw(item)} index.index_document(documentTemp) for item in gutenberg_list: documentTemp = {'id': str(i), 'text': gutenberg.raw(item)} index.index_document(documentTemp) while True: search_term = input("Enter term(s) to search: ") result = index.lookup_query(search_term.lower()) for term in result.keys(): for appearance in result[term]: # Belgium: { docId: 1, frequency: 1} document = db.get(appearance.docId) print(highlight_term(appearance.docId, term, document['text'])) print("-----------------------------")
def getNgramFreqDict(n, retrain=False): if not retrain: try: with open('data/%dgram_freq.json' % n) as fin: print('Trained frequency for n=%d found; Reading data...' % n) ngram_freq = json.load(fin) return ngram_freq except FileNotFoundError: pass print('Training frequency for n=%d...' % n) # using whole gutenberg corpus corpus = gutenberg.raw() corpus = re.sub('[^a-z. ]', ' ', corpus.lower()) corpus = ' '.join(corpus.split()) corpus_ngram = ngrams(corpus, n) ngram_freq = {} for gram in corpus_ngram: key = ''.join(gram) if key in ngram_freq: ngram_freq[key] += 1 else: ngram_freq[key] = 1 sum_count = sum([tup[1] for tup in ngram_freq.items()]) for k in ngram_freq.keys(): ngram_freq[k] = ngram_freq[k] / sum_count with open('data/%dgram_freq.json' % n, 'w') as fout: json.dump(ngram_freq, fout) return ngram_freq
def getMLE(word): text = gutenberg.raw() words = getWords(text) unigramFreq = getFreqUnigram(words) bigramFreq = getFreqBigram(getBigram(words)) tempDict = {} for key in bigramFreq: if key[0] == word: tempDict[key] = bigramFreq[key] mle = {} if bool(tempDict): sortedList = sorted(tempDict, key=tempDict.get, reverse=True) for i in range(0, 3): count = tempDict[sortedList[i]] prob = count / float(unigramFreq[word]) mle[sortedList[i][1]] = prob return mle
def extract_word_vectors(corpus): # Read in text text = gutenberg.raw(corpus)[:10000] # Extract one word and the following one tokenizer = Tokenizer() # Extracts sequences of text tokenizer.fit_on_texts([text]) # Convert sequences of text to sequences of ints int_enc = tokenizer.texts_to_sequences([text])[0] # Store vocabulary length for embedding layer (+ 1 to encode longest word) vocab_len = len(tokenizer.word_index) + 1 # Create word-word sequences sequences = list() for i in range(1, len(int_enc)): tmp = int_enc[i - 1:i + 1] sequences.append(tmp) # Split into first and second element of sequence sequences = array(sequences) X, y = sequences[:, :-1], sequences[:, -1] # Use Keras to_categorical() function to one-hot encode the output / second word y = to_categorical(y, num_classes=vocab_len) return [X, y, vocab_len, tokenizer]
def getNgramFreqTree(n, retrain=False): filename = 'data/%dgram_tree.pickle' % n if not retrain: try: with open(filename, 'rb') as fin: print('Trained frequency for n=%d found; Reading data...' % n) ngramtree = pickle.load(fin) return ngramtree except FileNotFoundError: pass print('Training frequency tree for n=%d...' % n) ngramtree = NgramTree() corpus = gutenberg.raw() corpus = re.sub('[^a-z. ]', ' ', corpus.lower()) corpus = ' '.join(corpus.split()) corpus_ngram = ngrams(corpus, n) for gram in corpus_ngram: ngramtree.addGram(gram) ngramtree.addUp() ngramtree.normalize() with open(filename, 'wb') as fout: pickle.dump(ngramtree, fout) return ngramtree
def load_text(filename): if (filename == None or filename == ''): text = gutenberg.raw(fileids='carroll-alice.txt') else: with open(filename, 'w') as f: text = f.read() return text
def load_sents(): global sents default_st = nltk.sent_tokenize alice = gutenberg.raw(fileids='carroll-alice.txt') mobyd = gutenberg.raw(fileids='melville-moby_dick.txt') shak1 = gutenberg.raw(fileids='shakespeare-hamlet.txt') shak2 = gutenberg.raw(fileids='shakespeare-macbeth.txt') bbkjv = gutenberg.raw(fileids='bible-kjv.txt') alice_sentences = default_st(text=alice) mobyd_sentences = default_st(text=mobyd) shak1_sentences = default_st(text=shak1) shak2_sentences = default_st(text=shak2) bbkjv_sentences = default_st(text=bbkjv) sents = alice_sentences + mobyd_sentences + shak1_sentences + shak2_sentences + bbkjv_sentences
def gutenberg(): from nltk.corpus import gutenberg for t in gutenberg.fileids(): num_chars = len(gutenberg.raw(t)) num_words = len(gutenberg.words(t)) num_sents = len(gutenberg.sents(t)) num_vocab = len(set([w.lower() for w in gutenberg.words(t)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
def clear_libs(self, MainWindow): self.textBrowser.clear() comboText = self.comboBox.currentText() for i in text.textDict: if comboText == i: rawText = gb.raw(text.textDict[i]) self.textBrowser.append(rawText)
def handle(self, *args, **options): for fileid in gutenberg.fileids(): out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "") if not os.path.isdir(out_dir): os.makedirs(out_dir) f = open(out_dir + os.sep + "sentences.txt", 'w') f.write(gutenberg.raw(fileid)) f.close()
def get_gutenberg_statistics(): for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) print(round(num_chars / num_words), round(num_words / num_sents), round(num_words / num_vocab), fileid)
def show_text(self, MainWindow): text.libd_storage = [] self.textBrowser.clear() comboText = self.comboBox.currentText() for i in text.textDict: if comboText == i: rawText = gb.raw(text.textDict[i]) self.textBrowser.append(rawText)
def data_builder(file_id): d = gutenberg.raw(fileids=file_id) d_sentences = default_st(text=d) d_tuples = [nltk.pos_tag(default_wt(sentence)) for sentence in d_sentences] d_words = [[word[0] for word in sentence] for sentence in d_tuples] d_tags = [[word[1] for word in sentence] for sentence in d_tuples] d_len = len(d_sentences) return d_sentences, d_words, d_tags, d_len
def gutenberg_file_info(): for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid)
def demo(): """ LOAD DATA , veri setini yukleyelim. """ # Sense and Sensibility by Jane Austen 1811 text = gutenberg.raw('austen-sense.txt') sentences = sent_tokenize(text[:1000]) modal = word_2_vec_with_gensim(sentences) print "Modal : ", modal
def get_training_text(): text = "" nltk.download('gutenberg') for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) return text
def getSentences(self): if self.category == "novel": sentences = gutenberg.raw(gutenberg.fileids()[0]) sentences = sentences.split('\n') elif self.category == "news": sentences = brown.sents(categories='news') return sentences
def structure(): raw = gutenberg.raw("burgess-busterbrown.txt") raw[1:20] words = gutenberg.words("burgess-busterbrown.txt") words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20]
def similarity_gutenberg(): for x in range(2,6): a = [] b = 0 c = 0 d = 1 for fid in gutenberg.fileids(): a.append([]) for ffid in gutenberg.fileids(): a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x))) b += 1 for i in range(len(a)): for j in range(len(a)): c += a[i][j]/(len(a)*len(a)) d = min(d,a[i][j]) print("Media: "+ str(c)) print("Minimo: "+ str(d))
def generate_tokens(titles): corpus = [] for title in titles: novel: str = gutenberg.raw(title) novel = novel.strip() novel = novel.lower() novel = re.sub('\W+', ' ', novel) words = novel.split(' ') corpus.extend(words) return corpus
def fun02(): """fun02""" for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) # average word length average sentence length print int(num_chars/num_words), int(num_words/num_sents), # number of times each vocabulary item appers in the text print int(num_words/num_vocab), fileid
def page57(): """Statistics from the Gutenberg corpora""" from nltk.corpus import gutenberg for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars / num_words), int(num_words / num_sents), print int(num_words / num_vocab), fileid
def for_print(): ''' 显示每个文本的三个统计量 :return: ''' for fileid in gutenberg.fileids(): num_chars=len(gutenberg.raw(fileid)) num_words=len(gutenberg.words(fileid)) num_sents=len(gutenberg.sents(fileid)) num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
def preprocessing_text_file(input_file): train_text = gutenberg.raw(input_file) sample_text = gutenberg.raw(input_file) #### understand input type and content #print(train_text) #print("=====================") #print(type(train_text)) #### unicode text custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) #### show chunked result #print("\n\n".join(tokenized)) #print("=================") #print(type(tokenized)) #### list of sentences (separated by ".") return tokenized
def solve_p2_greedy(file): lines = [l.lower().split("|")[1:-1] for l in open(file)] slices = slice(lines) n = 3 corpus = NgramLetterCorpus(n) for fileid in gutenberg.fileids()[:3]: corpus.update(gutenberg.raw(fileid)) slices = unshred3(slices, corpus) print "FINAL: " for l in linearize(slices): print "".join(l)
def test_moby_dick_window(self): #just make sure we window_sizes = xrange(100, 6000, 100) text = gutenberg.raw('melville-moby_dick.txt') tokens = tokenize(text, only_alphanum=True, clean_punctuation=True) total_number_of_tokens = len(tokens) for window_size in window_sizes: count = 0 number_of_windows = int(math.ceil( total_number_of_tokens / window_size)) for current_window in range(0, number_of_windows+1): word_window = Window(tokens, window_size, current_window) for word in word_window: count += 1 self.assertEquals(count, total_number_of_tokens)
def benchmark_sbd(): ps = [] rs = [] f1s = [] c = 0 for fileid in gutenberg.fileids(): c += 1 copy_sents_gold = gutenberg.sents(fileid) sents_gold = [s for s in copy_sents_gold] for sent_i in range(len(sents_gold)): new_sent = [w for w in sents_gold[sent_i] if w.isalpha()] sents_gold[sent_i] = new_sent text = gutenberg.raw(fileid) sents_obtained = split_text(text) copy_sents_obtained = sents_obtained.copy() for sent_i in range(len(sents_obtained)): new_sent = [w.group() for w in re.finditer(r'\w+', sents_obtained[sent_i]) if w.group().isalpha()] sents_obtained[sent_i] = new_sent c_common = 0 for sent in sents_obtained: if sent in sents_gold: c_common += 1 p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold)) print('\n\n', fileid) print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1)) ps.append(p) rs.append(r) f1s.append(f1) print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps), np.std(ps))) print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs), np.std(rs))) print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s), np.std(f1s))) print(len(f1s)) good_ps = [p for p in ps if p >= 0.8] good_rs = [r for r in rs if r >= 0.8] good_f1s = [f1 for f1 in f1s if f1 >= 0.8] print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps), np.std(good_ps))) print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs), np.std(good_rs))) print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s), np.std(good_f1s))) print(len(good_f1s))
def access(): monty[0] monty[3] monty[5] monty[-1] sent = 'colorless green ideas sleep furiously' for char in sent: print char, from nltk.corpus import gutenberg raw = gutenberg.raw('melville-moby_dick.txt') fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha()) fdist.keys()
def load_hamlet(): """ Loads the contents of the play Hamlet into a string. Returns ------- str The one big, raw, unprocessed string. Example ------- >>> document = load_hamlet() >>> document[:80] '[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus. Scoena Prim' """ return gutenberg.raw("shakespeare-hamlet.txt")
def mean_len(): a = [] d = 1 for fid in gutenberg.fileids(): b = 0 c = 0 st = gutenberg.raw(fid) stl = re.split("\n|\.|\!|\?", st) stw = re.split("\n|\.|\!|\?| |,| - ", st) for el in stl: b += len(el)*(1.0)/len(stl) for el in stw: c += len(el)*(1.0)/len(stw) print(fid) print("Media Frases: "+ str(b)) print("Media Palavras: "+ str(c))
def get_moby_dick_document(): moby_dick = gutenberg.raw('melville-moby_dick.txt') document = Document( url = 'melville-moby_dick.txt', name = 'Moby dick', text = moby_dick, month = 'Oct', year = 1851 ) # document uses tokenizer func for create tokens, since we need to enforce # only_alphanum and clean_punct we need a wrapper def tokenizer_wrapper(raw_text): return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def gutenberg(): emma = nltk.corpus.gutenberg.words('austen-emma.txt') print len(emma) print gutenberg.fileids() emma = gutenberg.words('austen-emma.txt') macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
def sentenceTokenization(): ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mySentenceTokenizer = nltk.sent_tokenize ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sample_text = 'We will discuss briefly about the basic syntax, structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming language!' sentences_sample = mySentenceTokenizer(text = sample_text) print( '\nTotal number of sentences in sample_text: ' + str(len(sentences_sample)) ) print( '\nSample sentences:' ) print( sentences_sample ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### alice = gutenberg.raw(fileids = 'carroll-alice.txt') print( "\n### len(alice), total number of characters: " + str(len(alice)) ) print( "\n### First 1000 characters of carroll-alice.txt:\n" ) print( alice[0:1000] ) sentences_alice = mySentenceTokenizer(text = alice) print( '\nTotal number of sentences in Alice: ' + str(len(sentences_alice)) ) print( '\nFirst 5 sentences in Alice:' ) for temp_sentence in sentences_alice[0:5]: print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence ) print( "\n### ~~~~~~~~~~ ###" ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### text_german = europarl_raw.german.raw(fileids = "ep-00-01-17.de") print( "\n### len(German text), total number of characters: " + str(len(text_german)) ) print( "\n### First 1000 characters of ep-00-01-17.de (German text):\n" ) print( text_german[0:1000] ) sentences_german = mySentenceTokenizer(text = text_german, language = "german") print( '\nTotal number of sentences in German text: ' + str(len(sentences_german)) ) print( '\nFirst 5 sentences in German text:' ) for temp_sentence in sentences_german[0:5]: print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence ) print( "\n### ~~~~~~~~~~ ###" ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def create_random_statements(count=50): """ This function scans the ``nltk`` Project Gutenberg dataset, extracts random sentences containing some form of "it is" and tags them with a random tag. NB: This thing can take a while. """ created_count = 0 tags = Tag.objects.order_by("?") gutenberg_files = gutenberg.fileids() random.shuffle(gutenberg_files) for file_name in gutenberg_files: exists, not_exists = extract.from_text(gutenberg.raw(file_name)) for sentence in [_linebreak.sub(" ", s) for s in exists]: if created_count == count: break statement = Statement(text=sentence, tag=random.choice(tags)) try: statement.save() created_count += 1 transaction.commit() except IntegrityError: transaction.rollback()
from nltk.corpus import gutenberg from nltk.tokenize import sent_tokenize sample_text = gutenberg.raw('bible-kjv.txt') tok = sent_tokenize(sample_text) print(tok[5:15])
import nltk from nltk import FreqDist from nltk.corpus import gutenberg import json import csv print "* Loading corpus" #raw = gutenberg.raw('melville-moby_dick.txt') #raw = gutenberg.raw('bible-kjv.txt') raw = gutenberg.raw('blake-poems.txt') print "* Tokenizing" tokens = nltk.word_tokenize(raw) print "* Tagging parts of speech" # Save this to strip articles later parts_of_speech = nltk.pos_tag(tokens) print "* Converting POS list into a dict for lookup" # TODO -- fix this. this is going to f**k up on homonyms parts_of_speech = dict(parts_of_speech) # You can ban other parts of speech by adding their tags to this list. # You can find out what the part-of-speech tags mean by using code like # this: # >>> print nltk.help.upenn_tagset('DT') # DT: determiner # all an another any both del each either every half la many much nary # neither no some such that the them these this those banned_parts_of_speech = [ 'DT',
sorted([w for w in set(text1) if w.endswith('ableness')]) [w.upper() for w in text1] for word in ['Call', 'me', 'Ishmael', '.']: print word #获取语料库 nltk.corpus.gutenberg.fileids() emma = nltk.corpus.gutenberg.words('austen-emma.txt') emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) emma.concordance("surprize") from nltk.corpus import gutenberg gutenberg.fileids() for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid #句子划分 macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') longest_len = max([len(s) for s in macbeth_sentences]) #网络聊天语料库 from nltk.corpus import webtext from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories()
from nltk.corpus import gutenberg from nltk.tokenize import sent_tokenize from pprint import pprint as pp sample = gutenberg.raw('bible-kjv.txt') sentences = sent_tokenize(sample) pp(sentences[0:10])
import nltk from nltk.corpus import gutenberg from nltk.probability import FreqDist from nltk.util import ngrams from nltk import sent_tokenize, word_tokenize from operator import itemgetter text = gutenberg.raw('chesterton-thursday.txt') nltk_sents = sent_tokenize(text) # contains the list of sentences detected from the tool nltk_words = word_tokenize(text) tokens = nltk_words uni_freq = FreqDist(tokens) x = FreqDist() y = FreqDist() Bigram_count = 0 for line in nltk_sents: w = word_tokenize(line) for window in ngrams(w,3,pad_right=True): p = window[0] if p is None: continue for p1 in window[1:]: if p1 is not None: Bigram_count = Bigram_count +1 x[p,p1] = x[p,p1]+1 y[p] = y[p]+1 y[p1] = y[p1]+1 ct = 0 coll = [] for k,v in x.items():
import nltk import math from nltk.corpus import gutenberg from pattern.en import * text = gutenberg.raw('austen-emma.txt') #pprint(parse(text,chunks = False, tags = False).split()) pattern_words = parse(text,chunks = False, tags = False).split() pattern_sent = tokenize(text) #print pattern_words tokens = pattern_words l = [] for token in tokens: for i in token: for j in i: l.append(j.lower()) tokens = l tokens = [token.lower() for token in tokens if len(token) > 1] #dictn=list(set(tokens)) r = ' '.join(tokens) dictn=list(set(tokens)) uni_tokens = ngrams(r,n = 1) bi_tokens = ngrams(r, n = 2) tri_tokens = ngrams(r, n = 3) uni_fdist = nltk.FreqDist(uni_tokens) uni_freq = 0
#!/usr/bin/python """Just a testing program for NLTK library. It is a NLP library for Python. Some kick-ass library this is. :) Pre-Requisites: NLTK Library installed, And Download additional data for the library using it's command. You can use "Natural Language Processing with Python" book from O'Reilley Publications for further details. This program prints some statistics for the Corpus(a large compiled collection of text files). """ import nltk from nltk.corpus import gutenberg for fid in gutenberg.fileids(): nchars=gutenberg.raw(fid) nwords=gutenberg.words(fid) nsents=gutenberg.sents(fid) nvocab=len(set(w.lower() for w in gutenberg.words(fid)) print "%s %s %s %s",(% str(int(nchars/nwords)), % str(int(nwords/nsents)), % str(int(nwords/nvocab)), % fid),
import nltk nltk.corpus.gutenberg.fileids() emma = nltk.corpus.gutenberg.words('austen-emma.txt') len(emma) emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) emma.concordance('surprize') #another way to do this from nltk.corpus import gutenberg gutenberg.fileids() emma = gutenberg.words('austen-emma.txt') for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences #load sentences of Macbeth macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence from nltk.corpus import webtext for fileid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:65], '...')
#Lemmatizing from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize("cats")) #cat print(lemmatizer.lemmatize("cacti")) #cactus print(lemmatizer.lemmatize("geese")) #goose print(lemmatizer.lemmatize("python")) #python print(lemmatizer.lemmatize("better",pos="a")) #good print(lemmatizer.lemmatize("run",'v')) #run #Importing any file from nltk.data from nltk.corpus import gutenberg from nltk.tokenize import sent_tokenize sample = gutenberg.raw("bible-kjv.txt") tok = sent_tokenize(sample) #using wordnet to get synonyms,meanings,examples and antonyms of words from nltk.corpus import wordnet syns = wordnet.synsets("program") print(syns) #will give all the synonyms like print(syns[0].lemmas()[0].name) #will give the first synonym. print(syns[0].definition()) #will give the dictionary meaning of the synonym. print(syns[0].examples()) #will give some examples of sentences using that synonyms. synonyms = [] antonyms = []