def conditional_freq_distrubution(): cfd = nltk.ConditionalFreqDist((target, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in ['sexy', 'guy'] if posts.lower().startswith(target)) cfd.plot()
def fun4(): from nltk.corpus import nps_chat nltk.download('nps_chat') chatroom = nps_chat.posts('10-19-20s_706posts.xml') print chatroom[123] for i in nps_chat.fileids(): print i
def calculate_confidence_index(): cfd = nltk.ConditionalFreqDist((target, fileid[:10]) for fileid in nps.fileids() for posts in nps.xml_posts(fileid) for target in ['ynQuestion'] if (posts.get('class') == 'ynQuestion')) cfd.plot() # if(flagCount != 0 && timeElapsed != 0) # { # }else{ # } print("Printing confidence index as a function" "of flagCount and timeElapsed")
def calculate_flags(): flagNumber = 0 tokens = nltk.word_tokenize(flagList) # TODO: using a list of flags to be determined, # iterate through posts to find instances of any flags cfd = nltk.ConditionalFreqDist((tokens, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in [tokens] #you need a check if len(samples) < 1 #you don't need to use a format specifier to get string length if posts.lower().startswith(str(target))) print("printing flagList " + str(tokens)) print("cfd values: " + str(cfd.keys())) #problem here with "max() arg is an empty sequence" if we try to .tabulate() cfd.tabulate(cumulative = True)
def webtext(): from nltk.corpus import webtext as webtext from nltk.corpus import nps_chat # list comprehension version file_ids = [fileid for fileid in webtext.fileids()] chat_file_ids = [fileid for fileid in nps_chat.fileids()] pirates = webtext.raw('pirates.txt') pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) lexical_diversity = lexical_div(uniqs, pirates_words) # import nltk.book as book # text1 = book.text1 # pirates = webtext.raw('pirates.txt') return render_template('webtext.html', file_ids=file_ids, chat_file_ids=chat_file_ids, pirates=pirates)
import nltk from nltk.corpus import nps_chat nps_chat.fileids() chatroom1 = nps_chat.fileids()[1] chatroom1 = nps_chat.posts(chatroom1) chatwords_list = [] for w in chatroom1: chatwords_list.append(' '.join(w)) chatwords = ' '.join(chatwords_list) #tokenization chat_token = nltk.word_tokenize(chatwords) print(chat_token) #lower & alpha lower_chat = [w.lower() for w in chat_token] #alpha_chat = [w for w in lower_chat if w.isalpha()] #stop words stopwords = nltk.corpus.stopwords.words('english') stopped_chat = [m for m in lower_chat if m not in stopwords] #Frequency Table from nltk import FreqDist fdist = FreqDist(lower_chat) print("Top 50 words in NPS-chat corpus [1]:") topkeys = fdist.most_common(50) for p in topkeys: print(p) #Bigram Frequency from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(lower_chat) scored = finder.score_ngrams(bigram_measures.raw_freq) print("Top 50 biagram in NPS-chat corpus [1]:")
return precedents.most_common(num_precedents) def show_most_common_context(freq_diff_counter, first_category, second_category): print print "The following words are used more by " + first_category + " than by " + second_category for item in freq_diff_counter: word = item[0] count = item[1] first_precedents = most_common_precedents(word, first_category) second_precedents = most_common_precedents(word, second_category) print print first_category + " uses the word '" + word + "' " + str(count) + " more than " + second_category print first_category + " uses the word with the following words most " + str(first_precedents) print second_category + " uses the word with the following words most " + str(second_precedents) fileids_list = nps_chat.fileids() fid = {"20s":[], "30s":[], "40s":[],"adu":[],"tee":[]} for f_id in fileids_list: tag = f_id[6:9] fid[tag].append(f_id) young_and_old = {"young": fid["tee"]+fid["20s"] , "old": fid["30s"]+fid["40s"]+fid["adu"]} (young_word_freq, young_words) = word_freq("young", True) (old_word_freq, old_words) = word_freq("old", True) old_more_than_young = freq_diff(old_word_freq, young_word_freq) show_most_common_context(old_more_than_young, 'old', 'young')
import nltk import numpy as np # nltk.download('nps_chat') from nltk import bigrams from nltk.corpus import webtext fx = webtext.raw(webtext.fileids()[0]) from nltk.corpus import nps_chat chat = nps_chat.posts(nps_chat.fileids()[0]) print(len(chat)) fx = fx.replace("\r", "") fxline = fx.splitlines() fxcorpus = [] for line in fxline: fxcorpus.append(line.split(" ")) def generate_co_occurrence_matrix(corpus): vocab = set(corpus) vocab = list(vocab) vocab_index = {word: i for i, word in enumerate(vocab)} # Create bigrams from all words in corpus bi_grams = list(bigrams(corpus)) # Frequency distribution of bigrams ((word1, word2), num_occurrences) bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams)) # Initialise co-occurrence matrix # co_occurrence_matrix[current][previous]
def exercise_nps_chat(): # 打印聊天室文本名, 名称由日期,年龄,包含的帖子数量注册 # 如: 10-19-20s_706posts.xml,包含10月19号从20多岁聊天室收集的706个帖子 for file_id in nps_chat.fileids(): print(file_id)
print "The following words are used more by " + first_category + " than by " + second_category for item in freq_diff_counter: word = item[0] count = item[1] first_precedents = most_common_precedents(word, first_category) second_precedents = most_common_precedents(word, second_category) print print first_category + " uses the word '" + word + "' " + str( count) + " more than " + second_category print first_category + " uses the word with the following words most " + str( first_precedents) print second_category + " uses the word with the following words most " + str( second_precedents) fileids_list = nps_chat.fileids() fid = {"20s": [], "30s": [], "40s": [], "adu": [], "tee": []} for f_id in fileids_list: tag = f_id[6:9] fid[tag].append(f_id) young_and_old = { "young": fid["tee"] + fid["20s"], "old": fid["30s"] + fid["40s"] + fid["adu"] } (young_word_freq, young_words) = word_freq("young", True) (old_word_freq, old_words) = word_freq("old", True) old_more_than_young = freq_diff(old_word_freq, young_word_freq)
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import nps_chat ################################################################## ## 简单查看 print(type( nps_chat)) # <class 'nltk.corpus.reader.nps_chat.NPSChatCorpusReader'> print(len(nps_chat.fileids())) # 15 print( nps_chat.fileids() ) # ['10-19-20s_706posts.xml', '10-19-30s_705posts.xml', '10-19-40s_686posts.xml', '10-19-adults_706posts.xml', '10-24-40s_706posts.xml', '10-26-teens_706posts.xml', '11-06-adults_706posts.xml', '11-08-20s_705posts.xml', '11-08-40s_706posts.xml', '11-08-adults_705posts.xml', '11-08-teens_706posts.xml', '11-09-20s_706posts.xml', '11-09-40s_706posts.xml', '11-09-adults_706posts.xml', '11-09-teens_706posts.xml'] print(len(nps_chat.words('10-19-20s_706posts.xml'))) # 2829 print( nps_chat.words('10-19-20s_706posts.xml')[:10] ) # ['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey'] ################################################################## ## posts() chatroom = nps_chat.posts('10-19-20s_706posts.xml') print( chatroom[123] ) # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
# 문서를 word 단위로 읽어온다. word = webtext.words('firefox.txt') print(word) print("word 개수 = ", len(word)) # 문서를 문장 단위로 읽어온다. sentence = webtext.sents('firefox.txt') for i in range(5): print(sentence[i]) print("문장 개수 = ", len(sentence)) # NPS Chat 데이터 # http://faculty.nps.edu/cmartell/npschat.htm from nltk.corpus import nps_chat nltk.download('nps_chat') # Chat 코퍼스의 파일 ID를 조회한다. textId = nps_chat.fileids() print(textId) # 특정 Chat session의 텍스트 문서를 조회한다. text = nps_chat.raw('10-19-20s_706posts.xml') print(text[:2000]) print("문자 개수 = ", len(text)) # XML의 post 데이터를 읽는다 chatroom = nps_chat.posts('10-19-20s_706posts.xml') for chat in chatroom[:20]: print(chat)
def exercise_nps_chat(): # 打印聊天室文本名, 名称由日期,年龄,包含的帖子数量注册 # 如: 10-19-20s_706posts.xml,包含10月19号从20多岁聊天室收集的706个帖子 for file_id in nps_chat.fileids(): print file_id
import pprint print "****** gutenberg" from nltk.corpus import gutenberg print gutenberg.fileids() print "raw: ", len(gutenberg.raw()) print "words: ", len(gutenberg.words()) print "sents: ", len(gutenberg.sents()) print "****** webtext" from nltk.corpus import webtext print len(webtext.raw('firefox.txt')) print "****** nps_chat" from nltk.corpus import nps_chat print nps_chat.fileids() cr=nps_chat.posts('10-19-20s_706posts.xml') print cr print "****** brown" from nltk.corpus import brown nt=brown.words(categories='news') print nt from nltk.corpus import reuters from nltk.corpus import inaugural print [w for w in nltk.corpus.udhr.fileids() if 'heb' in w.lower()] print nltk.corpus.brown.readme() print nltk.corpus.brown.words()[1:10]
def __init__(self): self.number_id = 29 self.source_id = "nps_chat" self.titles = [name for name in nps_chat.fileids()] self.data = [nps_chat.raw(name) for name in self.titles]
print(' '.join(longest_sent[0])) print(longest_sent) # 1.2. 网络文本 和 聊天文本 from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom) print(' '.join(chatroom[123])) for fileid in nps_chat.fileids(): print(fileid, ' '.join(nps_chat.posts(fileid)[123])) # 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究) from nltk.corpus import brown print(brown.categories()) brown_news_words = brown.words(categories='news') print(brown_news_words) brown_cg22_words = brown.words(fileids='cg22') brown_sents = brown.sents(categories=['news', 'editorial', 'reviews']) print(brown_sents) fdist = nltk.FreqDist([w.lower() for w in brown_news_words]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals:
10-19-40s_686posts.xml 10-19-adults_706posts.xml 10-24-40s_706posts.xml 10-26-teens_706posts.xml 11-06-adults_706posts.xml 11-08-20s_705posts.xml 11-08-40s_706posts.xml 11-08-adults_705posts.xml 11-08-teens_706posts.xml 11-09-20s_706posts.xml 11-09-40s_706posts.xml 11-09-adults_706posts.xml 11-09-teens_706posts.xml ''' # putting all tagged posts from the nps_chat corpus into one list nps_chat_tagged = list() for fileid in nps_chat.fileids(): print fileid for post in nps_chat.tagged_posts(fileid): nps_chat_tagged.append(post) print str(len(nps_chat_tagged)) print nps_chat_tagged[0] # tags can be retrieved in the same way as the Brown corpus
from nltk.corpus import nps_chat as nps # NPS_CHAT can be found in: https://catalog.ldc.upenn.edu/LDC2010T05 # but is a charged service - buaa buaa buaa caminho = 'C:\\Users\\theone\\Documents\\FATEC\\PROJETO TG1\\PJ_FINAL\\' i = 0 for fid in nps.fileids(): print('CREATING FILE: ' + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt\n') arq = open(caminho + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt', 'a') arq.truncate() for post in nps.posts(fid): line = ' '.join(post).rstrip() if 'ACTION' in line or 'JOIN' in line or 'PART' in line: continue arq.write(line + '\n') arq.close() i+=1
#--------------------------------- ##CHAPTER 2:Accessing Text Corpora import nltk #print(nltk.corpus.gutenberg.fileids()) #prints filenames for nltk.gutenberg emma = nltk.corpus.gutenberg.words('austen-emma.txt') #select text #print(len(emma)) emma = nltk.Text(emma) #to use previous functions as with nltk.book txts print(emma.concordance('surprise')) print(' '.join(emma[20:50])) #LIST to STRING - comes out as text #examples of corpus available in nltk from nltk.corpus import webtext #less formal text print(webtext.fileids()) #filenames from nltk.corpus import nps_chat #predators print(nps_chat.fileids()) from nltk.corpus import brown #brown uni various texts print(brown.fileids()) from nltk.corpus import reuters print(reuters.fileids()) from nltk.corpus import inaugural print(inaugural.fileids()) #page 72 for a variety of corpus functionality commands ##SPACY SECTION - DataCamp course code collection, starting with 'Feature Engineering for NLP' import spacy
# Importing modules with datasets within nltk.corpus from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown from nltk.corpus import reuters # Printing the list of all dataset names in each module print('Printing the file IDs for each module...\n') print('gutenberg:\n', gutenberg.fileids()) print('webtext:\n', webtext.fileids()) print('nps_chat:\n', nps_chat.fileids()) print('brown:\n', brown.fileids()) print('reuters:\n', reuters.fileids()) # Printing the categories of each module # NOTE: gutenberg, webtext and nps_text do not have "categories" print('Printing the categories for each module, if available...\n') print('brown:\n', brown.categories()) print('reuters:\n', reuters.categories()) # Accessing the corpora # NOTE: TXT files can be accessed through "raw" to get the full files print('Accessing the sample files...') print('gutenberg:\n', gutenberg.raw("austen-emma.txt")) # Accessing sentences of a sample file print('Getting a list of sentences...') print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt")) print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml"))
def make_sessions(): fileids = nps_chat.fileids() # @UndefinedVariable for fileid in fileids: yield nps_data.fromxml(nps.get_session(fileid), fileid)