import json from nltk import FreqDist from nltk.corpus import knbc cp = knbc.words() dist = FreqDist(cp) total_count = 0 for term, count in dist.items(): total_count += count if __name__ == "__main__": line = input() out = {"corpus_size": len(dist), "total_count": total_count} # |C| for word in line.split(" "): out[word] = dist[word] print(json.dumps(out))
# out.write(line + ',' + tagged + '\n') if __name__ == '__main__': with open('word_list.csv', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: kansai = row[0].replace('〜', '') standard = row[1].replace('〜', '').split('・') if not kansai in dictionary: dictionary[kansai] = standard total = 0 prev = 'BOS' for word in knbc.words(): bg = bigram(prev, word) if bigram_freq_given(bg) == 0: frequency_bigram[bg] = 1 else: frequency_bigram[bg] = frequency_bigram[bg] + 1 if word_freq(word) == 0: frequency_word[word] = 1 else: frequency_word[word] = frequency_word[word] + 1 prev = word # total += 1 # print(word + ', ' + str(frequency_word[word])) prev = 'BOS' for word in jeita.words(): bg = bigram(prev, word)
#JPop Band Name and Hits Generator #aka. chickenberry #Jonisha McKiddy | Julie Evans ###import/corpus### import re import nltk from nltk.corpus import knbc jc = knbc.words() c = nltk.corpus.words.words() from nltk.corpus import PlaintextCorpusReader #portal=r"C:\Users\JuJuBee Marie\Google Drive\linguistics\comp ling\chickenberry" corpus_root = r"C:\\Users\ses71_000\Desktop\programming" pc = PlaintextCorpusReader(corpus_root, 'portal 12text.txt') #cfd_pc=nltk.ConditionalFreqDist(nltk.bigrams(pc)) #cpd_pc=nltk.ConditionalProbDist(cfd_pc, nltk.MLEProbDist) ###dictionary### f = open('C:\\Users\ses71_000\Desktop\edict.csv', encoding='utf-8') #def searchable(w): #[word for line in f for word in line.split()] #print(w, word) #srch=[word for word in f for word in line.split()] read = f.readlines() char = {} for line in read: l = line str.replace(l, ',,,,,,,,,,,,,,,,,,,,', '') p = re.split('\||\|', l)
import nltk from nltk.corpus import jeita from nltk.corpus import knbc jfull_t = nltk.Text(jeita.words()) # create NLTK Text from JEITA Corpus kfull_t = nltk.Text(knbc.words()) # create NLTK Text from KNB Corpus fdist_jfull = nltk.FreqDist(jfull_t) # create frequency dist from JEITA Corpus print(fdist_jfull.most_common(50)) # 50 most common words in JEITA Corpus fdist_kfull = nltk.FreqDist(kfull_t) # create frequency dist from KNB Corpus print(fdist_kfull.most_common(50)) # 50 most common words in KNB Corpus # words that appear in the same context (same words on either side) as '人' print(jfull_t.similar("人")) print(kfull_t.similar("人"))
print( "words in a0010.chasen with 3 or more characters, that appear 3 or more times:" ) print(sorted(w for w in set(jsingle_t) if len(w) >= 3 and fdist_j[w] >= 3)) print("words in a0010.chasen ending in しい:") print(sorted(w for w in set(jsingle_t) if w.endswith('しい'))) print("words in a0010.chasen starting with 見:") print(sorted(w for w in set(jsingle_t) if w.startswith('見'))) print("words in a0010.chasen which contain 山:") print(sorted(w for w in set(jsingle_t) if '山' in w)) print("words in a0010.chasen which contain 上 or 下:") print(sorted(w for w in set(jsingle_t) if '上' in w or '下' in w)) # create NLTK texts for each corpus in full jfull_t = nltk.Text(jeita.words()) kfull_t = nltk.Text(knbc.words()) # the frequency distribution tells us the frequency of each vocabulary item in the text fdist_jfull = nltk.FreqDist(jfull_t) print("50 most common words in the full JEITA corpus:") print(fdist_jfull.most_common(50)) fdist_kfull = nltk.FreqDist(kfull_t) print("50 most common words in the full KNB corpus:") print(fdist_kfull.most_common(50)) # a collocation is a sequence of words that occur together unusually often print( "collocations (words that occur together unusually often) in the JEITA corpus:" ) print(jfull_t.collocations()) print(