def get_char_counts(openfile):
    word_counts = sorted(
        collections.Counter(c for l in openfile for c in l).items())
    counts = dict()
    for word_count in word_counts:
        if word_count[0].isalpha() or word_count[0].isspace():
            counts[word_count[0]] = float(word_count[1])
    counts[' '] = brown.raw().count(' ')
    corpus_size = float(len(brown.raw()))
    return corpus_size, counts
예제 #2
0
def process_brown():    
    tokenizer = RegexpTokenizer(r'\w+')
    brown_toks = tokenizer.tokenize(brown.raw()[:50000])
    brown_toks = list(set(brown_toks))
    brown_toks= map(lambda x: x.lower(), brown_toks)
 
    return brown_toks
예제 #3
0
def brownFreqListNoStop():
    # Obtain the list of words
    brown_words = brown.raw().split(' ')
    englishstop = stopwords.words('english')
    filtered_words = [w for w in brown_words if not w in englishstop]

    num_filtered_words = len(filtered_words)
    print "We have " + str(num_filtered_words) + " brown filtered words"
    counter = 0

    brown_frequ = defaultdict(int)
    sleep(2)
    for word in filtered_words:
        counter += 1
        brown_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_filtered_words)) * 100) + " %"

    brown_frequ = sorted(brown_frequ.values(), reverse=True)
    brown_rank = np.array(xrange(1, len(brown_frequ) + 1))

    c, alpha = powerLaw(brown_frequ, brown_rank)
    print 'According to Zipfs law %.2f should be close to 1.' % alpha
    plotPowerLaws(
        brown_rank,
        brown_frequ, [c, c], [-1, -alpha],
        title=
        "Relation between word rank and frequency for brown, no stop words",
        xlabel="Word Rank",
        ylabel="Word Frequency")

    return 0
예제 #4
0
def brownFreq():
    # Obtain the list of words
    brown_words = brown.raw().split(' ')

    num_brown_words = len(brown_words)
    print "We have " + str(num_brown_words) + " brown words"
    counter = 0

    brown_frequ = defaultdict(int)
    sleep(2)
    for word in brown_words:
        counter += 1
        brown_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_brown_words)) * 100) + " %"

    brown_frequ = sorted(brown_frequ.values(), reverse=True)
    brown_rank = np.array(xrange(1, len(brown_frequ) + 1))

    c, alpha = powerLaw(brown_frequ, brown_rank)
    plotPowerLaws(brown_rank,
                  brown_frequ, [c, c], [-1, -alpha],
                  title="Relation between word rank and frequency for brown",
                  xlabel="Word Rank",
                  ylabel="Word Frequency")

    return 0
예제 #5
0
def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")
예제 #6
0
def select_genres(n):
    '''
    Selects genres with more than n files. Returns raw data and the genre of each file
    in the selected genres as two 1d numpy arrays.
    
    Parameters
    ----------
    n: An integer.
    
    Returns
    -------
    A tuple of (raw, genres)
    raw: A 1d numpy array.
    genres: A 1d numpy array.
    '''
    genres = []
    raw = []
    #Creates arrays of the genres and raw data for genres with more than n files
    for file in brown.fileids():

        for k in brown.categories(file):

            if len(brown.fileids(k)) > n:
                genres.append(k)
                raw.append(brown.raw(file))

    return raw, genres
예제 #7
0
파일: xor.py 프로젝트: amlweems/zngnfnab
def _init():
    global total
    global char_counts
    if total == 256:
        from nltk.corpus import brown
        for char in brown.raw():
            char_counts[ord(char)] += 1
        total = float(sum(char_counts))
예제 #8
0
 def __init__(self):
     self.words = list(word.lower() for word in brown.words())
     #self.words= brown.words()
     #self.text=nltk.Text(word.lower() for word in nltk.corpus.brown.words())
     self.tagged_words = brown.tagged_words()
     self.tagfreq = {}
     self.raw = brown.raw()
     """
예제 #9
0
파일: 3-29.py 프로젝트: jbbe/lang
def ari(cat):
    """Accept text as list of words"""
    num_chars = len(brown.raw(categories=cat))
    num_words = len(brown.words(categories=cat))
    num_sents = len(brown.sents(categories=cat))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
예제 #10
0
 def learn(self, listofsentences=[], n=126733):
     self.bf = BloomFilter(1090177, 4)
     i = 0
     for sent in brown.raw():
         if i >= n:
             break
         for word in sent:
             self.bf.Insert(word.lower())
             i += 1
             
     self.bf.PrintStats()
예제 #11
0
def ch03_29_reading_difficulty():
  sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
  from nltk.corpus import brown
  for category in brown.categories():
    raw = brown.raw(categories=category)
    words = len(brown.words(categories=category))
    sentences = len(sent_tokenizer.tokenize(raw))
    letters_per_word = (len(raw) - words) / words # raw chars - words space chars
    words_per_sentence = words / sentences
    reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
    print category, reading_level
예제 #12
0
def readBrownDataset():
    nltk.download("brown")
    documents = brown.fileids()
    docs = []
    for doc in documents:
        if len(brown.categories(doc)) == 1:
            d = brown.raw(doc).replace("\n", " ")
            d = re.sub(
                r"/[A-Za-z0-9_-]+ ", " ", d
            )  #The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn") #.replace("/at","").replace("/nn-tl","").replace("/nn-hp","").replace("/np-hl","").replace("/nn","").replace("/vbd","").replace("/in","").replace("/jj","").replace("/hvz","").replace("/cs","").replace("/nps","").replace("/nr","").replace("/np-tl","").replace("/md","").replace("/np","").replace("/cd-hl","").replace("/vbn","").replace("/np-tl","").replace("/dti","").replace("--/--","")
            docs.append(d)
    return docs
예제 #13
0
def readability(input):
	letters = brown.raw(categories=input)
	words = brown.words(categories=input)
	sentences = brown.sents(categories=input)
	
	letters_per_word = len(letters) / len(words)
	words_per_sentence = len(words) / len(sentences)
	
	ari_score = (4.71 * float(letters_per_word)) + (0.5 * float(words_per_sentence)) - 21.43
	print("Letters per word: %s" % letters_per_word)
	print("Words per sentence: %s" % words_per_sentence)
	print("ARI Score: %s" % ari_score)
예제 #14
0
def ch03_29_reading_difficulty():
    sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    from nltk.corpus import brown
    for category in brown.categories():
        raw = brown.raw(categories=category)
        words = len(brown.words(categories=category))
        sentences = len(sent_tokenizer.tokenize(raw))
        letters_per_word = (len(raw) -
                            words) / words  # raw chars - words space chars
        words_per_sentence = words / sentences
        reading_level = (4.71 *
                         letters_per_word) + (0.5 * words_per_sentence) + 21.43
        print category, reading_level
예제 #15
0
def demo(text=None):
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
def get_brown_data(useN=100):
    try:
        fileids = brown.fileids()
    except LookupError:
        import nltk
        nltk.download('brown')
        fileids = brown.fileids()
        
    fileids = fileids[:useN]
    texts = [brown.raw(fid) for fid in fileids]
    
    fileids = [os.path.splitext(fid)[0] for fid in fileids]
    
    return texts, fileids
def Automated_Readability_Index29(section):
    char_count = 0
    sent = len(brown.sents(categories=section))
    words = len(brown.words(categories=section))
    raw_text = brown.raw(categories=section)

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI
예제 #18
0
def demo(text=None):
    from nltk.corpus import brown
    import pylab
    tt=TextTilingTokenizer(demo_mode=True)
    if text is None: text=brown.raw()[:10000]
    s,ss,d,b=tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)),b)
    pylab.legend()
    pylab.show()
def exercise29():
	'''
	Readability measures are used to score the reading difficulty of a text, for the purposes of selecting texts of appropriate difficulty for language learners.
	Let us define avgW to be the average number of letters per word, and avgSen to be the average number of words per sentence, in a given text.
	The Automated Readability Index (ARI) of the text is defined to be: 4.71 avgWord + 0.5 avgSen - 21.43.
	Compute the ARI score for various sections of the Brown Corpus, including section f (popular lore) and j (learned).
	Make use of the fact that nltk.corpus.brown.words() produces a sequence of words, while nltk.corpus.brown.sents() produces a sequence of sentences
	'''

	for category in brown.categories():
		chars = brown.raw(categories=category)
		words = brown.words(categories=category)
		sentences = brown.sents(categories=category)
		
		avgW = len(chars)/len(words) #average number of letters per word
		avgS = len(words)/len(sentences) #average number of words per sentence
		
		print category, "Avg Words", avgW, "Avg Sentences", avgS
		print("ARI", (4.71 * avgW ) + ( 0.5 * avgS ) - 21.43)
예제 #20
0
def word_frequencies(contents):
    toktok = ToktokTokenizer()
    string_corpus = brown.raw()

    # Frequencies for each file
    list = []
    for file in contents.keys():
        print("Tokenising", file)
        tokenised = [
            toktok.tokenize(sent) for sent in sent_tokenize(string_corpus)
        ]
        fdist = Counter(chain(*tokenised))
        list.append(fdist)

    # Combine keys into one set, eliminating duplicates
    print("Making frequency distribution of all words that we care about.")
    keys = []
    for sublist in list:
        keys += sublist
    keys = set(keys)

    # Build combined frequency dict
    # Tuple of identifiers for connectives and other common words
    unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz',
                'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$',
                'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd',
                'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti',
                'dod', 'wrb', 'hvz', 'nn$')
    # This is far from the best way to do this, but I couldn't find the documentation for these identifiers
    frequencies = {}
    for key in keys:
        total = 0
        if (key[0] not in string.punctuation) and (
                key.split('/')[-1]
                not in unwanted):  # Gets rid of unwanted tokens
            for sublist in list:
                if key in sublist.keys():
                    total += sublist[key]
            frequencies[key.split('/')[0].lower()] = total
    print("Total words (that we care about): " + str(len(frequencies.keys())))

    return frequencies
def get_features(liste):
    features_file = {}
    for fileid in liste:
        features_file[fileid] = {}  #on initialise les features du fichier
        # Utilisons notre libraire my_tools pour ajouter des stats sur les mots
        words = brown.words(fileid)
        stats_mots = mt.get_stats_longueur(words)
        for feature, valeur in stats_mots.items():
            features_file[fileid][feature] = valeur
        # Puis sur les phrases
        stats_phrases = mt.get_types_phrases(brown.raw(fileid))
        for feature, valeur in stats_phrases.items():
            features_file[fileid][feature] = valeur
        adverbes = mt.get_effectif_adverbes(words)
        for feature, valeur in adverbes.items():
            features_file[fileid][feature] = valeur
        #... l'entrée varie mais la sortie est un dico {"feature_name":valeur,...}
    print("->Features extraites:",
          list(features_file[fileid].keys())[:20], "...")
    return features_file
def demo(text=None):
    '''
    use the bounary together with the pseudo sentences to evaluate the quality of segmentation.
    :param text:
    :return:
    '''
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(w=40, k=20, demo_mode=True)
    with open('flypaper_short.txt', 'r') as file:
        text = file.read()
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    print(b)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
예제 #23
0
def demo(text=None):
    from nltk.corpus import brown
    import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
    """s = tt.tokenize(text)
    FILE = open("tiled","w")
    FILE.writelines(s)
    FILE.close()"""


# if __name__ == '__main__':
#     content = open('toTile', 'r').read()
#     demo(content)
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd

# Pick out the first of these texts — Emma by Jane Austen — and give it a short name, gutenberg_raw
gutenberg_raw = gutenberg.raw("austen-emma.txt")

# Pick out the words from webtext corpus and give it a short name, webtext_words
webtext_words = webtext.words()
print(webtext_words)

# Pick out the text from np_chat corpus and name it as nps_chat_raw
nps_chat_raw = nps_chat.raw()

# Pick out the text from brown corpus and name it as brown_raw
brown_raw = brown.raw()
print(brown_raw)

# Pick out the text from reuters corpus and name it as reuters_words
reuters_words = reuters.words()
print(reuters_words)

# Pick out the text from inaugural corpus and name it as inaugral_raw
inaugral_words = inaugural.words()
print(inaugral_words)

# Creating a variable for tokenizing words
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens
tokens = tokenizer.tokenize(gutenberg_raw)
예제 #25
0
파일: 03.py 프로젝트: kouheiszk/nltk
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import brown
from nltk.corpus import webtext

brown.raw(fileids=["cm02"])
webtext.raw("firefox.txt")
예제 #26
0
def main(argv):

  try:
    opts, args = getopt.getopt(argv,"o:v:c:",["ifile=","ofile="])
  except getopt.GetoptError:
    print ' [-o <datafile>] [-v <vocabfile>]'
    sys.exit(2)

  outputfile = "trainging.dat"
  vocabfile = "vocab.txt"
  corpus = "20newsgroups"

  for opt, arg in opts:
    if opt == '-o':
      outputfile = arg
    elif opt == '-v':
      vocabfile = arg
    elif opt == '-c':
      corpus = arg    

  tokenizer = RegexpTokenizer(r'[a-z]+')
  id_dict = {}
  nterms = 0;
  wordlist = []
  data = []

  if corpus == "20newsgroups":
     #categories = []
     #categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
     categories = ['talk.politics.guns','soc.religion.christian','sci.electronics','rec.sport.baseball','comp.graphics']

     if len(categories) == 0:   data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42).data
     else: data = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42).data
  elif corpus == "brown":
    for fileid in brown.fileids():
      data.append(brown.raw(fileid))
  elif corpus == "reuters":
    for fileid in reuters.fileids():
      data.append(reuters.raw(fileid))
  else:
    for dirname, dirnames, filenames in os.walk(corpus):
      for filename in filenames:
        inpfile = os.path.join(dirname,filename)
        with io.open(inpfile, "r", errors='ignore') as fp:
          data.append(fp.read())
          fp.close()

  stemmer = SnowballStemmer("english")

  with io.open(outputfile, "wb") as output:
    for i in range(len(data)):
        lines = data[i].split('\n')

        fd = {}
        termsdoc=0
        for line in lines:
          if line.startswith("From:") or line.startswith("Subject:") or line.startswith("Reply-To:") or line.startswith("Organization:") or line.startswith("Lines:") or line.lower().startswith("Nntp-Posting-Host:") or line.startswith("X-Newsreader:") or line.startswith("Distribution:") or line.startswith("Keywords:") or line.startswith("Article-I.D.:") or line.startswith("Supersedes:") or line.startswith("Expires:") or line.startswith("NNTP-Posting-Host:") or line.startswith("Summary:") or line.startswith("Originator:") : continue;
          line = line.lower()
          splits = tokenizer.tokenize(line)
          filtered_words = [word for word in splits if word not in stopwords.words('english')]
          filtered_words = [word for word in filtered_words if len(word) > 2]
          filtered_words = [word for word in filtered_words if word not in ["edu","com","subject","writes","mil", "subject"]]

          for word in filtered_words:

            try:
              id = id_dict[word]
            except KeyError:
              id_dict[word] = nterms
              id = nterms
              nterms = nterms+1
              wordlist.append(word)

            try:
              fd[id] = fd[id]+1
            except KeyError:
              fd[id] = 1
              termsdoc = termsdoc+1


        outline = str(termsdoc)         
        for idterm in fd:
          outline = outline+" "+str(idterm)+":"+str(fd[idterm])

        output.write(outline+"\n")
  output.close()
 
  output = open(vocabfile,"w")
  for val in wordlist:
    output.write(str(val)+"\n")
  output.close()
예제 #27
0
    "Word rank inversely proportional to word frequency (Gutenberg)",
    "Word rank", "Word frequency")

for w in gutenWordsFiltered:
    gutenFilteredFreq[w] += 1
gutenFilteredFreq = sorted(gutenFilteredFreq.values(), reverse=True)
#   Filtered word ranks
gutenFilteredRank = numpy.array(xrange(1, len(gutenFilteredFreq) + 1))
c, a = powerLaw(gutenFilteredFreq, gutenFilteredRank)
plotPowerLaws(
    gutenFilteredRank, gutenFilteredFreq, [c, c], [-1, -a],
    "Word rank inversely proportional to word frequency (Gutenberg without stopwords)",
    "Word rank", "Word frequency")

#   Brown corpus
brownWords = brown.raw().split(" ")
#   Without stopwords
brownWordsFiltered = [w for w in brownWords if not w in stopWords]
#   Frequencies
brownFreq = defaultdict(int)
brownFilteredFreq = defaultdict(int)
for w in brownWords:
    brownFreq[w] += 1
brownFreq = sorted(brownFreq.values(), reverse=True)
#   Word ranks
brownRank = numpy.array(xrange(1, len(brownFreq) + 1))
c, a = powerLaw(brownFreq, brownRank)
plotPowerLaws(brownRank, brownFreq, [c, c], [-1, -a],
              "Word rank inversely proportional to word frequency (Brown)",
              "Word rank", "Word frequency")
"""

#! python
import nltk

from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import nps_chat
from nltk.corpus import conll2000

import string
from sklearn.feature_extraction.text import TfidfVectorizer


#corpora 
brown = brown.raw()
nps_chat = nps_chat.raw()
conll2000 = conll2000.raw()
treebank = treebank.raw()

default=treebank;
operational= brown;

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

예제 #29
0
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import brown
import jieba

with open('flypaper_short.txt', 'r') as file:
    comments = file.read()

#segment the word
seg_list = jieba.cut(comments)
tokenized_comments = (" ".join(seg_list))
print(tokenized_comments)

#ttt = nltk.tokenize.TextTilingTokenizer(demo_mode=True)
ttt = nltk.tokenize.TextTilingTokenizer()
text = brown.raw()[:10000]
#print(text)
#gap_scores, smooth_scores, depth_scores, segment_boundaries = ttt.tokenize(tokenized_comments)
result = ttt.tokenize(tokenized_comments)
print(result)
'''
print(gap_scores)
print(smooth_scores)
print(depth_scores)
print(segment_boundaries)
'''
예제 #30
0
from nltk.corpus import brown
from nltk.util import ngrams

from lib import constants
from lib import huffman_tree


# HuffmanTree object.
ht = huffman_tree.HuffmanTree()

# A dict with a bag of tags for each word.
words = {}
words_huffman_encoded = {}  # A dict with a bag of tags for each huffman encoded word.

# All raw text of brown corpus.
txt = brown.raw()

# Raw sentences from brown corpus.
tagged_sentences = txt.split('./.')

# Get all tags for all words
for tagged_sentence in tagged_sentences:
    tagged_sentence = tagged_sentence.strip()
    if re.match(r'[0-9 ]+$', tagged_sentence):
        continue
    for word in tagged_sentence.split():
        word = word.strip()
        word = word.split('/')
        if len(word) < 2:
            continue
        word[0] = str(word[0]).translate(string.maketrans("",""),
예제 #31
0
# Module 3: Corpus
# Corpus structure challenge

from nltk.corpus import brown

# print(brown.fileids())

fileid = 'cl08'

# text = brown.words(fileid)
# print(text)

print(" Num of chars :", len(brown.raw(fileid)))
print(" Num of words :", len(brown.words(fileid)))
print(" Num of sentences :", len(brown.sents(fileid)))

print(" Categories:", brown.categories(fileid))
예제 #32
0
# Jonathan Monreal

import re, nltk
from nltk.corpus import brown

raw = brown.raw(categories = 'humor')
tokens = re.findall(r'\s(wh[\w]+)', raw)

for word in tokens:
    print word
예제 #33
0
len(brown.fileids()) # 500 sources, each file is a source.


# In[ ]:


print(brown.fileids()[:100]) # First 100 sources.


# You can access the raw files with:

# In[ ]:


print(brown.raw('cb01').strip()[:1000]) # First 1000 characters.


# <br>
# You will see that **each word comes with a slash and a label** and unlike normal text, we see that **punctuations are separated from the word that comes before it**, e.g. 
# 
# > The/at General/jj-tl Assembly/nn-tl ,/, which/wdt adjourns/vbz today/nr ,/, has/hvz performed/vbn in/in an/at atmosphere/nn of/in crisis/nn and/cc struggle/nn from/in the/at day/nn it/pps convened/vbd ./.
# 
# <br>
# And we also see that the **each sentence is separated by a newline**:
# 
# > There/ex followed/vbd the/at historic/jj appropriations/nns and/cc budget/nn fight/nn ,/, in/in which/wdt the/at General/jj-tl Assembly/nn-tl decided/vbd to/to tackle/vb executive/nn powers/nns ./.
# > 
# > The/at final/jj decision/nn went/vbd to/in the/at executive/nn but/cc a/at way/nn has/hvz been/ben opened/vbn for/in strengthening/vbg budgeting/vbg procedures/nns and/cc to/to provide/vb legislators/nns information/nn they/ppss need/vb ./.
# 
# <br>
예제 #34
0
gutenberg.raw(fileid)
#Words : 
gutenberg.words(fileid)
#Sentence : 
gutenberg.sents(fileid)
from nltk.tokenize import sent_tokenize
tok = sent_tokenize(text)

for x in range(5):
    print(tok[x])
    
from nltk.corpus import brown
brown.categories()
from nltk.corpus import brown
brown.categories()
text = brown.raw(categories='news')


import nltk
nltk.download('reuters')
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
fileid = 'test/16399'
text = reuters.raw(fileid)
text1=reuters.raw(categories='zinc')
reuters.categories(fileid)


import nltk
nltk.download('movie_reviews')
  #  ngrams_stats_bi_rev = pickle.load('ngrams_stats_bi_rev.pkl')
else: #initialise
    ngrams_stats_tri={}
    ngrams_stats_bi={}
    ngrams_stats_bi_rev={}
    ngrams_stats_tri_rev={}
'''
#class

ngrams_stats_tri = {}
ngrams_stats_bi = {}
ngrams_stats_bi_rev = {}
ngrams_stats_tri_rev = {}
vocab = Counter()
#choose sample
sample1 = brown.raw()
sample2 = gutenberg.raw()
sample3 = inaugural.raw()
sample5 = nltk.corpus.state_union.raw()
sample4 = genesis.raw('english-web.txt')
sample = sample1 + sample2 + sample3 + sample4 + sample5
vocab, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev = mainTrain(
    vocab, sample, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev,
    ngrams_stats_bi_rev)
'''
with open('ngrams_stats_tri.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri, hfile)
with open('ngrams_stats_bi.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_bi, hfile)
with open('ngrams_stats_tri_rev.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri_rev, hfile)
예제 #36
0
#%%
from nltk.corpus import reuters

# %%
articles = [" ".join(reuters.words(f)) for f in reuters.fileids()]
with open('reuters.txt', 'w') as f:
    for article in articles:
        f.write(article)
        f.write('\n\n')

# %%
fileids = reuters.fileids()
with open('reuters.txt', 'w') as f:
    for file_id in reuters.fileids():
        f.write(reuters.raw(file_id))
        f.write('\n\n')

# %%
from nltk.corpus import brown
print(brown.raw(categories='learned'))
# %%
예제 #37
0
from nltk.corpus import gutenberg
from nltk.corpus import brown

text = gutenberg.raw()
text_tokens = nltk.word_tokenize(text)
frecList_gutenberg = FreqDist(text_tokens)
text_brown = brown.raw()
text_brown_tokens = nltk.word_tokenize(text_brown)
frecList_brown = FreqDist(text_tokens)ten
예제 #38
0
import nltk

from nltk import *
with open("dracula.txt") as f:
	tokens = nltk.word_tokenize(f.read())

text = nltk.Text(tokens)

alpha_text = [word for word in text if word.isalpha() and len(word) > 5 and 
word[0].isupper() and word[1:].islower()]
print(FreqDist(alpha_text).most_common(5))

from nltk.corpus import reuters, brown
print(brown.categories())
fileid = brown.fileids(brown.categories()[-1])
raw = brown.raw(fileid)
print(raw[:50])

cfd = nltk.ConditionalFreqDist(
	(genre, word)
	for genre in brown.categories()
	for word in brown.words(categories=genre))


예제 #39
0
파일: ch2.py 프로젝트: haidang92/csc577
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    avg_word_len = round(num_chars / num_words)
    avg_sent_len = round(num_words / num_sents)
    lexical_diversity = round(num_words / num_vocab)
    print(fileid, "  |  ", num_chars, "  |  ", num_words, "  |  ", num_sents,
          "  |  ", num_vocab, "  |  ", avg_word_len, "  |  ", avg_sent_len,
          "  |  ", lexical_diversity)

for fileid in webtext.fileids():
    print(fileid)

brown.categories()
brown.raw("cr09")

#stylistics - systematic differences between genres
# by use of modal verbs - [can could may might must will]
news_text = brown.words(categories='news')
hobbies_text = brown.words(categories='hobbies')
news_text_fdist = nltk.FreqDist(w.lower() for w in news_text)
hobbies_text_fdist = nltk.FreqDist(w.lower() for w in hobbies_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])

event_words = ["who", "what", "when", "where", "why"]
for m in event_words:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])
예제 #40
0
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")
from nltk.corpus import inaugural, reuters, brown, gutenberg

from itertools import product as iter_product

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)