Python raw 예제들, nltk.corpus.brown.raw Python 예제들

예제 #1

0

파일 보기

파일: generate_random_letters.py 프로젝트: gsgoncalves/11761-fall2019-project

def get_char_counts(openfile):
    word_counts = sorted(
        collections.Counter(c for l in openfile for c in l).items())
    counts = dict()
    for word_count in word_counts:
        if word_count[0].isalpha() or word_count[0].isspace():
            counts[word_count[0]] = float(word_count[1])
    counts[' '] = brown.raw().count(' ')
    corpus_size = float(len(brown.raw()))
    return corpus_size, counts

예제 #2

0

파일 보기

파일: saryn_3_backend.py 프로젝트: cferko/eldritch

def process_brown():    
    tokenizer = RegexpTokenizer(r'\w+')
    brown_toks = tokenizer.tokenize(brown.raw()[:50000])
    brown_toks = list(set(brown_toks))
    brown_toks= map(lambda x: x.lower(), brown_toks)
 
    return brown_toks

예제 #3

0

파일 보기

파일: sbeaulieu_MTI380_devoir4.py 프로젝트: jizhihang/mti830

def brownFreqListNoStop():
    # Obtain the list of words
    brown_words = brown.raw().split(' ')
    englishstop = stopwords.words('english')
    filtered_words = [w for w in brown_words if not w in englishstop]

    num_filtered_words = len(filtered_words)
    print "We have " + str(num_filtered_words) + " brown filtered words"
    counter = 0

    brown_frequ = defaultdict(int)
    sleep(2)
    for word in filtered_words:
        counter += 1
        brown_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_filtered_words)) * 100) + " %"

    brown_frequ = sorted(brown_frequ.values(), reverse=True)
    brown_rank = np.array(xrange(1, len(brown_frequ) + 1))

    c, alpha = powerLaw(brown_frequ, brown_rank)
    print 'According to Zipfs law %.2f should be close to 1.' % alpha
    plotPowerLaws(
        brown_rank,
        brown_frequ, [c, c], [-1, -alpha],
        title=
        "Relation between word rank and frequency for brown, no stop words",
        xlabel="Word Rank",
        ylabel="Word Frequency")

    return 0

예제 #4

0

파일 보기

파일: sbeaulieu_MTI380_devoir4.py 프로젝트: jizhihang/mti830

def brownFreq():
    # Obtain the list of words
    brown_words = brown.raw().split(' ')

    num_brown_words = len(brown_words)
    print "We have " + str(num_brown_words) + " brown words"
    counter = 0

    brown_frequ = defaultdict(int)
    sleep(2)
    for word in brown_words:
        counter += 1
        brown_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_brown_words)) * 100) + " %"

    brown_frequ = sorted(brown_frequ.values(), reverse=True)
    brown_rank = np.array(xrange(1, len(brown_frequ) + 1))

    c, alpha = powerLaw(brown_frequ, brown_rank)
    plotPowerLaws(brown_rank,
                  brown_frequ, [c, c], [-1, -alpha],
                  title="Relation between word rank and frequency for brown",
                  xlabel="Word Rank",
                  ylabel="Word Frequency")

    return 0

예제 #5

0

파일 보기

파일: main.py 프로젝트: CommunicativeEngineer/searchEngine

def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")

예제 #6

0

파일 보기

def select_genres(n):
    '''
    Selects genres with more than n files. Returns raw data and the genre of each file
    in the selected genres as two 1d numpy arrays.
    
    Parameters
    ----------
    n: An integer.
    
    Returns
    -------
    A tuple of (raw, genres)
    raw: A 1d numpy array.
    genres: A 1d numpy array.
    '''
    genres = []
    raw = []
    #Creates arrays of the genres and raw data for genres with more than n files
    for file in brown.fileids():

        for k in brown.categories(file):

            if len(brown.fileids(k)) > n:
                genres.append(k)
                raw.append(brown.raw(file))

    return raw, genres

예제 #7

0

파일 보기

파일: xor.py 프로젝트: amlweems/zngnfnab

def _init():
    global total
    global char_counts
    if total == 256:
        from nltk.corpus import brown
        for char in brown.raw():
            char_counts[ord(char)] += 1
        total = float(sum(char_counts))

예제 #8

0

파일 보기

 def __init__(self):
     self.words = list(word.lower() for word in brown.words())
     #self.words= brown.words()
     #self.text=nltk.Text(word.lower() for word in nltk.corpus.brown.words())
     self.tagged_words = brown.tagged_words()
     self.tagfreq = {}
     self.raw = brown.raw()
     """

예제 #9

0

파일 보기

파일: 3-29.py 프로젝트: jbbe/lang

def ari(cat):
    """Accept text as list of words"""
    num_chars = len(brown.raw(categories=cat))
    num_words = len(brown.words(categories=cat))
    num_sents = len(brown.sents(categories=cat))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43

예제 #10

0

파일 보기

파일: spell_test.py 프로젝트: bluemoon/Godel

 def learn(self, listofsentences=[], n=126733):
     self.bf = BloomFilter(1090177, 4)
     i = 0
     for sent in brown.raw():
         if i >= n:
             break
         for word in sent:
             self.bf.Insert(word.lower())
             i += 1
             
     self.bf.PrintStats()

예제 #11

0

파일 보기

파일: ch03_ex.py 프로젝트: 447327642/nltk-examples

def ch03_29_reading_difficulty():
  sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
  from nltk.corpus import brown
  for category in brown.categories():
    raw = brown.raw(categories=category)
    words = len(brown.words(categories=category))
    sentences = len(sent_tokenizer.tokenize(raw))
    letters_per_word = (len(raw) - words) / words # raw chars - words space chars
    words_per_sentence = words / sentences
    reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
    print category, reading_level

예제 #12

0

파일 보기

def readBrownDataset():
    nltk.download("brown")
    documents = brown.fileids()
    docs = []
    for doc in documents:
        if len(brown.categories(doc)) == 1:
            d = brown.raw(doc).replace("\n", " ")
            d = re.sub(
                r"/[A-Za-z0-9_-]+ ", " ", d
            )  #The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn") #.replace("/at","").replace("/nn-tl","").replace("/nn-hp","").replace("/np-hl","").replace("/nn","").replace("/vbd","").replace("/in","").replace("/jj","").replace("/hvz","").replace("/cs","").replace("/nps","").replace("/nr","").replace("/np-tl","").replace("/md","").replace("/np","").replace("/cd-hl","").replace("/vbn","").replace("/np-tl","").replace("/dti","").replace("--/--","")
            docs.append(d)
    return docs

예제 #13

0

파일 보기

파일: readability.py 프로젝트: emilyholt/working-with-corpora

def readability(input):
	letters = brown.raw(categories=input)
	words = brown.words(categories=input)
	sentences = brown.sents(categories=input)
	
	letters_per_word = len(letters) / len(words)
	words_per_sentence = len(words) / len(sentences)
	
	ari_score = (4.71 * float(letters_per_word)) + (0.5 * float(words_per_sentence)) - 21.43
	print("Letters per word: %s" % letters_per_word)
	print("Words per sentence: %s" % words_per_sentence)
	print("ARI Score: %s" % ari_score)

예제 #14

0

파일 보기

def ch03_29_reading_difficulty():
    sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    from nltk.corpus import brown
    for category in brown.categories():
        raw = brown.raw(categories=category)
        words = len(brown.words(categories=category))
        sentences = len(sent_tokenizer.tokenize(raw))
        letters_per_word = (len(raw) -
                            words) / words  # raw chars - words space chars
        words_per_sentence = words / sentences
        reading_level = (4.71 *
                         letters_per_word) + (0.5 * words_per_sentence) + 21.43
        print category, reading_level

예제 #15

0

파일 보기

def demo(text=None):
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()

예제 #16

0

파일 보기

파일: data_sources.py 프로젝트: ekmaloney/intro_to_text_analysis

def get_brown_data(useN=100):
    try:
        fileids = brown.fileids()
    except LookupError:
        import nltk
        nltk.download('brown')
        fileids = brown.fileids()
        
    fileids = fileids[:useN]
    texts = [brown.raw(fid) for fid in fileids]
    
    fileids = [os.path.splitext(fid)[0] for fid in fileids]
    
    return texts, fileids

예제 #17

0

파일 보기

파일: Girish_Srinivas3b.py 프로젝트: GirishSrinivas/PythonPrograms

def Automated_Readability_Index29(section):
    char_count = 0
    sent = len(brown.sents(categories=section))
    words = len(brown.words(categories=section))
    raw_text = brown.raw(categories=section)

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI

예제 #18

0

파일 보기

파일: texttiling.py 프로젝트: damorelse/MachineTranslation

def demo(text=None):
    from nltk.corpus import brown
    import pylab
    tt=TextTilingTokenizer(demo_mode=True)
    if text is None: text=brown.raw()[:10000]
    s,ss,d,b=tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)),b)
    pylab.legend()
    pylab.show()

예제 #19

0

파일 보기

파일: nltkChapter3b_template.py 프로젝트: httruongs/PythonNaturalLanguageProcessing

def exercise29():
	'''
	Readability measures are used to score the reading difficulty of a text, for the purposes of selecting texts of appropriate difficulty for language learners.
	Let us define avgW to be the average number of letters per word, and avgSen to be the average number of words per sentence, in a given text.
	The Automated Readability Index (ARI) of the text is defined to be: 4.71 avgWord + 0.5 avgSen - 21.43.
	Compute the ARI score for various sections of the Brown Corpus, including section f (popular lore) and j (learned).
	Make use of the fact that nltk.corpus.brown.words() produces a sequence of words, while nltk.corpus.brown.sents() produces a sequence of sentences
	'''

	for category in brown.categories():
		chars = brown.raw(categories=category)
		words = brown.words(categories=category)
		sentences = brown.sents(categories=category)
		
		avgW = len(chars)/len(words) #average number of letters per word
		avgS = len(words)/len(sentences) #average number of words per sentence
		
		print category, "Avg Words", avgW, "Avg Sentences", avgS
		print("ARI", (4.71 * avgW ) + ( 0.5 * avgS ) - 21.43)

예제 #20

0

파일 보기

파일: natural_language.py 프로젝트: Sandvich/Hashtags

def word_frequencies(contents):
    toktok = ToktokTokenizer()
    string_corpus = brown.raw()

    # Frequencies for each file
    list = []
    for file in contents.keys():
        print("Tokenising", file)
        tokenised = [
            toktok.tokenize(sent) for sent in sent_tokenize(string_corpus)
        ]
        fdist = Counter(chain(*tokenised))
        list.append(fdist)

    # Combine keys into one set, eliminating duplicates
    print("Making frequency distribution of all words that we care about.")
    keys = []
    for sublist in list:
        keys += sublist
    keys = set(keys)

    # Build combined frequency dict
    # Tuple of identifiers for connectives and other common words
    unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz',
                'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$',
                'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd',
                'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti',
                'dod', 'wrb', 'hvz', 'nn$')
    # This is far from the best way to do this, but I couldn't find the documentation for these identifiers
    frequencies = {}
    for key in keys:
        total = 0
        if (key[0] not in string.punctuation) and (
                key.split('/')[-1]
                not in unwanted):  # Gets rid of unwanted tokens
            for sublist in list:
                if key in sublist.keys():
                    total += sublist[key]
            frequencies[key.split('/')[0].lower()] = total
    print("Total words (that we care about): " + str(len(frequencies.keys())))

    return frequencies

예제 #21

0

파일 보기

파일: Methodo_TD7-phase2.py 프로젝트: jughurta/Extraction-de-features-et-classification-d-articles-du-dataset-Brown

def get_features(liste):
    features_file = {}
    for fileid in liste:
        features_file[fileid] = {}  #on initialise les features du fichier
        # Utilisons notre libraire my_tools pour ajouter des stats sur les mots
        words = brown.words(fileid)
        stats_mots = mt.get_stats_longueur(words)
        for feature, valeur in stats_mots.items():
            features_file[fileid][feature] = valeur
        # Puis sur les phrases
        stats_phrases = mt.get_types_phrases(brown.raw(fileid))
        for feature, valeur in stats_phrases.items():
            features_file[fileid][feature] = valeur
        adverbes = mt.get_effectif_adverbes(words)
        for feature, valeur in adverbes.items():
            features_file[fileid][feature] = valeur
        #... l'entrée varie mais la sortie est un dico {"feature_name":valeur,...}
    print("->Features extraites:",
          list(features_file[fileid].keys())[:20], "...")
    return features_file

예제 #22

0

파일 보기

파일: TextTilingBaseline.py 프로젝트: ChanningPing/TextSegmentationBaselines

def demo(text=None):
    '''
    use the bounary together with the pseudo sentences to evaluate the quality of segmentation.
    :param text:
    :return:
    '''
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(w=40, k=20, demo_mode=True)
    with open('flypaper_short.txt', 'r') as file:
        text = file.read()
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    print(b)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()

예제 #23

0

파일 보기

def demo(text=None):
    from nltk.corpus import brown
    import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None: text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
    """s = tt.tokenize(text)
    FILE = open("tiled","w")
    FILE.writelines(s)
    FILE.close()"""


# if __name__ == '__main__':
#     content = open('toTile', 'r').read()
#     demo(content)

예제 #24

0

파일 보기

파일: Ngrams_Naresh.py 프로젝트: nareshshah139/IE-Group-D-Term3

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd

# Pick out the first of these texts — Emma by Jane Austen — and give it a short name, gutenberg_raw
gutenberg_raw = gutenberg.raw("austen-emma.txt")

# Pick out the words from webtext corpus and give it a short name, webtext_words
webtext_words = webtext.words()
print(webtext_words)

# Pick out the text from np_chat corpus and name it as nps_chat_raw
nps_chat_raw = nps_chat.raw()

# Pick out the text from brown corpus and name it as brown_raw
brown_raw = brown.raw()
print(brown_raw)

# Pick out the text from reuters corpus and name it as reuters_words
reuters_words = reuters.words()
print(reuters_words)

# Pick out the text from inaugural corpus and name it as inaugral_raw
inaugral_words = inaugural.words()
print(inaugral_words)

# Creating a variable for tokenizing words
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens
tokens = tokenizer.tokenize(gutenberg_raw)

예제 #25

0

파일 보기

파일: 03.py 프로젝트: kouheiszk/nltk

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import brown
from nltk.corpus import webtext

brown.raw(fileids=["cm02"])
webtext.raw("firefox.txt")

예제 #26

0

파일 보기

파일: corpus_to_lda.py 프로젝트: rdorado79/data-augmentation

def main(argv):

  try:
    opts, args = getopt.getopt(argv,"o:v:c:",["ifile=","ofile="])
  except getopt.GetoptError:
    print ' [-o <datafile>] [-v <vocabfile>]'
    sys.exit(2)

  outputfile = "trainging.dat"
  vocabfile = "vocab.txt"
  corpus = "20newsgroups"

  for opt, arg in opts:
    if opt == '-o':
      outputfile = arg
    elif opt == '-v':
      vocabfile = arg
    elif opt == '-c':
      corpus = arg    

  tokenizer = RegexpTokenizer(r'[a-z]+')
  id_dict = {}
  nterms = 0;
  wordlist = []
  data = []

  if corpus == "20newsgroups":
     #categories = []
     #categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
     categories = ['talk.politics.guns','soc.religion.christian','sci.electronics','rec.sport.baseball','comp.graphics']

     if len(categories) == 0:   data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42).data
     else: data = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42).data
  elif corpus == "brown":
    for fileid in brown.fileids():
      data.append(brown.raw(fileid))
  elif corpus == "reuters":
    for fileid in reuters.fileids():
      data.append(reuters.raw(fileid))
  else:
    for dirname, dirnames, filenames in os.walk(corpus):
      for filename in filenames:
        inpfile = os.path.join(dirname,filename)
        with io.open(inpfile, "r", errors='ignore') as fp:
          data.append(fp.read())
          fp.close()

  stemmer = SnowballStemmer("english")

  with io.open(outputfile, "wb") as output:
    for i in range(len(data)):
        lines = data[i].split('\n')

        fd = {}
        termsdoc=0
        for line in lines:
          if line.startswith("From:") or line.startswith("Subject:") or line.startswith("Reply-To:") or line.startswith("Organization:") or line.startswith("Lines:") or line.lower().startswith("Nntp-Posting-Host:") or line.startswith("X-Newsreader:") or line.startswith("Distribution:") or line.startswith("Keywords:") or line.startswith("Article-I.D.:") or line.startswith("Supersedes:") or line.startswith("Expires:") or line.startswith("NNTP-Posting-Host:") or line.startswith("Summary:") or line.startswith("Originator:") : continue;
          line = line.lower()
          splits = tokenizer.tokenize(line)
          filtered_words = [word for word in splits if word not in stopwords.words('english')]
          filtered_words = [word for word in filtered_words if len(word) > 2]
          filtered_words = [word for word in filtered_words if word not in ["edu","com","subject","writes","mil", "subject"]]

          for word in filtered_words:

            try:
              id = id_dict[word]
            except KeyError:
              id_dict[word] = nterms
              id = nterms
              nterms = nterms+1
              wordlist.append(word)

            try:
              fd[id] = fd[id]+1
            except KeyError:
              fd[id] = 1
              termsdoc = termsdoc+1


        outline = str(termsdoc)         
        for idterm in fd:
          outline = outline+" "+str(idterm)+":"+str(fd[idterm])

        output.write(outline+"\n")
  output.close()
 
  output = open(vocabfile,"w")
  for val in wordlist:
    output.write(str(val)+"\n")
  output.close()

예제 #27

0

파일 보기

    "Word rank inversely proportional to word frequency (Gutenberg)",
    "Word rank", "Word frequency")

for w in gutenWordsFiltered:
    gutenFilteredFreq[w] += 1
gutenFilteredFreq = sorted(gutenFilteredFreq.values(), reverse=True)
#   Filtered word ranks
gutenFilteredRank = numpy.array(xrange(1, len(gutenFilteredFreq) + 1))
c, a = powerLaw(gutenFilteredFreq, gutenFilteredRank)
plotPowerLaws(
    gutenFilteredRank, gutenFilteredFreq, [c, c], [-1, -a],
    "Word rank inversely proportional to word frequency (Gutenberg without stopwords)",
    "Word rank", "Word frequency")

#   Brown corpus
brownWords = brown.raw().split(" ")
#   Without stopwords
brownWordsFiltered = [w for w in brownWords if not w in stopWords]
#   Frequencies
brownFreq = defaultdict(int)
brownFilteredFreq = defaultdict(int)
for w in brownWords:
    brownFreq[w] += 1
brownFreq = sorted(brownFreq.values(), reverse=True)
#   Word ranks
brownRank = numpy.array(xrange(1, len(brownFreq) + 1))
c, a = powerLaw(brownFreq, brownRank)
plotPowerLaws(brownRank, brownFreq, [c, c], [-1, -a],
              "Word rank inversely proportional to word frequency (Brown)",
              "Word rank", "Word frequency")

예제 #28

0

파일 보기

파일: ws_fit_of_default_training_data_sklearnvariant(9).py 프로젝트: kieferca/quality-indicators-for-text

"""

#! python
import nltk

from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import nps_chat
from nltk.corpus import conll2000

import string
from sklearn.feature_extraction.text import TfidfVectorizer


#corpora 
brown = brown.raw()
nps_chat = nps_chat.raw()
conll2000 = conll2000.raw()
treebank = treebank.raw()

default=treebank;
operational= brown;

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

예제 #29

0

파일 보기

# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import brown
import jieba

with open('flypaper_short.txt', 'r') as file:
    comments = file.read()

#segment the word
seg_list = jieba.cut(comments)
tokenized_comments = (" ".join(seg_list))
print(tokenized_comments)

#ttt = nltk.tokenize.TextTilingTokenizer(demo_mode=True)
ttt = nltk.tokenize.TextTilingTokenizer()
text = brown.raw()[:10000]
#print(text)
#gap_scores, smooth_scores, depth_scores, segment_boundaries = ttt.tokenize(tokenized_comments)
result = ttt.tokenize(tokenized_comments)
print(result)
'''
print(gap_scores)
print(smooth_scores)
print(depth_scores)
print(segment_boundaries)
'''

예제 #30

0

파일 보기

파일: generate_hmm.py 프로젝트: parthabb/thesis

from nltk.corpus import brown
from nltk.util import ngrams

from lib import constants
from lib import huffman_tree


# HuffmanTree object.
ht = huffman_tree.HuffmanTree()

# A dict with a bag of tags for each word.
words = {}
words_huffman_encoded = {}  # A dict with a bag of tags for each huffman encoded word.

# All raw text of brown corpus.
txt = brown.raw()

# Raw sentences from brown corpus.
tagged_sentences = txt.split('./.')

# Get all tags for all words
for tagged_sentence in tagged_sentences:
    tagged_sentence = tagged_sentence.strip()
    if re.match(r'[0-9 ]+$', tagged_sentence):
        continue
    for word in tagged_sentence.split():
        word = word.strip()
        word = word.split('/')
        if len(word) < 2:
            continue
        word[0] = str(word[0]).translate(string.maketrans("",""),

예제 #31

0

파일 보기

# Module 3: Corpus
# Corpus structure challenge

from nltk.corpus import brown

# print(brown.fileids())

fileid = 'cl08'

# text = brown.words(fileid)
# print(text)

print(" Num of chars :", len(brown.raw(fileid)))
print(" Num of words :", len(brown.words(fileid)))
print(" Num of sentences :", len(brown.sents(fileid)))

print(" Categories:", brown.categories(fileid))

예제 #32

0

파일 보기

파일: c3q18.py 프로젝트: jonathanmonreal/nltk-examples

# Jonathan Monreal

import re, nltk
from nltk.corpus import brown

raw = brown.raw(categories = 'humor')
tokens = re.findall(r'\s(wh[\w]+)', raw)

for word in tokens:
    print word

예제 #33

0

파일 보기

len(brown.fileids()) # 500 sources, each file is a source.


# In[ ]:


print(brown.fileids()[:100]) # First 100 sources.


# You can access the raw files with:

# In[ ]:


print(brown.raw('cb01').strip()[:1000]) # First 1000 characters.


# <br>
# You will see that **each word comes with a slash and a label** and unlike normal text, we see that **punctuations are separated from the word that comes before it**, e.g. 
# 
# > The/at General/jj-tl Assembly/nn-tl ,/, which/wdt adjourns/vbz today/nr ,/, has/hvz performed/vbn in/in an/at atmosphere/nn of/in crisis/nn and/cc struggle/nn from/in the/at day/nn it/pps convened/vbd ./.
# 
# <br>
# And we also see that the **each sentence is separated by a newline**:
# 
# > There/ex followed/vbd the/at historic/jj appropriations/nns and/cc budget/nn fight/nn ,/, in/in which/wdt the/at General/jj-tl Assembly/nn-tl decided/vbd to/to tackle/vb executive/nn powers/nns ./.
# > 
# > The/at final/jj decision/nn went/vbd to/in the/at executive/nn but/cc a/at way/nn has/hvz been/ben opened/vbn for/in strengthening/vbg budgeting/vbg procedures/nns and/cc to/to provide/vb legislators/nns information/nn they/ppss need/vb ./.
# 
# <br>

예제 #34

0

파일 보기

gutenberg.raw(fileid)
#Words : 
gutenberg.words(fileid)
#Sentence : 
gutenberg.sents(fileid)
from nltk.tokenize import sent_tokenize
tok = sent_tokenize(text)

for x in range(5):
    print(tok[x])
    
from nltk.corpus import brown
brown.categories()
from nltk.corpus import brown
brown.categories()
text = brown.raw(categories='news')


import nltk
nltk.download('reuters')
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
fileid = 'test/16399'
text = reuters.raw(fileid)
text1=reuters.raw(categories='zinc')
reuters.categories(fileid)


import nltk
nltk.download('movie_reviews')

예제 #35

0

파일 보기

파일: main_ngram_Train.py 프로젝트: tamamin/Affect-Adjusted-NLP

  #  ngrams_stats_bi_rev = pickle.load('ngrams_stats_bi_rev.pkl')
else: #initialise
    ngrams_stats_tri={}
    ngrams_stats_bi={}
    ngrams_stats_bi_rev={}
    ngrams_stats_tri_rev={}
'''
#class

ngrams_stats_tri = {}
ngrams_stats_bi = {}
ngrams_stats_bi_rev = {}
ngrams_stats_tri_rev = {}
vocab = Counter()
#choose sample
sample1 = brown.raw()
sample2 = gutenberg.raw()
sample3 = inaugural.raw()
sample5 = nltk.corpus.state_union.raw()
sample4 = genesis.raw('english-web.txt')
sample = sample1 + sample2 + sample3 + sample4 + sample5
vocab, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev = mainTrain(
    vocab, sample, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev,
    ngrams_stats_bi_rev)
'''
with open('ngrams_stats_tri.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri, hfile)
with open('ngrams_stats_bi.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_bi, hfile)
with open('ngrams_stats_tri_rev.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri_rev, hfile)

예제 #36

0

파일 보기

파일: makenews.py 프로젝트: benjaminb/curriculumeffects

#%%
from nltk.corpus import reuters

# %%
articles = [" ".join(reuters.words(f)) for f in reuters.fileids()]
with open('reuters.txt', 'w') as f:
    for article in articles:
        f.write(article)
        f.write('\n\n')

# %%
fileids = reuters.fileids()
with open('reuters.txt', 'w') as f:
    for file_id in reuters.fileids():
        f.write(reuters.raw(file_id))
        f.write('\n\n')

# %%
from nltk.corpus import brown
print(brown.raw(categories='learned'))
# %%

예제 #37

0

파일 보기

파일: exercice.py 프로젝트: jizhihang/mti830

from nltk.corpus import gutenberg
from nltk.corpus import brown

text = gutenberg.raw()
text_tokens = nltk.word_tokenize(text)
frecList_gutenberg = FreqDist(text_tokens)
text_brown = brown.raw()
text_brown_tokens = nltk.word_tokenize(text_brown)
frecList_brown = FreqDist(text_tokens)ten

예제 #38

0

파일 보기

파일: nltk_ch2.py 프로젝트: altear/MiscProjects

import nltk

from nltk import *
with open("dracula.txt") as f:
	tokens = nltk.word_tokenize(f.read())

text = nltk.Text(tokens)

alpha_text = [word for word in text if word.isalpha() and len(word) > 5 and 
word[0].isupper() and word[1:].islower()]
print(FreqDist(alpha_text).most_common(5))

from nltk.corpus import reuters, brown
print(brown.categories())
fileid = brown.fileids(brown.categories()[-1])
raw = brown.raw(fileid)
print(raw[:50])

cfd = nltk.ConditionalFreqDist(
	(genre, word)
	for genre in brown.categories()
	for word in brown.words(categories=genre))

예제 #39

0

파일 보기

파일: ch2.py 프로젝트: haidang92/csc577

    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    avg_word_len = round(num_chars / num_words)
    avg_sent_len = round(num_words / num_sents)
    lexical_diversity = round(num_words / num_vocab)
    print(fileid, "  |  ", num_chars, "  |  ", num_words, "  |  ", num_sents,
          "  |  ", num_vocab, "  |  ", avg_word_len, "  |  ", avg_sent_len,
          "  |  ", lexical_diversity)

for fileid in webtext.fileids():
    print(fileid)

brown.categories()
brown.raw("cr09")

#stylistics - systematic differences between genres
# by use of modal verbs - [can could may might must will]
news_text = brown.words(categories='news')
hobbies_text = brown.words(categories='hobbies')
news_text_fdist = nltk.FreqDist(w.lower() for w in news_text)
hobbies_text_fdist = nltk.FreqDist(w.lower() for w in hobbies_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])

event_words = ["who", "what", "when", "where", "why"]
for m in event_words:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])

예제 #40

0

파일 보기

파일: term_extraction.py 프로젝트: darkliquid/NaNoGenMo

from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")

예제 #41

0

파일 보기

파일: helper_processing.py 프로젝트: KhaoticMind/kaggle-homedepot

from nltk.corpus import inaugural, reuters, brown, gutenberg

from itertools import product as iter_product

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)