예제 #1
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
예제 #2
0
def get_book_sents(word_list):
    """Searches Jane Austen's 'Emma' for the words in the word list.
    The sentences are modified to highlight the found words by changing them to uppercase.
    Then the sentence number (in order from the book) is appended to the front
    of the sentence string.
    Returns a list of strings (sentence # + \s + sentence string).
    """
    book = 'austen-emma.txt'
    book_sents = gutenberg.sents(book)
    sent_nums = set()
    sents_to_return = []
    s_count = 0
    for s in book_sents:
        s_count += 1
        s_str = " ".join(s)
        for w in word_list:
            if ' '+w+' ' in s_str.lower():
                if s_count not in sent_nums:
                    sent_nums.add(s_count)
                    s_str = s_str.replace(' '+w+' ', ' '+w.upper()+' ')
                    s_str = s_str.replace(' '+w.title()+' ', ' '+w.upper()+' ')
                    sents_to_return.append(str(s_count)+' '+s_str)
                else:
                    s_str = s_str.replace(' '+w+' ', ' '+w.upper()+' ')
                    s_str = s_str.replace(' '+w.title()+' ', ' '+w.upper()+' ')
                    sents_to_return[-1] = str(s_count)+' '+s_str
    return sents_to_return
예제 #3
0
def main(num_couplets, num_syllables, rhyme_depth):
  for text in TEXTS:
    for sentence in gutenberg.sents(text):
      addSentence(sentence, rhyme_depth)

  for couplet_number in range(0, num_couplets):
    # Get a randomly selected couplet
    attempts = 0
    while True:
      couplet = getCouplet(num_syllables)
      if couplet is not None: break
      # Prevent an infinite loop if parameters are off
      attempts += 1
      if attempts == 1000: return
    couplet = [ pretty(line) for line in couplet ]

    # A little hack for adjusting punctuation and capitalization
    couplet[0] = couplet[0][0].upper() + couplet[0][1:]
    if couplet[0][-1] == '.' or couplet[0][-1] == ',':
      couplet[0] = couplet[0][:-1] + ','
      char = couplet[1][0].lower() if couplet[1][:2] != 'I ' else 'I'
      couplet[1] = char + couplet[1][1:]
    else:
      couplet[1] = couplet[1][0].upper() + couplet[1][1:]

    # Dump to stdout
    print couplet[0]
    print couplet[1]
예제 #4
0
파일: main.py 프로젝트: kwdhd/nlp
def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
예제 #5
0
def tagged_sentences( book ):
	'''
	Generator yielding one sentence at a time, filtering out the -NONE- tagged
	sentences, which are anomalies in the words.
	'''

	for sentence in gutenberg.sents( book ):
		yield filter( lambda x: x[1] not in [':', '-NONE-', ], nltk.pos_tag( sentence ) )
def plot_sentiment_flow(title):
    sents = gutenberg.sents(title)
    positive_flow = [partial_sentiment(x) for x in sents]
    negative_flow = [partial_sentiment(x, positive = False) for x in sents]
    plt.plot(range(len(sents)), positive_flow, label = 'Positive')
    plt.plot(range(len(sents)), negative_flow, label = 'Negative')
    plt.ylabel('Sentiment Score')
    plt.xlabel(title)
    plt.show()
예제 #7
0
def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
예제 #8
0
def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]
예제 #9
0
def page59():
    """Prints the longest sentence from Macbeth"""
    from nltk.corpus import gutenberg

    macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt")
    print "macbeth_sentences=", macbeth_sentences
    print "macbeth_sentences[1037]=", macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    print "longest sentence=",
    print [s for s in macbeth_sentences if len(s) == longest_len]
예제 #10
0
def create_model_from_NLTK():
    filepath = "nltkcorpus.txt"
    if isfile(filepath):
        return create_model(filepath= filepath, save=False)
    else:
        from nltk.corpus import reuters, brown, gutenberg
        sents = reuters.sents() + brown.sents()
        for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]:
            sents += gsents

        return create_model(sentences=sents, savename=filepath)
예제 #11
0
def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid
예제 #12
0
파일: toturial.py 프로젝트: Paul-Lin/misc
def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
예제 #13
0
파일: ch02.py 프로젝트: gree2/hobby
def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid
예제 #14
0
파일: LanguageModel.py 프로젝트: slee17/NLP
 def train(self):
     self.vocabulary=set()
     
     this_bigrams=[]
     self.unigrams = FreqDist([])
     
     for fileid in gutenberg.fileids():
         for sentence in gutenberg.sents(fileid):
             words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
             this_bigrams += bigrams(words)
             self.vocabulary.update(words)
             self.unigrams.update(words)
     self.bigrams=ConditionalFreqDist(this_bigrams)
     self.V = len(self.vocabulary)
예제 #15
0
    def benchmark_sbd():
        ps = []
        rs = []
        f1s = []
        c = 0
        for fileid in gutenberg.fileids():
            c += 1
            copy_sents_gold = gutenberg.sents(fileid)
            sents_gold = [s for s in copy_sents_gold]
            for sent_i in range(len(sents_gold)):
                new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
                sents_gold[sent_i] = new_sent
            text = gutenberg.raw(fileid)
            sents_obtained = split_text(text)
            copy_sents_obtained = sents_obtained.copy()
            for sent_i in range(len(sents_obtained)):
                new_sent = [w.group()
                            for w in re.finditer(r'\w+', sents_obtained[sent_i])
                            if w.group().isalpha()]
                sents_obtained[sent_i] = new_sent
            c_common = 0
            for sent in sents_obtained:
                if sent in  sents_gold:
                    c_common += 1
            p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
            print('\n\n', fileid)
            print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
            ps.append(p)
            rs.append(r)
            f1s.append(f1)

        print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
                                                           np.std(ps)))
        print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
                                                        np.std(rs)))
        print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
                                                    np.std(f1s)))
        print(len(f1s))

        good_ps = [p for p in ps if p >= 0.8]
        good_rs = [r for r in rs if r >= 0.8]
        good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
        print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
                                                           np.std(good_ps)))
        print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
                                                        np.std(good_rs)))
        print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
                                                    np.std(good_f1s)))
        print(len(good_f1s))
예제 #16
0
파일: rnn_lm.py 프로젝트: yelu/blog
    def tokenize_data(self, n = -1):
        # download dependent nltk resources if you havn't.
        # nltk.download('punkt')

        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading sentences from gutenberg corpus ..."
        from nltk.corpus import gutenberg
        tokenized_sentences = []
        for s in gutenberg.sents('austen-emma.txt'):
            tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token])
        print "Parsed %d sentences." % (len(tokenized_sentences))

        if n > 0:
            tokenized_sentences = tokenized_sentences[:n]

        # count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        self.vocabulary_size = int(len(word_freq.items()) * 0.95)

        # get the most common words, treat others words as unknown.
        vocab = word_freq.most_common(self.vocabulary_size - 1)
        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word is '%s' and appeared %d times." % \
              (vocab[-1][0], vocab[-1][1])
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in self.word_to_index
                                      else self.unknown_token for w in sent]

        # create training data
        x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]]
                             for sent in tokenized_sentences])
        y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]]
                             for sent in tokenized_sentences])

        print ""
        print "Example sentence: '%s'" % tokenized_sentences[0]
        print "By word indexes: '%s'" % \
              [self.word_to_index[w] for w in tokenized_sentences[0]]

        return (x_train, y_train)
예제 #17
0
	def get_gutenberg_data(self):
		count = {}
		self.len_list = []
		#my_fileids = ['austen-sense.txt', 'austen-emma.txt', 'austen-persuasion.txt']
		#my_fileids = ['chesterton-ball.txt', 'chesterton-ball.txt', 'chesterton-thursday.txt']
		my_fileids =['shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']
		for fileids in my_fileids:
			for sent in gutenberg.sents(fileids):
				l = len(sent)
				#if l < 3:
					#continue
				self.len_list.append(l)
				if l in count:
					count[l] += 1
				else:
					count[l] = 1
		total = len(self.len_list)
		for i in range(100):
			if i in count.keys():
				self.probs.append(count[i]/(total+0.0))
			else:
				self.probs.append(0)
예제 #18
0
def test_train_selection(
        choice):  # For Dataset combination selection S1,S2,S3,S4

    total_sent1 = list(brown.sents())
    total_sent2 = list(gutenberg.sents())
    # Start and end of sentence tagging
    for sent in total_sent1:
        sent.insert(0, "<s>")
        sent.insert(len(sent), "</s>")

    for sent2 in total_sent2:
        sent2.insert(0, "<s>")
        sent2.insert(len(sent2), "</s>")

    train1, test1 = train_test_split(total_sent1,
                                     test_size=0.1,
                                     random_state=4)
    train2, test2 = train_test_split(total_sent2,
                                     test_size=0.1,
                                     random_state=4)
    #optimal discount values are being passed which had been calculated through held out data
    if choice == 1:  # D1-train , D1- test
        LM_model_kneser(train1, test1, 5500, 0.8)
        LM_model_katz(train1, test1, 5500, 0.75)
        LM_model_trigram(train1, test1, 5500)

    elif choice == 2:  # D2-train , D2- test
        LM_model_kneser(train2, test2, 5500, 0.8)
        LM_model_katz(train2, test2, 5500, 0.75)
        LM_model_trigram(train2, test2, 5500)
    elif choice == 3:  # D1 + D2 train , D1- test
        LM_model_kneser(train1 + total_sent2, test1, 5500, 0.75)
        LM_model_katz(train1 + total_sent2, test1, 5500, 0.6)
        LM_model_trigram(train1 + total_sent2, test1, 5500)
    else:  # D1 + D2 train , D2- test
        LM_model_kneser(total_sent1 + train2, test2, 5500, 0.8)
        LM_model_katz(total_sent1 + train2, test2, 5500, 0.6)
        LM_model_trigram(total_sent1 + train2, test2, 5500)
예제 #19
0
def text_cleaning():
    brown_data = []
    brown_data = brown.sents()
    gutenberg_data = gutenberg.sents()
    punctuations = [',', '.', ':', ';', '?', '"', '!', '--', '(', ')', '``']
    punctuations.append("''")
    for sentence in brown_data:
        sentence.insert(0, '<s>')
        sentence.insert(0, '<s>')
        sentence.append('</s>')
        for i in sentence:
            if i in punctuations:
                sentence.remove(i)
        brown_text.append(sentence)

    for sentence in gutenberg_data:
        sentence.insert(0, '<s>')
        sentence.insert(0, '<s>')
        sentence.append('</s>')
        for i in sentence:
            if i in punctuations:
                sentence.remove(i)
        gutenberg_text.append(sentence)
def q1():
	# 1. Print the number of word tokens
	# YOUR CODE
	from nltk.corpus import gutenberg as gb
	#if you want to print all file ids in gutenberg archive
	#print(gb.fileids())
	file_id = 'austen-sense.txt'
	word_list = gb.words(file_id)
	print(len(word_list))

	# 2. Print the number of word types
	# YOUR CODE
	print(len( set( [ w.lower() for w in word_list ]) ))
	
	# 3. Print all tokens in the first sentence
	# YOUR CODE
	sent_list = gb.sents(file_id)
	print(' '.join(sent_list[0]))

	# if you want to tokenize a string
	raw = 'i have a book.'
	from nltk import word_tokenize as wt 
	word_list = wt(raw)
예제 #21
0
def getFreq(n):
    freq = {}
    length2 = {}

    for category in brown.categories():
        sentences = brown.sents(categories=category)
        length2[category] = len(sentences)
        for sentence in sentences[:int(length2[category] * 0.9)]:
            text = " <s> " + ' '.join(
                re.compile(r'\w+').findall(
                    ' '.join(sentence))).lower() + " </s> "
            model = myNGrams(text, n)
            for x in model:
                line = ' '.join(x)
                count = len(re.findall(" " + line + " ", text))
                if (x not in freq) and (count <> 0):
                    freq[x] = 0
                if count <> 0:
                    freq[x] += count
    length = {}
    for category in gutenberg.fileids():
        sentences = gutenberg.sents(category)
        length[category] = len(sentences)
        for sentence in sentences[:int(length[category] * 0.9)]:
            text = " <s> " + ' '.join(
                re.compile(r'\w+').findall(
                    ' '.join(sentence))).lower() + " </s> "
            model = myNGrams(text, n)
            for x in model:
                line = ' '.join(x)
                count = len(re.findall(" " + line + " ", text))
                if (x not in freq) and (count <> 0):
                    freq[x] = 0
                if count <> 0:
                    freq[x] += count

    return [freq, length]
예제 #22
0
def write_sentence():
    """Step 2: Choose a work, identify the author, and choose a sentence."""
    work = gutenberg.fileids()[np.random.randint(len(gutenberg.fileids()))]
    author = re.findall('(\w+)-', work)[0].title()

    sentences = gutenberg.sents(work)

    rndm_sentence = sentences[np.random.randint(len(sentences))]

    tagged_rndm_sentence = pos_tag(rndm_sentence)
    """Step 3: Replace every word in the sentence with another word that can have the same POS."""

    new_sentence = [
        tup[0]
        if tup[1] in ['DT', 'NNP', '.', ','] or tup[1] not in tagged_words_dict
        else tagged_words_dict[tup[1]][np.random.randint(
            len(tagged_words_dict[tup[1]]))] for tup in tagged_rndm_sentence
    ]

    new_detokenized_sentence = str(
        TreebankWordDetokenizer().detokenize(new_sentence))
    new_detokenized_sentence = new_detokenized_sentence[0].upper(
    ) + new_detokenized_sentence[1:]
    if new_detokenized_sentence[-1].isalnum():
        new_detokenized_sentence = new_detokenized_sentence + '.'

    #print(f"{author}:", tagged_rndm_sentence)
    #print(f"{author}:", new_detokenized_sentence)
    #for tag_s, new_s in zip(tagged_rndm_sentence, new_detokenized_sentence.split()):
    #   print(tag_s, new_s)

    if len(new_sentence) <= 3:
        return write_sentence()
    if len(author) + len(new_detokenized_sentence) > 278:
        return write_sentence()
    else:
        return (author, re.sub('[\)]', '', new_detokenized_sentence))
예제 #23
0
def get_poem():
    """
    This function should extract hexametric sentences from Gutenberg texts, but it doesn't.
    Either hexametric sentences are too rare, or the absence of basic function words from CMUdict results in problems
    with the matching of the whole sentence.
    """
    outtext = []
    for corpus in gutenberg.fileids():
        text = gutenberg.sents(corpus)
        for sentence in text:
            transcription = ""
            discard = False
            for word in sentence:
                if word.lower() in words:
                    transcription += words[word.lower()]
                elif re.match(one_syllable, word.lower()):
                    # consider this word a "small", unstressed word
                    transcription += "A0A"
                else:
                    discard = True
            if re.match(verse, transcription) and not discard:
                print(sentence, transcription)
                outtext.append(" ".join(sentence))
    return "\n".join(outtext)
예제 #24
0
def get_poem():
    """
    This function should extract hexametric sentences from Gutenberg texts, but it doesn't.
    Either hexametric sentences are too rare, or the absence of basic function words from CMUdict results in problems
    with the matching of the whole sentence.
    """
    outtext = []
    for corpus in gutenberg.fileids():
        text = gutenberg.sents(corpus)
        for sentence in text:
            transcription = ""
            discard = False
            for word in sentence:
                if word.lower() in words:
                    transcription += words[word.lower()]
                elif re.match(one_syllable, word.lower()):
                    # consider this word a "small", unstressed word
                    transcription += "A0A"
                else:
                    discard = True
            if re.match(verse, transcription) and not discard:
                print(sentence, transcription)
                outtext.append(" ".join(sentence))
    return "\n".join(outtext)
예제 #25
0
from nltk.corpus import gutenberg

alice = gutenberg.sents('carroll-alice.txt')


def count_word():
    result = {}
    for sentence in alice:
        for word in sentence:
            normalized_word = word.lower()
            if normalized_word.isalpha():
                result[normalized_word] = result.setdefault(
                    normalized_word, 0) + 1
    return result


def count_first_word():
    result = {}
    for sentence in alice:
        first_word = sentence[0]
        first_word = first_word.lower()
        if first_word.isalpha():
            result[first_word] = result.setdefault(first_word, 0) + 1
    return result


def count():
    wordf = count_word()
    firstf = count_first_word()

    words = sorted(wordf.keys())
예제 #26
0
파일: toturial.py 프로젝트: Paul-Lin/misc
def print_longest():
    macbeth_sentences=gutenberg.sents('shakespeare-macbeth.txt')
    # print macbeth_sentences
    # print macbeth_sentences[1037]
    longest_len=max([len(s) for s in macbeth_sentences])
    print [s for s in macbeth_sentences if len(s)==longest_len]
def makePowerMatrix():
	matrix = [["uniquecount", "sentence length", "avg word length", "digit prop", "capital prop",
				"quotation", "question", "exclamation", "noun", "adj", "adv", "verb", "foreign", 
				"preposition", "pronoun", "interjection","childW", "historyW","religionW","scienceW",
				"FILE", "CLASS"]]
	rowLength = len(matrix)
	i = 0
	j = 0
	for path in parse.paths: #parse.paths
		for file in glob.glob(path):
			row = [0.0] * 22

			wholetext = parsetotext(file)
			textlist = parse.parse(file)
			textlistNL = parse.parse(file, False) # not lower case
			# textliststem = stemparse.parse(file)
			doclen = len(wholetext)
			wordcount = len(textlist)

			question = wholetext.count("?")
			exclamation = wholetext.count("!")
			quotations = (wholetext.count("'") + wholetext.count('"'))
			uniquecount = len(list(set(textlist)))

			num_words = len(gutenberg.words(file))
			num_sents = len(gutenberg.sents(file))

			if wordcount != 0:
				row[0] = uniquecount/(wordcount*1.0)
				lenmap = map(len, textlist)
				row[2] = sum(lenmap)/(wordcount*1.0)
			if num_sents != 0:
				row[1] = round(num_words/num_sents)

			if doclen != 0:
				row[3] = sum(c.isdigit() for c in wholetext)/(doclen*1.0)
				a = [x.isupper() for x in [y[0] for y in textlistNL]]
				row[4] = sum(a)/(doclen*1.0)

			row[5] = quotations / (wordcount * 1.0)
			row[6] = question / (wordcount * 1.0)
			row[7] = exclamation / (wordcount * 1.0)

			wholetext = unicode(wholetext, errors='replace')
			text = nltk.word_tokenize(wholetext)
			a = nltk.pos_tag(text)
			tag_fd = nltk.FreqDist(tag for (word, tag) in a)
			a = tag_fd.most_common()
			count = sum([y for (x, y) in a])
			if count != 0:
				row[8] = sum([y for (x, y) in a if x in ["NN", "NNP", "NNPS", "NNS"]])/(count*1.0) #Noun NN NNP NNPS NNS
				row[9] = sum([y for (x, y) in a if x in ["JJ", "JJR", "JJS"]])/(count*1.0) #Adj JJ JJR JJS
				row[10] = sum([y for (x, y) in a if x in ["RB", "RBR", "RBS", "WRB"]])/(count*1.0) #Adv RB RBR RBS WRB
				row[11] = sum([y for (x, y) in a if x in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]])/(count*1.0) #Verb VB VBD VBG VBN VBP VBZ
				row[12] = sum([y for (x, y) in a if x in ["FW"]])/(count*1.0) #foreign
				row[13] = sum([y for (x, y) in a if x in ["IN"]])/(count*1.0) #prepo
				row[14] = sum([y for (x, y) in a if x in ["PRP", "PRP$", "WP", "WP$"]])/(count*1.0) #pronoun
				row[15] = sum([y for (x, y) in a if x in ["UH"]])/(count*1.0) #interjection

			if wordcount != 0:
			 	row[16] = len([y for y in textlist if y in childW])/(wordcount*1.0)
			 	row[17] = len([y for y in textlist if y in historyW])/(wordcount*1.0)
			 	row[18] = len([y for y in textlist if y in religionW])/(wordcount*1.0)
			 	row[19] = len([y for y in textlist if y in scienceW])/(wordcount*1.0)
			 	# row[20] = len([y for y in textliststem if y in childWS])/(wordcount*1.0)
			 	# row[21] = len([y for y in textliststem if y in historyWS])/(wordcount*1.0)
			 	# row[22] = len([y for y in textliststem if y in religionWS])/(wordcount*1.0)
			 	# row[23] = len([y for y in textliststem if y in scienceWS])/(wordcount*1.0)

			row[-1] = j # This number assigns class
			row[-2] = re.search('[0-9]+\.txt', file).group() # Extracts file name (Ex: "123.txt")
			matrix += [row]
			i += 1
			print(path[-20:] + " on iteration " + str(i))
		j += 1
	return matrix
예제 #28
0
from nltk.corpus import gutenberg
gutenberg.fileids()
#print( gutenberg.fileids() )
emma = gutenberg.words('austen-emma.txt')
print(len(emma))

'''
This program displays three statistics for each text: average word length, average sentence 
length, and the number of times each vocabulary item appears in the text on average 
(our lexical diversity score).
'''

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab),fileid )

#Returns a List of sentences
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')

print( macbeth_sentences )
print( macbeth_sentences[1037] )

#Return the max len of sentences 
longest_len = max([len(s) for s in macbeth_sentences])

#Save the sentences biggest
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
예제 #29
0
from nltk.probability import (FreqDist, ConditionalProbDist,
                              ConditionalFreqDist, LidstoneProbDist)
from nltk.util import ngrams
from nltk.model.api import ModelI
from nltk.model.ngram import NgramModel
from random import randint
from nltk.tokenize import word_tokenize
import numpy as np

est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

noOfFiles = 3
fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt']
Authors = ['Bryant', 'Carroll', 'Shakespeare']
lenFirstSent = [
    len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles)
]
C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)]
lenC = [len(C[i]) for i in range(noOfFiles)]

unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)]
bigram = [
    NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles)
]
trigram = [
    NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles)
]


def generateText(model, train):
    pos = []
예제 #30
0
 def _nltk_prep_gutenberg(self, gutenbergF: str):
     try:
         gutenberg.sents()
     except LookupError as _le:
         nltk.download('gutenberg')
     return gutenberg
예제 #31
0
for word in ['Call', 'me', 'Ishmael', '.']:
    print word

#获取语料库
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
from nltk.corpus import gutenberg
gutenberg.fileids()

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
from nltk.corpus import brown
brown.categories()
brown.sents(categories=['news', 'editorial', 'reviews'])
news_text = brown.words(categories='news')
예제 #32
0
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import brown

print(gutenberg.fileids())

print(len(gutenberg.raw('austen-emma.txt')))

macbeth = gutenberg.sents('shakespeare-macbeth.txt')
print(macbeth[1:5])

print(brown.categories())

print(brown.words(categories='news'))

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
print(fdist)

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
print(cfd)

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance']
modals = ['can', 'could', 'may', 'might', 'must', 'will']

cfd.tabulate(conditions=genres, samples=modals)
예제 #33
0
from nltk.corpus import gutenberg
from nltk.util import ngrams
from kneser_ney import KneserNeyLM

gut_ngrams = (ngram for sent in gutenberg.sents() for ngram in ngrams(
    sent, 3, pad_left=True, pad_right=True, pad_symbol='<s>'))
lm = KneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>')
print(lm.score_sent(('This', 'is', 'a', 'sample', 'sentence', '.')))
예제 #34
0
import nltk
from nltk.corpus import gutenberg
print nltk.corpus.gutenberg.fileids()
print gutenberg.fileids()
print gutenberg.words('austen-sense.txt')

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)
예제 #35
0
파일: graph.py 프로젝트: spacenut/cs585
    def __init__(self, iterable):
        assert len(iterable) == 2
        tuple.__init__(self, iterable)
        self.label = iterable[0]
        self.child = iterable[1]

    def __new__(cls, *args, **kwargs):
        assert len(args) == 1
        assert len(args[0]) == 2
        return tuple.__new__(cls, args[0])


if __name__ == '__main__':
    import nltk
    from nltk.corpus import gutenberg
    import parser

    def get_deps(s):
        return parser.cp.parse_trees(s, transform=parser.to_deps)


    G = FreqGraph()
    alice_sents = gutenberg.sents(fileids='carroll-alice.txt')
    for sent in alice_sents:
        dep = get_deps(' '.join(sent))
        try:
            term = logic.Term(dep.next())
            G.clique(term)
        except:
            pass
    #G.ingest(get_deps('Barack is president'))
예제 #36
0
train_set = []
dev_set = []
test_set = []
for c in brown.categories():
    temp = brown.fileids(c)
    temp_length = len(temp)
    train_set += temp[:int(np.ceil(0.6 * temp_length))]
    dev_set += temp[int(np.ceil(0.6 *
                                temp_length)):int(np.ceil(0.8 * temp_length))]
    test_set += temp[int(np.ceil(0.8 * temp_length)):]
    #test_set += temp[-1:]

# In[3]:

brown_sent_train = brown.sents(train_set) + gb.sents(gb_train_set)
brown_words_train = brown.words(train_set) + gb.words(gb_train_set)
brown_words_train = list(
    filter(
        lambda a: a not in
        ("``", "''", "--", ".", ",", "!", ";", "(", ")", "?", ":"),
        brown_words_train))
brown_words_train = [x.lower() for x in brown_words_train]
brown_words_train += ['<s>', '</s>'] * len(brown_sent_train)
brown_unigram_dict_train1 = FreqDist(brown_words_train)
brown_unigram_dict_train = copy.deepcopy(brown_unigram_dict_train1)
c = 0
for (k, v) in brown_unigram_dict_train1.items():
    if (v <= 3):
        c += 1
        brown_unigram_dict_train['<unk>'] = brown_unigram_dict_train.pop(k)
예제 #37
0
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)

    return normalized_corpus

# NOrmalize the data
from nltk.corpus import gutenberg
from string import punctuation
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent
               if word not in remove_terms] for sent in bible]

norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]

norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]
print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
예제 #38
0
from nltk.corpus import gutenberg

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
# -*- coding: utf-8 -*-
"""

"""

from nltk.corpus import gutenberg
from normalization import normalize_corpus
import nltk
from operator import itemgetter

alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = filter(None, normalize_corpus(alice, lemmatize=False))

# print first line
print norm_alice[0]


def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def compute_ngrams(sequence, n):
    return zip(*[sequence[index:] for index in range(n)])


def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
예제 #40
0
pickle_in = open("gutenbergBrownUnigrams.pickle", "rb")
Unigrams = pickle.load(pickle_in)

pickle_in = open("gutenbergBrownBigrams.pickle", "rb")
Bigrams = pickle.load(pickle_in)

pickle_in = open("gutenbergBrownTrigrams.pickle", "rb")
Trigrams = pickle.load(pickle_in)

print("Pickle in time:" + str(time.time() - start))

# # TEST MODULE - outputs perplexity # #
start = time.time()
totPerplexity = 0
iterCount = 0
for sent in gutenberg.sents()[78842:]:
    probOfSent = 0
    sent.insert(0, "<s>")
    sent.append("<\s>")
    prev_word2 = sent[0]
    word = sent[1]
    try:
        probOfSent += Bigrams[prev_word2 + " " + word]
    except KeyError:
        probOfSent += -6
    prev_word1 = sent[0]
    prev_word2 = sent[1]
    for word in sent[2:]:
        try:
            trigram = prev_word1 + " " + prev_word2 + " " + word
            if trigram in Trigrams:
예제 #41
0
import nltk
from nltk.corpus import gutenberg

fileids = gutenberg.fileids()
# print 'fileids: ', fileids

emma = gutenberg.words('austen-emma.txt')

# average characters in a word: raw/words
# average word in a sentence: words/sents
# lexical diversity - num_words/num_vocab

# for fileid in fileids:
# 	num_chars = len(gutenberg.raw(fileid))
# 	num_words = len(gutenberg.words(fileid))
# 	num_sents = len(gutenberg.sents(fileid))
# 	num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# 	print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt')

longest_len = max([len(s) for s in macbeth_sents])
longest_sent = [s for s in macbeth_sents if len(s) == longest_len]

print 'longest_sent: ', longest_sent
예제 #42
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True)
    elapsed_time = time.time() - start
    print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))


from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
}

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
예제 #43
0
def analyze(book_name):
    # This function analyzes the 'book_name' file and prints out its characteristics using nltk package

    # Extracting characters, words and sentences respectively below
    characters = g.raw(book_name)
    words = g.words(book_name)
    sentences = g.sents(book_name)

    max_length_word = words[0]
    max_length_sentence = sentences[0]
    max_length_sentence_word_count = len(max_length_sentence)
    vocabulary = list()
    stem_families = dict()
    stemmer = PorterStemmer()

    for word in words:
        # Checking for the longest word
        if len(word) > len(max_length_word):
            max_length_word = word
        stemmed_word = stemmer.stem(word)
        # Creating a vocabulary of stemmed words and a dictionary for stem families
        if stemmed_word not in vocabulary and stemmed_word.isalpha():
            vocabulary.append(stemmed_word)
            stem_families[stemmed_word] = list()
            stem_families[stemmed_word].append(word.lower())
        elif stemmed_word in vocabulary and word.lower(
        ) not in stem_families[stemmed_word]:
            stem_families[stemmed_word].append(word.lower())

    for sentence in sentences:
        # Checking for the longest sentence
        if len(sentence) > len(max_length_sentence):
            max_length_sentence = sentence
            max_length_sentence_word_count = len(max_length_sentence)
    # Converting that largest sentence from a list of words to a cumulative string sentence
    max_length_sentence_string = " "
    max_length_sentence_string = max_length_sentence_string.join(
        max_length_sentence)

    max_stem_family = list(list(stem_families.items())[0])
    for key, value in stem_families.items():
        # Checking for the largest stem family
        if len(value) > len(max_stem_family[1]):
            max_stem_family[0] = key
            max_stem_family[1] = list(value)

    # Printing the characteristics as requested
    print("Analysis of '%s'" % book_name)
    print("# chars =", len(characters))
    print("# words =", len(words))
    print("# sentences =", len(sentences))
    print("Longest word = '%s'" % max_length_word)
    print("Longest sentence = '%s' (%d words)" %
          (max_length_sentence_string, max_length_sentence_word_count))
    print("Vocab size =", len(vocabulary))
    print("Largest stem family '%s' : {" % max_stem_family[0], end=" ")
    for i in range(len(max_stem_family[1])):
        if i != 0:
            print(",", end=" ")
        print("'%s'" % max_stem_family[1][i], end=" ")
    print("}")
예제 #44
0
import nltk
import numpy as np
from nltk.corpus import gutenberg
import pickle


def save_object(obj, filename):
    with open(filename, "w") as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


sents = gutenberg.sents("blake-poems.txt")

table = []
for i in range(20):
    table.append([])
for s in sents[1:]:
    # TODO prevent ?!, => . and dont count . in a sentence to length
    if len(s) > 2 and len(s) < 20:
        tags = nltk.pos_tag(s)
        simpleTags = [(word, nltk.map_tag("en-ptb", "universal", tag)) for word, tag in tags]
        tagsOnly = [t[1] for t in simpleTags]
        # this is to filter out headlines
        if tagsOnly[len(tagsOnly) - 1] == ".":
            wordCount = len(tagsOnly) - tagsOnly.count(".")
            table[wordCount].append(tagsOnly)

save_object(table, "grammar")
예제 #45
0
import nltk 
from spacy.en import English
from nltk.corpus import gutenberg

sentences = []
for fileid in gutenberg.fileids():
    txt = gutenberg.sents(fileid)
    for sent_array in txt:
        sentences.append(' '.join(sent_array))
    break

doc = ' '.join(sentences)
nlp = English()
doc_processed = nlp(doc)
for sent in doc:
    for token in sent:
        if token.is_alpha:
            print(token.orth +","+ token.tag_ + ","+ token.head.lemma_)
예제 #46
0
#!/usr/bin/python

"""Just a testing program for NLTK library. It is a NLP library for Python. Some kick-ass library this is. :)
	Pre-Requisites: NLTK Library installed, And Download additional data for the library using it's command.
	You can use "Natural Language Processing with Python" book from O'Reilley Publications for further details.
	This program prints some statistics for the Corpus(a large compiled collection of text files). """

import nltk
from nltk.corpus import gutenberg

for fid in gutenberg.fileids():
	nchars=gutenberg.raw(fid)
	nwords=gutenberg.words(fid)
	nsents=gutenberg.sents(fid)
	nvocab=len(set(w.lower() for w in gutenberg.words(fid))
	print "%s %s %s %s",(% str(int(nchars/nwords)), % str(int(nwords/nsents)), % str(int(nwords/nvocab)), % fid),
예제 #47
0
파일: bm25.py 프로젝트: matulma4/esc
            if a in result:
                result[a] += freq
            else:
                result[a] = freq
    result = sorted(result.items(), key=operator.itemgetter(1), reverse=True)
    return result


if __name__ == "__main__":
    # dataset = load_data('mycorpus.txt')
    # edited_data = edit_data(dataset)
    # dataset = gutenberg.sents('carroll-alice.txt')
    # dataset = gutenberg.sents('milton-paradise.txt')
    # dataset = gutenberg.sents('bible-kjv.txt')
    nltk.download('gutenberg')
    dataset = gutenberg.sents(gutenberg.fileids())
    #for fileid in gutenberg.fileids():
    #    dataset += gutenberg.sents(fileid)
    edited_data = edit_data(dataset)
    avg = 0
    for doc in edited_data:
        avg += len(doc)
    avg = avg / len(edited_data)
    # print('Number of documents: ' + str(len(edited_data)))
    # print('Average length of document: '+ str(avg))
    dictionary = make_dict(edited_data)
    corpus = [dictionary.doc2bow(text) for text in edited_data]
    # print('20 most common words of corpus:')
    freq_data = get_frequency(corpus, dictionary)
    f = 0
    for doc, freq in freq_data:
예제 #48
0
from __future__ import print_function

from nltk.corpus import gutenberg

if __name__ == '__main__':
    # Print all Gutenberg corpus documents
    print('Gutenberg corpus files:')
    print(gutenberg.fileids())

    # Print a raw corpus
    print(gutenberg.raw('milton-paradise.txt'))

    # Print 2 sentences from a corpus
    print(gutenberg.sents('milton-paradise.txt')[0:2])

    # Print 20 words from a corpus
    print(gutenberg.words('milton-paradise.txt')[0:20])
예제 #49
0
from nltk.corpus import gutenberg
from gensim.models import word2vec
from string import punctuation

bible_kjv_words = gutenberg.words('bible-kjv.txt')
bible_kjv_sents = gutenberg.sents('bible-kjv.txt')

discard_punctuation_and_lowercased_sents = [[word.lower() for word in sent if word not in punctuation] for sent in bible_kjv_sents]

bible_kjv_word2vec_model = word2vec.Word2Vec(discard_punctuation_and_lowercased_sents, min_count=5, size=200)
bible_kjv_word2vec_model.save("bible_word2vec_gensim")
bible_kjv_word2vec_model.wv.save_word2vec_format("bible_word2vec_org", "bible_word2vec_vocabulary")

print(bible_kjv_word2vec_model.most_similar(["god"]))
print(bible_kjv_word2vec_model.most_similar(["apple"]))
예제 #50
0
from nltk.corpus import gutenberg
from nltk.text import Text

from chatbot.markovgenerator import markovmodeller

if __name__ == "__main__":
    my_essay = True
    if my_essay:
        txt = ""
        with open('res/ans.txt') as text:
            for line in text:
                txt += line

        model = markovmodeller.build_markov_model_from_string(txt)

    else:
        list_sentences = Text(gutenberg.sents('austen-sense.txt'))
        model = markovmodeller.build_markov_model_from_list_of_sentences(
            list_sentences)

    while (True):
        walk_string = markovmodeller.get_walk(model)
        print(walk_string)
        x = input("--- type X to stop")
        if x == "X":
            break
    for i in range(len(words_te)-2):
            x=(words_te[i],words_te[i+1],words_te[i+2])
            if(Interpolated_Kneser_Ney_dict.get(x,"empty")=="empty"):
                if((x[0],x[1]) not in bgcounter):
                    Interpolated_Kneser_Ney_dict[x]=findPKn_bigram((x[1],x[2]))

                else:
                    Interpolated_Kneser_Ney_dict[x]=findPKn_trigram(x,discount_final)

            perp=perp*((1/Interpolated_Kneser_Ney_dict[x])**(1/N))
    return perp


# In[30]:

text_gutenberg=list(gutenberg.sents())
text_brown=list(brown.sents())
text_gutenberg_size=len(text_gutenberg)
text_gutenberg_size=len(text_gutenberg)
text_brown_size=len(text_brown)
for i in range(text_gutenberg_size):
    text_gutenberg[i].insert(0,"<s>")
    text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>')
    text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>')
for i in range(text_brown_size):
    text_brown[i].insert(0,"<s>")
    text_brown[i].insert(len(text_brown[i]),'<e>')
    text_brown[i].insert(len(text_brown[i]),'<e>')
text_gutenberg_tr,text_gutenberg_te=train_test_split(text_gutenberg,test_size=.20,random_state=4)
text_brown_tr,text_brown_te=train_test_split(text_brown,test_size=.20,random_state=4)
예제 #52
0
from nltk.corpus import gutenberg
from nltk.util import ngrams
from kneser_ney import KneserNeyLM

gut_ngrams = (
    ngram for sent in gutenberg.sents() for ngram in ngrams(sent, 3,
    pad_left=True, pad_right=True, pad_symbol='<s>'))
lm = KneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>')
print(lm.score_sent(('This', 'is', 'a', 'sample', 'sentence', '.')))
예제 #53
0
        return ngram[-1]
        
    def highest_order_probs(self):
        return self.lm[0]
    
    def generate_sentence(self, min_length=4):
        """
        Generate a sentence using the probabilities in the language model
        min_length: int, the minimum number of words in the sentence.
        """
        sent = []
        probs = self.highest_order_probs()
        while len(sent) < min_length + self.highest_order:
            sent = [self.start_pad_symbol]*(self.highest_order-1)
            sent.append(self.generate_next_word(sent,probs))
            while sent[-1] != self.end_pad_symbol:
                sent.append(self.generate_next_word(sent,probs))
        sent = " ".join(sent[(self.highest_order-1):-1])
        return sent
            
        
    
    
## how to test it
from nltk.corpus import gutenberg
from nltk.util import ngrams
from ModifiedKneserNeyLM import ModifiedKneserNeyLM

gut_ngrams = (ngram for sent in gutenberg.sents() for ngram in ngrams(sent, 3, pad_left=True, pad_right=True, pad_symbol='<s>'))
lm = ModifiedKneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>')
print(lm.score_sent(('This','is','a','sample','sentence','.')))
예제 #54
0
    if (len(text) != 10):
        return False, s
    else:
        return True, s


if __name__ == '__main__':
    time.clock()

    print()

    brown_corpus = list(brown.sents(brown.fileids()))
    for i in range(len(brown_corpus)):
        brown_corpus[i] = list(map(lambda x: x.lower(), brown_corpus[i]))
    gutenberg_corpus = list(gutenberg.sents(gutenberg.fileids()))
    for i in range(len(gutenberg_corpus)):
        gutenberg_corpus[i] = list(
            map(lambda x: x.lower(), gutenberg_corpus[i]))
    combined_corpus = brown_corpus + gutenberg_corpus

    unigram_list, bigram_list = training(combined_corpus)
    i = 0
    while (i < 1):
        bool, s = generate_trigram_token(bigram_list)
        if (bool):
            i += 1
            print(s)

    print()
    print('Total time taken', str(time.clock()))
예제 #55
0
# Module 5: Word Embedding
# Gutenberg Word2Vec
# Author: Dr. Alfred

from gensim.models import Word2Vec
from nltk.corpus import gutenberg

embedding = Word2Vec(gutenberg.sents(), min_count=1, window=5, size=32)

print(embedding['man'])
print(embedding.most_similar('man', topn=5))
print(embedding.most_similar('woman', topn=5))

print(embedding.most_similar(positive=['woman', 'king'], negative=['man']))
예제 #56
0
파일: ch02.py 프로젝트: gree2/hobby
def fun03():
    """fun03"""
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    print macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    print [s for s in macbeth_sentences if len(s) == longest_len]
예제 #57
0
 for word in wordlist:
     try:
         for thing in wikipedia.search(word):
             #print("SEARCH TERM: "+thing)
             #print(wikipedia.page(thing))
             #print(wikipedia.page(thing).content)
             pages += wikipedia.page(thing).content
     except wikipedia.DisambiguationError as e:
         s = random.choice(e.options)
         get_wiki(s)
         pass
     except:
         pass
 b = brown.sents()
 sents = tokenizer.tokenize(pages)
 sense = gutenberg.sents('austen-sense.txt')
 emma = gutenberg.sents('austen-emma.txt')
 persuasion = gutenberg.sents('austen-persuasion.txt')
 bible = genesis.sents('english-kjv.txt')
 blake = gutenberg.sents('blake-poems.txt')
 bryant = gutenberg.sents('bryant-stories.txt')
 burgess = gutenberg.sents('burgess-busterbrown.txt')
 carroll = gutenberg.sents('carroll-alice.txt')
 ch_ball = gutenberg.sents('chesterton-ball.txt')
 ch_brown = gutenberg.sents('chesterton-brown.txt')
 ch_thurs = gutenberg.sents('chesterton-thursday.txt')
 edge = gutenberg.sents('edgeworth-parents.txt')
 mel = gutenberg.sents('melville-moby_dick.txt')
 mil = gutenberg.sents('milton-paradise.txt')
 caesar = gutenberg.sents('shakespeare-caesar.txt')
 hamlet = gutenberg.sents('shakespeare-hamlet.txt')
예제 #58
0
파일: NLP.py 프로젝트: Toma-L/NLP
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance('surprize')

#another way to do this

from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
예제 #59
0
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# #library
# nltk.download('gutenberg')

# #puctionation and tokenizer
# nltk.download('punkt')

from nltk.corpus import gutenberg

print(gutenberg.fileids())

gberg_sents = gutenberg.sents(fileids=[
    'bible-kjv.txt', 'austen-emma.txt', 'austen-persuasion.txt',
    'austen-sense.txt', 'carroll-alice.txt'
])

#WORDCOUNT
print(
    len(
        gutenberg.sents(fileids=[
            'bible-kjv.txt', 'austen-emma.txt', 'austen-persuasion.txt',
            'austen-sense.txt', 'carroll-alice.txt'
        ])))

model = Word2Vec(sentences=gberg_sents,
                 size=64,
                 sg=1,
                 window=10,
                 min_count=5,
예제 #60
0
from nltk.corpus import gutenberg;
from pynput.keyboard import Key, Listener;
import random;

def on_release(key):
    print(key);
    if key == 'q':
        exit();
    elif key == Key.enter:
        print('next');

fileCount = len(gutenberg.fileids());
fileName = gutenberg.fileids();

fileName =  gutenberg.sents(random.choice(fileName));

print(' '.join(fileName[0]),"\n");

while 1:
    sentence = random.randrange(0,len(fileName));
    sentence = ' '.join(fileName[sentence]);
    print(sentence);

    with Listener(
            on_release=on_release) as listener:
        listener.join()