Пример #1
2
import cPickle as pickle
import random
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist
import nltk

print "... loading text"
text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt'))
print len(set(text_train))
text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt'))

#with open('./../datasets/t5_train') as f:
#    text_train =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_train)
#    text_train = (' . '.join(text_train)).split(' ')
#    
#with open('./../datasets/t5_test') as f:
#    text_test =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_test)
#    text_test = (' . '.join(text_test)).split(' ')

print "... training model"
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(3, text_train, estimator=estimator)

print "... results"
print lm.generate(50, ['dog'])
print lm.perplexity(text_test)
print lm.entropy(text_test)
Пример #2
0
class Ngrammodel:
  def __init__(self):
    self.sentences = []
    self.ngramModel = None
  
  def loadSentences(self, corpus):
    print 'In Load Sentences'
    lines = UnicodeHelper.readlinesSingleColumn(corpus)
    print 'lines loaded'
    self.sentences = [tuple(line.split()) for line in lines]
  
  def trainNgramModel(self, n):
    self.ngramModel = NgramModel(n, self.sentences)
  
  def sanityCheck(self):
    print 'here'
    for sentence in self.sentences:
      print sentence
  
  def logprob(self, word, context):
    return self.ngramModel.logprob(word, context)
  
  def prob(self, word, context):
    return self.ngramModel.prob(word, context)
  
  def testCompletion(self, context, word):
    print "Prob:", self.ngramModel.prob(word, context)
    print "Log Prob:", self.ngramModel.logprob(word, context)
Пример #3
0
def demo_generate(text):
  print "len of tokens=", len(text)
  while True:
    N = raw_input("Select a number N for the N-gram model (2, 3, or 4 only):")
    N = int(N)
    if N in [2, 3, 4]: break
  if N == 2:
    bi = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bi)
  else:
    from nltk.model import NgramModel
    from nltk.probability import LidstoneProbDist, WittenBellProbDist
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    lm = NgramModel(N, text, estimator)
#    for w in lm.generate(20, context=('I')): print w,
  while 1:
    inp = raw_input('Enter a Chinese word such as "目前"(type 0 to exit):'); 
    print "inp='"+ inp + "'"
    if inp == '0': break
    inp = inp.decode('big5')
    if N == 2:
      generate_model(cfd, inp)
    else:
      for w in lm.generate(20, context=(inp)): print w,
      print "\n"
      for w in lm.generate(20, context=(inp)): print w,
    print "\n"
Пример #4
0
def main(argv):
    AllWords = []

    OutFile = argv[1]
    GenQty = int(argv[2])
    print "Will try to generate " + str(GenQty) + " sentences!"

    sentLen = dict()
    lineQty = 0
    for line in sys.stdin:
        line = re.sub(r"</?s>", "", line)
        line = line.rstrip("\n")
        elems = re.split("\s+", line)
        AllWords.extend(elems)
        lineQty = lineQty + 1
        slen = len(elems)
        if not slen in sentLen:
            sentLen[slen] = 0
        sentLen[slen] = sentLen[slen] + 1

    print (sentLen)

    print str(len(AllWords)) + "\n"

    Estim = lambda fdist, bins: GoodTuringProbDist(fdist)
    # Estim = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

    N = 3
    print "Words are read, now let's compute the " + str(N) + "-gram model.\n"

    model = NgramModel(N, AllWords, estimator=Estim)

    print "" + str(N) + "-gram model is computed.\n"

    outf = open(OutFile, "w")

    for i in range(1, GenQty + 1):
        RandSum = random.randint(1, lineQty)
        sum = 0
        RandLen = -1
        for k in sentLen.keys():
            sum = sum + sentLen[k]
            if sum >= RandSum:
                RandLen = k
                break

        if RandLen == -1:
            print ("Internal error! Cannot select len for sent: " + str(i))
            sys.exit(1)

        text_words = model.generate(RandLen)

        # Concatenate all words generated in a string separating them by a space.
        text = " ".join([word for word in text_words])
        # Sometimes, we got more than one space, have no idea why
        text = re.sub(r"\s+", " ", "<s> " + text + " </s>")

        outf.write(text + "\n")
Пример #5
0
    def generate(self, length=100):
        """
        Print random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print("Building ngram index...")
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length)
        print(tokenwrap(text))
Пример #6
0
    def calcWordProb(self):

        word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']

        text = "They become more expensive already. Mine is like 25. So horrible and they did less things than I did last time."
        text = nltk.word_tokenize(text.translate(None, ',.'))

        print text
        #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        #lm = NgramModel(2, word_seq, estimator)

        est = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.2, bins)
        model = NgramModel(3, text, True, True, est, 21)

        print model.prob("more", text)
Пример #7
0
    def calcWordProb(self):

        word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']

        text = "They become more expensive already. Mine is like 25. So horrible and they did less things than I did last time."
        text = nltk.word_tokenize(text.translate(None, ',.'))

        print text
        #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        #lm = NgramModel(2, word_seq, estimator)

        est = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.2, bins)
        model = NgramModel(3, text, True, True, est, 21)

        print model.prob("more", text)
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel
Пример #9
0
def hammertime(corpus, ngramss=0, numGen=100):
	
	tokens = list(word_tokenize(corpus))
	print tokens[0:900]
	# estimator for smoothing the N-gram model
	estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

	if ngramss <= 0:
		ngramss = random.randint(2, 4)

	model = NgramModel(ngramss, tokens, False, False, estimator)

	# Apply the language model to generate 50 words in sequence
	text_words = model.generate(numGen)

	# Concatenate all words generated in a string separating them by a space.
	text = ' '.join([word for word in text_words])

	return text
Пример #10
0
def train_model(fdist, listObj, n):
    """ 
        @n - size of ngram
        @fdist - frequency distribution
        @listObj - ngram data list
        
        """

    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    lm = NgramModel(n, fdist, estimator=estimator)
    return lm
Пример #11
0
    def __init__(self):
        cess_sents = cess.tagged_sents()
        self.uni_tag = ut(cess_sents)

        self.model = NgramModel(3, brown.words())

        self.translation = []
        self.dictionary = collections.defaultdict(lambda: 0)
        dictionaryFile = open("../corpus/Dictionary.txt", 'r')
        for translation in dictionaryFile:
            spanish, english = translation.split(" - ")
            spanish = spanish.decode('utf-8')
            self.dictionary[spanish] = collections.defaultdict(lambda: [])
            english = english.rstrip(';\n').split('; ')
            for pos in english:
                pos = pos.split(': ')
                self.dictionary[spanish][pos[0]] = pos[1].split(', ')

        self.sentences = []
        sentencesFile = open("../corpus/TestSet.txt", 'r')
        for sentence in sentencesFile:
            self.sentences.append(sentence.rstrip('\n'))
Пример #12
0
    def generate(self, length=100, context=()):
        """
        Return random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print "Building ngram index..."
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length, context=context)
        return tokenwrap(text)
Пример #13
0
	def __init__(self):
		cess_sents = cess.tagged_sents()
		self.uni_tag = ut(cess_sents)

		self.model = NgramModel(3, brown.words())

		self.translation = []
		self.dictionary = collections.defaultdict(lambda: 0)
		dictionaryFile = open("../corpus/Dictionary.txt", 'r')
		for translation in dictionaryFile:
			spanish, english = translation.split(" - ")
			spanish = spanish.decode('utf-8')
			self.dictionary[spanish] = collections.defaultdict(lambda: [])
			english = english.rstrip(';\n').split('; ')
			for pos in english:
				pos = pos.split(': ')
				self.dictionary[spanish][pos[0]] = pos[1].split(', ')

		self.sentences = []
		sentencesFile = open("../corpus/TestSet.txt", 'r')
		for sentence in sentencesFile:
			self.sentences.append(sentence.rstrip('\n'))
Пример #14
0
  def train(self, corpus):
    """Trains a language using a trigram model with stupid backoff
    to a bigram model with stupid backoff to a unigram model
    with plus one smoothing"""
    for sentence in corpus.corpus:
      for i in xrange(0, len(sentence.data)):
        token = sentence.data[i].word
        self.unigramCounts[token] += 1
        self.total += 1
        if i + 1 < len(sentence.data): 
            next = sentence.data[i+1].word
            self.bigramCounts[(token, next)] += 1
        if i + 2 < len(sentence.data):
            third = sentence.data[i+2].word
            self.trigramCounts[(token, next, third)] += 1

    train_tokens = brown.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    self.trilm = NgramModel(3, train_tokens, True, False, estimator)
    self.bilm = NgramModel(2, train_tokens, True, False, estimator)
    self.unilm = NgramModel(1, train_tokens, True, False, estimator)
Пример #15
0
class CorpusText(nltk.Text):
    def concordance(self, word, width=79, lines=25):
        """
        Return a string concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.
        :seealso: ``ConcordanceIndex``

        (nltk default is to print concordance)
        """
        if '_concordance_index' not in self.__dict__:
            print "Building concordance index..."
            self._concordance_index = CorpusConcordanceIndex(self.tokens,
                                                       key=lambda s:s.lower())

#        return self._concordance_index.get_concordance_as_str(word, width, lines)
        return self._concordance_index.get_concordance_as_matrix(word, width, lines)

    def similar(self, word, num=20):
        """
        Returns as a string similar words
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            return tokenwrap(words)
        else:
            print "No matches"

    def generate(self, length=100, context=()):
        """
        Return random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print "Building ngram index..."
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length, context=context)
        return tokenwrap(text)


    def get_adjacent_tokens(self, word, window=5, lines=25):
        ### todo: should this go here??? look into fixing nltk.ContentIndex
## to-do: figure out what to do about capitalization        
##        assert word == word.lower()

        result = []        

        indices = [ i for i, w in enumerate(self.tokens) if w.lower()==word]
        if indices:
            lines = min(lines, len(indices))
            print "Displaying %s of %s matches:" % (lines, len(indices))

            for i in indices:
                if lines <= 0:
                    break

                ind_a = max(0, i-window)
                ind_b = min(len(self.tokens), i+window)
                
                adjacent = self.tokens[ind_a:ind_b]
                result.append(adjacent)
                lines -= 1
        else:
            print "No matches"

        return result
Пример #16
0
class LanguageModel:

  STUPID_K = 0.4

  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    self.trigramCounts = collections.defaultdict(lambda:0)
    self.bigramCounts = collections.defaultdict(lambda: 0)
    self.unigramCounts = collections.defaultdict(lambda: 0)
    self.total = 0
    self.trilm = None
    self.bilm = None
    self.unilm = None
    self.train(corpus)

  def train(self, corpus):
    """Trains a language using a trigram model with stupid backoff
    to a bigram model with stupid backoff to a unigram model
    with plus one smoothing"""
    for sentence in corpus.corpus:
      for i in xrange(0, len(sentence.data)):
        token = sentence.data[i].word
        self.unigramCounts[token] += 1
        self.total += 1
        if i + 1 < len(sentence.data): 
            next = sentence.data[i+1].word
            self.bigramCounts[(token, next)] += 1
        if i + 2 < len(sentence.data):
            third = sentence.data[i+2].word
            self.trigramCounts[(token, next, third)] += 1

    train_tokens = brown.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    self.trilm = NgramModel(3, train_tokens, True, False, estimator)
    self.bilm = NgramModel(2, train_tokens, True, False, estimator)
    self.unilm = NgramModel(1, train_tokens, True, False, estimator)


  def score(self, sentence):
    score = 0.0
    for i in xrange(2, len(sentence)):
        token = sentence[i]
        prev = sentence[i-1]
        first = sentence[i-2]
        tricount = self.trigramCounts[(first, prev, token)]
        #begin with trigram model
        if tricount > 0:
            score += self.trilm.prob(token, [prev, first])
            # score += math.log(tricount)
            # score -= math.log(self.bigramCounts[(first, prev)])
            score -= self.bilm.prob(first,[prev])
            # continue
        #back off to bigram model
        biCount = self.bigramCounts[(prev, token)]
        if biCount > 0: 
            # score += math.log(biCount)
            score += self.bilm.prob(token, [prev])
            score += math.log(self.STUPID_K)
            # score -= math.log(self.unigramCounts[prev])
            score -= self.unilm.prob(prev, [])
            # continue  
        #back off to unigram model with +1 smoothing
        # count = self.unigramCounts[token]
        score += math.log(2 * self.STUPID_K) 
        score += self.unilm.prob(token, [])
        # score += math.log(count + 1.0)
        # score -= math.log(self.total + len(self.unigramCounts))

    return score


  def n_most_likely(self, sentences, n):
    """Given a list of string sentences, returns the n most likely"""
    #m = (float("-inf"),"")
    scores = []
    for s in sentences:
        prob = self.score(s)
        scores.append((s, prob))
    scores = sorted(scores, key=itemgetter(1,0), reverse=True)
    sents = []
    for tup in scores[:n]:
        sents.append(tup[0])
    return sents
Пример #17
0
				if line not in useful:
					useful.append(line)
	print "\ntotal useful sents: " + str(len(useful))

		#train trigram model
	corpus_tokens = []
	print "Adding brown"
	for word in brown.words():
		word = word.lower()
		corpus_tokens.append(word)
	print "Adding gutenberg"
	for word in gutenberg.words():
		word = word.lower()
		corpus_tokens.append(word)
	print "Training Trigram Model"
	lm = NgramModel(3,corpus_tokens,True,False,lambda f,b:LidstoneProbDist(f,0.01,f.B()+1))

	tweet_entropies = []
	count = 1
	for sent in useful:
		sent = sent.split()
		percentage = 100*count/len(useful)
		print "\rChecking entropy : " + str(count) + " of " + str(len(useful)) + "        " + str(percentage) + "%",
		entropy = lm.entropy(sent)
		tweet_entropies.append((" ".join(sent), entropy))
		count += 1
	tweet_entropies.sort(key=lambda x: x[1])
	threshold = int(len(tweet_entropies) * 0.8)
	list_of_tweets = tweet_entropies[:threshold]

	print "\n",
Пример #18
0
#m = NgramModel(1, [str(i) for i in [1,2,3,4,5]])
#print m.prob('1', [])

# Tokens contains the words for Genesis and Reuters Trade
tokens = set(brown.words())
words = [];

for word in tokens:
    words.extend([char for char in word.lower()]);
    words.extend(['\t']);

#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))

# estimator for smoothing the N-gram model
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.02)

# N-gram language model with 3-grams
#model = NgramModel(3, tokens, estimator)
model = NgramModel(3, words, estimator)

# Apply the language model to generate 50 words in sequence
text_words = model.generate(50)

# Concatenate all words generated in a string separating them by a space.
text = ' '.join([word for word in text_words])

# print the text
print text

print model.prob('e', ['a', 't'])
Пример #19
0

# Tokens contains the words for Genesis and Reuters Trade
tokens = tokenize_file("simple_wikipedia_plaintext.txt")
#tokens = brown.words(categories='news')
#print tokens[1:100]
#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))
#tokens.extend(list(brown.words(categories='news')))
#tokens.extend(list(reuters.words(categories = 'earn')))

# estimator for smoothing the N-gram model
est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

# N-gram language model with 3-grams
model = NgramModel(N, tokens, pad_left=True, pad_right=True, estimator=est)
#model = NgramModel(N, tokens, estimator=est)

# Apply the language model to generate 50 words in sequence
#text_words = model.generate(50)

# Concatenate all words generated in a string separating them by a space.
#text = ' '.join([word for word in text_words])

# print the text
#print text

sentence = "This is a sample sentence."
print sentence
print "p:", sentence_probability(sentence, model)
print "p_m:", sentence_probability_modified(sentence, model)
Пример #20
0
class Text(object):
    """
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    """
    # This defeats lazy loading, but makes things faster.  This
    # *shouldn't* be necessary because the corpus view *should* be
    # doing intelligent caching, but without this it's running slow.
    # Look into whether the caching is working correctly.
    _COPY_TOKENS = True

    def __init__(self, tokens, name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens

        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."

    #////////////////////////////////////////////////////////////
    # Support item & slice access
    #////////////////////////////////////////////////////////////

    def __getitem__(self, i):
        if isinstance(i, slice):
            return self.tokens[i.start:i.stop]
        else:
            return self.tokens[i]

    def __len__(self):
        return len(self.tokens)

    #////////////////////////////////////////////////////////////
    # Interactive console methods
    #////////////////////////////////////////////////////////////

    def concordance(self, word, width=79, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.
        :seealso: ``ConcordanceIndex``
        """
        if '_concordance_index' not in self.__dict__:
            print("Building index...")
            self._concordance_index = ConcordanceIndex(self.tokens,
                                                       key=lambda s:s.lower())

        self._concordance_index.print_concordance(word, width, lines)

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

    def count(self, word):
        """
        Count the number of times this word appears in the text.
        """
        return self.tokens.count(word)

    def index(self, word):
        """
        Find the index of the first occurrence of the word in the text.
        """
        return self.tokens.index(word)

    def readability(self, method):
        # code from nltk_contrib.readability
        raise NotImplementedError

    def generate(self, length=100):
        """
        Print random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print("Building ngram index...")
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length)
        print(tokenwrap(text))

    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = islice(fd.keys(), num)
            print(tokenwrap(words))
        else:
            print("No matches")


    def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = islice(fd.keys(), num)
                print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))

        except ValueError as e:
            print(e)

    def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type word: str
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words)

    def plot(self, *args):
        """
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        """
        self.vocab().plot(*args)

    def vocab(self):
        """
        :seealso: nltk.prob.FreqDist
        """
        if "_vocab" not in self.__dict__:
            print("Building vocabulary index...")
            self._vocab = FreqDist(self)
        return self._vocab

    def findall(self, regexp):
        """
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        """

        if "_token_searcher" not in self.__dict__:
            self._token_searcher = TokenSearcher(self)

        hits = self._token_searcher.findall(regexp)
        hits = [' '.join(h) for h in hits]
        print(tokenwrap(hits, "; "))

    #////////////////////////////////////////////////////////////
    # Helper Methods
    #////////////////////////////////////////////////////////////

    _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
    def _context(self, tokens, i):
        """
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        """
        # Left context
        j = i-1
        while j>=0 and not self._CONTEXT_RE.match(tokens[j]):
            j -= 1
        left = (tokens[j] if j != 0 else '*START*')

        # Right context
        j = i+1
        while j<len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
            j += 1
        right = (tokens[j] if j != len(tokens) else '*END*')

        return (left, right)

    #////////////////////////////////////////////////////////////
    # String Display
    #////////////////////////////////////////////////////////////

    def __str__(self):
        return '<Text: %s>' % self.name

    def __repr__(self):
        return '<Text: %s>' % self.name
Пример #21
0
 def trainNgramModel(self, n):
   self.ngramModel = NgramModel(n, self.sentences)
Пример #22
0
# Tokens contains the words for Genesis and Reuters Trade
#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))

# estimator for smoothing the N-gram model
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sent = "abraham lincoln be bear feb 12 1809"
tokens = sent.split()
splitNgrams = list(ingrams(list(sent), 3))

tokens = ["".join(x) for x in splitNgrams]

# N-gram language model with 3-grams
# Without an estimator, it assumes Good-Turing.
model = NgramModel(3, tokens, estimator)
print "Model: " + str(model)

sent2 = "abe lincoln was born in 1809"

splitNgrams2 = list(ingrams(list(sent2), 3))
tokens2 = ["".join(x) for x in splitNgrams2]


print "Word: " + tokens2[-1]
context = " ".join(tokens2[:-1])
print "Context: " + context


print model.prob(tokens2[-1], [sent2])
Пример #23
0
class Text(object):
    """
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    """
    # This defeats lazy loading, but makes things faster.  This
    # *shouldn't* be necessary because the corpus view *should* be
    # doing intelligent caching, but without this it's running slow.
    # Look into whether the caching is working correctly.
    _COPY_TOKENS = True

    def __init__(self, tokens, name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens

        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."

    #////////////////////////////////////////////////////////////
    # Support item & slice access
    #////////////////////////////////////////////////////////////

    def __getitem__(self, i):
        if isinstance(i, slice):
            return self.tokens[i.start:i.stop]
        else:
            return self.tokens[i]

    def __len__(self):
        return len(self.tokens)

    #////////////////////////////////////////////////////////////
    # Interactive console methods
    #////////////////////////////////////////////////////////////

    def concordance(self, word, width=79, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.
        :seealso: ``ConcordanceIndex``
        """
        if '_concordance_index' not in self.__dict__:
            print("Building index...")
            self._concordance_index = ConcordanceIndex(self.tokens,
                                                       key=lambda s: s.lower())

        self._concordance_index.print_concordance(word, width, lines)

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

    def count(self, word):
        """
        Count the number of times this word appears in the text.
        """
        return self.tokens.count(word)

    def index(self, word):
        """
        Find the index of the first occurrence of the word in the text.
        """
        return self.tokens.index(word)

    def readability(self, method):
        # code from nltk_contrib.readability
        raise NotImplementedError

    def generate(self, length=100):
        """
        Print random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print("Building ngram index...")
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length)
        print(tokenwrap(text))

    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens,
                filter=lambda x: x.isalpha(),
                key=lambda s: s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = islice(fd.keys(), num)
            print(tokenwrap(words))
        else:
            print("No matches")

    def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s: s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = islice(fd.keys(), num)
                print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))

        except ValueError as e:
            print(e)

    def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type word: str
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words)

    def plot(self, *args):
        """
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        """
        self.vocab().plot(*args)

    def vocab(self):
        """
        :seealso: nltk.prob.FreqDist
        """
        if "_vocab" not in self.__dict__:
            print("Building vocabulary index...")
            self._vocab = FreqDist(self)
        return self._vocab

    def findall(self, regexp):
        """
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        """

        if "_token_searcher" not in self.__dict__:
            self._token_searcher = TokenSearcher(self)

        hits = self._token_searcher.findall(regexp)
        hits = [' '.join(h) for h in hits]
        print(tokenwrap(hits, "; "))

    #////////////////////////////////////////////////////////////
    # Helper Methods
    #////////////////////////////////////////////////////////////

    _CONTEXT_RE = re.compile('\w+|[\.\!\?]')

    def _context(self, tokens, i):
        """
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        """
        # Left context
        j = i - 1
        while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
            j -= 1
        left = (tokens[j] if j != 0 else '*START*')

        # Right context
        j = i + 1
        while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
            j += 1
        right = (tokens[j] if j != len(tokens) else '*END*')

        return (left, right)

    #////////////////////////////////////////////////////////////
    # String Display
    #////////////////////////////////////////////////////////////

    def __str__(self):
        return '<Text: %s>' % self.name

    def __repr__(self):
        return '<Text: %s>' % self.name
Пример #24
0
def process_plaintext(dir_path):
    reader = CategorizedPlaintextCorpusReader(dir_path,
                    r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt')
    facilitator_files = reader.fileids(categories='facilitator')
    participant_files = reader.fileids(categories='participant')
    print facilitator_files, participant_files

    #print reader.categories()
    #print len(reader.words())
    #print len(reader.sents())

    fac_words = [word for word in reader.words(facilitator_files)]
    par_words = [word for word in reader.words(participant_files)]

    fac_words = edit_tokens(fac_words)
    par_words = edit_tokens(par_words)

    speakers = (
        [(word, 'facilitator') for word in reader.words(facilitator_files)] +
        [(word, 'participant') for word in reader.words(participant_files)]
    )

    features = get_features(speakers)

    size = int(len(features) * 0.3)
    nb_train = features[size:]
    nb_test = features[:size]

    classifier = nltk.NaiveBayesClassifier.train(nb_train)
    print "Classifier labels:", classifier.labels()
    print classifier.show_most_informative_features()
    print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
    #print classifier.classify(get_features(["Yolo", "bag", "sp"], False))
    
    #random.shuffle(speakers)
    three_quarters = int(len(speakers) * 0.75)
    train = speakers[:three_quarters]
    test = speakers[three_quarters:]

    est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
    un_lm = NgramModel(1, train, estimator=est)
    bi_lm = NgramModel(2, train, estimator=est)
    tr_lm = NgramModel(3, train, estimator=est)
    qu_lm = NgramModel(4, train, estimator=est)
    pe_lm = NgramModel(5, train, estimator=est)
    print un_lm
    print bi_lm
    print tr_lm
    print qu_lm
    print pe_lm
    print "1 gram Perplexity:", un_lm.perplexity(test)
    print "2 gram Perplexity:", bi_lm.perplexity(test)
    print "3 gram Perplexity:", tr_lm.perplexity(test)
    print "4 gram Perplexity:", qu_lm.perplexity(test)
    print "5 gram Perplexity:", pe_lm.perplexity(test)

    print bi_lm.generate(10, ["uh", "sp"])

    fd_fac = nltk.FreqDist(fac_words)
    vocab_fac = fd_fac.keys()

    fd_par = nltk.FreqDist(par_words)
    vocab_par = fd_par.keys()

    print "Fac Vocab: " , len(vocab_fac)
    print "Fac Tokens: " , len(fac_words)
    print vocab_fac[:20]
    print "Par Vocab: " , len(vocab_par)
    print "Par Tokens: " , len(par_words)
    print vocab_par[:20]
    fd_par.plot(50)
Пример #25
0
class MachineTranslation:
	PUNCTUATION = [',', '.', '(', ')', '?']
	ENG_ADJECTIVE = ['JJ', 'JJR', 'JJS']
	ENG_NOUN = ['NN', 'NNS', 'NNP', 'NNPS']
	ENG_VERB = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

	ESP_ADJECTIVE = ['a', 'q', 'o', '0', 'c', 's', 'f', 'p', 'n']
	ESP_NOUN = ['n']
	ESP_VERB = ['vm', 'vs']
	ESP_VERB_PAST = ['vmii', 'vmis', 'vsii', 'vsis']

	NUMBER_PAT = "\d+"
	OPEN_QUESTION_MARK = '\xc2\xbf'
	
	def __init__(self):
		cess_sents = cess.tagged_sents()
		self.uni_tag = ut(cess_sents)

		self.model = NgramModel(3, brown.words())

		self.translation = []
		self.dictionary = collections.defaultdict(lambda: 0)
		dictionaryFile = open("../corpus/Dictionary.txt", 'r')
		for translation in dictionaryFile:
			spanish, english = translation.split(" - ")
			spanish = spanish.decode('utf-8')
			self.dictionary[spanish] = collections.defaultdict(lambda: [])
			english = english.rstrip(';\n').split('; ')
			for pos in english:
				pos = pos.split(': ')
				self.dictionary[spanish][pos[0]] = pos[1].split(', ')

		self.sentences = []
		sentencesFile = open("../corpus/TestSet.txt", 'r')
		for sentence in sentencesFile:
			self.sentences.append(sentence.rstrip('\n'))

	def translate(self):
		for sentence in self.sentences:

			sentenceTranslation = []
			questionSwapped = sentence
			if sentence.startswith(self.OPEN_QUESTION_MARK):
				questionSwapped = self.questionSwap(sentence)
			negationSwapped = self.negationSwap(questionSwapped)
			tokens = nltk.word_tokenize(negationSwapped)

			pos = self.uni_tag.tag(tokens)
			for word in pos:
				candidate = word[0].decode('utf-8').lower()
				# print candidate
				if candidate in self.PUNCTUATION or re.search(self.NUMBER_PAT, candidate):
					wordTranslation = candidate
				elif (word[1] and any(word[1].startswith(adj) for adj in self.ESP_ADJECTIVE) and 
					'adjective' in self.dictionary[candidate]):
					wordTranslation = self.dictionary[candidate]['adjective'][0]
				elif (word[1] and any(word[1].startswith(noun) for noun in self.ESP_NOUN) and
					'noun' in self.dictionary[candidate]):
					wordTranslation = self.dictionary[candidate]['noun'][0]
					if word[1][1] == 'p': # proper noun
						wordTranslation = wordTranslation.capitalize()
				elif (word[1] and any(word[1].startswith(verb) for verb in self.ESP_VERB) and
					'verb' in self.dictionary[candidate]):
					#wordTranslation = self.verbConjugation(candidate, word)
					wordTranslation = self.pluralADJ(candidate)
				else:
					wordTranslation = self.pluralADJ(candidate)
				sentenceTranslation.append(wordTranslation)

			directTranslation = " ".join(map(str, sentenceTranslation))
			adjNounSwapped = self.adjNounSwap(directTranslation)
			lm = self.ngram(adjNounSwapped)
			nounSwapped = self.nounSwap(lm)
			pronounAdded = self.addPronoun(nounSwapped)
			possessives = self.possessive(pronounAdded)
			removedDeterminers = self.removeDeterminers(possessives)
			capAndNum = self.capitalizationAndNumbers(removedDeterminers)
			removeExtraSpace = re.sub(r' \'s', '\'s', capAndNum)
			removeExtraSpace = re.sub(r' ,', ',', removeExtraSpace)
			if removeExtraSpace[-2:] == " .":
				removeExtraSpace = removeExtraSpace[:-2] + "."
			elif removeExtraSpace[-2:] == " ?":
				removeExtraSpace = removeExtraSpace[:-2] + "?"
			self.translation.append(removeExtraSpace)

	# if question is a yes or no question, swap the order of first two words
	def questionSwap(self, sentence):
		sentence = sentence.lstrip(self.OPEN_QUESTION_MARK)
		#tokens = nltk.word_tokenize(sentence)
		#pos = self.uni_tag.tag(tokens)
		#return " ".join(map(str, tokens))
		return sentence

	# reverse the order of negation words and their objects
	def negationSwap(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = self.uni_tag.tag(tokens)

		firstWord = pos[0]
		for i, word in enumerate(pos[1:]):
			if firstWord[0].lower() == "no" and word[1] is not None and (word[1].startswith('vs') or word[1].startswith('vm')):
				tokens[i] = tokens[i+1]
				tokens[i+1] = "not"
			firstWord = word

		firstWord = pos[0]
		secondWord = pos[1]
		for i, word in enumerate(pos[2:]):
			if firstWord[0].lower() == "no" and secondWord[1] is not None and secondWord[1].startswith('pp'):
				if word[1] is not None and (word[1].startswith('vs') or word[1].startswith('vm')):
					temp = tokens[i]
					tokens[i] = tokens[i+1]
					tokens[i+1] = "do " + temp
			firstWord = secondWord
			secondWord = word

		return " ".join(map(str, tokens))

	# switch position of possessive words to use apostrophe notation
	def possessive(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = nltk.pos_tag(tokens)

		removeOf = []

		firstWord = pos[0]
		secondWord = pos[1]
		for i, word in enumerate(pos[2:]):
			if firstWord[1] in self.ENG_NOUN and secondWord[0]=='of' and word[1] in ['NNP', 'NNPS']:
				temp = tokens[i]
				tokens[i] = tokens[i+2] + "'s"
				tokens[i+2] = temp
				removeOf.append(i+1)
			firstWord = secondWord
			secondWord = word

		if len(removeOf) != 0:
			for i in reversed(removeOf):
				tokens.pop(i)

		return " ".join(map(str, tokens))

	# fixes the "number of telephone" to "telephone number" example
	def nounSwap(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = nltk.pos_tag(tokens)

		removeOf = []

		firstWord = pos[0]
		secondWord = pos[1]
		for i, word in enumerate(pos[2:]):
			if firstWord[1] in ['NN', 'NNS'] and secondWord[0]=='of' and word[1] in ['NN', 'NNS']:
				temp = tokens[i]
				tokens[i] = tokens[i+2]
				tokens[i+2] = temp
				removeOf.append(i+1)
			firstWord = secondWord
			secondWord = word

		if len(removeOf) != 0:
			for i in reversed(removeOf):
				tokens.pop(i)

		return " ".join(map(str, tokens))

	def ngram(self, sentence):
		words = ['your', 'its', 'his', 'her', 'their']
		highestProb = 0
		highestSentence = sentence
		for word in words:
			candidateSentence = re.sub('your', word, sentence)
		 	prob = self.model.prob(word, [candidateSentence])
		 	if prob > highestProb:
		 		highestProb = prob
		 		highestSentence = candidateSentence
		return highestSentence

	# reverses order of adjacent adjectives and nouns
	def adjNounSwap(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = nltk.pos_tag(tokens)

		firstWord = pos[0]
		for i, word in enumerate(pos[1:]):
			if firstWord[1] in self.ENG_NOUN and word[1] in self.ENG_ADJECTIVE:
				temp = tokens[i]
				tokens[i] = tokens[i+1]
				tokens[i+1] = temp
			firstWord = word

		return " ".join(map(str, tokens))

	def addPronoun(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = nltk.pos_tag(tokens)

		firstWord = pos[0]
		for i, word in enumerate(pos[1:]):
			if firstWord[1] not in self.ENG_NOUN and firstWord[0] != 'have' and firstWord[1] not in ['DT', 'TO', 'WP', 'RB', 'PRP', 'VBZ', '.', ','] and (word[1] in self.ENG_VERB or word[0]=='have'):
				if firstWord[1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')):
					tokens[i+1] = "they " + tokens[i+1]
				else:
					tokens[i+1] = "it " + tokens[i+1]
			firstWord = word

		if pos[0][1] in self.ENG_VERB or pos[0][0]=='have':
			if pos[0][1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')):
				tokens[0] = "They " + tokens[0]
			else:
				tokens[0] = "It " + tokens[0]

		return " ".join(map(str, tokens))

	def pluralADJ(self, token):
		translation = self.dictionary[token]['default'][0]
		pos = self.uni_tag.tag(nltk.word_tokenize(token))
		if pos[0][1] is not None and pos[0][1].startswith('a') and 'p' in pos[0][1]:
			if translation.endswith('s'):
				if wordnet.synsets(translation[:-1]):
					translation = translation[:-1]
		return translation

	def removeDeterminers(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		pos = nltk.pos_tag(tokens)

		removeOf = []

		firstWord = pos[0]
		for i, word in enumerate(pos[1:]):
			if firstWord[1] in ['DT'] and word[1] in ['NNP', 'NNPS', 'NNS']:
				removeOf.append(i)
			firstWord = word

		if len(removeOf) != 0:
			for i in reversed(removeOf):
				tokens.pop(i)

		return " ".join(map(str, tokens))

	def capitalizationAndNumbers(self, sentence):
		tokens = nltk.word_tokenize(sentence)
		tokens[0] = tokens[0].capitalize()

		pos = nltk.pos_tag(tokens)

		days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
		months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

		for i, word in enumerate(pos):
			if word[1] in ['NNP', 'NNPS']:
				tokens[i] = tokens[i].capitalize()

			if word[1] in ['CD']:
				tokens[i] = re.sub(r'\.', ',', tokens[i])

			if word[0] in days or word[0] in months:
				tokens[i] = tokens[i].capitalize()

		newTokens = []
		for i, token in enumerate(tokens):
			if token.lower() == 'a' and i+1<len(tokens):
				if any(tokens[i+1].startswith(vp) for vp in ['a', 'e', 'i', 'o', 'u']):
					if i==0:
						newTokens.append('An')
					else:
						newTokens.append('an')
				else:
					newTokens.append(token)
			else:
				newTokens.append(token)

		return " ".join(map(str, newTokens))

	def verbConjugation(self, candidate, word):
		wordTranslation = en.verb.present(self.dictionary[candidate]['verb'][0], person=word[1][4])
		
		if word[1][2] == 'p':
			wordTranslation = en.verb.present_participle(self.dictionary[candidate]['verb'][0])

		if word[1][3] == 's':
			wordTranslation = en.verb.past(self.dictionary[candidate]['verb'][0], person=word[1][4])

			if word[1][2] == 'p':
				wordTranslation = en.verb.past_participle(self.dictionary[candidate]['verb'][0])

		return wordTranslation
Пример #26
0
import nltk

print("... build")
brown = nltk.corpus.brown
corpus = [word.lower() for word in brown.words()]

# Train on 95% f the corpus and test on the rest
spl = 95*len(corpus)/100
train = corpus[:spl]
test = corpus[spl:]

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

print("... train")


from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
lm = NgramModel(5, train, estimator=estimator)

print("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % (len(corpus), len(vocabulary), len(train), len(test)))
print("perplexity(test) =", lm.perplexity(test))
Пример #27
0
def process_plaintext(dir_path):
    reader = CategorizedPlaintextCorpusReader(dir_path,
                                              r'.*\.txt',
                                              cat_pattern=r'.+_.+_(.*)\.txt')
    facilitator_files = reader.fileids(categories='facilitator')
    participant_files = reader.fileids(categories='participant')
    print facilitator_files, participant_files

    #print reader.categories()
    #print len(reader.words())
    #print len(reader.sents())

    fac_words = [word for word in reader.words(facilitator_files)]
    par_words = [word for word in reader.words(participant_files)]

    fac_words = edit_tokens(fac_words)
    par_words = edit_tokens(par_words)

    speakers = ([(word, 'facilitator')
                 for word in reader.words(facilitator_files)] +
                [(word, 'participant')
                 for word in reader.words(participant_files)])

    features = get_features(speakers)

    size = int(len(features) * 0.3)
    nb_train = features[size:]
    nb_test = features[:size]

    classifier = nltk.NaiveBayesClassifier.train(nb_train)
    print "Classifier labels:", classifier.labels()
    print classifier.show_most_informative_features()
    print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
    #print classifier.classify(get_features(["Yolo", "bag", "sp"], False))

    #random.shuffle(speakers)
    three_quarters = int(len(speakers) * 0.75)
    train = speakers[:three_quarters]
    test = speakers[three_quarters:]

    est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
    un_lm = NgramModel(1, train, estimator=est)
    bi_lm = NgramModel(2, train, estimator=est)
    tr_lm = NgramModel(3, train, estimator=est)
    qu_lm = NgramModel(4, train, estimator=est)
    pe_lm = NgramModel(5, train, estimator=est)
    print un_lm
    print bi_lm
    print tr_lm
    print qu_lm
    print pe_lm
    print "1 gram Perplexity:", un_lm.perplexity(test)
    print "2 gram Perplexity:", bi_lm.perplexity(test)
    print "3 gram Perplexity:", tr_lm.perplexity(test)
    print "4 gram Perplexity:", qu_lm.perplexity(test)
    print "5 gram Perplexity:", pe_lm.perplexity(test)

    print bi_lm.generate(10, ["uh", "sp"])

    fd_fac = nltk.FreqDist(fac_words)
    vocab_fac = fd_fac.keys()

    fd_par = nltk.FreqDist(par_words)
    vocab_par = fd_par.keys()

    print "Fac Vocab: ", len(vocab_fac)
    print "Fac Tokens: ", len(fac_words)
    print vocab_fac[:20]
    print "Par Vocab: ", len(vocab_par)
    print "Par Tokens: ", len(par_words)
    print vocab_par[:20]
    fd_par.plot(50)
Пример #28
0
spans = [span for span in training_spans]
training_offsets = [span[0] for span in spans]
train = []
for s in spans:
    train.append(training_raw[s[0]:s[1]])

testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw)
spans = [span for span in testing_spans]
testing_offsets = [span[0] for span in spans]
test = []
for s in spans:
    test.append(testing_raw[s[0]:s[1]])

estimator = lambda fdist, bins: LidstoneProbDist(fdist, args.
                                                 estimator_probability)
lm = NgramModel(args.num_grams, train, estimator=estimator)

t0 = 0
t1 = 1
current_best = ''
while t1 < len(test):
    perplexity = lm.perplexity(test[t0:t1])
    if perplexity > args.cutoff_max_perplexity:
        if (len(current_best) > 1):
            print current_best + '.'
            current_best = ''
        t0 = t1 + 1
        t1 = t0 + 1
    else:
        t1 += 1
        if t1 - t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
Пример #29
0
#!/usr/bin/env python
# -*- coding: utf-8

from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import LineTokenizer
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist
import pickle

corpus_root = './data'
fileids = 'data_title'
example = ["Python", "is"]

corpus = PlaintextCorpusReader(corpus_root,
    fileids,
    sent_tokenizer=LineTokenizer(),
    encoding='utf-8')

est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
lm = NgramModel(3, corpus.words(), estimator=est)

sent = lambda n, example: ' '.join(lm.generate(n, example))

print "Let's make a sentence!!!"
print "give a seed : Python is ..."
print "sentence :"
print sent(5, example)

Пример #30
0
#!/usr/bin/env python
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.probability import LidstoneProbDist  
from nltk.model import NgramModel    

with open('./austen/persuasion.txt', 'r') as training_file:
    raw = training_file.read() 
tokens = nltk.word_tokenize(raw)

with open('./austen/sense_and_sensibility.txt', 'r') as test_file:
    test = test_file.read()
test_list = nltk.word_tokenize(test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
model = NgramModel(3, tokens,True,False,estimator)  
tri=model.entropy(test_list)
print "tri-gram: " + str(tri)

model = NgramModel(2, tokens,True,False,estimator)  
bi=model.entropy(test_list)
print "bi-gram: " + str(bi)

Пример #31
0
    # Read file
    f = io.open('/veu4/usuaris30/speech00/corpus/train/spanishlit_ninc_v' +
                version + '_nlm/' + categ + '.txt',
                encoding='utf8')
    g = f.read().lower()
    # Obtain tokenized words
    train = nltk.word_tokenize(g)
    print "e"
    # Remove rare words from the corpus
    # fdist = nltk.FreqDist(w for w in train)
    # vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))
    # train1 = map(lambda x: x if x in vocabulary else "*unknown*", train)

    # Obtain the Language Model using WittenBellProbDist to smooth unseen events
    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 10)
    lm[categ] = NgramModel(N, train, estimator=estimator)

    print "> Obtain language model of", categ, "... Done!"
print "> Obtain all language models... Done!"

# Load dictionary with: {category:tests}
n_categ = []
test_corpus = dict()
for categ in all_categ:
    files = os.listdir(
        '/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' + version +
        '_nlm/' + categ)
    n_categ.append(len(files))
    tests = []
    for fi in files:
        f = io.open('/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' +
Пример #32
0
class MachineTranslation:
    PUNCTUATION = [',', '.', '(', ')', '?']
    ENG_ADJECTIVE = ['JJ', 'JJR', 'JJS']
    ENG_NOUN = ['NN', 'NNS', 'NNP', 'NNPS']
    ENG_VERB = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    ESP_ADJECTIVE = ['a', 'q', 'o', '0', 'c', 's', 'f', 'p', 'n']
    ESP_NOUN = ['n']
    ESP_VERB = ['vm', 'vs']
    ESP_VERB_PAST = ['vmii', 'vmis', 'vsii', 'vsis']

    NUMBER_PAT = "\d+"
    OPEN_QUESTION_MARK = '\xc2\xbf'

    def __init__(self):
        cess_sents = cess.tagged_sents()
        self.uni_tag = ut(cess_sents)

        self.model = NgramModel(3, brown.words())

        self.translation = []
        self.dictionary = collections.defaultdict(lambda: 0)
        dictionaryFile = open("../corpus/Dictionary.txt", 'r')
        for translation in dictionaryFile:
            spanish, english = translation.split(" - ")
            spanish = spanish.decode('utf-8')
            self.dictionary[spanish] = collections.defaultdict(lambda: [])
            english = english.rstrip(';\n').split('; ')
            for pos in english:
                pos = pos.split(': ')
                self.dictionary[spanish][pos[0]] = pos[1].split(', ')

        self.sentences = []
        sentencesFile = open("../corpus/TestSet.txt", 'r')
        for sentence in sentencesFile:
            self.sentences.append(sentence.rstrip('\n'))

    def translate(self):
        for sentence in self.sentences:

            sentenceTranslation = []
            questionSwapped = sentence
            if sentence.startswith(self.OPEN_QUESTION_MARK):
                questionSwapped = self.questionSwap(sentence)
            negationSwapped = self.negationSwap(questionSwapped)
            tokens = nltk.word_tokenize(negationSwapped)

            pos = self.uni_tag.tag(tokens)
            for word in pos:
                candidate = word[0].decode('utf-8').lower()
                # print candidate
                if candidate in self.PUNCTUATION or re.search(
                        self.NUMBER_PAT, candidate):
                    wordTranslation = candidate
                elif (word[1] and any(word[1].startswith(adj)
                                      for adj in self.ESP_ADJECTIVE)
                      and 'adjective' in self.dictionary[candidate]):
                    wordTranslation = self.dictionary[candidate]['adjective'][
                        0]
                elif (word[1] and any(word[1].startswith(noun)
                                      for noun in self.ESP_NOUN)
                      and 'noun' in self.dictionary[candidate]):
                    wordTranslation = self.dictionary[candidate]['noun'][0]
                    if word[1][1] == 'p':  # proper noun
                        wordTranslation = wordTranslation.capitalize()
                elif (word[1] and any(word[1].startswith(verb)
                                      for verb in self.ESP_VERB)
                      and 'verb' in self.dictionary[candidate]):
                    #wordTranslation = self.verbConjugation(candidate, word)
                    wordTranslation = self.pluralADJ(candidate)
                else:
                    wordTranslation = self.pluralADJ(candidate)
                sentenceTranslation.append(wordTranslation)

            directTranslation = " ".join(map(str, sentenceTranslation))
            adjNounSwapped = self.adjNounSwap(directTranslation)
            lm = self.ngram(adjNounSwapped)
            nounSwapped = self.nounSwap(lm)
            pronounAdded = self.addPronoun(nounSwapped)
            possessives = self.possessive(pronounAdded)
            removedDeterminers = self.removeDeterminers(possessives)
            capAndNum = self.capitalizationAndNumbers(removedDeterminers)
            removeExtraSpace = re.sub(r' \'s', '\'s', capAndNum)
            removeExtraSpace = re.sub(r' ,', ',', removeExtraSpace)
            if removeExtraSpace[-2:] == " .":
                removeExtraSpace = removeExtraSpace[:-2] + "."
            elif removeExtraSpace[-2:] == " ?":
                removeExtraSpace = removeExtraSpace[:-2] + "?"
            self.translation.append(removeExtraSpace)

    # if question is a yes or no question, swap the order of first two words
    def questionSwap(self, sentence):
        sentence = sentence.lstrip(self.OPEN_QUESTION_MARK)
        #tokens = nltk.word_tokenize(sentence)
        #pos = self.uni_tag.tag(tokens)
        #return " ".join(map(str, tokens))
        return sentence

    # reverse the order of negation words and their objects
    def negationSwap(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = self.uni_tag.tag(tokens)

        firstWord = pos[0]
        for i, word in enumerate(pos[1:]):
            if firstWord[0].lower() == "no" and word[1] is not None and (
                    word[1].startswith('vs') or word[1].startswith('vm')):
                tokens[i] = tokens[i + 1]
                tokens[i + 1] = "not"
            firstWord = word

        firstWord = pos[0]
        secondWord = pos[1]
        for i, word in enumerate(pos[2:]):
            if firstWord[0].lower() == "no" and secondWord[
                    1] is not None and secondWord[1].startswith('pp'):
                if word[1] is not None and (word[1].startswith('vs')
                                            or word[1].startswith('vm')):
                    temp = tokens[i]
                    tokens[i] = tokens[i + 1]
                    tokens[i + 1] = "do " + temp
            firstWord = secondWord
            secondWord = word

        return " ".join(map(str, tokens))

    # switch position of possessive words to use apostrophe notation
    def possessive(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        removeOf = []

        firstWord = pos[0]
        secondWord = pos[1]
        for i, word in enumerate(pos[2:]):
            if firstWord[1] in self.ENG_NOUN and secondWord[
                    0] == 'of' and word[1] in ['NNP', 'NNPS']:
                temp = tokens[i]
                tokens[i] = tokens[i + 2] + "'s"
                tokens[i + 2] = temp
                removeOf.append(i + 1)
            firstWord = secondWord
            secondWord = word

        if len(removeOf) != 0:
            for i in reversed(removeOf):
                tokens.pop(i)

        return " ".join(map(str, tokens))

    # fixes the "number of telephone" to "telephone number" example
    def nounSwap(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        removeOf = []

        firstWord = pos[0]
        secondWord = pos[1]
        for i, word in enumerate(pos[2:]):
            if firstWord[1] in [
                    'NN', 'NNS'
            ] and secondWord[0] == 'of' and word[1] in ['NN', 'NNS']:
                temp = tokens[i]
                tokens[i] = tokens[i + 2]
                tokens[i + 2] = temp
                removeOf.append(i + 1)
            firstWord = secondWord
            secondWord = word

        if len(removeOf) != 0:
            for i in reversed(removeOf):
                tokens.pop(i)

        return " ".join(map(str, tokens))

    def ngram(self, sentence):
        words = ['your', 'its', 'his', 'her', 'their']
        highestProb = 0
        highestSentence = sentence
        for word in words:
            candidateSentence = re.sub('your', word, sentence)
            prob = self.model.prob(word, [candidateSentence])
            if prob > highestProb:
                highestProb = prob
                highestSentence = candidateSentence
        return highestSentence

    # reverses order of adjacent adjectives and nouns
    def adjNounSwap(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        firstWord = pos[0]
        for i, word in enumerate(pos[1:]):
            if firstWord[1] in self.ENG_NOUN and word[1] in self.ENG_ADJECTIVE:
                temp = tokens[i]
                tokens[i] = tokens[i + 1]
                tokens[i + 1] = temp
            firstWord = word

        return " ".join(map(str, tokens))

    def addPronoun(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        firstWord = pos[0]
        for i, word in enumerate(pos[1:]):
            if firstWord[1] not in self.ENG_NOUN and firstWord[
                    0] != 'have' and firstWord[1] not in [
                        'DT', 'TO', 'WP', 'RB', 'PRP', 'VBZ', '.', ','
                    ] and (word[1] in self.ENG_VERB or word[0] == 'have'):
                if firstWord[1] == 'VBP' or (wordnet.synsets(word[0])
                                             and not word[0].endswith('s')):
                    tokens[i + 1] = "they " + tokens[i + 1]
                else:
                    tokens[i + 1] = "it " + tokens[i + 1]
            firstWord = word

        if pos[0][1] in self.ENG_VERB or pos[0][0] == 'have':
            if pos[0][1] == 'VBP' or (wordnet.synsets(word[0])
                                      and not word[0].endswith('s')):
                tokens[0] = "They " + tokens[0]
            else:
                tokens[0] = "It " + tokens[0]

        return " ".join(map(str, tokens))

    def pluralADJ(self, token):
        translation = self.dictionary[token]['default'][0]
        pos = self.uni_tag.tag(nltk.word_tokenize(token))
        if pos[0][1] is not None and pos[0][1].startswith(
                'a') and 'p' in pos[0][1]:
            if translation.endswith('s'):
                if wordnet.synsets(translation[:-1]):
                    translation = translation[:-1]
        return translation

    def removeDeterminers(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        removeOf = []

        firstWord = pos[0]
        for i, word in enumerate(pos[1:]):
            if firstWord[1] in ['DT'] and word[1] in ['NNP', 'NNPS', 'NNS']:
                removeOf.append(i)
            firstWord = word

        if len(removeOf) != 0:
            for i in reversed(removeOf):
                tokens.pop(i)

        return " ".join(map(str, tokens))

    def capitalizationAndNumbers(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        tokens[0] = tokens[0].capitalize()

        pos = nltk.pos_tag(tokens)

        days = [
            'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
            'sunday'
        ]
        months = [
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december'
        ]

        for i, word in enumerate(pos):
            if word[1] in ['NNP', 'NNPS']:
                tokens[i] = tokens[i].capitalize()

            if word[1] in ['CD']:
                tokens[i] = re.sub(r'\.', ',', tokens[i])

            if word[0] in days or word[0] in months:
                tokens[i] = tokens[i].capitalize()

        newTokens = []
        for i, token in enumerate(tokens):
            if token.lower() == 'a' and i + 1 < len(tokens):
                if any(tokens[i + 1].startswith(vp)
                       for vp in ['a', 'e', 'i', 'o', 'u']):
                    if i == 0:
                        newTokens.append('An')
                    else:
                        newTokens.append('an')
                else:
                    newTokens.append(token)
            else:
                newTokens.append(token)

        return " ".join(map(str, newTokens))

    def verbConjugation(self, candidate, word):
        wordTranslation = en.verb.present(
            self.dictionary[candidate]['verb'][0], person=word[1][4])

        if word[1][2] == 'p':
            wordTranslation = en.verb.present_participle(
                self.dictionary[candidate]['verb'][0])

        if word[1][3] == 's':
            wordTranslation = en.verb.past(
                self.dictionary[candidate]['verb'][0], person=word[1][4])

            if word[1][2] == 'p':
                wordTranslation = en.verb.past_participle(
                    self.dictionary[candidate]['verb'][0])

        return wordTranslation
Пример #33
0
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel
from nltk.tokenize import word_tokenize, wordpunct_tokenize  # Tokenizer

if __name__ == "__main__":

    # add language
    tTwit = list(brown.words())
    #    tTwit.extend(list(cess_cat.words()))

    # estimator for smoothing the N-gram model
    estimator1 = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

    tokens1 = list(brown.words())
    # N-gram language model with 3-grams
    model = NgramModel(3, tokens1, estimator=estimator1)

    twit = sys.argv[1]
    # posVars = sys.argv[2]
    # pos = sys.argv[3]

    posVars = list()
    tmpVars = list()
    for i in range(2, len(sys.argv)):
        posVars.append(sys.argv[i])
    #        print sys.argv[i]

    tTwit = word_tokenize(twit)
    # tokens2 = word_tokenize(posVars)
    #    print 'twit ' + ' '.join(tTwit)
    print "posVars " + " ".join(posVars)
Пример #34
0
# Import the corpus and functions used from nltk library  
from nltk.corpus import brown;
from nltk.corpus import genesis
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel
  
# Tokens contains the words for Genesis and Reuters Trade  
#tokens = list(genesis.words('english-kjv.txt'))

#tokens.extend(list(reuters.words(categories = 'trade')))

# estimator for smoothing the N-gram model
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

# N-gram language model with 3-grams
#model = NgramModel(3, tokens, estimator)
model = NgramModel(3, brown.words(categories='news'), estimator)
#model = NgramModel(3, tokens)

# Apply the language model to generate 50 words in sequence
text_words = model.generate(50)

# Concatenate all words generated in a string separating them by a space.
text = ' '.join([word for word in text_words])

# print the text
print text

print model.prob('repayments', ['international', 'debt']);
	for item in sent:
		item=item.lower()
		for entry in list(item):
			char_list.append(entry)
		char_list.append(' ')

myCorpus=char_list
# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in myCorpus)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

myCorpus = map(lambda x: x if x in vocabulary else "*unknown*", myCorpus)


estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm_useful = NgramModel(NVAL, myCorpus, estimator=estimator)

print "Useful reviews model complete"

myCorpusReader = nltk.corpus.reader.PlaintextCorpusReader(TRAIN_DATA_PATH,NOTUSEFUL_REVIEWS_FILE)
myCorpus = [word.lower() for word in myCorpusReader.words()]
for sent in myCorpusReader.sents():
	for item in sent:
		item=item.lower()
		for entry in list(item):
			char_list.append(entry)
		char_list.append(' ')

myCorpus=char_list
# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in myCorpus)
Пример #36
0
def demo():
    from nltk.corpus import treebank 
    #from nltk.probability import LidstoneProbDist
    #from nltk.probability import WittenBellProbDist
    from nltk.probability import SimpleGoodTuringProbDist
    from nltk.model import NgramModel
    estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) 
    #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
    #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) 
    
    tag_corpus = []
    for (word,tag) in treebank.tagged_words():
        tag_corpus.append(tag)
    lm = NgramModel(2, tag_corpus, estimator) 
    print lm 
    lm1 = NgramModel(1, tag_corpus, estimator) 
    print lm1 
    print tag_corpus[:20]

    sent = "NN"
    print lm1.entropy(sent) 
    
    sent = "DT "
    print lm1.entropy(sent) 

    sent = "VBZ"
    print lm1.entropy(sent) 
    
    sent = "JJ"
    print lm1.entropy(sent) 
    
    sent = "RB"
    print lm1.entropy(sent) 
    
    sent = "DT NN"
    print lm.entropy(sent) 
Пример #37
0
def trainModel():
    totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    BigramModel = NgramModel(2, totalwords)
    UnigramModel = NgramModel(1, totalwords)
    return (UnigramModel, BigramModel)
Пример #38
0
from nltk.util import ngrams
from nltk.corpus import reuters  
from nltk.corpus import genesis  
from nltk.probability import LaplaceProbDist  
from nltk.model import NgramModel
import nltk

sentence = 'She covered a Bob Dylan song for Amnesty International.'

## http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
## http://www.nltk.org/book/ch02.html

n = 2
bigrams = ngrams(sentence.split(), n)

print bigrams

## Append starting points and ending points

#for grams in sixgrams:
#    print grams
    
estimator = lambda fdist, bins: LaplaceProbDist(fdist, len(sentence.split())+1)

model = NgramModel(2,sentence.split(),estimator=estimator)  

print model.generate(1, ("her","take"))
print 
print model.entropy(["she","covered"])
Пример #39
0
'''
Created on 7 Nov 2009

@author: joh
'''

from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist

text = 'hi how are you do you like fudge you like cookies'

model = NgramModel(3, text.split(), LidstoneProbDist)



print model.prob('you', ('how','are'))

print model.prob('you', ('how','do'))
Пример #40
0
training_spans = WhitespaceTokenizer().span_tokenize(training_raw)
spans = [span for span in training_spans]
training_offsets = [span[0] for span in spans]
train = []
for s in spans:
	train.append(training_raw[s[0]:s[1]])

testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw)
spans = [span for span in testing_spans]
testing_offsets = [span[0] for span in spans]
test = []
for s in spans:
	test.append(testing_raw[s[0]:s[1]])

estimator = lambda fdist, bins: LidstoneProbDist(fdist, args.estimator_probability) 
lm = NgramModel(args.num_grams, train, estimator=estimator)

t0 = 0
t1 = 1
current_best=''
while t1 < len(test):
	perplexity = lm.perplexity(test[t0:t1])
	if perplexity > args.cutoff_max_perplexity:
		if (len(current_best)>1):
			print current_best+'.'
			current_best=''
		t0 = t1 + 1
		t1 = t0 + 1
	else:
		t1 += 1
		if t1-t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
Пример #41
0
			if parsed.word_type == "stem":
				stemmer = Stemmer.Stemmer('russian')
				words += stemmer.stemWords([inp])
			elif parsed.word_type == "surface_all":
				words += nltk.word_tokenize(inp)
			elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_":
				inp = inp.translate(None, string.punctuation)
				words += nltk.word_tokenize(inp)
			else:
				words += nltk.word_tokenize(inp)
			

if parsed.word_type[:7] == "suffix_":
	l = int(parsed.word_type.split("_")[1])
	words = [x[-l:] for x in words]

if parsed.unknown_word_freq:
	unknown_words = []
	# print "Removing unknown words"
	fq = FreqDist(words)
	for w, count in fq.iteritems():
		if count < parsed.unknown_word_freq:
			unknown_words += w

	words[:] = [x if x not in unknown_words else "<UNK>" for x in words]

lm = NgramModel(n, words, estimator=estimator)
outf = open(output, "wb")
dill.dump(lm, outf ,protocol= 2)
outf.close()
Пример #42
0
#!/usr/bin/env python
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel

with open('./austen/persuasion.txt', 'r') as training_file:
    raw = training_file.read()
tokens = nltk.word_tokenize(raw)

with open('./austen/sense_and_sensibility.txt', 'r') as test_file:
    test = test_file.read()
test_list = nltk.word_tokenize(test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
model = NgramModel(3, tokens, True, False, estimator)
tri = model.entropy(test_list)
print "tri-gram: " + str(tri)

model = NgramModel(2, tokens, True, False, estimator)
bi = model.entropy(test_list)
print "bi-gram: " + str(bi)
Пример #43
0
    forbidden_words = forbidden_words_doc.readline()
    # список предлогов и других служебных слов
    prep_doc = codecs.open(scriptdir + u'prepos_list.txt',
                           u'r',
                           encoding='utf-8')
    prepositions = []
    for line in prep_doc:
        if len(line) > 0: prepositions.append(strip_string(line))

    # Представляем входной текст в виде списка токенов
    all_tokens = tokenize(text)

    # строим языковую модель
    ngrams = 10
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.1)
    model = NgramModel(ngrams, all_tokens, estimator=estimator)
    print 'Language model built.'

    # рандомно генерируем первое слово
    random_word = generate_first_word(all_tokens)
    meisterwerk = [random_word]

    # генерируем 4 строки. Количество слогов по строкам: 9/8/9/8
    first_line = generate_line(meisterwerk, 9)
    for word in first_line:
        meisterwerk.append(word)
    print '1st line generated.'

    second_line = generate_line(meisterwerk, 8)
    for word in second_line:
        meisterwerk.append(word)
Пример #44
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import LidstoneProbDist, WittenBellProbDist
from nltk.model import NgramModel
from nltk.tokenize import sent_tokenize, word_tokenize


corpusdir = 'corpora/' # Directory of corpus.
SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt')
HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt')
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    


estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator)
sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator)

healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator)
healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator)

tweet = "Remember when we were all diagnosed with Bieber fever ? Lol"

print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet)))
print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet)))
print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet)))
print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
Пример #45
0
tamil2_alpha = [] 
tamil2_alpha_all = []


for line in tamil1f.readlines()[1:]:
    tamil1_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"])
    tamil1_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]

for line in tamil2f.readlines()[1:]:
    tamil2_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"])
    tamil2_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]

        
s_bg1 = nltk.bigrams(tamil1_alpha_all)
s_bg2 = nltk.bigrams(tamil2_alpha_all)

fdist1 = nltk.FreqDist(s_bg1)
fdist2 = nltk.FreqDist(s_bg2)

estimator1 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil1_alpha_all)+1)
estimator2 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil2_alpha_all)+1)

model1 = NgramModel(3,tamil1_alpha_all,estimator=estimator1)  
model2 = NgramModel(3,tamil2_alpha_all,estimator=estimator2)

print model1.entropy(tamil1_alpha[0])
print model1.perplexity(tamil1_alpha[0])