예제 #1
0
 def language_model(self, level):
     #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
     estimator = lambda fdist, bins: GoodTuringProbDist(fdist)
     #estimator = lambda fdist, bins: MLEProbDist(fdist)
     model = nltk.NgramModel(level, self.words, estimator)
     logging.info("Ngram model of length %d is calculated for %s." %
                  (level, self.name))
     return model
예제 #2
0
def preprocess():
    for pundit in pundit_files:
        # parse each input file, and convert it into NLTK tokens and text
        in_file = open(pundit_files[pundit], 'r')
        tokens = nltk.word_tokenize(in_file.read().decode('utf-8', 'ignore'))
        text = nltk.Text(tokens)
        nltk_models[pundit] = {}
        # only support up to 3-grams
        for gram in range(1, 4):
            nltk_models[pundit][gram] = nltk.NgramModel(gram, text)
        in_file.close()
        successfulInit = True
예제 #3
0
def preprocess_short():
    for pundit in pundit_files:
        # parse each input file, and convert it into NLTK tokens and text
        in_file = open(pundit_files[pundit], 'r')
        tokens = nltk.word_tokenize(in_file.read().decode('utf-8', 'ignore'))
        text = nltk.Text(tokens)
        print 'tokenized tokens'
        nltk_models[pundit] = {}
        for gram in range(1, 3):
            nltk_models[pundit][gram] = nltk.NgramModel(gram, text)
            print 'made ngram model'
        in_file.close()
        successfulInit = True
예제 #4
0
def _get_ngram_model(bigrams):
    #NLTK produces a LOT of warnings - don't mess with my error log
    warnings.simplefilter("ignore")
    cached = cache.get('ngram_model')
    if cached is None:
        samples = Sample.get_all()
        if samples:
            text = [unicode(s) for s in samples]
            tokenizer = nltk.tokenize.WordPunctTokenizer()
            tokenized = tokenizer.tokenize(' '.join(text))
            cached = nltk.NgramModel(3-int(bool(bigrams)), tokenized)
            cache.set('ngram_model', cached, timeout=app.config['CACHE_MINUTES'] * 60)
    return cached
예제 #5
0
    def generate(self, length=100):
        """"""
        # Change tokens
        self.tokens = nltk.word_tokenize(
            self.__words[randint(1, len(self.__words)) - 1])

        estimator = lambda fdist, bins: nltk.LidstoneProbDist(
            fdist, self.__random.random())
        #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
        self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15),
                                              self, estimator)
        #self._trigram_model = nltk.NgramModel(3, self, estimator)
        text = self._trigram_model.generate(length)
        return nltk.tokenwrap(text)
예제 #6
0
def language_model_for(nick):
    """Return a language model for the specified nick, or None if there's not
    much data for that user."""
    lines = readfromlogs.lines_said_by(nick)
    print("got lines")
    sentences = []
    for line in lines:
        tokenized = nltk.word_tokenize(line)
        justwords = list(filter(word_or_contraction, tokenized))

        joined = merge_contractions(justwords)
        sentences.append(joined)

    if len(sentences) < 10: return None

    print("got sentences")
    out = nltk.NgramModel(3, sentences)
    print("got lm")
    return out
예제 #7
0
    rhyme_part = pron[lsv:]
    lrp = len(rhyme_part) * -1
    for (x, y) in word_list_u:
        ps = strip_numbers(y)
        if ps[lrp:] == rhyme_part and ps[lrp - 1:] != pron[lsv - 1:]:
            rhyming_words.append(x)
        else:
            pass
    rw = [i for i in rhyming_words if not i == word]
    rw2 = [j for j in rw if not j in banned_end_words]
    return rw2


print "building content model..."
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
content_model = nltk.NgramModel(5, vw, estimator=estimator)


def generate():
    sw1 = random.randint(0, len(vw) - 10)
    sw2 = sw1 + 10
    starting_words = vw[sw1:sw2]
    line_1 = content_model.generate(10, starting_words)
    line_1 = line_1[-10:]
    line_2 = content_model.generate(10, line_1)
    line_2 = line_2[-10:]
    line_3 = content_model.generate(9, line_2)
    line_3 = line_3[-9:]
    line_4 = content_model.generate(9, line_3)
    line_4 = line_4[-9:]
    line_5 = content_model.generate(10, line_4)
예제 #8
0
    rawtext = unicode(rawtext, 'utf-8')
    rawtext = re.sub(ur"^陳雲:", '', rawtext.rstrip("\n"))
    seg_list = jieba.cut(rawtext)
    sentCor.append(" ".join(seg_list))


sentCor = []
for i in inputtext:
    if i != "\n":
        processText(i, sentCor)

content_text = ' '.join(sentCor)
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')

tokenized_content = tokenizer.tokenize(content_text)
content_model = nltk.NgramModel(4, tokenized_content)


def chinwanBot(content_model):
    starting_words = content_model.generate(100)[-2:]
    randomsentence = content_model.generate(70, starting_words)
    for i in range(0, len(randomsentence)):
        if re.match(ur'[A-Za-z]+$', randomsentence[i]):
            randomsentence[i] = randomsentence[i] + " "
    puncIndex = [
        i for i, x in enumerate(randomsentence)
        if any(thing in x for thing in [u'。', u'!', u'?'])
    ]
    startingIndex = min(puncIndex) + 1
    endingIndex = max(puncIndex) + 1
    return "".join(randomsentence[startingIndex:endingIndex])
예제 #9
0
파일: nGram.py 프로젝트: afgiel/mt
 def __init__(self, n):
     self.n = n
     print "Training ngram language model..."
     self.model = nltk.NgramModel(n, brown.words())
     print "--Training complete--"
예제 #10
0
def createLM(observation, n):
    return nltk.NgramModel(n, observation, estimator=nltk.MLEProbDist)
예제 #11
0
    if not options.sample else options.sample.split(' ')
WORDS = 500 if not options.words else int(options.words)
NGRAM = 3 if not options.bigrams else 2

samples = []
if options.sample:
    for url in SAMPLE_URLS:
        sample = unicode(
            BeautifulSoup(urlopen(url),
                          convertEntities=BeautifulSoup.HTML_ENTITIES))
        samples.append(nltk.clean_html(sample))
elif options.input:
    samples = [open(options.input).read().decode('utf8')]

tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenized = tokenizer.tokenize(' '.join(samples))
warnings.simplefilter("ignore")
model = nltk.NgramModel(NGRAM, tokenized)

starts = model.generate(100)[-2:]
generated = model.generate(WORDS, starts)
out = ' '.join(generated).encode('utf8').replace(' , ',
                                                 ', ').replace(' . ', '. ')
out = '%s%s...' % (out[0].upper(), out[1:])

if options.output:
    f = open(options.output, 'a+')
    f.write(out)
    f.close()
else:
    print out
예제 #12
0
                       type=int)

init_args.add_argument('-o', '--outfile',
                       help='specify the output of the file,'
                       'must be either .txt (plain text) or .csv'
                       '(markov table), if non specified,'
                       'will print plain text to stdio',
                       type=argparse.FileType('a'),
                       default=sys.stdout)


if __name__ == '__main__':
    args = init_args.parse_args()
    gram_size = args.gramsize
    in_file = args.infile
    out_file = args.outfile
    text = None
    if in_file.name.endswith('.txt'):
        tokens = nltk.word_tokenize(in_file.read())
        text = nltk.Text(tokens)
        in_file.close()

    # generate ngrams text
    model = nltk.NgramModel(gram_size, text)
    starting_words = model.generate(100)[-2:]
    model_words = model.generate(50, starting_words)
    speech = ' '.join([word for word in model_words])
    print(speech)
    out_file.write(speech)
    out_file.close()
예제 #13
0
import nltk

from nltk.book import *

punctuation = ['.', ':', ',', '?', '!', '-', '--', ';']
text2Clean = [w for w in text2 if w not in punctuation]

text2Clean[1:10]

nltk.bigrams(text2Clean[1:10])

nltk.trigrams(text2Clean[1:10])

unigramModel = nltk.NgramModel(1, text2Clean)

len(set(text2Clean))

unigramModel.generate(num_words=50)

bigramModel = nltk.NgramModel(2, text2Clean)

bigramModel.generate(num_words=50)

trigramModel = nltk.NgramModel(3, text2Clean)

trigramModel.generate(num_words=50)

# POS
nltk.corpus.brown.tagged_words()[1:10]

myText = 'This is my text, and I will now try to tag it'
예제 #14
0
        tokens = nltk.word_tokenize(new_result.lower())
    else:
        tokens = nltk.wordpunct_tokenize(new_result.lower())

    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

    try:
        fn = str(uuid.uuid1()) + ".txt"
        f = open(fn, "w")
    except Exception as e:
        print "Cannot open file %s for writing! Aborting..." % fn
        raise

    for i in range(args.tries):
        content_model = nltk.NgramModel(args.ngram,
                                        tokens,
                                        estimator=estimator)
        starting_words = content_model.generate(args.words * 10)[-2:]
        try:
            print "Generating %d-Gram, starting from words:\t%s" % (
                args.ngram, " ".join(
                    word.encode(syscodepage, "ignore")
                    for word in starting_words))
        except:
            pass
        content = content_model.generate(args.words, starting_words)
        try:
            f.write(" ".join(word.encode("utf8") for word in content) + "\n")
        except Exception as e:
            f.close()
            print "Cannot write to file %s! Exception:\t%s" % (fn, e)
예제 #15
0
#!/usr/bin/env python2 -W ignore::UserWarning

import sys
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")

reload(sys)
sys.setdefaultencoding('utf8')

with open('stackoverflow_content', 'r') as f:
    contents = f.read()
    tokens = word_tokenize(contents)
    for i in range(3, 7):
        content_model = nltk.NgramModel(i, tokens)

        content = content_model.generate(20, ["I", "like"])
        print("\nSentence generated for " + str(i) + "-Gram Model:")
        print(' '.join(content))