def language_model(self, level): #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) estimator = lambda fdist, bins: GoodTuringProbDist(fdist) #estimator = lambda fdist, bins: MLEProbDist(fdist) model = nltk.NgramModel(level, self.words, estimator) logging.info("Ngram model of length %d is calculated for %s." % (level, self.name)) return model
def preprocess(): for pundit in pundit_files: # parse each input file, and convert it into NLTK tokens and text in_file = open(pundit_files[pundit], 'r') tokens = nltk.word_tokenize(in_file.read().decode('utf-8', 'ignore')) text = nltk.Text(tokens) nltk_models[pundit] = {} # only support up to 3-grams for gram in range(1, 4): nltk_models[pundit][gram] = nltk.NgramModel(gram, text) in_file.close() successfulInit = True
def preprocess_short(): for pundit in pundit_files: # parse each input file, and convert it into NLTK tokens and text in_file = open(pundit_files[pundit], 'r') tokens = nltk.word_tokenize(in_file.read().decode('utf-8', 'ignore')) text = nltk.Text(tokens) print 'tokenized tokens' nltk_models[pundit] = {} for gram in range(1, 3): nltk_models[pundit][gram] = nltk.NgramModel(gram, text) print 'made ngram model' in_file.close() successfulInit = True
def _get_ngram_model(bigrams): #NLTK produces a LOT of warnings - don't mess with my error log warnings.simplefilter("ignore") cached = cache.get('ngram_model') if cached is None: samples = Sample.get_all() if samples: text = [unicode(s) for s in samples] tokenizer = nltk.tokenize.WordPunctTokenizer() tokenized = tokenizer.tokenize(' '.join(text)) cached = nltk.NgramModel(3-int(bool(bigrams)), tokenized) cache.set('ngram_model', cached, timeout=app.config['CACHE_MINUTES'] * 60) return cached
def generate(self, length=100): """""" # Change tokens self.tokens = nltk.word_tokenize( self.__words[randint(1, len(self.__words)) - 1]) estimator = lambda fdist, bins: nltk.LidstoneProbDist( fdist, self.__random.random()) #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2) self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15), self, estimator) #self._trigram_model = nltk.NgramModel(3, self, estimator) text = self._trigram_model.generate(length) return nltk.tokenwrap(text)
def language_model_for(nick): """Return a language model for the specified nick, or None if there's not much data for that user.""" lines = readfromlogs.lines_said_by(nick) print("got lines") sentences = [] for line in lines: tokenized = nltk.word_tokenize(line) justwords = list(filter(word_or_contraction, tokenized)) joined = merge_contractions(justwords) sentences.append(joined) if len(sentences) < 10: return None print("got sentences") out = nltk.NgramModel(3, sentences) print("got lm") return out
rhyme_part = pron[lsv:] lrp = len(rhyme_part) * -1 for (x, y) in word_list_u: ps = strip_numbers(y) if ps[lrp:] == rhyme_part and ps[lrp - 1:] != pron[lsv - 1:]: rhyming_words.append(x) else: pass rw = [i for i in rhyming_words if not i == word] rw2 = [j for j in rw if not j in banned_end_words] return rw2 print "building content model..." estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) content_model = nltk.NgramModel(5, vw, estimator=estimator) def generate(): sw1 = random.randint(0, len(vw) - 10) sw2 = sw1 + 10 starting_words = vw[sw1:sw2] line_1 = content_model.generate(10, starting_words) line_1 = line_1[-10:] line_2 = content_model.generate(10, line_1) line_2 = line_2[-10:] line_3 = content_model.generate(9, line_2) line_3 = line_3[-9:] line_4 = content_model.generate(9, line_3) line_4 = line_4[-9:] line_5 = content_model.generate(10, line_4)
rawtext = unicode(rawtext, 'utf-8') rawtext = re.sub(ur"^陳雲:", '', rawtext.rstrip("\n")) seg_list = jieba.cut(rawtext) sentCor.append(" ".join(seg_list)) sentCor = [] for i in inputtext: if i != "\n": processText(i, sentCor) content_text = ' '.join(sentCor) tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') tokenized_content = tokenizer.tokenize(content_text) content_model = nltk.NgramModel(4, tokenized_content) def chinwanBot(content_model): starting_words = content_model.generate(100)[-2:] randomsentence = content_model.generate(70, starting_words) for i in range(0, len(randomsentence)): if re.match(ur'[A-Za-z]+$', randomsentence[i]): randomsentence[i] = randomsentence[i] + " " puncIndex = [ i for i, x in enumerate(randomsentence) if any(thing in x for thing in [u'。', u'!', u'?']) ] startingIndex = min(puncIndex) + 1 endingIndex = max(puncIndex) + 1 return "".join(randomsentence[startingIndex:endingIndex])
def __init__(self, n): self.n = n print "Training ngram language model..." self.model = nltk.NgramModel(n, brown.words()) print "--Training complete--"
def createLM(observation, n): return nltk.NgramModel(n, observation, estimator=nltk.MLEProbDist)
if not options.sample else options.sample.split(' ') WORDS = 500 if not options.words else int(options.words) NGRAM = 3 if not options.bigrams else 2 samples = [] if options.sample: for url in SAMPLE_URLS: sample = unicode( BeautifulSoup(urlopen(url), convertEntities=BeautifulSoup.HTML_ENTITIES)) samples.append(nltk.clean_html(sample)) elif options.input: samples = [open(options.input).read().decode('utf8')] tokenizer = nltk.tokenize.WordPunctTokenizer() tokenized = tokenizer.tokenize(' '.join(samples)) warnings.simplefilter("ignore") model = nltk.NgramModel(NGRAM, tokenized) starts = model.generate(100)[-2:] generated = model.generate(WORDS, starts) out = ' '.join(generated).encode('utf8').replace(' , ', ', ').replace(' . ', '. ') out = '%s%s...' % (out[0].upper(), out[1:]) if options.output: f = open(options.output, 'a+') f.write(out) f.close() else: print out
type=int) init_args.add_argument('-o', '--outfile', help='specify the output of the file,' 'must be either .txt (plain text) or .csv' '(markov table), if non specified,' 'will print plain text to stdio', type=argparse.FileType('a'), default=sys.stdout) if __name__ == '__main__': args = init_args.parse_args() gram_size = args.gramsize in_file = args.infile out_file = args.outfile text = None if in_file.name.endswith('.txt'): tokens = nltk.word_tokenize(in_file.read()) text = nltk.Text(tokens) in_file.close() # generate ngrams text model = nltk.NgramModel(gram_size, text) starting_words = model.generate(100)[-2:] model_words = model.generate(50, starting_words) speech = ' '.join([word for word in model_words]) print(speech) out_file.write(speech) out_file.close()
import nltk from nltk.book import * punctuation = ['.', ':', ',', '?', '!', '-', '--', ';'] text2Clean = [w for w in text2 if w not in punctuation] text2Clean[1:10] nltk.bigrams(text2Clean[1:10]) nltk.trigrams(text2Clean[1:10]) unigramModel = nltk.NgramModel(1, text2Clean) len(set(text2Clean)) unigramModel.generate(num_words=50) bigramModel = nltk.NgramModel(2, text2Clean) bigramModel.generate(num_words=50) trigramModel = nltk.NgramModel(3, text2Clean) trigramModel.generate(num_words=50) # POS nltk.corpus.brown.tagged_words()[1:10] myText = 'This is my text, and I will now try to tag it'
tokens = nltk.word_tokenize(new_result.lower()) else: tokens = nltk.wordpunct_tokenize(new_result.lower()) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) try: fn = str(uuid.uuid1()) + ".txt" f = open(fn, "w") except Exception as e: print "Cannot open file %s for writing! Aborting..." % fn raise for i in range(args.tries): content_model = nltk.NgramModel(args.ngram, tokens, estimator=estimator) starting_words = content_model.generate(args.words * 10)[-2:] try: print "Generating %d-Gram, starting from words:\t%s" % ( args.ngram, " ".join( word.encode(syscodepage, "ignore") for word in starting_words)) except: pass content = content_model.generate(args.words, starting_words) try: f.write(" ".join(word.encode("utf8") for word in content) + "\n") except Exception as e: f.close() print "Cannot write to file %s! Exception:\t%s" % (fn, e)
#!/usr/bin/env python2 -W ignore::UserWarning import sys import nltk from nltk.tokenize import word_tokenize import warnings warnings.filterwarnings("ignore") reload(sys) sys.setdefaultencoding('utf8') with open('stackoverflow_content', 'r') as f: contents = f.read() tokens = word_tokenize(contents) for i in range(3, 7): content_model = nltk.NgramModel(i, tokens) content = content_model.generate(20, ["I", "like"]) print("\nSentence generated for " + str(i) + "-Gram Model:") print(' '.join(content))