def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" if not hasattr(self, '_ngram_model'): estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) return self._ngram_model.generate(length, [random.choice(self.words)])[1:]
def generateContentFromTokens(text_length, ngram_length, token_list): estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1) source_ngrams = NgramModel(ngram_length,token_list,estimator) seed_words = source_ngrams.generate(text_length)[-2:] generated_text = source_ngrams.generate(text_length, seed_words) return ' '.join(generated_text)
def dos2(words): #bigram model. there must be a way to avoid precomputing this? #this is not working yet as I haven't succeeded in getting nltk's #smoothing implemention's to work. #also, a better way would be to systematically try all possibilities #and store the ones with a probability above a threshold #(there must be an algorithm for doing this efficiently) model = NgramModel(2, words) lengths = map(len, words) # iterate over possible number of morphemes for n in range(2, max(lengths)+1): # sample as many words as there are words with this number of morphemes for m in lengths.count(n): yield model.generate(n)
def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [ word for word in tweet.split() if word[0] not in ["@", "#", ":", "(", ")", "2"] and not "http://" in word and not "https://" in word ] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator)
def generate(body_tokens): #return a 5 line string object following the Cinquain syllable pattern. #stores the pattern rule for the Cinquain #this could be parameterized to handle other formats syl_per_line = [5,7,5,7,7] line_syl_counts=[0]*len(syl_per_line) lines=[""]*len(syl_per_line) #much like the generator code for random article text estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1) source = NgramModel(min(syl_per_line) ,body_tokens, estimator) seed_words = source.generate(100)[-2:] generated_text = source.generate(sum(syl_per_line)*2, seed_words) for i in range(len(syl_per_line)): target = syl_per_line[i] while True: word = generated_text[0] s = syll_count(word) if (s + line_syl_counts[i] < target): line_syl_counts[i] += syll_count(word) lines[i] += word + " " word = generated_text.pop(0) elif (s + line_syl_counts[i] == target): line_syl_counts[i] += syll_count(word) lines[i] += word + " " word = generated_text.pop(0) break else: word = generated_text.pop(0) break for i, text in enumerate(lines): if line_syl_counts[i] < syl_per_line[i]: target = syl_per_line[i] - line_syl_counts[i] for word in (generated_text): if syll_count(word) == target: text += word break return "\n".join(lines)
def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [word for word in tweet.split() if word[0] not in ["@", "#"] and not "http://" in word and not "https://" in word] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator)
def language_ngrams_tags(n, training): language_ngrams = {} languages = {} for language in LANGUAGES: languages[language] = [] for comment, language in training: tags_of_a_comment = [ tag for statement in comment for word, tag in statement ] languages[language].extend(tags_of_a_comment) for language in LANGUAGES: language_ngrams[language] = NgramModel(n, languages[language], _estimator) return language_ngrams
class Generator: def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [word for word in tweet.split() if word[0] not in ["@", "#"] and not "http://" in word and not "https://" in word] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" return self._ngram_model.generate(length, [random.choice(self.words)])[1:] def smart_trim(self, genwords): """Trims to tweet-size while attempting to respect sentence boundaries.""" new_words = genwords[:] # Cleverly trim to tweet size stoppers = r'[.?!]' while True: short_enough = (sum([len(word)+1 for word in new_words]) < 140) if short_enough and re.search(stoppers, new_words[-1]): break if len(new_words) <= 1: new_words = genwords[:] break new_words.pop() # Proper sentence markings for i, word in enumerate(new_words): if i == 0 or re.search(stoppers, new_words[i-1][-1]): new_words[i] = word.capitalize() return new_words def tweetworthy(self): """Generate some tweetable text.""" genwords = self.raw_words() if self.capitalize: genwords = self.smart_trim(genwords) while len(genwords) > 1 and sum([len(word)+1 for word in genwords]) > 140: genwords.pop() if self.capitalize: genwords[-1] += random.choice(['.', '!', '?']) product = " ".join(genwords) if len(product) > 140: product = product[0:140] # Remove mismatched enclosures for pair in [['(', ')'], ['{', '}'], ['[', ']']]: if product.count(pair[0]) != product.count(pair[1]): product = product.replace(pair[0], '').replace(pair[1], '') for enc in ['"', '*']: if product.count(enc) % 2 != 0: product = product.replace(enc, '') return product
def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" if not hasattr(self, '_ngram_model'): estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator) return self._ngram_model.generate(length, [random.choice(self.words)])[1:]
class Generator: def __init__(self): dataset = open(os.path.join(TOPDIR, 'dataset.txt')).read() words = [ word for word in dataset.split() if re.match(r'[a-zA-Z0-9 \.,?:\'"!_\(\)]+', word) ] self.words = words self.model = nltk.Text(words) def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" if not hasattr(self, '_ngram_model'): estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) return self._ngram_model.generate(length, [random.choice(self.words)])[1:] def smart_trim(self, genwords): """Trims to tweet-size while attempting to respect sentence boundaries.""" new_words = genwords[:] # Cleverly trim to tweet size stoppers = r'[.?!]' while True: short_enough = (sum([len(word) + 1 for word in new_words]) < 140) if short_enough and re.search(stoppers, new_words[-1]): break if len(new_words) <= 1: new_words = genwords[:] break new_words.pop() # Proper sentence markings for i, word in enumerate(new_words): if i == 0 or re.search(stoppers, new_words[i - 1][-1]): new_words[i] = word.capitalize() return new_words def tweetworthy(self): """Generate some tweetable text.""" genwords = self.raw_words() genwords = self.smart_trim(genwords) while len(genwords) > 1 and sum([len(word) + 1 for word in genwords]) > 140: genwords.pop() # genwords[-1] += random.choice(['.', '!', '?']) product = " ".join(genwords) if len(product) > 140: product = product[0:140] # Remove mismatched enclosures for pair in [['(', ')'], ['{', '}'], ['[', ']']]: if product.count(pair[0]) != product.count(pair[1]): product = product.replace(pair[0], '').replace(pair[1], '') for enc in ['"', '*']: if product.count(enc) % 2 != 0: product = product.replace(enc, '') return product
class Generator: def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [ word for word in tweet.split() if word[0] not in ["@", "#", ":", "(", ")", "2"] and not "http://" in word and not "https://" in word ] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" return self._ngram_model.generate(length, [random.choice(self.words)])[1:] def smart_trim(self, genwords): """Trims to tweet-size while attempting to respect sentence boundaries.""" new_words = genwords[:] # Cleverly trim to tweet size stoppers = r'[.?!]' while True: short_enough = (sum([len(word) + 1 for word in new_words]) < 140) if short_enough and re.search(stoppers, new_words[-1]): break if len(new_words) <= 1: new_words = genwords[:] break new_words.pop() # Proper sentence markings for i, word in enumerate(new_words): if i == 0 or re.search(stoppers, new_words[i - 1][-1]): new_words[i] = word.capitalize() return new_words def tweetworthy(self): """Generate some tweetable text.""" genwords = self.raw_words() if self.capitalize: genwords = self.smart_trim(genwords) while len(genwords) > 1 and sum([len(word) + 1 for word in genwords]) > 140: genwords.pop() if self.capitalize: genwords[-1] += random.choice(['.', '!', '?']) product = " ".join(genwords) if len(product) > 140: product = product[0:140] # Remove mismatched enclosures for pair in [['(', ')'], ['{', '}'], ['[', ']']]: if product.count(pair[0]) != product.count(pair[1]): product = product.replace(pair[0], '').replace(pair[1], '') for enc in ['"', '*']: if product.count(enc) % 2 != 0: product = product.replace(enc, '') return product
def markov(n, tokenized_content): content_model = NgramModel(n, tokenized_content) generated_content = content_model.generate(50, 'Emma') return ' '.join(generated_content)