def writehaiku(trend, tweets): # Print preamble # print "Poet0: " # Create list of words in tweets allWords = [] for tweet in tweets: allWords.extend(tweet.text.split()) invalidWords = [] for word in allWords: # remove URLs and twitter users and hashtags if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word): invalidWords.append(word) # remove non-alpha words if not word.isalpha(): invalidWords.append(word) # remove words fewer than four characters or more than 25 if len(word) < 4 or len(word) > 25: invalidWords.append(word) filteredWords = [word for word in allWords if word not in invalidWords] # logging.debug("Filtered wordlist is now: ") # logging.debug(filteredWords) # Get the list of unique words with their counts uniqueWords = Counter(filteredWords) # Get the most common words topWords = uniqueWords.most_common(5) # For top common filtered words, get phrases of length 5 containing them phrases = [] for n in range(2, 7): for word in topWords: idx = n - 1 try: while filteredWords[idx : len(filteredWords) - n].index(word[0]) >= 0: idx = filteredWords[idx : len(filteredWords) - n].index(word[0]) + idx for i in range(0, n): phrases.append(" ".join(filteredWords[(idx - i) : (idx - i + n)])) idx += 1 except: idx = 0 uniquePhrases = Counter(phrases) topPhrases = uniquePhrases.most_common(200) # Compute the syllable length for each phrase listPhrases = [list(phrase) for phrase in topPhrases] for phrase in listPhrases: phrase.append(sf.nsyllables(phrase[0])) # Use Jaccard similarity to choose top-tweeted phrases that are not similar to one another Phrase1 = "" Phrase2 = "" Phrase3 = "" # Populate 7-syllable phrase first with the top-tweeted phrase of five syllables for phrase in listPhrases: if phrase[2] == 7 and Phrase1 == "": Phrase2 = phrase[0] break # Get list of 5-syllable phrases and compute their Jaccard similarities from the 2nd # Choose the one that is popular with smaller Jaccard similarity if Phrase2 != "": for phrase in listPhrases: if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.4: if Phrase1 == "": Phrase1 = phrase[0] # Get list of 5-syllable phrases and compute their Jaccard similarities from the 1st and 2nd # Choose the one that is popular with smaller Jaccard similarity if Phrase2 != "" and Phrase1 != "": for phrase in listPhrases: if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.6 and d.jaccard(Phrase1, phrase[0]) >= 0.6: if Phrase3 == "": Phrase3 = phrase[0] myHaiku = h.Haiku() # Construct the haiku if Phrase1 != "" and Phrase2 != "" and Phrase3 != "": myHaiku.length = len(Phrase1) + len(Phrase2) + len(Phrase3) myHaiku.text = [Phrase1, Phrase2, Phrase3] # return [[Phrase1, Phrase2, Phrase3], len(Phrase1) + len(Phrase2) + len(Phrase3)] return myHaiku
def writehaiku(trend, tweets): # Print preamble # print "Poet0: " # Create list of words in tweets allWords = [] for tweet in tweets: allWords.extend(tweet.text.split()) invalidWords = [] for word in allWords: # remove URLs and twitter users and hashtags if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word): invalidWords.append(word) # remove non-alpha words if not word.isalpha(): invalidWords.append(word) # remove words fewer than four characters or more than 25 if len(word) < 4 or len(word) > 25: invalidWords.append(word) filteredWords = [word for word in allWords if word not in invalidWords] # logging.debug("Filtered wordlist is now: ") # logging.debug(filteredWords) # Get the list of unique words with their counts uniqueWords = Counter(filteredWords) # Get the most common words topWords = uniqueWords.most_common(5) # For top common filtered words, get phrases of length 5 containing them phrases = [] for n in range(2, 7): for word in topWords: idx = n - 1 try: while filteredWords[idx : len(filteredWords) - n].index(word[0]) >= 0: idx = filteredWords[idx : len(filteredWords) - n].index(word[0]) + idx for i in range(0, n): phrases.append(" ".join(filteredWords[(idx - i) : (idx - i + n)])) idx += 1 except: idx = 0 uniquePhrases = Counter(phrases) topPhrases = uniquePhrases.most_common(200) # Compute the syllable length for each phrase listPhrases = [list(phrase) for phrase in topPhrases] for phrase in listPhrases: phrase.append(sf.nsyllables(phrase[0])) # Select the two most commonly tweeted phrases with 5 syllables and the most commonly tweeted phrase with 7 syllables Phrase1 = "" Phrase2 = "" Phrase3 = "" for phrase in listPhrases: if phrase[2] == 5: if Phrase1 == "": Phrase1 = phrase[0] else: if Phrase3 == "": Phrase3 = phrase[0] break if phrase[2] == 7: if Phrase2 == "": Phrase2 = phrase[0] myHaiku = h.Haiku() # Construct the haiku if Phrase1 != "" and Phrase2 != "" and Phrase3 != "": myHaiku.length = len(Phrase1) + len(Phrase2) + len(Phrase3) myHaiku.text = [Phrase1, Phrase2, Phrase3] # return [[Phrase1, Phrase2, Phrase3], len(Phrase1) + len(Phrase2) + len(Phrase3)] return myHaiku
def writehaiku(trend, tweets): allWords = [] for tweet in tweets: allWords.extend(tweet.text.split()) invalidWords = [] for word in allWords: # remove URLs and twitter users and hashtags if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word): invalidWords.append(word) # remove non-alpha words if not word.isalpha(): invalidWords.append(word) # remove words fewer than four characters or more than 25 if len(word)<4 or len(word)>25: invalidWords.append(word) filteredWords = [word for word in allWords if word not in invalidWords] tagged = nltk.pos_tag(filteredWords) line1 = "" for word in tagged: if line1 != "": continue else: if sf.nsyl(word[0]) == 2 and word[1]=='NN': if word[0][0] in ['a', 'e', 'i', 'o', 'u', 'y']: line1 = "I am an "+word[0]+"." else: line1 = "I am a "+word[0]+"." word2 = "" for word in tagged: if word2 != "": continue else: if sf.nsyl(word[0]) == 3 and word[1]=='NNP': word2 = word[0] word3 = "" for word in tagged: if word3 != "": continue else: if sf.nsyl(word[0])== 1 and word[1]=='NN': word3 = word[0] line2 = "and {} is my {}.".format(word2, word3) line3 = '' for word in tagged: if line3 != "": continue else: if sf.nsyl(word[0]) == 1 and word[1]=='VB': line3 = "I {} with the night!".format(word[0]) out = line1 + '\n' + line2 + '\n' + line3 return out