def generateSentencesAndCheckErrors(infile, K): model, tweets = getModelAndTweetsFromFile(infile, K) res = [] for i in range(100): random_tweet = np.random.choice(tweets) tweet_start = random_tweet.split(" ")[:K-1] num_words_wanted = len(random_tweet.split(" ")) - (K-1) generated_sentence = generateSentenceFromStartingKmer(tweet_start, K, num_words_wanted, model) updated_tweet, updated_generated_sentence = match_sentence_lengths(random_tweet, generated_sentence) result = compareSentences(updated_tweet, updated_generated_sentence) if result == None: continue res.append(result) sys.stdout.flush() print res print "Average:", sum(res)/float(len(res)) return res
def generateSentencesAndCheckErrors(infile, K): model, tweets = getModelAndTweetsFromFile(infile, K) res = [] for i in range(100): random_tweet = np.random.choice(tweets) tweet_start = random_tweet.split(" ")[:K - 1] num_words_wanted = len(random_tweet.split(" ")) - (K - 1) generated_sentence = generateSentenceFromStartingKmer( tweet_start, K, num_words_wanted, model) updated_tweet, updated_generated_sentence = match_sentence_lengths( random_tweet, generated_sentence) #print "Updates", updated_tweet, updated_generated_sentence result = compareSentences(updated_tweet, updated_generated_sentence) if result == None: continue res.append(result) sys.stdout.flush() print res print "Average:", sum(res) / float(len(res)) return res
def generateTweetsFromFile(infile, K): #initialize array of tweets model, tweets = getModelAndTweetsFromFile(infile, K) starting_kmers = getStartingKMers(K-1, tweets) # make original tweet i = 0 tries = 0 while i < 100 and tries < 1000: tries += 1 #randomly pick a starting KMer sentence = starting_kmers[np.random.choice(range(len(starting_kmers)))] generated_tweet = generateSentenceFromStartingKmer(sentence,K, 20, model) # If the tweet was one of the originals, get another one. if checkIfSentenceInDatabase(generated_tweet, tweets): continue print generated_tweet i += 1 return
def generateTweetsFromFile(infile, K): #initialize array of tweets model, tweets = getModelAndTweetsFromFile(infile, K) # tweets = [] # fpath = os.path.join("../data/", infile) # #populate tweets array with contents of tweets file # with open(fpath, 'r') as tweetsfile: # csvreader = csv.reader(tweetsfile, delimiter=',') # for line in csvreader: # tweets.append(line[-1]) # #build Markov model # kmers = getAllKMers(K, tweets) # model = buildModel(kmers) starting_kmers = getStartingKMers(K - 1, tweets) #begin loop to generate n novel tweets # i = 0 tries = 0 while i < 100 and tries < 1000: tries += 1 #randomly pick a starting KMer sentence = starting_kmers[np.random.choice(range(len(starting_kmers)))] generated_tweet = generateSentenceFromStartingKmer( sentence, K, 20, model) # If the tweet was one of the originals, get another one. if checkIfSentenceInDatabase(generated_tweet, tweets): continue print generated_tweet i += 1 #print generateSentenceFromStartingKmer(sentence,K, 20, model) #print tmp #print generated_sentence #print checkIfSentenceInDatabase(sentence, tweets) # #generate rest of sentence using Markov chain # for j in range(20): # #select most recently-generated KMer # current_phrase = tuple(sentence[-K + 1:]) # #get occurrence frequencies for most recent KMer # frequencies = model[current_phrase] # #generate the next word # next_word = getNextWord(frequencies) # #if no next word, end the sentence # if next_word == None: # break # #append the next word to the sentence # sentence.append(next_word) # #flatten the sentence array into a string and print # print " ".join(sentence) return