process_batch(word_batch, curr_dict) new_stems = len(curr_dict) - start_dict_size spent_seconds = time.time() - start if new_stems >= 0: print("finished batch {} in {} seconds".format(batch_no, spent_seconds)) print("added {} entries to stem map".format(new_stems)) save_dict(curr_dict) else: log_error('stem dictionary is corrupted') raise RuntimeError if spent_seconds < INTER_BATCH_SLEEP_SECONDS: time.sleep(INTER_BATCH_SLEEP_SECONDS - spent_seconds) batch_no += 1 if test and batch_no >= n_loops: break return curr_dict if __name__ == "__main__": lyrics, unique_words = get_lyrics() # print(len(lyrics)) # print(lyrics[0]) # print("*"*10) # print(unique_words[0]) # print("...") # print(unique_words[-1]) stem_dict = build_stem_dict() get_counts(lyrics, stem_dict)
def init(): get_lyrics()
return [(c, cnt / s) for c, cnt in counter.iteritems()] outlm = {hist: normalize(chars) for hist, chars in lm.iteritems()} return outlm def generate_text(lm, order, nletters=1000): history = random.choice(lm.items())[0] out = [] for i in xrange(nletters): c = generate_letter(lm, history, order) history = history[-order:] + c out.append(c) return "".join(out) def generate_letter(lm, history, order): history = history[-order:] dist = lm[history] x = random.random() for c, v in dist: x = x - v if x <= 0: return c artist = raw_input('Enter Artist: ') order = int(raw_input('Enter character-level n-gram order: ')) (lyrics, lines, bow, line_endings) = scrape_lyrics.get_lyrics(artist) lm = train_char_lm(lyrics, order) print generate_text(lm, order)