def main(): df = fill_song_pd() lyrics = "" #going to be one huge string db_name = './markov/' + genre mc = MarkovChain(db_name) #creating new markov dataset if it doesn't exist if not os.path.isfile(db_name): print("creating new data set based on the " + str(genre) + " genre...") for index, row in df.iterrows(): if row['genre'] == genre_dict[genre]: lyrics += row["lyrics"] + " " mc.generateDatabase(lyrics) mc.dumpdb() for i in range(int(lines) + 1): print(mc.generateString())
def generate_database(): """ Generates the database that the Markov Chain will use to make its word-by- word predictions. It will attempt to create this file in the same directory as where the script is currently located. """ currpath = os.path.dirname(__file__) path_to_data = os.path.join(currpath, 'in.txt') chain = MarkovChain() with open(path_to_data) as f: chain.generateDatabase(f.read()) chain.dumpdb() print(chain.generateString())
def main(): args = parser.parse_args() dirname=os.path.split(__file__)[0] filename=os.path.join(dirname,"phil.txt") title_filename=os.path.join(dirname,"phil_titles.txt") dbname1 = "database.pkl" dbname2 = "database_title.pkl" new_db = not os.path.exists(dbname1) body_maker = MarkovChain(dbname1) title_maker = MarkovChain(dbname2) if new_db: title_maker.generateDatabase(open(title_filename).read()) title_maker.dumpdb() body_maker.generateDatabase(open(filename).read()) body_maker.dumpdb() name = title_maker.generateString() body = ' '.join([body_maker.generateString()+'.' for i in xrange(3)]) if args.repo: if args.token: token = args.token else: token_filename = os.path.join(dirname, "token.txt") if not os.path.exists(token_filename): sys.stderr.write("Please either specify --token=XXX on the command line or put a github API token in token.txt\n") sys.stderr.write("You can generate a token here: https://github.com/settings/tokens\n") sys.exit(1) token = open(token_filename).read().strip() import github gh=github.Github(token) user=gh.get_user() repo=user.get_repo(args.repo) issue = repo.create_issue(title=name, body=body) print issue.html_url else: print print name print "-"*len(name) print body
artist_name.lower().encode('utf-8')).hexdigest() mc = MarkovChain(db_name_hashed) # Checking if the database already exists, if so uses the cache instead another API call if not os.path.isfile(db_name_hashed): print( "No data cached. Please be patient while we search the lyrics of %s." % artist_name) # Adding lyrics to a single gigant string lyrics = '' # Parsing each lyric from this artist. # [http://api.wikia.com/wiki/LyricWiki_API] artist = requests.get(API_URI, params=params).json() for album in artist['albums']: for song in album['songs']: params = {'artist': artist_name, 'song': song} print("Parsing \"{}\" from Wikia.".format(song)) response = requests.get(API_URI, params=params).json()["lyrics"] lyrics += response.replace('[...]', '') + ' ' # Generating the database mc.generateDatabase(lyrics) mc.dumpdb() # Printing a string for i in range(0, int(number_of_phrases)): print(mc.generateString())
if not os.path.isfile(DB_FILE): # Handle common user errors if not os.path.isfile(SOURCE_FILE): if os.path.isfile(DB_FILE + '.7z'): sys.exit("NOTICE: Please extract the archive containing the Markov database before use.") sys.exit("NOTICE: You can't regenerate the Markov database without the source text."); # Moving this in here avoids an annoying warning message if either of the # above two sys.exit() calls would be triggered mc = MarkovChain(DB_FILE) # Generate the database with open(SOURCE_FILE, 'r') as f: mc.generateDatabase(f.read(), sentenceSep='[.!?"\n]', n=2) mc.dumpdb() else: mc = MarkovChain(DB_FILE) def generate_string(max_length): # Generate the string # We could be a bit smarter about this, but it works fairly well gen_string = '' short_counter = 0 while len(gen_string) < max_length: new_str = mc.generateString().strip() new_str = re.sub(r' , ?', ', ', new_str) # Too short or too long to be meaningful if len(new_str) < 4 or len(new_str) > 100: continue
if "//" in t: continue if "cw: " in t: continue # Prune short tags if ARGS.prune and len(t) <= 3: continue # Tags which are just numbers should not be in the corpus try: int(t.strip()) continue except ValueError: pass if ARGS.nohash: CORPUS += t + " " else: CORPUS += '#' + t + " " CORPUS += "\n" if ARGS.debug: print(CORPUS) exit(1) print("Generating database...") BOT = MarkovChain(TARGET_FILE) BOT.generateDatabase(CORPUS) print("Dumping database to {}".format(TARGET_FILE)) BOT.dumpdb()