import dimod from dwave.system.samplers import DWaveSampler from dwave.system.composites import EmbeddingComposite from board24 import Board import morris import pygame import numpy as np import sys game = morris.GameState() num_spots = 24 max_checkers = 18 our = np.zeros(num_spots, dtype=int) enemy = np.zeros(num_spots, dtype=int) previous_enemy = np.zeros(num_spots, dtype=int) h_const = 100 j_const = 1 constraint_const = 3 mill_constant = 0.1 anti_mill_constant = 0.5 #our = np.array([0,1,1,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0]) #enemy = np.array([0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1,0,0,1,0]) #previous_enemy=enemy b = Board() b.board_array = our + 2 * enemy # FOR BOARD MARKERS: # OURS IS 1 # ENEMY IS 2
def trainNetwork(s, readout, sess): # define the cost function TODO!!! a = tf.placeholder("float", [None, 24]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing do_nothing = np.zeros(24) s_t, r_0, terminal = game_state.frame_step(do_nothing) # saving and loading networks saver = tf.train.Saver() # Merge all sumarries and write merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('morris/train', sess.graph) tf.global_variables_initializer().run() checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print(("Successfully loaded:", checkpoint.model_checkpoint_path)) else: print("Could not find old network weights") epsilon = INITIAL_EPSILON t = 0 try: while "pigs" != "fly": # choose an action epsilon greedily a_t = readout.eval(feed_dict={s: [s_t]})[0] if random.random() <= epsilon or t <= OBSERVE: a_t = np.random.rand(24) # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observe next state and reward s_t1, r_t, terminal = game_state.frame_step(a_t) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step ''' train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) ''' if t % 100 == 0: # Record execution stats run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % t) train_writer.add_summary(summary, t) print(('Adding run metadata for', t)) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }) train_writer.add_summary(summary, t) # update the old values s_t = s_t1 t += 1 # save progress every 1000 iterations if t % 1000 == 0: saver.save(sess, 'morris/checkpoint morris-dqn', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print(("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ REWARD", r_t)) except KeyboardInterrupt: train_writer.close() export_path = 'morris/out' print(('Exporting trained model to', export_path)) saver = tf.train.Saver(sharded=True) model_exporter = exporter.Exporter(saver) model_exporter.init(sess.graph.as_graph_def(), named_graph_signatures={ 'inputs': exporter.generic_signature({'board': s}), 'outputs': exporter.generic_signature({'values': readout}) }) model_exporter.export(export_path, tf.constant(1), sess)