Exemplo n.º 1
0
def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
    logger.configure()
    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
    env.seed(seed)
    return env
Exemplo n.º 2
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     env = Monitor(
         env,
         logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     return wrap_deepmind(env, **wrapper_kwargs)
Exemplo n.º 3
0
def main():

    statsd_client = StatsClient(statsd_host, statsd_port, prefix="wifi.parse.data")
    statsd_client.gauge("WA_SOURCE_FJ_1001.success", 0)
    statsd_client.gauge("WA_SOURCE_FJ_1001.failed", 0)
    statsd_client.gauge("WA_BASIC_FJ_1003.success", 0)
    statsd_client.gauge("WA_BASIC_FJ_1003.failed", 0)
    statsd_client.gauge("file.failed", 0)
    list = os.listdir(config["monitor_path"])  # 列出文件夹下所有的目录与文件
    for i in list:
        com_path = os.path.join(config["monitor_path"], i)
        Monitor(stastd=statsd_client, zipinfo="True").operate_change(com_path)
    event_handler = Monitor(stastd=statsd_client)
    observer = Observer()
    observer.schedule(event_handler, path=config["monitor_path"], recursive=True)  # recursive递归的
    observer.start()
    observer.join()
Exemplo n.º 4
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ))
    env.seed(seed)
    return env
Exemplo n.º 5
0
    def test(self):

        mon = Gdk.Display.get_monitor(Gdk.Display.get_default(), 0)
        monitor = Monitor.from_monitor(mon)

        hwinfo = HWinfo()

        hwinfo_data = hwinfo.dmi_load()
        manufacturer = hwinfo_data['sys_vendor'].lower()
        model = hwinfo_data['product_name'].lower()

        expected_width = self.expected_width
        expected_height = self.expected_height

        if monitor.width != expected_width or monitor.height != expected_height:
            self.fail(
                "Internal display did not match expected resolution, expected: "
                "{0}x{1} got: {2}x{3}".format(expected_width, expected_height,
                                              monitor.width, monitor.height))
Exemplo n.º 6
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
Exemplo n.º 7
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(args.log, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
    env.reset()

    S = set()

    corWs = queue.Queue()

    # add two extreme points
    corWs.put(FloatTensor([1.0, 0.0]))
    corWs.put(FloatTensor([0.0, 1.0]))

    # outer_loop!
    for _ in range(args.ws):

        print(colored("size of corWs: {}".format(corWs.qsize()), "green"))

        if corWs.qsize() == 0:
            corWs.put(FloatTensor([1.0, 0.0]))
            corWs.put(FloatTensor([0.0, 1.0]))

        corner_w = corWs.get_nowait()
        while not is_corner(corner_w, S) and corWs.qsize()>0:
            corner_w = corWs.get_nowait()
            print(colored("{} left....".format(corWs.qsize()), "green"))
        if not is_corner(corner_w, S):
            print(colored("no more corner w...", "green"))
            print(colored("Final S contains", "green"))
            for s in S:
                print(colored(s, "green"))
            break
        print(colored("solve for w: {}".format(corner_w), "green"))

        for num_eps in range(int(args.episode_num / args.ws)):
            terminal = False
            env.reset()
            loss = 0
            cnt = 0
            tot_reward = 0

            tot_reward_mo = 0

            probe = None
            if args.env_name == "dst":
                probe = corner_w
            elif args.env_name in ['ft', 'ft5', 'ft7']:
                probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

            while not terminal:
                state = env.observe()
                action = agent.act(state, corner_w)
                agent.w_kept = corner_w
                next_state, reward, terminal = env.step(action)
                if args.log:
                    monitor.add_log(state, action, reward, terminal, agent.w_kept)
                agent.memorize(state, action, next_state, reward, terminal, roi=True)
                loss += agent.learn(corner_w)
                if cnt > 100:
                    terminal = True
                    agent.reset()
                tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt)

                tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt)

                cnt = cnt + 1

            _, q = agent.predict(probe)

            if args.env_name == "dst":
                act_1 = q[0, 3]
                act_2 = q[0, 1]
            elif args.env_name in ['ft', 'ft5', 'ft7']:
                act_1 = q[0, 1]
                act_2 = q[0, 0]

            if args.method == "crl-naive":
                act_1 = act_1.data.cpu()
                act_2 = act_2.data.cpu()
            elif args.method == "crl-envelope":
                act_1 = probe.dot(act_1.data)
                act_2 = probe.dot(act_2.data)
            elif args.method == "crl-energy":
                act_1 = probe.dot(act_1.data)
                act_2 = probe.dot(act_2.data)
            print("end of eps %d with total reward (1) %0.2f (%0.2f, %0.2f), the Q is %0.2f | %0.2f; loss: %0.4f" % (
                num_eps,
                tot_reward,
                tot_reward_mo[0],
                tot_reward_mo[1],
                act_1,
                act_2,
                # q__max,
                loss / cnt))
            monitor.update(num_eps,
                           tot_reward,
                           act_1,
                           act_2,
                           #    q__max,
                           loss / cnt)


        # agent.is_train=False
        terminal = False
        env.reset()
        cnt = 0
        tot_reward_mo = 0
        while not terminal:
            state = env.observe()
            action = agent.act(state, corner_w)
            agent.w_kept = corner_w
            next_state, reward, terminal = env.step(action)
            if cnt > 100:
                terminal = True
                agent.reset()
            tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt)
            cnt = cnt + 1
        agent.is_train=True

        S, corWs = update_ccs(S, corWs, tot_reward_mo)

        print(colored("----------------\n", "red"))
        print(colored("Current S contains", "red"))
        for s in S:
            print(colored(s, "red"))
        print(colored("----------------\n", "red"))

    # if num_eps+1 % 100 == 0:
    # 	agent.save(args.save, args.model+args.name+"_tmp_{}".format(number))
    agent.save(args.save, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
Exemplo n.º 8
0
    def train(self):
        '''
        call this function when the episode ends
        '''
        self.episodecount += 1
        if self.monitor is None:
            self.monitor = Monitor("-" + self.algorithm)

        if not self.is_training:
            logger.info("Not in training mode")
            return
        else:
            logger.info("Update naive morl policy parameters.")

        logger.info("Episode Num so far: %s" % (self.episodecount))

        if len(self.trans_mem) > self.batch_size * 10:

            self.update_count += 1

            minibatch = self.sample(self.trans_mem, self.priority_mem,
                                    self.batch_size)
            batchify = lambda x: list(x) * self.weight_num
            state_batch = batchify(map(lambda x: x.s, minibatch))
            action_batch = batchify(map(lambda x: LongTensor([x.a]),
                                        minibatch))
            reward_batch = batchify(map(lambda x: x.r.unsqueeze(0), minibatch))
            next_state_batch = batchify(map(lambda x: x.s_, minibatch))
            terminal_batch = batchify(map(lambda x: x.d, minibatch))
            mask_batch = batchify(map(lambda x: x.ms.unsqueeze(0), minibatch))
            next_mask_batch = batchify(
                map(lambda x: x.ms_.unsqueeze(0), minibatch))

            w_batch = np.random.randn(self.weight_num, self.model_.reward_size)
            w_batch = np.abs(w_batch) / \
                      np.linalg.norm(w_batch, ord=1, axis=1, keepdims=True)
            w_batch = torch.from_numpy(w_batch.repeat(
                self.batch_size, axis=0)).type(FloatTensor)

            if self.algorithm == 'naive':
                __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)),
                                    Variable(w_batch),
                                    Variable(torch.cat(mask_batch, dim=0)))
                # detach since we don't want gradients to propagate
                # HQ, _    = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True),
                #                     Variable(w_batch, volatile=True))
                _, DQ = self.model(
                    Variable(torch.cat(next_state_batch, dim=0),
                             requires_grad=False),
                    Variable(w_batch, requires_grad=False),
                    Variable(torch.cat(next_mask_batch, dim=0),
                             requires_grad=False))
                _, act = self.model_(
                    Variable(torch.cat(next_state_batch, dim=0),
                             requires_grad=False),
                    Variable(w_batch, requires_grad=False),
                    Variable(torch.cat(next_mask_batch, dim=0),
                             requires_grad=False))[1].max(1)
                HQ = DQ.gather(1, act.unsqueeze(dim=1)).squeeze()

                w_reward_batch = torch.bmm(
                    w_batch.unsqueeze(1),
                    torch.cat(reward_batch, dim=0).unsqueeze(2)).squeeze()

                nontmlmask = self.nontmlinds(terminal_batch)
                with torch.no_grad():
                    Tau_Q = Variable(
                        torch.zeros(self.batch_size *
                                    self.weight_num).type(FloatTensor))
                    Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask]
                    Tau_Q += Variable(w_reward_batch)

                actions = Variable(torch.cat(action_batch, dim=0))

                # Compute Huber loss
                loss = F.smooth_l1_loss(Q.gather(1, actions.unsqueeze(dim=1)),
                                        Tau_Q.unsqueeze(dim=1))

            elif self.algorithm == 'envelope':
                action_size = self.model_.action_size
                reward_size = self.model_.reward_size
                __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)),
                                    Variable(w_batch),
                                    w_num=self.weight_num,
                                    execmask=Variable(
                                        torch.cat(mask_batch, dim=0)))

                # detach since we don't want gradients to propagate
                # HQ, _    = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True),
                #                     Variable(w_batch, volatile=True), w_num=self.weight_num)
                _, DQ = self.model(Variable(torch.cat(next_state_batch, dim=0),
                                            requires_grad=False),
                                   Variable(w_batch, requires_grad=False),
                                   execmask=Variable(torch.cat(next_mask_batch,
                                                               dim=0),
                                                     requires_grad=False))
                w_ext = w_batch.unsqueeze(2).repeat(1, action_size, 1)
                w_ext = w_ext.view(-1, self.model.reward_size)
                _, tmpQ = self.model_(Variable(torch.cat(next_state_batch,
                                                         dim=0),
                                               requires_grad=False),
                                      Variable(w_batch, requires_grad=False),
                                      execmask=Variable(torch.cat(
                                          next_mask_batch, dim=0),
                                                        requires_grad=False))

                tmpQ = tmpQ.view(-1, reward_size)
                # print(torch.bmm(w_ext.unsqueeze(1),
                #               tmpQ.data.unsqueeze(2)).view(-1, action_size))
                act = torch.bmm(
                    Variable(w_ext.unsqueeze(1), requires_grad=False),
                    tmpQ.unsqueeze(2)).view(-1, action_size).max(1)[1]

                HQ = DQ.gather(
                    1,
                    act.view(-1, 1, 1).expand(DQ.size(0), 1,
                                              DQ.size(2))).squeeze()

                nontmlmask = self.nontmlinds(terminal_batch)
                with torch.no_grad():
                    Tau_Q = Variable(
                        torch.zeros(self.batch_size * self.weight_num,
                                    reward_size).type(FloatTensor))
                    Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask]
                    # Tau_Q.volatile = False
                    Tau_Q += Variable(torch.cat(reward_batch, dim=0))

                actions = Variable(torch.cat(action_batch, dim=0))

                Q = Q.gather(
                    1,
                    actions.view(-1, 1,
                                 1).expand(Q.size(0), 1,
                                           Q.size(2))).view(-1, reward_size)
                Tau_Q = Tau_Q.view(-1, reward_size)

                wQ = torch.bmm(Variable(w_batch.unsqueeze(1)),
                               Q.unsqueeze(2)).squeeze()

                wTQ = torch.bmm(Variable(w_batch.unsqueeze(1)),
                                Tau_Q.unsqueeze(2)).squeeze()

                # loss = F.mse_loss(Q.view(-1), Tau_Q.view(-1))
                # print self.beta
                loss = self.beta * F.mse_loss(wQ.view(-1), wTQ.view(-1))
                loss += (1 - self.beta) * F.mse_loss(Q.view(-1),
                                                     Tau_Q.view(-1))

            self.optimizer.zero_grad()
            loss.backward()
            for param in self.model_.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

            if self.update_count % self.update_freq == 0:
                self.model.load_state_dict(self.model_.state_dict())

            self.monitor.update(self.episodecount, loss=loss.data)

        self.savePolicyInc()  # self.out_policy_file)
Exemplo n.º 9
0
class MORLPolicy(Policy.Policy):
    '''Derived from :class:`Policy`
    '''
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(MORLPolicy, self).__init__(domainString, is_training)

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        # parameter settings
        if 0:  # cfg.has_option('morlpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('morlpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.n_rew = 1
        if cfg.has_option('morlpolicy', 'n_rew'):
            self.n_rew = cfg.getint('morlpolicy', 'n_rew')

        self.lr = 0.001
        if cfg.has_option('morlpolicy', 'learning_rate'):
            self.lr = cfg.getfloat('morlpolicy', 'learning_rate')

        self.epsilon = 0.5
        if cfg.has_option('morlpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('morlpolicy', 'epsilon')

        self.epsilon_decay = True
        if cfg.has_option('morlpolicy', 'epsilon_decay'):
            self.epsilon_decay = cfg.getboolean('morlpolicy', 'epsilon_decay')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('morlpolicy', 'gamma'):
            self.gamma = cfg.getfloat('morlpolicy', 'gamma')

        self.weight_num = 32
        if cfg.has_option('morlpolicy', 'weight_num'):
            self.weight_num = cfg.getint('morlpolicy', 'weight_num')

        self.episode_num = 1000
        if cfg.has_option('morlpolicy', 'episode_num'):
            self.episode_num = cfg.getfloat('morlpolicy', 'episode_num')

        self.optimizer = "Adam"
        if cfg.has_option('morlpolicy', 'optimizer'):
            self.optimizer = cfg.get('morlpolicy', 'optimizer')

        self.save_step = 100
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.update_freq = 50
        if cfg.has_option('morlpolicy', 'update_freq'):
            self.update_freq = cfg.getint('morlpolicy', 'update_freq')

        self.policyfeatures = []
        if cfg.has_option('morlpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('morlpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('morlpolicy', 'features'))

        self.algorithm = 'naive'
        if cfg.has_option('morlpolicy', 'algorithm'):
            self.algorithm = cfg.get('morlpolicy', 'algorithm')
            logger.info('Learning algorithm: ' + self.algorithm)

        self.batch_size = 32
        if cfg.has_option('morlpolicy', 'batch_size'):
            self.batch_size = cfg.getint('morlpolicy', 'batch_size')

        self.mem_size = 1000
        if cfg.has_option('morlpolicy', 'mem_size'):
            self.mem_size = cfg.getint('morlpolicy', 'mem_size')

        self.training_freq = 1
        if cfg.has_option('morlpolicy', 'training_freq'):
            self.training_freq = cfg.getint('morlpolicy', 'training_freq')

        # set beta for envelope algorithm
        self.beta = 0.1
        if cfg.has_option('morlpolicy', 'beta'):
            self.beta = cfg.getfloat('morlpolicy', 'beta')
        self.beta_init = self.beta
        self.beta_uplim = 1.00
        self.tau = 1000.
        self.beta_expbase = float(
            np.power(self.tau * (self.beta_uplim - self.beta),
                     1. / (self.episode_num + 1)))
        self.beta_delta = self.beta_expbase / self.tau
        self.beta -= self.beta_delta

        # using homotopy method for optimization
        self.homotopy = False
        if cfg.has_option('morlpolicy', 'homotopy'):
            self.homotopy = cfg.getboolean('morlpolicy', 'homotopy')

        self.epsilon_delta = (self.epsilon - 0.05) / self.episode_num

        self.episodecount = 0

        # construct the models
        self.state_dim = self.n_in
        self.summaryaction = SummaryAction.SummaryAction(domainString)
        if action_names is None:
            self.action_names = self.summaryaction.action_names
        else:
            self.action_names = action_names
        self.action_dim = len(self.action_names)
        self.stats = [0 for _ in range(self.action_dim)]
        self.reward_dim = self.n_rew

        model = None
        if self.algorithm == 'naive':
            model = naive.NaiveLinearCQN(self.state_dim, self.action_dim,
                                         self.reward_dim)
        elif self.algorithm == 'envelope':
            model = envelope.EnvelopeLinearCQN(self.state_dim, self.action_dim,
                                               self.reward_dim)

        self.model_ = model
        self.model = copy.deepcopy(model)

        # initialize memory
        self.trans_mem = deque()
        self.trans = namedtuple('trans',
                                ['s', 'a', 's_', 'r', 'd', 'ms', 'ms_'])
        self.priority_mem = deque()
        self.mem_last_state = None
        self.mem_last_action = None
        self.mem_last_mask = None
        self.mem_cur_state = None
        self.mem_cur_action = None
        self.mem_cur_mask = None

        if self.optimizer == 'Adam':
            self.optimizer = optim.Adam(self.model_.parameters(), lr=self.lr)
        elif self.optimizer == 'RMSprop':
            self.optimizer = optim.RMSprop(self.model_.parameters(),
                                           lr=self.lr)

        try:
            self.loadPolicy(self.in_policy_file)
        except:
            logger.info("No previous model found...")

        self.w_kept = None
        self.update_count = 0
        if self.is_training:
            self.model_.train()
        if use_cuda:
            self.model.cuda()
            self.model_.cuda()

        self.monitor = None

    def get_n_in(self, domain_string):
        if domain_string == 'CamRestaurants':
            return 268
        elif domain_string == 'CamHotels':
            return 111
        elif domain_string == 'SFRestaurants':
            return 636
        elif domain_string == 'SFHotels':
            return 438
        elif domain_string == 'Laptops6':
            return 268  # ic340: this is wrong
        elif domain_string == 'Laptops11':
            return 257
        elif domain_string is 'TV':
            return 188
        else:
            print 'DOMAIN {} SIZE NOT SPECIFIED, PLEASE DEFINE n_in'.format(
                domain_string)

    def act_on(self, state, preference=None):
        if self.lastSystemAction is None and self.startwithhello:
            systemAct, nextaIdex = 'hello()', -1
        else:
            systemAct, nextaIdex = self.nextAction(state, preference)
        self.lastSystemAction = systemAct
        self.summaryAct = nextaIdex
        self.prevbelief = state

        systemAct = DiaAct.DiaAct(systemAct)
        return systemAct

    def record(self,
               reward,
               domainInControl=None,
               weight=None,
               state=None,
               action=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.actToBeRecorded is None:
            self.actToBeRecorded = self.summaryAct

        if state is None:
            state = self.prevbelief
        if action is None:
            action = self.actToBeRecorded
        cState, cAction = self.convertStateAction(state, action)

        execMask = self.summaryaction.getExecutableMask(state, cAction)
        execMask = torch.Tensor(execMask).type(FloatTensor)

        # # normalising total return to -1~1
        # reward /= 20.0

        self.mem_last_state = self.mem_cur_state
        self.mem_last_action = self.mem_cur_action
        self.mem_last_mask = self.mem_cur_mask
        self.mem_cur_state = np.vstack(
            [np.expand_dims(x, 0) for x in [cState]])
        # self.mem_cur_action = np.eye(self.action_dim, self.action_dim)[[cAction]]
        self.mem_cur_action = cAction
        self.mem_cur_mask = execMask

        state = self.mem_last_state
        action = self.mem_last_action
        next_state = self.mem_cur_state
        terminal = False

        if state is not None and action is not None:
            self.trans_mem.append(
                self.trans(
                    torch.from_numpy(state).type(FloatTensor),  # state
                    action,  # action
                    torch.from_numpy(next_state).type(
                        FloatTensor),  # next state
                    torch.from_numpy(reward).type(FloatTensor),  # reward
                    terminal,  # terminal
                    self.mem_last_mask,  # action mask
                    self.mem_cur_mask))  # next action mask

            # randomly produce a preference for calculating priority
            # preference = self.w_kept
            preference = torch.randn(self.model_.reward_size)
            preference = (torch.abs(preference) /
                          torch.norm(preference, p=1)).type(FloatTensor)

            state = torch.from_numpy(state).type(FloatTensor)

            _, q = self.model_(Variable(state, requires_grad=False),
                               Variable(preference.unsqueeze(0),
                                        requires_grad=False),
                               execmask=Variable(
                                   self.mem_last_mask.unsqueeze(0),
                                   requires_grad=False))

            q = q[0, action].data

            if self.algorithm == 'naive':
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(Variable(next_state,
                                                 requires_grad=False),
                                        Variable(preference.unsqueeze(0),
                                                 requires_grad=False),
                                        execmask=Variable(
                                            self.mem_cur_mask.unsqueeze(0),
                                            requires_grad=False))
                    hq = hq.data[0]
                    p = abs(wr + self.gamma * hq - q)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    p = abs(wr - q)
            elif self.algorithm == 'envelope':
                wq = preference.dot(q)
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(Variable(next_state,
                                                 requires_grad=False),
                                        Variable(preference.unsqueeze(0),
                                                 requires_grad=False),
                                        execmask=Variable(
                                            self.mem_cur_mask.unsqueeze(0),
                                            requires_grad=False))
                    hq = hq.data[0]
                    whq = preference.dot(hq)
                    p = abs(wr + self.gamma * whq - wq)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    # if self.homotopy:
                    #     self.beta += self.beta_delta
                    #     self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta
                    p = abs(wr - wq)
            p += 1e-5

            self.priority_mem.append(p)
            if len(self.trans_mem) > self.mem_size:
                self.trans_mem.popleft()
                self.priority_mem.popleft()

        self.actToBeRecorded = None

    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            logger.warning(
                "record attempted to be finalized for domain where nothing has been recorded before"
            )
            return

        # # normalising total return to -1~1
        # reward /= 20.0

        terminal_state, terminal_action = self.convertStateAction(
            TerminalState(), TerminalAction())

        # # normalising total return to -1~1
        # reward /= 20.0

        self.mem_last_state = self.mem_cur_state
        self.mem_last_action = self.mem_cur_action
        self.mem_last_mask = self.mem_cur_mask
        self.mem_cur_state = np.vstack(
            [np.expand_dims(x, 0) for x in [terminal_state]])
        self.mem_cur_action = None
        self.mem_cur_mask = torch.zeros(self.action_dim).type(FloatTensor)

        state = self.mem_last_state
        action = self.mem_last_action
        next_state = self.mem_cur_state
        terminal = True

        if state is not None:
            self.trans_mem.append(
                self.trans(
                    torch.from_numpy(state).type(FloatTensor),  # state
                    action,  # action
                    torch.from_numpy(next_state).type(
                        FloatTensor),  # next state
                    torch.from_numpy(reward).type(FloatTensor),  # reward
                    terminal,  # terminal
                    self.mem_last_mask,  # action mask
                    self.mem_cur_mask))  # next action mask

            # randomly produce a preference for calculating priority
            # preference = self.w_kept
            preference = torch.randn(self.model_.reward_size)
            preference = (torch.abs(preference) /
                          torch.norm(preference, p=1)).type(FloatTensor)

            state = torch.from_numpy(state).type(FloatTensor)

            _, q = self.model_(
                Variable(state, requires_grad=False),
                Variable(preference.unsqueeze(0), requires_grad=False))

            q = q.data[0, action]

            if self.algorithm == 'naive':
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(
                        Variable(next_state, requires_grad=False),
                        Variable(preference.unsqueeze(0), requires_grad=False))
                    hq = hq.data[0]
                    p = abs(wr + self.gamma * hq - q)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    p = abs(wr - q)
            elif self.algorithm == 'envelope':
                wq = preference.dot(q)
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(
                        Variable(next_state, requires_grad=False),
                        Variable(preference.unsqueeze(0), requires_grad=False))
                    hq = hq.data[0]
                    whq = preference.dot(hq)
                    p = abs(wr + self.gamma * whq - wq)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    # if self.homotopy:
                    #     self.beta += self.beta_delta
                    #     self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta
                    p = abs(wr - wq)

            p += 1e-5

            self.priority_mem.append(p)
            if len(self.trans_mem) > self.mem_size:
                self.trans_mem.popleft()
                self.priority_mem.popleft()

    def convertStateAction(self, state, action):
        '''
        nnType = 'dnn'
        #nnType = 'rnn'
        # expand one dimension to match the batch size of 1 at axis 0
        if nnType == 'rnn':
            belief = np.expand_dims(belief,axis=0)
        '''
        if isinstance(state, TerminalState):
            if self.domainUtil.domainString == 'CamRestaurants':
                return [0] * 268, action
            elif self.domainUtil.domainString == 'CamHotels':
                return [0] * 111, action
            elif self.domainUtil.domainString == 'SFRestaurants':
                return [0] * 633, action
            elif self.domainUtil.domainString == 'SFHotels':
                return [0] * 438, action
            elif self.domainUtil.domainString == 'Laptops11':
                return [0] * 257, action
            elif self.domainUtil.domainString == 'TV':
                return [0] * 188, action
        else:
            flat_belief = flatten_belief(state, self.domainUtil)
            self.prev_state_check = flat_belief

            return flat_belief, action

    def convertDIPStateAction(self, state, action):
        '''

        '''
        if isinstance(state, TerminalState):
            return [0] * 89, action

        else:
            dip_state = DIP_state(state.domainStates[state.currentdomain],
                                  self.domainString)
            action_name = self.actions.action_names[action]
            act_slot = 'general'
            for slot in dip_state.slots:
                if slot in action_name:
                    act_slot = slot
            flat_belief = dip_state.get_beliefStateVec(act_slot)
            self.prev_state_check = flat_belief

            return flat_belief, action

    def nextAction(self, beliefstate, preference=None):
        '''
        select next action

        :param beliefstate:
        :param preference:
        :returns: (int) next summary action
        '''
        beliefVec = flatten_belief(beliefstate, self.domainUtil)
        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)
        execMask = torch.Tensor(execMask).type(FloatTensor)

        if preference is None:
            if self.w_kept is None:
                self.w_kept = torch.randn(self.model_.reward_size)
                self.w_kept = (torch.abs(self.w_kept) /
                               torch.norm(self.w_kept, p=1)).type(FloatTensor)
            preference = self.w_kept

        if self.is_training and (len(self.trans_mem) < self.batch_size * 10
                                 or torch.rand(1)[0] < self.epsilon):
            admissible = [i for i, x in enumerate(execMask) if x == 0.0]
            random.shuffle(admissible)
            nextaIdex = admissible[0]
        else:
            state = np.reshape(beliefVec, (1, len(beliefVec)))
            state = torch.from_numpy(state).type(FloatTensor)
            if self.algorithm == 'naive':
                _, Q = self.model_(
                    Variable(state, requires_grad=False),
                    Variable(preference.unsqueeze(0), requires_grad=False),
                    Variable(execMask.unsqueeze(0), requires_grad=False))
                nextaIdex = np.argmax(Q.detach().cpu().numpy())
            elif self.algorithm == 'envelope':
                _, Q = self.model_(Variable(state, requires_grad=False),
                                   Variable(preference.unsqueeze(0),
                                            requires_grad=False),
                                   execmask=Variable(execMask.unsqueeze(0),
                                                     requires_grad=False))
                Q = Q.view(-1, self.model_.reward_size)
                Q = torch.mv(Q.data, preference)
                action = Q.max(0)[1].cpu().numpy()
                nextaIdex = int(action)

        self.stats[nextaIdex] += 1
        summaryAct = self.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)

        return masterAct, nextaIdex

    def sample(self, pop, pri, k):
        pri = np.array(pri).astype(np.float)
        inds = np.random.choice(range(len(pop)),
                                k,
                                replace=False,
                                p=pri / pri.sum())
        return [pop[i] for i in inds]

    def actmsk(self, num_dim, index):
        mask = ByteTensor(num_dim).zero_()
        mask[index] = 1
        return mask.unsqueeze(0)

    def nontmlinds(self, terminal_batch):
        mask = ByteTensor(terminal_batch)
        inds = torch.arange(0, len(terminal_batch)).type(LongTensor)
        inds = inds[mask.eq(0)]
        return inds

    def train(self):
        '''
        call this function when the episode ends
        '''
        self.episodecount += 1
        if self.monitor is None:
            self.monitor = Monitor("-" + self.algorithm)

        if not self.is_training:
            logger.info("Not in training mode")
            return
        else:
            logger.info("Update naive morl policy parameters.")

        logger.info("Episode Num so far: %s" % (self.episodecount))

        if len(self.trans_mem) > self.batch_size * 10:

            self.update_count += 1

            minibatch = self.sample(self.trans_mem, self.priority_mem,
                                    self.batch_size)
            batchify = lambda x: list(x) * self.weight_num
            state_batch = batchify(map(lambda x: x.s, minibatch))
            action_batch = batchify(map(lambda x: LongTensor([x.a]),
                                        minibatch))
            reward_batch = batchify(map(lambda x: x.r.unsqueeze(0), minibatch))
            next_state_batch = batchify(map(lambda x: x.s_, minibatch))
            terminal_batch = batchify(map(lambda x: x.d, minibatch))
            mask_batch = batchify(map(lambda x: x.ms.unsqueeze(0), minibatch))
            next_mask_batch = batchify(
                map(lambda x: x.ms_.unsqueeze(0), minibatch))

            w_batch = np.random.randn(self.weight_num, self.model_.reward_size)
            w_batch = np.abs(w_batch) / \
                      np.linalg.norm(w_batch, ord=1, axis=1, keepdims=True)
            w_batch = torch.from_numpy(w_batch.repeat(
                self.batch_size, axis=0)).type(FloatTensor)

            if self.algorithm == 'naive':
                __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)),
                                    Variable(w_batch),
                                    Variable(torch.cat(mask_batch, dim=0)))
                # detach since we don't want gradients to propagate
                # HQ, _    = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True),
                #                     Variable(w_batch, volatile=True))
                _, DQ = self.model(
                    Variable(torch.cat(next_state_batch, dim=0),
                             requires_grad=False),
                    Variable(w_batch, requires_grad=False),
                    Variable(torch.cat(next_mask_batch, dim=0),
                             requires_grad=False))
                _, act = self.model_(
                    Variable(torch.cat(next_state_batch, dim=0),
                             requires_grad=False),
                    Variable(w_batch, requires_grad=False),
                    Variable(torch.cat(next_mask_batch, dim=0),
                             requires_grad=False))[1].max(1)
                HQ = DQ.gather(1, act.unsqueeze(dim=1)).squeeze()

                w_reward_batch = torch.bmm(
                    w_batch.unsqueeze(1),
                    torch.cat(reward_batch, dim=0).unsqueeze(2)).squeeze()

                nontmlmask = self.nontmlinds(terminal_batch)
                with torch.no_grad():
                    Tau_Q = Variable(
                        torch.zeros(self.batch_size *
                                    self.weight_num).type(FloatTensor))
                    Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask]
                    Tau_Q += Variable(w_reward_batch)

                actions = Variable(torch.cat(action_batch, dim=0))

                # Compute Huber loss
                loss = F.smooth_l1_loss(Q.gather(1, actions.unsqueeze(dim=1)),
                                        Tau_Q.unsqueeze(dim=1))

            elif self.algorithm == 'envelope':
                action_size = self.model_.action_size
                reward_size = self.model_.reward_size
                __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)),
                                    Variable(w_batch),
                                    w_num=self.weight_num,
                                    execmask=Variable(
                                        torch.cat(mask_batch, dim=0)))

                # detach since we don't want gradients to propagate
                # HQ, _    = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True),
                #                     Variable(w_batch, volatile=True), w_num=self.weight_num)
                _, DQ = self.model(Variable(torch.cat(next_state_batch, dim=0),
                                            requires_grad=False),
                                   Variable(w_batch, requires_grad=False),
                                   execmask=Variable(torch.cat(next_mask_batch,
                                                               dim=0),
                                                     requires_grad=False))
                w_ext = w_batch.unsqueeze(2).repeat(1, action_size, 1)
                w_ext = w_ext.view(-1, self.model.reward_size)
                _, tmpQ = self.model_(Variable(torch.cat(next_state_batch,
                                                         dim=0),
                                               requires_grad=False),
                                      Variable(w_batch, requires_grad=False),
                                      execmask=Variable(torch.cat(
                                          next_mask_batch, dim=0),
                                                        requires_grad=False))

                tmpQ = tmpQ.view(-1, reward_size)
                # print(torch.bmm(w_ext.unsqueeze(1),
                #               tmpQ.data.unsqueeze(2)).view(-1, action_size))
                act = torch.bmm(
                    Variable(w_ext.unsqueeze(1), requires_grad=False),
                    tmpQ.unsqueeze(2)).view(-1, action_size).max(1)[1]

                HQ = DQ.gather(
                    1,
                    act.view(-1, 1, 1).expand(DQ.size(0), 1,
                                              DQ.size(2))).squeeze()

                nontmlmask = self.nontmlinds(terminal_batch)
                with torch.no_grad():
                    Tau_Q = Variable(
                        torch.zeros(self.batch_size * self.weight_num,
                                    reward_size).type(FloatTensor))
                    Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask]
                    # Tau_Q.volatile = False
                    Tau_Q += Variable(torch.cat(reward_batch, dim=0))

                actions = Variable(torch.cat(action_batch, dim=0))

                Q = Q.gather(
                    1,
                    actions.view(-1, 1,
                                 1).expand(Q.size(0), 1,
                                           Q.size(2))).view(-1, reward_size)
                Tau_Q = Tau_Q.view(-1, reward_size)

                wQ = torch.bmm(Variable(w_batch.unsqueeze(1)),
                               Q.unsqueeze(2)).squeeze()

                wTQ = torch.bmm(Variable(w_batch.unsqueeze(1)),
                                Tau_Q.unsqueeze(2)).squeeze()

                # loss = F.mse_loss(Q.view(-1), Tau_Q.view(-1))
                # print self.beta
                loss = self.beta * F.mse_loss(wQ.view(-1), wTQ.view(-1))
                loss += (1 - self.beta) * F.mse_loss(Q.view(-1),
                                                     Tau_Q.view(-1))

            self.optimizer.zero_grad()
            loss.backward()
            for param in self.model_.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

            if self.update_count % self.update_freq == 0:
                self.model.load_state_dict(self.model_.state_dict())

            self.monitor.update(self.episodecount, loss=loss.data)

        self.savePolicyInc()  # self.out_policy_file)

    def savePolicy(self, FORCE_SAVE=False):
        """
        Does not use this, cause it will be called from agent after every episode.
        we want to save the policy only periodically.
        """
        pass

    def savePolicyInc(self, FORCE_SAVE=False):
        """
        save model and replay buffer
        """
        if self.episodecount % self.save_step == 0:
            torch.save(
                self.model, "{}.{}.pkl".format(self.out_policy_file,
                                               self.algorithm))

    def loadPolicy(self, filename):
        """
        load model and replay buffer
        """
        # load models
        self.model_ = torch.load("{}.{}.pkl".format(filename, self.algorithm))
        self.model = copy.deepcopy(self.model_)

    def restart(self):
        self.summaryAct = None
        self.lastSystemAction = None
        self.prevbelief = None
        self.actToBeRecorded = None
        self.w_kept = None
        if self.epsilon_decay:
            self.epsilon -= self.epsilon_delta
        if self.homotopy:
            self.beta += self.beta_delta
            self.beta_delta = (
                self.beta - self.beta_init
            ) * self.beta_expbase + self.beta_init - self.beta
    def fit(self,
            x,
            y_true,
            x_test,
            y_test,
            loss,
            epochs,
            batch_size,
            learning_rate=1e-3,
            momentum=0.9,
            weight_decay=0.0002,
            zeta=0.3,
            dropoutrate=0.,
            testing=True,
            save_filename="",
            monitor=False):
        """
        :param x: (array) Containing parameters
        :param y_true: (array) Containing one hot encoded labels.
        :return (array) A 2D array of metrics (epochs, 3).
        """
        if not x.shape[0] == y_true.shape[0]:
            raise ValueError("Length of x and y arrays don't match")

        self.monitor = Monitor(
            save_filename=save_filename) if monitor else None

        # Initiate the loss object with the final activation function
        self.loss = loss()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.zeta = zeta
        self.dropout_rate = dropoutrate
        self.save_filename = save_filename
        self.input_layer_connections.append(self.get_core_input_connections())
        np.savez_compressed(self.save_filename + "_input_connections.npz",
                            inputLayerConnections=self.input_layer_connections)

        maximum_accuracy = 0
        metrics = np.zeros((epochs, 4))

        for i in range(epochs):
            # Shuffle the data
            seed = np.arange(x.shape[0])
            np.random.shuffle(seed)
            x_ = x[seed]
            y_ = y_true[seed]

            if self.monitor:
                self.monitor.start_monitor()

            # training
            t1 = datetime.datetime.now()

            for j in range(x.shape[0] // batch_size):
                k = j * batch_size
                l = (j + 1) * batch_size
                z, a, masks = self._feed_forward(x_[k:l], True)

                self._back_prop(z, a, masks, y_[k:l])

            t2 = datetime.datetime.now()

            if self.monitor:
                self.monitor.stop_monitor()

            print("\nSET-MLP Epoch ", i)
            print("Training time: ", t2 - t1)

            # test model performance on the test data at each epoch
            # this part is useful to understand model performance and can be commented for production settings
            if testing:
                t3 = datetime.datetime.now()
                accuracy_test, activations_test = self.predict(x_test, y_test)
                accuracy_train, activations_train = self.predict(x, y_true)

                t4 = datetime.datetime.now()
                maximum_accuracy = max(maximum_accuracy, accuracy_test)
                loss_test = self.loss.loss(y_test, activations_test)
                loss_train = self.loss.loss(y_true, activations_train)
                metrics[i, 0] = loss_train
                metrics[i, 1] = loss_test
                metrics[i, 2] = accuracy_train
                metrics[i, 3] = accuracy_test

                print(f"Testing time: {t4 - t3}\n; Loss test: {loss_test}; \n"
                      f"Accuracy test: {accuracy_test}; \n"
                      f"Maximum accuracy val: {maximum_accuracy}")

            t5 = datetime.datetime.now()
            if i < epochs - 1:  # do not change connectivity pattern after the last epoch
                # self.weights_evolution_I() # this implementation is more didactic, but slow.
                self.weights_evolution_II(
                )  # this implementation has the same behaviour as the one above, but it is much faster.
            t6 = datetime.datetime.now()
            print("Weights evolution time ", t6 - t5)

            # save performance metrics values in a file
            if self.save_filename != "":
                np.savetxt(self.save_filename + ".txt", metrics)

            if self.save_filename != "" and self.monitor:
                with open(self.save_filename + "_monitor.json", 'w') as file:
                    file.write(
                        json.dumps(self.monitor.get_stats(),
                                   indent=4,
                                   sort_keys=True,
                                   default=str))

        return metrics
Exemplo n.º 11
0
def main(args):
    # process config
    c = Configs(args.config)
    ROOT = os.environ['TENSOROFLOW']
    model_directory = '%s/examples/model/multi_layer_nmt' % ROOT
    model_path = '%s/model' % model_directory
    dictionary_path = {
        'source': '%s/source_dictionary.pickle' % model_directory,
        'source_reverse':
        '%s/source_reverse_dictionary.pickle' % model_directory,
        'target': '%s/target_dictionary.pickle' % model_directory,
        'target_reverse':
        '%s/target_reverse_dictionary.pickle' % model_directory
    }
    PAD = c.const['PAD']
    EOS = c.const['EOS']
    train_step = c.option['train_step']
    max_time = c.option['max_time']
    batch_size = c.option['batch_size']
    vocabulary_size = c.option['vocabulary_size']
    input_embedding_size = c.option['embedding_size']
    hidden_units = c.option['hidden_units']
    layers = c.option['layers']
    source_train_data_path = c.data['source_train_data']
    target_train_data_path = c.data['target_train_data']
    source_valid_data_path = c.data['source_valid_data']
    target_valid_data_path = c.data['target_valid_data']
    source_test_data_path = c.data['source_test_data']
    target_test_data_path = c.data['target_test_data']

    # read data
    if args.mode == 'train':
        source_dictionary, source_reverse_dictionary = build_dictionary(
            read_words(source_train_data_path), vocabulary_size)
        source_train_datas = [
            sentence_to_onehot(lines, source_dictionary)
            for lines in read_data(source_train_data_path)
        ]
        target_dictionary, target_reverse_dictionary = build_dictionary(
            read_words(target_train_data_path), vocabulary_size)
        target_train_datas = [
            sentence_to_onehot(lines, target_dictionary)
            for lines in read_data(target_train_data_path)
        ]

        source_valid_datas = [
            sentence_to_onehot(lines, source_dictionary)
            for lines in read_data(source_valid_data_path)
        ]
        target_valid_datas = [
            sentence_to_onehot(lines, target_dictionary)
            for lines in read_data(target_valid_data_path)
        ]

        if args.debug:
            source_train_datas = source_train_datas[:1000]
            target_train_datas = source_train_datas[:1000]
    else:
        with open(dictionary_path['source'], 'rb') as f1, \
             open(dictionary_path['source_reverse'], 'rb') as f2, \
             open(dictionary_path['target'], 'rb') as f3, \
             open(dictionary_path['target_reverse'], 'rb') as f4:
            source_dictionary = pickle.load(f1)
            source_reverse_dictionary = pickle.load(f2)
            target_dictionary = pickle.load(f3)
            target_reverse_dictionary = pickle.load(f4)

    source_test_datas = [
        sentence_to_onehot(lines, source_dictionary)
        for lines in read_data(source_test_data_path)
    ]
    target_test_datas = [
        sentence_to_onehot(lines, target_dictionary)
        for lines in read_data(target_test_data_path)
    ]

    # placeholder
    encoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='encoder_inputs')
    decoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_inputs')
    decoder_labels = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_labels')

    # embed
    embeddings = tf.Variable(tf.random_uniform(
        [vocabulary_size, input_embedding_size], -1.0, 1.0),
                             dtype=tf.float32,
                             name='embeddings')
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     encoder_inputs)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     decoder_inputs)

    # encoder
    encoder_units = hidden_units
    encoder_layers = [
        tf.contrib.rnn.LSTMCell(size) for size in [encoder_units] * layers
    ]
    encoder_cell = tf.contrib.rnn.MultiRNNCell(encoder_layers)
    encoder_output, encoder_final_state = tf.nn.dynamic_rnn(
        encoder_cell,
        encoder_inputs_embedded,
        dtype=tf.float32,
        time_major=True)
    del encoder_output

    # decoder
    decoder_units = encoder_units
    decoder_layers = [
        tf.contrib.rnn.LSTMCell(size) for size in [decoder_units] * layers
    ]
    decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_layers)
    decoder_output, decoder_final_state = tf.nn.dynamic_rnn(
        decoder_cell,
        decoder_inputs_embedded,
        initial_state=encoder_final_state,
        scope="plain_decoder",
        dtype=tf.float32,
        time_major=True)

    decoder_logits = tf.contrib.layers.linear(decoder_output, vocabulary_size)
    decoder_prediction = tf.argmax(
        decoder_logits, 2)  # max_time: axis=0, batch: axis=1, vocab: axis=2

    # optimizer
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_labels,
                          depth=vocabulary_size,
                          dtype=tf.float32),
        logits=decoder_logits,
    )

    loss = tf.reduce_mean(stepwise_cross_entropy)
    train_op = tf.train.AdamOptimizer().minimize(loss)

    saver = tf.train.Saver()
    minibatch_idx = {'train': 0, 'valid': 0, 'test': 0}
    with tf.Session() as sess:
        if args.mode == 'train':
            # train
            global_max_step = train_step * (
                len(source_train_datas) // batch_size + 1)
            loss_freq = global_max_step // 100 if global_max_step > 100 else 1
            loss_log = []
            batch_loss_log = []
            loss_suffix = ''
            es = EarlyStopper(max_size=5, edge_threshold=0.1)
            m = Monitor(global_max_step)
            sess.run(tf.global_variables_initializer())
            global_step = 0
            stop_flag = False
            for batch in range(train_step):
                if stop_flag:
                    break
                current_batch_loss_log = []
                while True:  # minibatch process
                    m.monitor(global_step, loss_suffix)
                    source_train_batch, _ = batchnize(source_train_datas,
                                                      batch_size,
                                                      minibatch_idx['train'])
                    target_train_batch, minibatch_idx['train'] = batchnize(
                        target_train_datas, batch_size, minibatch_idx['train'])
                    batch_data = seq2seq(source_train_batch,
                                         target_train_batch, max_time,
                                         vocabulary_size)
                    feed_dict = {
                        encoder_inputs: batch_data['encoder_inputs'],
                        decoder_inputs: batch_data['decoder_inputs'],
                        decoder_labels: batch_data['decoder_labels']
                    }
                    sess.run(fetches=[train_op, loss], feed_dict=feed_dict)
                    if global_step % loss_freq == 0:
                        source_valid_batch, _ = batchnize(
                            source_valid_datas, batch_size,
                            minibatch_idx['valid'])
                        target_valid_batch, minibatch_idx['valid'] = batchnize(
                            target_valid_datas, batch_size,
                            minibatch_idx['valid'])
                        batch_data = seq2seq(source_valid_batch,
                                             target_valid_batch, max_time,
                                             vocabulary_size)
                        feed_dict = {
                            encoder_inputs: batch_data['encoder_inputs'],
                            decoder_inputs: batch_data['decoder_inputs'],
                            decoder_labels: batch_data['decoder_labels']
                        }
                        loss_val = sess.run(fetches=loss, feed_dict=feed_dict)
                        loss_log.append(loss_val)
                        current_batch_loss_log.append(loss_val)
                        loss_suffix = 'loss: %f' % loss_val
                        es_status = es(loss_val)
                        if batch > train_step // 2 and es_status:
                            print('early stopping at step: %d' % global_step)
                            stop_flag = True
                            break
                    global_step += 1
                    if minibatch_idx['train'] == 0:
                        batch_loss = np.mean(current_batch_loss_log)
                        batch_loss_log.append(batch_loss)
                        print('Batch: {}/{}, batch loss: {}'.format(
                            batch + 1, train_step, batch_loss))
                        break

            # save tf.graph and variables
            saver.save(sess, model_path)
            print('save at %s' % model_path)

            # save plot of loss
            plt.plot(np.arange(len(loss_log)) * loss_freq, loss_log)
            plt.savefig('%s_global_loss.png' % model_path)
            plt.figure()
            plt.plot(np.arange(len(batch_loss_log)), batch_loss_log)
            plt.savefig('%s_batch_loss.png' % model_path)

            # save dictionary
            with open(dictionary_path['source'], 'wb') as f1, \
                 open(dictionary_path['source_reverse'], 'wb') as f2, \
                 open(dictionary_path['target'], 'wb') as f3, \
                 open(dictionary_path['target_reverse'], 'wb') as f4:
                pickle.dump(source_dictionary, f1)
                pickle.dump(source_reverse_dictionary, f2)
                pickle.dump(target_dictionary, f3)
                pickle.dump(target_reverse_dictionary, f4)

        elif args.mode == 'eval':
            saver.restore(sess, model_path)
            print('load from %s' % model_path)

        else:
            raise  # args.mode should be train or eval

        # evaluate
        loss_val = []
        input_vectors = None
        predict_vectors = None
        for i in range(len(source_test_datas) // batch_size + 1):
            source_test_batch, _ = batchnize(source_test_datas, batch_size,
                                             minibatch_idx['test'])
            target_test_batch, minibatch_idx['test'] = batchnize(
                target_test_datas, batch_size, minibatch_idx['test'])
            batch_data = seq2seq(source_test_batch, target_test_batch,
                                 max_time, vocabulary_size)
            feed_dict = {
                encoder_inputs: batch_data['encoder_inputs'],
                decoder_inputs: batch_data['decoder_inputs'],
                decoder_labels: batch_data['decoder_labels']
            }
            pred = sess.run(fetches=decoder_prediction, feed_dict=feed_dict)
            if predict_vectors is None:
                predict_vectors = pred.T
            else:
                predict_vectors = np.vstack((predict_vectors, pred.T))
            input_ = batch_data['encoder_inputs']
            if input_vectors is None:
                input_vectors = input_.T
            else:
                input_vectors = np.vstack((input_vectors, input_.T))
            loss_val.append(sess.run(fetches=loss, feed_dict=feed_dict))

        input_sentences = ''
        predict_sentences = ''
        for i, (input_vector, predict_vector) in enumerate(
                zip(input_vectors[:len(source_test_datas)],
                    predict_vectors[:len(target_test_datas)])):
            input_sentences += ' '.join([
                source_reverse_dictionary[vector] for vector in input_vector
                if not vector == PAD
            ])
            predict_sentences += ' '.join([
                target_reverse_dictionary[vector] for vector in predict_vector
                if not vector == PAD
            ])
            if i < len(source_test_datas) - 1:
                input_sentences += '\n'
                predict_sentences += '\n'

        evaluate_input_path = '%s.evaluate_input' % model_path
        evaluate_predict_path = '%s.evaluate_predict' % model_path
        with open(evaluate_input_path, 'w') as f1, \
             open(evaluate_predict_path, 'w') as f2:
            f1.write(input_sentences)
            f2.write(predict_sentences)

        print('input sequences at {}'.format(evaluate_input_path))
        print('predict sequences at {}'.format(evaluate_predict_path))
        print('mean of loss: %f' % np.mean(loss_val))

    print('finish.')
Exemplo n.º 12
0
def main(args):
    # process config
    c = Configs(args.config)
    ROOT = os.environ['TENSOROFLOW']
    model_path = '%s/examples/model/basic_nmt/model' % ROOT
    PAD = c.const['PAD']
    EOS = c.const['EOS']
    train_step = c.option['train_step']
    max_time = c.option['max_time']
    batch_size = c.option['batch_size']
    vocabulary_size = c.option['vocabulary_size']
    input_embedding_size = c.option['embedding_size']
    hidden_units = c.option['hidden_units']
    source_train_data_path = c.data['source_train_data']
    target_train_data_path = c.data['target_train_data']
    source_valid_data_path = c.data['source_valid_data']
    target_valid_data_path = c.data['target_valid_data']
    source_test_data_path = c.data['source_test_data']
    target_test_data_path = c.data['target_test_data']

    # read data
    source_dictionary, source_reverse_dictionary = build_dictionary(
        read_words(source_train_data_path), vocabulary_size)
    source_train_datas = [
        sentence_to_onehot(lines, source_dictionary)
        for lines in read_data(source_train_data_path)
    ]
    target_dictionary, target_reverse_dictionary = build_dictionary(
        read_words(target_train_data_path), vocabulary_size)
    target_train_datas = [
        sentence_to_onehot(lines, target_dictionary)
        for lines in read_data(target_train_data_path)
    ]

    source_valid_datas = [
        sentence_to_onehot(lines, source_dictionary)
        for lines in read_data(source_valid_data_path)
    ]
    target_valid_datas = [
        sentence_to_onehot(lines, target_dictionary)
        for lines in read_data(target_valid_data_path)
    ]
    source_test_datas = [
        sentence_to_onehot(lines, source_dictionary)
        for lines in read_data(source_test_data_path)
    ]
    target_test_datas = [
        sentence_to_onehot(lines, target_dictionary)
        for lines in read_data(target_test_data_path)
    ]

    # placeholder
    encoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='encoder_inputs')
    decoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_inputs')
    decoder_labels = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_labels')

    # embed
    embeddings = tf.Variable(tf.random_uniform(
        [vocabulary_size, input_embedding_size], -1.0, 1.0),
                             dtype=tf.float32,
                             name='embeddings')
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     encoder_inputs)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     decoder_inputs)

    # encoder
    encoder_units = hidden_units
    encoder_cell = tf.contrib.rnn.LSTMCell(encoder_units)
    _, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell,
                                               encoder_inputs_embedded,
                                               dtype=tf.float32,
                                               time_major=True)

    # decoder
    decoder_units = encoder_units
    decoder_cell = tf.contrib.rnn.LSTMCell(decoder_units)
    decoder_output, decoder_final_state = tf.nn.dynamic_rnn(
        decoder_cell,
        decoder_inputs_embedded,
        initial_state=encoder_final_state,
        scope="plain_decoder",
        dtype=tf.float32,
        time_major=True)

    decoder_logits = tf.contrib.layers.linear(decoder_output, vocabulary_size)
    decoder_prediction = tf.argmax(
        decoder_logits, 2)  # max_time: axis=0, batch: axis=1, vocab: axis=2

    # optimizer
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_labels,
                          depth=vocabulary_size,
                          dtype=tf.float32),
        logits=decoder_logits,
    )

    loss = tf.reduce_mean(stepwise_cross_entropy)
    train_op = tf.train.AdamOptimizer().minimize(loss)

    saver = tf.train.Saver()
    batch_idx = {'train': 0, 'valid': 0, 'test': 0}
    with tf.Session() as sess:
        if args.mode == 'train':
            # train
            loss_freq = train_step // 100
            loss_log = []
            loss_suffix = ''
            es = EarlyStopper(max_size=5, edge_threshold=0.1)
            m = Monitor(train_step)
            sess.run(tf.global_variables_initializer())
            for i in range(train_step):
                m.monitor(i, loss_suffix)
                source_train_batch, _ = batchnize(source_train_datas,
                                                  batch_size,
                                                  batch_idx['train'])
                target_train_batch, batch_idx['train'] = batchnize(
                    target_train_datas, batch_size, batch_idx['train'])
                batch_data = seq2seq(source_train_batch, target_train_batch,
                                     max_time, vocabulary_size)
                feed_dict = {
                    encoder_inputs: batch_data['encoder_inputs'],
                    decoder_inputs: batch_data['decoder_inputs'],
                    decoder_labels: batch_data['decoder_labels']
                }
                sess.run(fetches=[train_op, loss], feed_dict=feed_dict)
                if i % loss_freq == 0:
                    source_valid_batch, _ = batchnize(source_valid_datas,
                                                      batch_size,
                                                      batch_idx['valid'])
                    target_valid_batch, batch_idx['valid'] = batchnize(
                        target_valid_datas, batch_size, batch_idx['valid'])
                    batch_data = seq2seq(source_valid_batch,
                                         target_valid_batch, max_time,
                                         vocabulary_size)
                    feed_dict = {
                        encoder_inputs: batch_data['encoder_inputs'],
                        decoder_inputs: batch_data['decoder_inputs'],
                        decoder_labels: batch_data['decoder_labels']
                    }
                    loss_val = sess.run(fetches=loss, feed_dict=feed_dict)
                    loss_log.append(loss_val)
                    loss_suffix = 'loss: %f' % loss_val
                    es_status = es(loss_val)
                    if i > train_step // 2 and es_status:
                        print('early stopping at step: %d' % i)
                        break
            saver.save(sess, model_path)
            print('save at %s' % model_path)
            plt.plot(np.arange(len(loss_log)) * loss_freq, loss_log)
            plt.savefig('%s_loss.png' % model_path)
        elif args.mode == 'eval':
            saver.restore(sess, model_path)
            print('load from %s' % model_path)
        else:
            raise

        # evaluate
        loss_val = []
        input_vectors = None
        predict_vectors = None
        for i in range(len(source_test_datas) // batch_size + 1):
            source_test_batch, _ = batchnize(source_test_datas, batch_size,
                                             batch_idx['test'])
            target_test_batch, batch_idx['test'] = batchnize(
                target_test_datas, batch_size, batch_idx['test'])
            batch_data = seq2seq(source_test_batch, target_test_batch,
                                 max_time, vocabulary_size)
            feed_dict = {
                encoder_inputs: batch_data['encoder_inputs'],
                decoder_inputs: batch_data['decoder_inputs'],
                decoder_labels: batch_data['decoder_labels']
            }
            pred = sess.run(fetches=decoder_prediction, feed_dict=feed_dict)
            if predict_vectors is None:
                predict_vectors = pred.T
            else:
                predict_vectors = np.vstack((predict_vectors, pred.T))
            input_ = batch_data['encoder_inputs']
            if input_vectors is None:
                input_vectors = input_.T
            else:
                input_vectors = np.vstack((input_vectors, input_.T))
            loss_val.append(sess.run(fetches=loss, feed_dict=feed_dict))

        input_sentences = ''
        predict_sentences = ''
        for i, (input_vector, predict_vector) in enumerate(
                zip(input_vectors[:len(source_test_datas)],
                    predict_vectors[:len(target_test_datas)])):
            input_sentences += ' '.join([
                source_reverse_dictionary[vector] for vector in input_vector
                if not vector == PAD
            ])
            predict_sentences += ' '.join([
                target_reverse_dictionary[vector] for vector in predict_vector
                if not vector == PAD
            ])
            if i < len(source_test_datas) - 1:
                input_sentences += '\n'
                predict_sentences += '\n'

        evaluate_input_path = '%s.evaluate_input' % model_path
        evaluate_predict_path = '%s.evaluate_predict' % model_path
        with open(evaluate_input_path, 'w') as f1, \
             open(evaluate_predict_path, 'w') as f2:
            f1.write(input_sentences)
            f2.write(predict_sentences)

        print('input sequences at {}'.format(evaluate_input_path))
        print('predict sequences at {}'.format(evaluate_predict_path))
        print('mean of loss: %f' % np.mean(loss_val))

    print('finish.')
Exemplo n.º 13
0
def main(args):
    tf.reset_default_graph()

    # process config
    c = Configs(args.config)
    ROOT = os.environ['TENSOROFLOW']
    model_path = '%s/examples/model/multi_layer_seq2seq/model' % ROOT
    PAD = c.const['PAD']
    EOS = c.const['EOS']
    train_step = c.option['train_step']
    max_time = c.option['max_time']
    batch_size = c.option['batch_size']
    vocabulary_size = c.option['vocabulary_size']
    input_embedding_size = c.option['embedding_size']
    hidden_units = c.option['hidden_units']
    layers = c.option['layers']
    datas = []

    # placeholder
    encoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='encoder_inputs')
    decoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_inputs')
    decoder_labels = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_labels')

    # embed
    embeddings = tf.Variable(tf.random_uniform(
        [vocabulary_size, input_embedding_size], -1.0, 1.0),
                             dtype=tf.float32,
                             name='embeddings')
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     encoder_inputs)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     decoder_inputs)

    # encoder
    encoder_units = hidden_units
    encoder_layers = [
        tf.contrib.rnn.LSTMCell(size) for size in [encoder_units] * layers
    ]
    encoder_cell = tf.contrib.rnn.MultiRNNCell(encoder_layers)
    encoder_output, encoder_final_state = tf.nn.dynamic_rnn(
        encoder_cell,
        encoder_inputs_embedded,
        dtype=tf.float32,
        time_major=True)
    del encoder_output

    # decoder
    decoder_units = encoder_units
    decoder_layers = [
        tf.contrib.rnn.LSTMCell(size) for size in [decoder_units] * layers
    ]
    decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_layers)
    decoder_output, decoder_final_state = tf.nn.dynamic_rnn(
        decoder_cell,
        decoder_inputs_embedded,
        initial_state=encoder_final_state,
        scope="plain_decoder",
        dtype=tf.float32,
        time_major=True)

    decoder_logits = tf.contrib.layers.linear(decoder_output, vocabulary_size)
    decoder_prediction = tf.argmax(
        decoder_logits, 2)  # max_time: axis=0, batch: axis=1, vocab: axis=2

    # optimizer
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_labels,
                          depth=vocabulary_size,
                          dtype=tf.float32),
        logits=decoder_logits,
    )

    loss = tf.reduce_mean(stepwise_cross_entropy)
    train_op = tf.train.AdamOptimizer().minimize(loss)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        if args.mode == 'train':
            # train
            loss_freq = train_step // 100
            loss_log = []
            loss_suffix = ''
            es = EarlyStopper(max_size=5, edge_threshold=0.1)
            m = Monitor(train_step)
            sess.run(tf.global_variables_initializer())
            for i in range(train_step):
                m.monitor(i, loss_suffix)
                batch_data = through(datas, max_time, batch_size,
                                     vocabulary_size)
                feed_dict = {
                    encoder_inputs: batch_data['encoder_inputs'],
                    decoder_inputs: batch_data['decoder_inputs'],
                    decoder_labels: batch_data['decoder_labels']
                }
                sess.run(fetches=[train_op, loss], feed_dict=feed_dict)
                if i % loss_freq == 0:
                    batch_data = through(datas, max_time, batch_size,
                                         vocabulary_size)
                    feed_dict = {
                        encoder_inputs: batch_data['encoder_inputs'],
                        decoder_inputs: batch_data['decoder_inputs'],
                        decoder_labels: batch_data['decoder_labels']
                    }
                    loss_val = sess.run(fetches=loss, feed_dict=feed_dict)
                    loss_log.append(loss_val)
                    loss_suffix = 'loss: %f' % loss_val
                    es_status = es(loss_val)
                    if i > train_step // 2 and es_status:
                        print('early stopping at step: %d' % i)
                        break
            saver.save(sess, model_path)
            print('save at %s' % model_path)
            plt.plot(np.arange(len(loss_log)) * loss_freq, loss_log)
            plt.savefig('%s_loss.png' % model_path)
        elif args.mode == 'eval':
            saver.restore(sess, model_path)
            print('load from %s' % model_path)
        else:
            raise

        # evaluate
        batch_data = through(datas, max_time, batch_size, vocabulary_size)
        feed_dict = {
            encoder_inputs: batch_data['encoder_inputs'],
            decoder_inputs: batch_data['decoder_inputs'],
            decoder_labels: batch_data['decoder_labels']
        }
        pred = sess.run(fetches=decoder_prediction, feed_dict=feed_dict)
        input_ = batch_data['encoder_inputs']
        loss_val = sess.run(fetches=loss, feed_dict=feed_dict)

        print('input sequences...\n{}'.format(input_))
        print('predict sequences...\n{}'.format(pred))
        print('loss: %f' % loss_val)

    print('finish.')
Exemplo n.º 14
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    logdir = kwargs['logdir']
    seed = kwargs['seed']
    headless = kwargs['headless']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed * 20)
    np.random.seed(seed * 20)

    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v0')  # Make the gym environment

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)

    def rgb2gray(rgb):
        return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])

    with tf.Session() as sess:
        network = DQN(
            sess,
            create_basic([64, 64, 256], transpose=True),
            [(env.world.number_of_snakes) * 2 + 1, env.world.screen_width,
             env.world.screen_height],
            None,
            n_actions=4,
            batch_size=None,
            gamma=.99,
            update_freq=None,
            ddqn=True,  # double dqn
            buffer_size=None,
            clip_grad=None,
            batches_per_epoch=None,
            is_sparse=False,
            use_priority=False)

        monitor = Monitor(os.path.join(logdir, 'test_gifs'))
        # summary_writer = tf.summary.FileWriter(logdir)

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        saver = tf.train.Saver(max_to_keep=2)

        if True:
            try:
                print('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(
                    os.path.join(os.getcwd(), logdir))
                saver.restore(sess, ckpt.model_checkpoint_path)
                iteration_offset = int(
                    ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        ################################################################
        # Fill Buffer
        ################################################################

        tic = time.time()
        total_timesteps = 0

        for iteration in range(5):
            obs = env.reset()
            # obs = env.render('rgb_array', headless = headless).astype(float)
            # obs /= obs.max()
            # obs = rgb2gray(obs)

            done_n = np.array([False] * env.n_actors)
            steps = 0
            viewer = None
            while not done_n.all():

                if True:
                    if (not viewer) and (not headless):
                        from gym.envs.classic_control import rendering
                        viewer = rendering.SimpleImageViewer()

                    rgb = env.render('rgb_array', headless=headless)
                    scaler = 10
                    rgb = repeat_upsample(rgb, scaler, scaler)

                    if not headless:

                        viewer.imshow(rgb)
                        time.sleep(.01)

                    monitor.add(rgb, iteration, iteration)

                last_obs = np.array([[x.A for x in obs]])
                acts = network.greedy_select(
                    last_obs, 0)  #network.greedy_select([[last_obs]], 0)
                acts = [str(x) for x in acts]

                # Next step
                obs, reward_n, done_n = env.step(acts[-1])
                # obs = env.render('rgb_array', headless = headless).astype(float)
                # obs /= obs.max()
                # obs = rgb2gray(obs)

                steps += 1
                if steps > 300:
                    break

            monitor.make_gifs(iteration, fps=12)
            pdb.set_trace()
Exemplo n.º 15
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    iterations=kwargs['iterations']
    discount=kwargs['discount']
    batch_size=kwargs['batch_size']
    num_batches=kwargs['num_batches']
    max_seq_length=kwargs['max_seq_length']
    learning_rate=kwargs['learning_rate']
    animate=kwargs['animate']
    logdir=kwargs['logdir']
    seed=kwargs['seed']
    games_played_per_epoch=kwargs['games_played_per_epoch']
    load_model = False
    mcts_iterations=kwargs['mcts_iterations']
    batches_per_epoch=kwargs['batches_per_epoch']
    headless=kwargs['headless']
    update_freq=kwargs['update_freq']
    buffer_size=kwargs['buffer_size']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed)
    np.random.seed(seed)

    
    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v0') # Make the gym environment
    maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes
   

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    sess = tf.Session(config=tf_config)

    summary_writers = []
    for idx in np.arange(env.n_actors):
        summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','snake_%s' % idx) ))

    summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','training_stats') ))    

    def rgb2gray(rgb):
        return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

    with tf.Session() as sess:
        network = DQN( 
                     sess,
                     create_basic([16,16,64], transpose=True),
                     [1,env.world.screen_width,env.world.screen_height], 
                     summary_writers[-1],
                     n_actions=4, 
                     batch_size=batch_size,
                     gamma=.99,
                     update_freq=update_freq,
                     ddqn=True, # double dqn
                     buffer_size = buffer_size,
                     clip_grad = None,
                     batches_per_epoch = batches_per_epoch,
                     is_sparse = False
                     )

        monitor = Monitor(os.path.join(logdir,'gifs'))
        epsilon_schedule = LinearSchedule(iterations*9/10, 1.0, 0.01)
        learning_rate_schedule = PiecewiseSchedule([(0,1e-3),(20000,5e-4),(50000,1e-4)], outside_value=1e-4)

        saver = tf.train.Saver(max_to_keep=2)
        # summary_writer = tf.summary.FileWriter(logdir) 

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        if load_model == True:
            try:
                print ('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(logdir)
                saver.restore(sess,ckpt.model_checkpoint_path)
                iteration_offset = int(ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print ('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0   
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        summary_writers[0].add_graph(sess.graph)

        ################################################################
        # Fill Buffer
        ################################################################

        tic = time.time()
        total_timesteps = 0

        while not network.buffer.full(N=buffer_size/2):
            network.buffer.games_played += 1
            print 'Game number: %s. Buffer_size: %s' % (network.buffer.games_played, network.buffer.buffer_size)
            _ = env.reset()
            obs = env.render('rgb_array', headless = headless).astype(float)
            obs /= obs.max()
            obs = rgb2gray(obs)

            done_n = np.array([False]*env.n_actors)
            steps = 0
            while not done_n.all():
                last_obs = obs
                acts = network.greedy_select([[last_obs]], 1.) 
                acts = [str(x) for x in acts]
      
                # Next step
                _, reward_n, done_n = env.step(acts[-1])
                obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                steps += 1

                network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                if steps > maximum_number_of_steps:
                    done_n[:] = True

        print 'Filled Buffer'

        ################################################################
        # Train Loop
        ################################################################
        network.buffer.soft_reset()
        total_number_of_steps_in_iteration = 0

        for iteration in range(iteration_offset, iteration_offset + iterations):
            print('{0} Iteration {1} {0}'.format('*'*10, iteration))
            timesteps_in_iteration = 0

            if (iteration % update_freq == 0):
                saver.save(sess,os.path.join(logdir,'model-'+str(iteration)+'.cptk'))
                print "Saved Model. Timestep count: %s" % iteration

            total_reward = np.array([0]*env.n_actors)

            while True:
                network.buffer.games_played += 1
                if (((network.buffer.games_played) % 10) == 0):
                    print 'Epoch: %s. Game number: %s' % (iteration, network.buffer.games_played)
                _ = env.reset()
                rgb = obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                animate_episode = (iteration % (update_freq) == 0) and animate

                done_n = np.array([False]*env.n_actors)
                steps = 0
                
                # Runs policy, collects observations and rewards
                viewer = None

                while not done_n.all():

                    if animate_episode:
                        if (not viewer) and (not headless):
                            from gym.envs.classic_control import rendering
                            viewer = rendering.SimpleImageViewer()

                        rgb = env.render('rgb_array', headless = headless)
                        scaler = 10
                        rgb=repeat_upsample(rgb,scaler,scaler)

                        if not headless:
                            
                            viewer.imshow(rgb)
                            time.sleep(.01)

                        monitor.add(rgb, iteration, network.buffer.games_played)

                    
                    # ob = get_data(np.array(raw_observations)[-2:])
                    last_obs = obs

                    # Control the exploration
                    acts = network.greedy_select([[last_obs]], epsilon_schedule.value(network.epoch)) # epsilon greedy

                    acts = [str(x) for x in acts]
          
                    # Next step
                    _, reward_n, done_n = env.step(acts[-1])
                    obs = env.render('rgb_array', headless = headless).astype(float)
                    obs /= obs.max()
                    obs = rgb2gray(obs)

                    total_reward += np.array(reward_n)

                    if total_number_of_steps_in_iteration % 4 == 0:
                        network.train_step(learning_rate_schedule)
                    
                    total_number_of_steps_in_iteration += 1
                    steps += 1

                    network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                    # terminate the collection of data if the controller shows stability
                    # for a long time. This is a good thing.
                    if steps > maximum_number_of_steps:
                        done_n[:] = True

                if viewer:
                    viewer.close()

                if network.buffer.games_played >= 1:
                    break

            monitor.make_gifs(iteration)
            
            
            for count, writer in enumerate(summary_writers):
                if count < (len(summary_writers) - 1):
                    summary = tf.Summary()
                    summary.value.add(tag='Average Reward', simple_value=(total_reward[count]))
                    summary.value.add(tag='Steps Taken', simple_value=(steps))
                    writer.add_summary(summary, iteration)
                writer.flush()
Exemplo n.º 16
0
class Dense_MLP:
    def __init__(self, dimensions, activations):
        """
        :param dimensions: (tpl/ list) Dimensions of the neural net. (input, hidden layer, output)
        :param activations: (tpl/ list) Activations functions.

        Example of three hidden layer with
        - 3312 input features
        - 3000 hidden neurons
        - 3000 hidden neurons
        - 3000 hidden neurons
        - 5 output classes


        layers -->    [1,        2,     3,     4,     5]
        ----------------------------------------

        dimensions =  (3312,     3000,  3000,  3000,  5)
        activations = (          Relu,  Relu,  Relu,  Sigmoid)
        """
        self.n_layers = len(dimensions)
        self.loss = None
        self.learning_rate = None
        self.momentum = None
        self.weight_decay = None
        self.dropout_rate = 0.  # dropout rate
        self.dimensions = dimensions

        self.save_filename = ""
        self.monitor = None

        # Weights and biases are initiated by index. For a one hidden layer net you will have a w[1] and w[2]
        self.w = {}
        self.b = {}
        self.pdw = {}
        self.pdd = {}

        # Activations are also initiated by index. For the example we will have activations[2] and activations[3]
        self.activations = {}
        for i in range(len(dimensions) - 1):
            # He uniform initialization
            limit = np.sqrt(6. / float(dimensions[i]))
            self.w[i + 1] = np.random.uniform(
                -limit, limit, (dimensions[i], dimensions[i + 1]))
            self.b[i + 1] = np.zeros(dimensions[i + 1])
            self.activations[i + 2] = activations[i]

    def _feed_forward(self, x, drop=False):
        """
        Execute a forward feed through the network.
        :param x: (array) Batch of input data vectors.
        :return: (tpl) Node outputs and activations per layer. The numbering of the output is equivalent to the layer numbers.
        """
        # w(x) + b
        z = {}

        # activations: f(z)
        a = {
            1: x
        }  # First layer has no activations as input. The input x is the input.
        masks = {}

        for i in range(1, self.n_layers):
            z[i + 1] = a[i] @ self.w[i] + self.b[i]
            a[i + 1] = self.activations[i + 1].activation(z[i + 1])
            if drop:
                if i < self.n_layers - 1:
                    # apply dropout
                    a[i + 1], keep_mask = dropout(a[i + 1], self.dropout_rate)
                    masks[i + 1] = keep_mask

        return z, a, masks

    def _back_prop(self, z, a, masks, y_true):
        """
        The input dicts keys represent the layers of the net.

        a = { 1: x,
              2: f(w1(x) + b1)
              3: f(w2(a2) + b2)
              4: f(w3(a3) + b3)
              5: f(w4(a4) + b4)
              }

        :param z: (dict) w(x) + b
        :param a: (dict) f(z)
        :param y_true: (array) One hot encoded truth vector.
        :return:
        """
        keep_prob = 1.
        if self.dropout_rate > 0:
            keep_prob = np.float32(1. - self.dropout_rate)

        # Determine partial derivative and delta for the output layer.
        # delta output layer
        delta = self.loss.delta(y_true, a[self.n_layers])
        dw = np.dot(a[self.n_layers - 1].T, delta)

        update_params = {self.n_layers - 1: (dw, np.mean(delta, axis=0))}

        # In case of three layer net will iterate over i = 2 and i = 1
        # Determine partial derivative and delta for the rest of the layers.
        # Each iteration requires the delta from the previous layer, propagating backwards.
        for i in reversed(range(2, self.n_layers)):
            # dropout for the backpropagation step
            if keep_prob != 1:
                delta = (delta @ self.w[i].transpose()
                         ) * self.activations[i].prime(z[i])
                delta = delta * masks[i]
                delta /= keep_prob
            else:
                delta = (delta @ self.w[i].transpose()
                         ) * self.activations[i].prime(z[i])

            dw = np.dot(a[i - 1].T, delta)

            update_params[i - 1] = (dw, np.mean(delta, axis=0))
        for k, v in update_params.items():
            self._update_w_b(k, v[0], v[1])

    def _update_w_b(self, index, dw, delta):
        """
        Update weights and biases.

        :param index: (int) Number of the layer
        :param dw: (array) Partial derivatives
        :param delta: (array) Delta error.
        """

        # perform the update with momentum
        if index not in self.pdw:
            self.pdw[index] = -self.learning_rate * dw
            self.pdd[index] = -self.learning_rate * delta
        else:
            self.pdw[index] = self.momentum * self.pdw[
                index] - self.learning_rate * dw
            self.pdd[index] = self.momentum * self.pdd[
                index] - self.learning_rate * delta

        self.w[index] += self.pdw[index] - self.weight_decay * self.w[index]
        self.b[index] += self.pdd[index] - self.weight_decay * self.b[index]

    def fit(self,
            x,
            y_true,
            x_test,
            y_test,
            loss,
            epochs,
            batch_size,
            learning_rate=1e-3,
            momentum=0.9,
            weight_decay=0.0002,
            dropoutrate=0.,
            testing=True,
            save_filename="",
            monitor=False):
        """
        :param x: (array) Containing parameters
        :param y_true: (array) Containing one hot encoded labels.
        :return (array) A 2D array of metrics (epochs, 3).
        """
        if not x.shape[0] == y_true.shape[0]:
            raise ValueError("Length of x and y arrays don't match")

        self.monitor = Monitor(
            save_filename=save_filename) if monitor else None

        # Initiate the loss object with the final activation function
        self.loss = loss()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.dropout_rate = dropoutrate
        self.save_filename = save_filename

        maximum_accuracy = 0

        metrics = np.zeros((epochs, 4))

        for i in range(epochs):
            # Shuffle the data
            seed = np.arange(x.shape[0])
            np.random.shuffle(seed)
            x_ = x[seed]
            y_ = y_true[seed]

            if self.monitor:
                self.monitor.start_monitor()

            # training
            t1 = datetime.datetime.now()

            for j in range(x.shape[0] // batch_size):
                k = j * batch_size
                l = (j + 1) * batch_size
                z, a, masks = self._feed_forward(x_[k:l], True)

                self._back_prop(z, a, masks, y_[k:l])

            t2 = datetime.datetime.now()

            if self.monitor:
                self.monitor.stop_monitor()

            print("\nDense-MLP Epoch ", i)
            print("Training time: ", t2 - t1)

            # test model performance on the test data at each epoch
            # this part is useful to understand model performance and can be commented for production settings
            if (testing):
                t3 = datetime.datetime.now()
                accuracy_test, activations_test = self.predict(
                    x_test, y_test, batch_size)
                accuracy_train, activations_train = self.predict(
                    x, y_true, batch_size)
                t4 = datetime.datetime.now()
                maximum_accuracy = max(maximum_accuracy, accuracy_test)
                loss_test = self.loss.loss(y_test, activations_test)
                loss_train = self.loss.loss(y_true, activations_train)
                metrics[i, 0] = loss_train
                metrics[i, 1] = loss_test
                metrics[i, 2] = accuracy_train
                metrics[i, 3] = accuracy_test

                print(f"Testing time: {t4 - t3}\n; Loss test: {loss_test}; \n"
                      f"Accuracy test: {accuracy_test}; \n"
                      f"Maximum accuracy val: {maximum_accuracy}")

            # save performance metrics values in a file
            if save_filename != "":
                np.savetxt(save_filename + ".txt", metrics)

            if self.save_filename != "" and self.monitor:
                with open(self.save_filename + "_monitor.json", 'w') as file:
                    file.write(
                        json.dumps(self.monitor.get_stats(),
                                   indent=4,
                                   sort_keys=True,
                                   default=str))

        return metrics

    def predict(self, x_test, y_test, batch_size=100):
        """
        :param x_test: (array) Test input
        :param y_test: (array) Correct test output
        :param batch_size:
        :return: (flt) Classification accuracy
        :return: (array) A 2D array of shape (n_cases, n_classes).
        """
        activations = np.zeros((y_test.shape[0], y_test.shape[1]))
        for j in range(x_test.shape[0] // batch_size):
            k = j * batch_size
            l = (j + 1) * batch_size
            _, a_test, _ = self._feed_forward(x_test[k:l], drop=False)
            activations[k:l] = a_test[self.n_layers]
        accuracy = compute_accuracy(activations, y_test)
        return accuracy, activations
Exemplo n.º 17
0
 def make_env():
     env = make_mujoco_env(args.env, args.seed)
     # env = gym.make(env_id)
     env = Monitor(env, logger.get_dir(), allow_early_resets=True)
     return env
Exemplo n.º 18
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    iterations = kwargs['iterations']
    discount = kwargs['discount']
    batch_size = kwargs['batch_size']
    num_batches = kwargs['num_batches']
    max_seq_length = kwargs['max_seq_length']
    learning_rate = kwargs['learning_rate']
    animate = kwargs['animate']
    logdir = kwargs['logdir']
    seed = kwargs['seed']
    games_played_per_epoch = kwargs['games_played_per_epoch']
    load_model = False
    mcts_iterations = kwargs['mcts_iterations']
    batches_per_epoch = kwargs['batches_per_epoch']
    headless = kwargs['headless']
    update_freq = kwargs['update_freq']
    buffer_size = kwargs['buffer_size']
    use_priority = kwargs['use_priority']
    policy_batch_size = kwargs['policy_batch_size']
    reservoir_buffer_size = kwargs['reservoir_buffer_size']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed)
    np.random.seed(seed)

    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v1')  # Make the gym environment
    maximum_number_of_steps = max_seq_length  #or env.max_episode_steps # Maximum length for episodes

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)

    summary_writers = []
    for idx in np.arange(env.n_actors):
        summary_writers.append(
            tf.summary.FileWriter(
                os.path.join(logdir, 'tensorboard', 'snake_%s' % idx)))

    summary_writers.append(
        tf.summary.FileWriter(
            os.path.join(logdir, 'tensorboard', 'training_stats')))

    with tf.Session() as sess:

        networks = []

        for i in range(env.n_actors):
            networks.append(
                SelfPlay(
                    sess,
                    create_basic([64, 64, 256], transpose=True),
                    [(env.n_actors) * 2 + 1, env.world.screen_width,
                     env.world.screen_height],
                    summary_writers[-1],
                    n_actions=4,
                    batch_size=batch_size,
                    gamma=.99,
                    update_freq=update_freq,
                    ddqn=True,  # double dqn
                    buffer_size=buffer_size,
                    clip_grad=None,
                    batches_per_epoch=batches_per_epoch,
                    is_sparse=True,
                    use_priority=use_priority,
                    _id=i,
                    policy_batch_size=policy_batch_size,
                    reservoir_buffer_size=reservoir_buffer_size))

        monitor = Monitor(os.path.join(logdir, 'gifs'))
        epsilon_schedule = PiecewiseSchedule(
            [(0, .2), (50000, .05), (75000, .01)],
            outside_value=.01)  #LinearSchedule(iterations*60/100, 1., 0.001)
        eta_schedule = PiecewiseSchedule(
            [(0, .8), (60000, .4)],
            outside_value=.4)  #LinearSchedule(iterations*60/100, 0.2, 0.1)
        if use_priority:
            beta_schedule = LinearSchedule(iterations, 0.4, 1.)
        learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (30000, 5e-4),
                                                    (60000, 1e-4)],
                                                   outside_value=1e-4)
        policy_learning_rate_schedule = PiecewiseSchedule([(0, 1e-3),
                                                           (4000, 5e-4),
                                                           (20000, 1e-4)],
                                                          outside_value=1e-4)

        saver = tf.train.Saver(max_to_keep=2)
        # summary_writer = tf.summary.FileWriter(logdir)

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        if load_model == True:
            try:
                print('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(logdir)
                saver.restore(sess, ckpt.model_checkpoint_path)
                iteration_offset = int(
                    ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        summary_writers[0].add_graph(sess.graph)

        ################################################################
        # Train Loop
        ################################################################

        tic = time.time()
        total_timesteps = 0

        while not all([
                network.buffer.full(N=int(buffer_size / 2.))
                for network in networks
        ]):
            networks[0].buffer.games_played += 1
            print 'Game number: %s. Buffer_sizes: %s' % (
                networks[0].buffer.games_played,
                [network.buffer.buffer_size for network in networks])
            obs = env.reset()

            done_n = np.array([False] * env.n_actors)
            steps = 0
            length_alive = np.array([0] * env.n_actors)
            viewer = None
            while not done_n.all():

                length_alive[env.world.idxs_of_alive_snakes] += 1
                last_obs = obs

                acts = []
                for i, network in enumerate(networks):
                    act = network.greedy_select(
                        np.array([[x.A for x in get_data(last_obs, i)]]), 1.)
                    acts += [str(act[0])]

                # Next step
                obs, reward_n, done_n = env.step(acts)
                steps += 1

                for i in env.world.idxs_of_alive_snakes:
                    priority = networks[i].get_error(
                        np.array(get_data(last_obs, i)), np.array(acts[i]),
                        np.array(reward_n[i]), np.array(get_data(obs, i)),
                        np.array(done_n[i]))

                    networks[i].store(
                        np.array(get_data(last_obs, i)),  # state
                        np.array(acts[i]),  # action
                        np.array(reward_n[i]),  #rewards
                        np.array(get_data(obs, i)),  #new state
                        np.array(done_n[i]),  #done
                        priority=priority)

                    # networks[i].store_reservoir(np.array(get_data(last_obs, i)), # state
                    #                                     np.array(int(acts[i])))

                # terminate the collection of data if the controller shows stability
                # for a long time. This is a good thing.
                if steps > maximum_number_of_steps:
                    done_n[:] = True

        print 'Filled Buffer'

        to_learn = np.array([0] * env.n_actors)
        frames_seen = np.array([0] * env.n_actors)

        for iteration in range(iteration_offset,
                               iteration_offset + iterations + 1):
            print('{0} Iteration {1} {0}'.format('*' * 10, iteration))
            networks[0].buffer.soft_reset()
            timesteps_in_iteration = 0

            if (iteration % update_freq == 0):
                saver.save(
                    sess,
                    os.path.join(logdir, 'model-' + str(iteration) + '.cptk'))
                print "Saved Model. Timestep count: %s" % iteration

            total_number_of_steps_in_iteration = 0

            total_reward = np.array([0] * env.n_actors)

            while True:
                networks[0].buffer.games_played += 1
                if (((networks[0].buffer.games_played) % 10) == 0):
                    print 'Epoch: %s. Game number: %s' % (
                        iteration, networks[0].buffer.games_played)
                obs = env.reset()

                # raw_observations = []
                # raw_observations.append(np.array(obs))

                animate_episode = ((networks[0].buffer.games_played - 1)
                                   == 0) and (iteration % update_freq
                                              == 0) and animate

                done_n = np.array([False] * env.n_actors)
                steps = 0

                # Runs policy, collects observations and rewards
                viewer = None

                length_alive = np.array([0] * env.n_actors)
                game_time = time.time()
                action_times = []
                learn_times = []

                select_from_average = np.array([True] * env.n_actors)

                for idx in range(select_from_average.shape[0]):
                    r = np.random.uniform()
                    eta = eta_schedule.value(iteration)
                    if (eta > 0) and (r <= eta):
                        select_from_average[idx] = False  # Sample from greedy

                while not done_n.all():

                    if animate_episode:
                        if (not viewer) and (not headless):
                            from gym.envs.classic_control import rendering
                            viewer = rendering.SimpleImageViewer()

                        rgb = env.render('rgb_array', headless=headless)
                        scaler = 10
                        rgb = repeat_upsample(rgb, scaler, scaler)

                        if not headless:

                            viewer.imshow(rgb)
                            time.sleep(.01)

                        monitor.add(rgb, iteration,
                                    networks[0].buffer.games_played)

                    length_alive[env.world.idxs_of_alive_snakes] += 1
                    to_learn[env.world.idxs_of_alive_snakes] += 1
                    # ob = get_data(np.array(raw_observations)[-2:])
                    last_obs = obs

                    # Control the exploration
                    acts = []
                    action_time = time.time()
                    for i, network in enumerate(networks):
                        if env.world.snakes[i].alive:
                            act = network.select_from_policy(
                                np.array([[x.A
                                           for x in get_data(last_obs, i)]]),
                                epsilon_schedule.value(iteration),
                                select_from_average[i])
                            acts += [str(act[0])]
                        else:
                            acts += [str(0)]

                    action_times.append(time.time() - action_time)
                    # Next step
                    obs, reward_n, done_n = env.step(acts)

                    total_reward += np.array(reward_n)

                    total_number_of_steps_in_iteration += 1
                    steps += 1

                    for i in env.world.idxs_of_alive_snakes:
                        priority = networks[i].get_error(
                            np.array(get_data(last_obs, i)), np.array(acts[i]),
                            np.array(reward_n[i]), np.array(get_data(obs, i)),
                            np.array(done_n[i]))

                        networks[i].store(
                            np.array(get_data(last_obs, i)),  # state
                            np.array(acts[i]),  # action
                            np.array(reward_n[i]),  #rewards
                            np.array(get_data(obs, i)),  #new state
                            np.array(done_n[i]),  #done
                            priority=priority)
                        if not select_from_average[i]:
                            networks[i].store_reservoir(
                                np.array(get_data(last_obs, i)),  # state
                                np.array(int(acts[i])))

                    # max: to cover all new steps added to buffer, min: to not overdo too much
                    learn_time = time.time()
                    for network_id in [
                            x for x in range(len(to_learn))
                            if to_learn[x] >= max(
                                networks[x].batch_size,
                                networks[x].avg_policy_batch_size)
                    ]:
                        to_learn[network_id] = 0
                        network = networks[network_id]
                        for _ in range(5):
                            frames_seen[network_id] += networks[
                                network_id].batch_size
                            if use_priority:
                                network.train_step(learning_rate_schedule,
                                                   beta_schedule)
                            else:
                                network.train_step(learning_rate_schedule)

                        for _ in range(5):
                            if network.reservoir.buffer_size > 0:
                                network.avg_policy_train_step(
                                    policy_learning_rate_schedule)

                    learn_times.append(time.time() - learn_time)
                    # terminate the collection of data if the controller shows stability
                    # for a long time. This is a good thing.
                    if steps > maximum_number_of_steps:
                        done_n[:] = True

                if viewer:
                    viewer.close()

                if networks[0].buffer.games_played >= 1:
                    break

            game_time = time.time() - game_time
            monitor.make_gifs(iteration)

            for count, writer in enumerate(summary_writers[:-1]):
                summary = tf.Summary()
                summary.value.add(tag='Average Reward',
                                  simple_value=(total_reward[count]))
                summary.value.add(tag='Steps Taken',
                                  simple_value=(length_alive[count]))
                summary.value.add(tag='Frames Seen',
                                  simple_value=frames_seen[count])
                writer.add_summary(summary, iteration)
                writer.flush()

            summary = tf.Summary()
            summary.value.add(tag='Time Elapsed/Game', simple_value=game_time)
            summary.value.add(tag='Time Elapsed/Total Actions',
                              simple_value=np.sum(action_times))
            summary.value.add(tag='Time Elapsed/Mean Actions',
                              simple_value=np.mean(action_times))
            summary.value.add(tag='Time Elapsed/Max Actions',
                              simple_value=np.max(action_times))
            summary.value.add(tag='Time Elapsed/Min Actions',
                              simple_value=np.min(action_times))
            summary.value.add(tag='Time Elapsed/Total Learn',
                              simple_value=np.sum(learn_times))
            summary.value.add(tag='Time Elapsed/Mean Learn',
                              simple_value=np.mean(learn_times))
            summary.value.add(tag='Time Elapsed/Max Learn',
                              simple_value=np.max(learn_times))
            summary.value.add(tag='Time Elapsed/Min Learn',
                              simple_value=np.min(learn_times))
            summary_writers[-1].add_summary(summary, iteration)
            summary_writers[-1].flush()

            print game_time, sum(action_times), sum(learn_times)
class SET_MLP:
    def __init__(self, dimensions, activations, epsilon=20):
        """
        :param dimensions: (tpl/ list) Dimensions of the neural net. (input, hidden layer, output)
        :param activations: (tpl/ list) Activations functions.

        Example of three hidden layer with
        - 3312 input features
        - 3000 hidden neurons
        - 3000 hidden neurons
        - 3000 hidden neurons
        - 5 output classes


        layers -->    [1,        2,     3,     4,     5]
        ----------------------------------------

        dimensions =  (3312,     3000,  3000,  3000,  5)
        activations = (          Relu,  Relu,  Relu,  Sigmoid)
        """
        self.n_layers = len(dimensions)
        self.loss = None
        self.dropout_rate = 0.  # dropout rate
        self.learning_rate = None
        self.momentum = None
        self.weight_decay = None
        self.epsilon = epsilon  # control the sparsity level as discussed in the paper
        self.zeta = None  # the fraction of the weights removed
        self.dimensions = dimensions

        self.save_filename = ""
        self.input_layer_connections = []
        self.monitor = None

        # Weights and biases are initiated by index. For a one hidden layer net you will have a w[1] and w[2]
        self.w = {}
        self.b = {}
        self.pdw = {}
        self.pdd = {}

        # Activations are also initiated by index. For the example we will have activations[2] and activations[3]
        self.activations = {}
        for i in range(len(dimensions) - 1):
            self.w[i + 1] = create_sparse_weights(
                self.epsilon, dimensions[i],
                dimensions[i + 1])  # create sparse weight matrices
            self.b[i + 1] = np.zeros(dimensions[i + 1], dtype='float32')
            self.activations[i + 2] = activations[i]

    def _feed_forward(self, x, drop=False):
        """
        Execute a forward feed through the network.
        :param x: (array) Batch of input data vectors.
        :return: (tpl) Node outputs and activations per layer. The numbering of the output is equivalent to the layer numbers.
        """
        # w(x) + b
        z = {}

        # activations: f(z)
        a = {
            1: x
        }  # First layer has no activations as input. The input x is the input.
        masks = {}

        for i in range(1, self.n_layers):
            z[i + 1] = a[i] @ self.w[i] + self.b[i]
            a[i + 1] = self.activations[i + 1].activation(z[i + 1])
            if drop:
                if i < self.n_layers - 1:
                    # apply dropout
                    a[i + 1], keep_mask = dropout(a[i + 1], self.dropout_rate)
                    masks[i + 1] = keep_mask

        return z, a, masks

    def _back_prop(self, z, a, masks, y_true):
        """
        The input dicts keys represent the layers of the net.

        a = { 1: x,
              2: f(w1(x) + b1)
              3: f(w2(a2) + b2)
              4: f(w3(a3) + b3)
              5: f(w4(a4) + b4)
              }

        :param z: (dict) w(x) + b
        :param a: (dict) f(z)
        :param y_true: (array) One hot encoded truth vector.
        :return:
        """
        keep_prob = 1.
        if self.dropout_rate > 0:
            keep_prob = np.float32(1. - self.dropout_rate)

        # Determine partial derivative and delta for the output layer.
        # delta output layer
        delta = self.loss.delta(y_true, a[self.n_layers])
        dw = coo_matrix(self.w[self.n_layers - 1], dtype='float32')
        # compute backpropagation updates
        backpropagation_updates_numpy(a[self.n_layers - 1], delta, dw.row,
                                      dw.col, dw.data)

        update_params = {
            self.n_layers - 1: (dw.tocsr(), np.mean(delta, axis=0))
        }

        # In case of three layer net will iterate over i = 2 and i = 1
        # Determine partial derivative and delta for the rest of the layers.
        # Each iteration requires the delta from the previous layer, propagating backwards.
        for i in reversed(range(2, self.n_layers)):
            # dropout for the backpropagation step
            if keep_prob != 1:
                delta = (delta @ self.w[i].transpose()
                         ) * self.activations[i].prime(z[i])
                delta = delta * masks[i]
                delta /= keep_prob
            else:
                delta = (delta @ self.w[i].transpose()
                         ) * self.activations[i].prime(z[i])

            dw = coo_matrix(self.w[i - 1], dtype='float32')

            # compute backpropagation updates
            backpropagation_updates_numpy(a[i - 1], delta, dw.row, dw.col,
                                          dw.data)

            update_params[i - 1] = (dw.tocsr(), np.mean(delta, axis=0))
        for k, v in update_params.items():
            self._update_w_b(k, v[0], v[1])

    def _update_w_b(self, index, dw, delta):
        """
        Update weights and biases.

        :param index: (int) Number of the layer
        :param dw: (array) Partial derivatives
        :param delta: (array) Delta error.
        """

        # perform the update with momentum
        if index not in self.pdw:
            self.pdw[index] = -self.learning_rate * dw
            self.pdd[index] = -self.learning_rate * delta
        else:
            self.pdw[index] = self.momentum * self.pdw[
                index] - self.learning_rate * dw
            self.pdd[index] = self.momentum * self.pdd[
                index] - self.learning_rate * delta

        self.w[index] += self.pdw[index] - self.weight_decay * self.w[index]
        self.b[index] += self.pdd[index] - self.weight_decay * self.b[index]

    def fit(self,
            x,
            y_true,
            x_test,
            y_test,
            loss,
            epochs,
            batch_size,
            learning_rate=1e-3,
            momentum=0.9,
            weight_decay=0.0002,
            zeta=0.3,
            dropoutrate=0.,
            testing=True,
            save_filename="",
            monitor=False):
        """
        :param x: (array) Containing parameters
        :param y_true: (array) Containing one hot encoded labels.
        :return (array) A 2D array of metrics (epochs, 3).
        """
        if not x.shape[0] == y_true.shape[0]:
            raise ValueError("Length of x and y arrays don't match")

        self.monitor = Monitor(
            save_filename=save_filename) if monitor else None

        # Initiate the loss object with the final activation function
        self.loss = loss()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.zeta = zeta
        self.dropout_rate = dropoutrate
        self.save_filename = save_filename
        self.input_layer_connections.append(self.get_core_input_connections())
        np.savez_compressed(self.save_filename + "_input_connections.npz",
                            inputLayerConnections=self.input_layer_connections)

        maximum_accuracy = 0
        metrics = np.zeros((epochs, 4))

        for i in range(epochs):
            # Shuffle the data
            seed = np.arange(x.shape[0])
            np.random.shuffle(seed)
            x_ = x[seed]
            y_ = y_true[seed]

            if self.monitor:
                self.monitor.start_monitor()

            # training
            t1 = datetime.datetime.now()

            for j in range(x.shape[0] // batch_size):
                k = j * batch_size
                l = (j + 1) * batch_size
                z, a, masks = self._feed_forward(x_[k:l], True)

                self._back_prop(z, a, masks, y_[k:l])

            t2 = datetime.datetime.now()

            if self.monitor:
                self.monitor.stop_monitor()

            print("\nSET-MLP Epoch ", i)
            print("Training time: ", t2 - t1)

            # test model performance on the test data at each epoch
            # this part is useful to understand model performance and can be commented for production settings
            if testing:
                t3 = datetime.datetime.now()
                accuracy_test, activations_test = self.predict(x_test, y_test)
                accuracy_train, activations_train = self.predict(x, y_true)

                t4 = datetime.datetime.now()
                maximum_accuracy = max(maximum_accuracy, accuracy_test)
                loss_test = self.loss.loss(y_test, activations_test)
                loss_train = self.loss.loss(y_true, activations_train)
                metrics[i, 0] = loss_train
                metrics[i, 1] = loss_test
                metrics[i, 2] = accuracy_train
                metrics[i, 3] = accuracy_test

                print(f"Testing time: {t4 - t3}\n; Loss test: {loss_test}; \n"
                      f"Accuracy test: {accuracy_test}; \n"
                      f"Maximum accuracy val: {maximum_accuracy}")

            t5 = datetime.datetime.now()
            if i < epochs - 1:  # do not change connectivity pattern after the last epoch
                # self.weights_evolution_I() # this implementation is more didactic, but slow.
                self.weights_evolution_II(
                )  # this implementation has the same behaviour as the one above, but it is much faster.
            t6 = datetime.datetime.now()
            print("Weights evolution time ", t6 - t5)

            # save performance metrics values in a file
            if self.save_filename != "":
                np.savetxt(self.save_filename + ".txt", metrics)

            if self.save_filename != "" and self.monitor:
                with open(self.save_filename + "_monitor.json", 'w') as file:
                    file.write(
                        json.dumps(self.monitor.get_stats(),
                                   indent=4,
                                   sort_keys=True,
                                   default=str))

        return metrics

    def get_core_input_connections(self):
        values = np.sort(self.w[1].data)
        first_zero_pos = find_first_pos(values, 0)
        last_zero_pos = find_last_pos(values, 0)

        largest_negative = values[int((1 - self.zeta) * first_zero_pos)]
        smallest_positive = values[int(
            min(values.shape[0] - 1, last_zero_pos + self.zeta *
                (values.shape[0] - last_zero_pos)))]

        wlil = self.w[1].tolil()
        wdok = dok_matrix((self.dimensions[0], self.dimensions[1]),
                          dtype="float32")

        # remove the weights closest to zero
        keep_connections = 0
        for ik, (row, data) in enumerate(zip(wlil.rows, wlil.data)):
            for jk, val in zip(row, data):
                if (val < largest_negative) or (val > smallest_positive):
                    wdok[ik, jk] = val
                    keep_connections += 1
        return wdok.tocsr().getnnz(axis=1)

    def weights_evolution_I(self):
        # this represents the core of the SET procedure. It removes the weights closest to zero in each layer and add new random weights
        for i in range(1, self.n_layers - 1):

            values = np.sort(self.w[i].data)
            first_zero_pos = find_first_pos(values, 0)
            last_zero_pos = find_last_pos(values, 0)

            largest_negative = values[int((1 - self.zeta) * first_zero_pos)]
            smallest_positive = values[int(
                min(
                    values.shape[0] - 1, last_zero_pos + self.zeta *
                    (values.shape[0] - last_zero_pos)))]

            wlil = self.w[i].tolil()
            pdwlil = self.pdw[i].tolil()
            wdok = dok_matrix((self.dimensions[i - 1], self.dimensions[i]),
                              dtype="float32")
            pdwdok = dok_matrix((self.dimensions[i - 1], self.dimensions[i]),
                                dtype="float32")

            # remove the weights closest to zero
            keep_connections = 0
            for ik, (row, data) in enumerate(zip(wlil.rows, wlil.data)):
                for jk, val in zip(row, data):
                    if (val < largest_negative) or (val > smallest_positive):
                        wdok[ik, jk] = val
                        pdwdok[ik, jk] = pdwlil[ik, jk]
                        keep_connections += 1
            limit = np.sqrt(6. / float(self.dimensions[i]))

            # add new random connections
            for kk in range(self.w[i].data.shape[0] - keep_connections):
                ik = np.random.randint(0, self.dimensions[i - 1])
                jk = np.random.randint(0, self.dimensions[i])
                while (wdok[ik, jk] != 0):
                    ik = np.random.randint(0, self.dimensions[i - 1])
                    jk = np.random.randint(0, self.dimensions[i])
                wdok[ik, jk] = np.random.uniform(-limit, limit)
                pdwdok[ik, jk] = 0

            self.pdw[i] = pdwdok.tocsr()
            self.w[i] = wdok.tocsr()

    def weights_evolution_II(self):
        # this represents the core of the SET procedure. It removes the weights closest to zero in each layer and add new random weights
        # improved running time using numpy routines - Amarsagar Reddy Ramapuram Matavalam ([email protected])
        for i in range(1, self.n_layers - 1):
            # uncomment line below to stop evolution of dense weights more than 80% non-zeros
            # if self.w[i].count_nonzero() / (self.w[i].get_shape()[0]*self.w[i].get_shape()[1]) < 0.8:
            t_ev_1 = datetime.datetime.now()
            # converting to COO form - Added by Amar
            wcoo = self.w[i].tocoo()
            vals_w = wcoo.data
            rows_w = wcoo.row
            cols_w = wcoo.col

            pdcoo = self.pdw[i].tocoo()
            vals_pd = pdcoo.data
            rows_pd = pdcoo.row
            cols_pd = pdcoo.col
            # print("Number of non zeros in W and PD matrix before evolution in layer",i,[np.size(valsW), np.size(valsPD)])
            values = np.sort(self.w[i].data)
            first_zero_pos = find_first_pos(values, 0)
            last_zero_pos = find_last_pos(values, 0)

            largest_negative = values[int((1 - self.zeta) * first_zero_pos)]
            smallest_positive = values[int(
                min(
                    values.shape[0] - 1, last_zero_pos + self.zeta *
                    (values.shape[0] - last_zero_pos)))]

            #remove the weights (W) closest to zero and modify PD as well
            vals_w_new = vals_w[(vals_w > smallest_positive) |
                                (vals_w < largest_negative)]
            rows_w_new = rows_w[(vals_w > smallest_positive) |
                                (vals_w < largest_negative)]
            cols_w_new = cols_w[(vals_w > smallest_positive) |
                                (vals_w < largest_negative)]

            new_w_row_col_index = np.stack((rows_w_new, cols_w_new), axis=-1)
            old_pd_row_col_index = np.stack((rows_pd, cols_pd), axis=-1)

            new_pd_row_col_index_flag = array_intersect(
                old_pd_row_col_index,
                new_w_row_col_index)  # careful about order

            vals_pd_new = vals_pd[new_pd_row_col_index_flag]
            rows_pd_new = rows_pd[new_pd_row_col_index_flag]
            cols_pd_new = cols_pd[new_pd_row_col_index_flag]

            self.pdw[i] = coo_matrix(
                (vals_pd_new, (rows_pd_new, cols_pd_new)),
                (self.dimensions[i - 1], self.dimensions[i])).tocsr()

            if i == 1:
                self.input_layer_connections.append(
                    coo_matrix(
                        (vals_w_new, (rows_w_new, cols_w_new)),
                        (self.dimensions[i - 1], self.dimensions[i])).getnnz(
                            axis=1))
                np.savez_compressed(
                    self.save_filename + "_input_connections.npz",
                    inputLayerConnections=self.input_layer_connections)

            # add new random connections
            keep_connections = np.size(rows_w_new)
            length_random = vals_w.shape[0] - keep_connections
            limit = np.sqrt(6. / float(self.dimensions[i - 1]))
            random_vals = np.random.uniform(-limit, limit, length_random)
            zero_vals = 0 * random_vals  # explicit zeros

            # adding  (wdok[ik,jk]!=0): condition
            while length_random > 0:
                ik = np.random.randint(0,
                                       self.dimensions[i - 1],
                                       size=length_random,
                                       dtype='int32')
                jk = np.random.randint(0,
                                       self.dimensions[i],
                                       size=length_random,
                                       dtype='int32')

                random_w_row_col_index = np.stack((ik, jk), axis=-1)
                random_w_row_col_index = np.unique(
                    random_w_row_col_index,
                    axis=0)  # removing duplicates in new rows&cols
                oldW_row_col_index = np.stack((rows_w_new, cols_w_new),
                                              axis=-1)

                unique_flag = ~array_intersect(
                    random_w_row_col_index,
                    oldW_row_col_index)  # careful about order & tilda

                ik_new = random_w_row_col_index[unique_flag][:, 0]
                jk_new = random_w_row_col_index[unique_flag][:, 1]
                # be careful - row size and col size needs to be verified
                rows_w_new = np.append(rows_w_new, ik_new)
                cols_w_new = np.append(cols_w_new, jk_new)

                length_random = vals_w.shape[0] - np.size(
                    rows_w_new)  # this will constantly reduce lengthRandom

            # adding all the values along with corresponding row and column indices - Added by Amar
            vals_w_new = np.append(
                vals_w_new,
                random_vals)  # be careful - we can add to an existing link ?
            # vals_pd_new = np.append(vals_pd_new, zero_vals) # be careful - adding explicit zeros - any reason??
            if vals_w_new.shape[0] != rows_w_new.shape[0]:
                print("not good")
            self.w[i] = coo_matrix(
                (vals_w_new, (rows_w_new, cols_w_new)),
                (self.dimensions[i - 1], self.dimensions[i])).tocsr()

            # print("Number of non zeros in W and PD matrix after evolution in layer",i,[(self.w[i].data.shape[0]), (self.pdw[i].data.shape[0])])

            t_ev_2 = datetime.datetime.now()
            print("Weights evolution time for layer", i, "is", t_ev_2 - t_ev_1)

    def predict(self, x_test, y_test, batch_size=100):
        """
        :param x_test: (array) Test input
        :param y_test: (array) Correct test output
        :param batch_size:
        :return: (flt) Classification accuracy
        :return: (array) A 2D array of shape (n_cases, n_classes).
        """
        activations = np.zeros((y_test.shape[0], y_test.shape[1]))
        for j in range(x_test.shape[0] // batch_size):
            k = j * batch_size
            l = (j + 1) * batch_size
            _, a_test, _ = self._feed_forward(x_test[k:l], drop=False)
            activations[k:l] = a_test[self.n_layers]
        accuracy = compute_accuracy(activations, y_test)
        return accuracy, activations
Exemplo n.º 20
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(
        args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                          args.name))
    env.reset()
    initial_state = env.observe()
    for num_eps in range(args.episode_num):
        terminal = False
        env.reset()
        loss = 0
        cnt = 0
        act1 = 0
        act2 = 0
        tot_reward = 0
        tot_reward_nc = 0
        tot_reward_dist = 0
        mask = None
        next_mask = None
        probe = None
        if args.env_name == "dst":
            probe = FloatTensor([0.8, 0.2])
        elif args.env_name == "crp":
            probe = FloatTensor([0.5, 0.5])
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

        while not terminal:
            t_now = time.time()
            state = env.observe()
            t_obs = time.time() - t_now
            t_now = time.time()
            if args.env_name == "crp":
                mask = env.env.get_action_out_mask()
            action = agent.act(state, mask=mask)
            t_policy = time.time() - t_now
            t_now = time.time()
            next_state, reward, terminal = env.step(action, step=0.5)
            t_step = time.time() - t_now
            if args.env_name == "crp":
                next_mask = env.env.get_action_out_mask()
            if args.log:
                monitor.add_log(state, action, reward, terminal, agent.w_kept)
            t_now = time.time()
            agent.memorize(state, action, next_state, reward, terminal, mask,
                           next_mask)
            t_mem = time.time() - t_now
            t_now = time.time()
            loss += agent.learn()
            t_learn = time.time() - t_now
            if terminal:
                # terminal = True
                t_now = time.time()
                agent.reset()
                t_reset = time.time() - t_now
            tot_reward = tot_reward + (probe.cpu().numpy().dot(reward))
            act1 += reward[0]
            act2 += reward[1]
            tot_reward_nc = tot_reward_nc + 1 - reward[0]
            tot_reward_dist = tot_reward_dist + env.env.get_distortion(
                absolute=True, tollerance=0) / 10
            cnt = cnt + 1

        # _, q = agent.predict(probe, initial_state=initial_state)

        # if args.env_name == "dst":
        #     act_1 = q[0, 3]
        #     act_2 = q[0, 1]
        if args.env_name == "crp":
            act_1 = act1
            act_2 = act2
        # elif args.env_name in ['ft', 'ft5', 'ft7']:
        # act_1 = q[0, 1]
        # act_2 = q[0, 0]

        # if args.method == "crl-naive":
        #     act_1 = act_1.data.cpu()
        #     act_2 = act_2.data.cpu()
        # elif args.method == "crl-envelope":
        #     act_1 = probe.dot(act_1.data)
        #     act_2 = probe.dot(act_2.data)
        # elif args.method == "crl-energy":
        #     act_1 = probe.dot(act_1.data)
        #     act_2 = probe.dot(act_2.data)
        print(
            "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f;  total_nc: %0.2f; total_dist: %0.2f;beta : %0.2f;eps : %0.2f;"
            % (
                num_eps,
                tot_reward,
                act_1,
                act_2,
                # q__max,
                loss / cnt,
                tot_reward_nc,
                tot_reward_dist,
                agent.beta,
                agent.epsilon))
        # print("t_obs : %0.2f;t_policy : %0.2f;t_step : %0.2f;t_mem : %0.2f;t_learn : %0.2f;t_reset : %0.2f" % (
        #     t_obs,
        #     t_policy,
        #     t_step,
        #     t_mem,
        #     t_learn,
        #     t_reset,))

        monitor.update(
            num_eps,
            tot_reward,
            act_1,
            act_2,
            #    q__max,
            loss / cnt)
        if (num_eps) % 10 == 0:
            agent.save(
                args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                                   args.name))
            agent.save(
                args.save,
                "m.{}_e.{}_n.{}.ep{}".format(args.model, args.env_name,
                                             args.name, num_eps // 100))
Exemplo n.º 21
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(
        args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                          args.name))
    env.reset()
    for num_eps in range(args.episode_num):
        terminal = False
        env.reset()
        loss = 0
        cnt = 0
        tot_reward = 0

        probe = None
        if args.env_name == "dst":
            probe = FloatTensor([0.8, 0.2])
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

        while not terminal:
            state = env.observe()
            action = agent.act(state)
            next_state, reward, terminal = env.step(action)
            if args.log:
                monitor.add_log(state, action, reward, terminal, agent.w_kept)
            agent.memorize(state, action, next_state, reward, terminal)
            loss += agent.learn()
            if cnt > 100:
                terminal = True
                agent.reset()
            tot_reward = tot_reward + (
                probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt)
            cnt = cnt + 1

        _, q = agent.predict(probe)

        if args.env_name == "dst":
            act_1 = q[0, 3]
            act_2 = q[0, 1]
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            act_1 = q[0, 1]
            act_2 = q[0, 0]

        if args.method == "crl-naive":
            act_1 = act_1.data.cpu()
            act_2 = act_2.data.cpu()
        elif args.method == "crl-envelope":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        elif args.method == "crl-energy":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        print(
            "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f"
            % (
                num_eps,
                tot_reward,
                act_1,
                act_2,
                # q__max,
                loss / cnt))
        monitor.update(
            num_eps,
            tot_reward,
            act_1,
            act_2,
            #    q__max,
            loss / cnt)
    if num_eps + 1 % 500 == 0:
        agent.save(
            args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                               args.name))
Exemplo n.º 22
0
def main(args):
    # process config
    c = Configs(args.config)
    ROOT = os.environ['TENSOROFLOW']
    output = c.option.get('output', 'examples/model/buf')
    model_directory = '%s/%s' % (ROOT, output)
    model_path = '%s/model' % model_directory
    dictionary_path = {
        'source': '%s/source_dictionary.pickle' % model_directory,
        'source_reverse':
        '%s/source_reverse_dictionary.pickle' % model_directory,
        'target': '%s/target_dictionary.pickle' % model_directory,
        'target_reverse':
        '%s/target_reverse_dictionary.pickle' % model_directory
    }
    PAD = c.const['PAD']
    BOS = c.const['BOS']
    EOS = c.const['EOS']
    train_step = c.option['train_step']
    max_time = c.option['max_time']
    batch_size = c.option['batch_size']
    vocabulary_size = c.option['vocabulary_size']
    input_embedding_size = c.option['embedding_size']
    hidden_units = c.option['hidden_units']
    layers = c.option['layers']
    source_train_data_path = c.data['source_train_data']
    target_train_data_path = c.data['target_train_data']
    source_valid_data_path = c.data['source_valid_data']
    target_valid_data_path = c.data['target_valid_data']
    source_test_data_path = c.data['source_test_data']
    target_test_data_path = c.data['target_test_data']

    # initialize output directory
    if pathlib.Path(model_directory).exists():
        print('Warning: model %s is exists.')
        print('Old model will be overwritten.')
        while True:
            print('Do you wanna continue? [yes|no]')
            command = input('> ')
            if command == 'yes':
                shutil.rmtree(model_directory)
                break
            elif command == 'no':
                sys.exit()
            else:
                print('You can only input "yes" or "no".')

    print('Make new model: %s' % model_directory)
    pathlib.Path(model_directory).mkdir()

    # read data
    if args.mode == 'train':
        source_dictionary, source_reverse_dictionary = build_dictionary(
            read_words(source_train_data_path), vocabulary_size)
        source_train_datas = [
            sentence_to_onehot(lines, source_dictionary)
            for lines in read_data(source_train_data_path)
        ]
        target_dictionary, target_reverse_dictionary = build_dictionary(
            read_words(target_train_data_path), vocabulary_size)
        target_train_datas = [
            sentence_to_onehot(lines, target_dictionary)
            for lines in read_data(target_train_data_path)
        ]

        source_valid_datas = [
            sentence_to_onehot(lines, source_dictionary)
            for lines in read_data(source_valid_data_path)
        ]
        target_valid_datas = [
            sentence_to_onehot(lines, target_dictionary)
            for lines in read_data(target_valid_data_path)
        ]

        if args.debug:
            source_train_datas = source_train_datas[:1000]
            target_train_datas = source_train_datas[:1000]
    else:
        with open(dictionary_path['source'], 'rb') as f1, \
             open(dictionary_path['source_reverse'], 'rb') as f2, \
             open(dictionary_path['target'], 'rb') as f3, \
             open(dictionary_path['target_reverse'], 'rb') as f4:
            source_dictionary = pickle.load(f1)
            source_reverse_dictionary = pickle.load(f2)
            target_dictionary = pickle.load(f3)
            target_reverse_dictionary = pickle.load(f4)

    source_test_datas = [
        sentence_to_onehot(lines, source_dictionary)
        for lines in read_data(source_test_data_path)
    ]
    target_test_datas = [
        sentence_to_onehot(lines, target_dictionary)
        for lines in read_data(target_test_data_path)
    ]

    # placeholder
    encoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='encoder_inputs')
    decoder_inputs = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_inputs')
    decoder_labels = tf.placeholder(shape=(None, None),
                                    dtype=tf.int32,
                                    name='decoder_labels')

    # embed
    embeddings = tf.Variable(tf.random_uniform(
        [vocabulary_size, input_embedding_size], -1.0, 1.0),
                             dtype=tf.float32,
                             name='embeddings')
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     encoder_inputs)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings,
                                                     decoder_inputs)

    # encoder with bidirection
    encoder_units = hidden_units
    encoder_layers_fw = [
        tf.contrib.rnn.LSTMCell(size) for size in [encoder_units] * layers
    ]
    encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(encoder_layers_fw)
    encoder_layers_bw = [
        tf.contrib.rnn.LSTMCell(size) for size in [encoder_units] * layers
    ]
    encoder_cell_bw = tf.contrib.rnn.MultiRNNCell(encoder_layers_bw)
    (encoder_output_fw,
     encoder_output_bw), encoder_state = tf.nn.bidirectional_dynamic_rnn(
         encoder_cell_fw,
         encoder_cell_bw,
         encoder_inputs_embedded,
         dtype=tf.float32,
         time_major=True)
    encoder_outputs = tf.concat((encoder_output_fw, encoder_output_bw), 2)
    encoder_state = tuple(
        tf.contrib.rnn.LSTMStateTuple(
            tf.concat((encoder_state[0][layer].c,
                       encoder_state[1][layer].c), 1),
            tf.concat((encoder_state[0][layer].h,
                       encoder_state[1][layer].h), 1))
        for layer in range(layers))

    # decoder with attention
    decoder_units = encoder_units * 2
    attention_units = decoder_units
    decoder_layers = [
        tf.contrib.rnn.LSTMCell(size) for size in [decoder_units] * layers
    ]
    cell = tf.contrib.rnn.MultiRNNCell(decoder_layers)

    sequence_length = tf.cast([max_time] * batch_size, dtype=tf.int32)
    beam_width = 1
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_outputs, multiplier=beam_width)
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
        encoder_state, multiplier=beam_width)
    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
        sequence_length, multiplier=beam_width)
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        num_units=attention_units,
        memory=tiled_encoder_outputs,
        memory_sequence_length=tiled_sequence_length)
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(
        cell, attention_mechanism, attention_layer_size=256)
    decoder_initial_state = attention_cell.zero_state(dtype=tf.float32,
                                                      batch_size=batch_size *
                                                      beam_width)
    decoder_initial_state = decoder_initial_state.clone(
        cell_state=tiled_encoder_final_state)

    if args.mode == 'train':
        helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=decoder_inputs_embedded,
            sequence_length=tf.cast([max_time] * batch_size, dtype=tf.int32),
            time_major=True)
    elif args.mode == 'eval':
        """
    helper = tf.contrib.seq2seq.TrainingHelper(
      inputs=decoder_inputs_embedded,
      sequence_length=tf.cast([max_time] * batch_size, dtype=tf.int32),
      time_major=True)
    """
        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            embedding=embeddings,
            start_tokens=tf.tile([BOS], [batch_size]),
            end_token=EOS)

    decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=attention_cell,
        helper=helper,
        initial_state=decoder_initial_state)
    decoder_outputs = tf.contrib.seq2seq.dynamic_decode(
        decoder=decoder,
        output_time_major=True,
        impute_finished=False,
        maximum_iterations=max_time)

    decoder_logits = tf.contrib.layers.linear(decoder_outputs[0][0],
                                              vocabulary_size)
    decoder_prediction = tf.argmax(
        decoder_logits, 2)  # max_time: axis=0, batch: axis=1, vocab: axis=2

    # optimizer
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_labels,
                          depth=vocabulary_size,
                          dtype=tf.float32),
        logits=decoder_logits,
    )

    loss = tf.reduce_mean(stepwise_cross_entropy)
    regularizer = 0.0 * tf.nn.l2_loss(decoder_outputs[0][0])
    train_op = tf.train.AdamOptimizer().minimize(loss + regularizer)

    saver = tf.train.Saver()
    minibatch_idx = {'train': 0, 'valid': 0, 'test': 0}
    with tf.Session() as sess:
        if args.mode == 'train':
            # train
            global_max_step = train_step * (
                len(source_train_datas) // batch_size + 1)
            loss_freq = global_max_step // 100 if global_max_step > 100 else 1
            loss_log = []
            batch_loss_log = []
            loss_suffix = ''
            es = EarlyStopper(max_size=5, edge_threshold=0.1)
            m = Monitor(global_max_step)
            log = Logger('%s/log' % model_directory)
            sess.run(tf.global_variables_initializer())
            global_step = 0
            stop_flag = False
            for batch in range(train_step):
                if stop_flag:
                    break
                current_batch_loss_log = []
                while True:  # minibatch process
                    m.monitor(global_step, loss_suffix)
                    source_train_batch, _ = batchnize(source_train_datas,
                                                      batch_size,
                                                      minibatch_idx['train'])
                    target_train_batch, minibatch_idx['train'] = batchnize(
                        target_train_datas, batch_size, minibatch_idx['train'])
                    batch_data = seq2seq(source_train_batch,
                                         target_train_batch,
                                         max_time,
                                         vocabulary_size,
                                         reverse=True)
                    feed_dict = {
                        encoder_inputs: batch_data['encoder_inputs'],
                        decoder_inputs: batch_data['decoder_inputs'],
                        decoder_labels: batch_data['decoder_labels']
                    }
                    sess.run(fetches=[train_op, loss], feed_dict=feed_dict)

                    if global_step % loss_freq == 0:
                        source_valid_batch, _ = batchnize(
                            source_valid_datas, batch_size,
                            minibatch_idx['valid'])
                        target_valid_batch, minibatch_idx['valid'] = batchnize(
                            target_valid_datas, batch_size,
                            minibatch_idx['valid'])
                        batch_data = seq2seq(source_valid_batch,
                                             target_valid_batch,
                                             max_time,
                                             vocabulary_size,
                                             reverse=True)
                        feed_dict = {
                            encoder_inputs: batch_data['encoder_inputs'],
                            decoder_inputs: batch_data['decoder_inputs'],
                            decoder_labels: batch_data['decoder_labels']
                        }
                        loss_val = sess.run(fetches=loss, feed_dict=feed_dict)
                        loss_log.append(loss_val)
                        current_batch_loss_log.append(loss_val)
                        loss_suffix = 'loss: %f' % loss_val
                    global_step += 1
                    if minibatch_idx['train'] == 0:
                        batch_loss = np.mean(current_batch_loss_log)
                        batch_loss_log.append(batch_loss)
                        loss_msg = 'Batch: {}/{}, batch loss: {}'.format(
                            batch + 1, train_step, batch_loss)
                        print(loss_msg)
                        log(loss_msg)
                        es_status = es(batch_loss)
                        if batch > train_step // 2 and es_status:
                            print('early stopping at step: %d' % global_step)
                            stop_flag = True
                        break

            # save tf.graph and variables
            saver.save(sess, model_path)
            print('save at %s' % model_path)

            # save plot of loss
            plt.plot(np.arange(len(loss_log)) * loss_freq, loss_log)
            plt.savefig('%s_global_loss.png' % model_path)
            plt.figure()
            plt.plot(np.arange(len(batch_loss_log)), batch_loss_log)
            plt.savefig('%s_batch_loss.png' % model_path)

            # save dictionary
            with open(dictionary_path['source'], 'wb') as f1, \
                 open(dictionary_path['source_reverse'], 'wb') as f2, \
                 open(dictionary_path['target'], 'wb') as f3, \
                 open(dictionary_path['target_reverse'], 'wb') as f4:
                pickle.dump(source_dictionary, f1)
                pickle.dump(source_reverse_dictionary, f2)
                pickle.dump(target_dictionary, f3)
                pickle.dump(target_reverse_dictionary, f4)

        elif args.mode == 'eval':
            saver.restore(sess, model_path)
            print('load from %s' % model_path)

        else:
            raise  # args.mode should be train or eval

        # evaluate
        loss_val = []
        input_vectors = None
        predict_vectors = None
        for i in range(len(source_test_datas) // batch_size + 1):
            source_test_batch, _ = batchnize(source_test_datas, batch_size,
                                             minibatch_idx['test'])
            target_test_batch, minibatch_idx['test'] = batchnize(
                target_test_datas, batch_size, minibatch_idx['test'])
            batch_data = seq2seq(source_test_batch,
                                 target_test_batch,
                                 max_time,
                                 vocabulary_size,
                                 reverse=True)
            feed_dict = {
                encoder_inputs: batch_data['encoder_inputs'],
                decoder_inputs: batch_data['decoder_inputs'],
                decoder_labels: batch_data['decoder_labels']
            }
            pred = sess.run(fetches=decoder_prediction, feed_dict=feed_dict)
            if predict_vectors is None:
                predict_vectors = pred.T
            else:
                predict_vectors = np.vstack((predict_vectors, pred.T))
            input_ = batch_data['encoder_inputs']
            if input_vectors is None:
                input_vectors = input_.T
            else:
                input_vectors = np.vstack((input_vectors, input_.T))
            loss_val.append(sess.run(fetches=loss, feed_dict=feed_dict))

        input_sentences = ''
        predict_sentences = ''
        ignore_token = EOS
        for i, (input_vector, predict_vector) in enumerate(
                zip(input_vectors[:len(source_test_datas)],
                    predict_vectors[:len(target_test_datas)])):
            input_sentences += ' '.join([
                source_reverse_dictionary[vector] for vector in input_vector
                if not vector == ignore_token
            ])
            predict_sentences += ' '.join([
                target_reverse_dictionary[vector] for vector in predict_vector
                if not vector == ignore_token
            ])
            if i < len(source_test_datas) - 1:
                input_sentences += '\n'
                predict_sentences += '\n'

        evaluate_input_path = '%s.evaluate_input' % model_path
        evaluate_predict_path = '%s.evaluate_predict' % model_path
        with open(evaluate_input_path, 'w') as f1, \
             open(evaluate_predict_path, 'w') as f2:
            f1.write(input_sentences)
            f2.write(predict_sentences)

        print('input sequences at {}'.format(evaluate_input_path))
        print('predict sequences at {}'.format(evaluate_predict_path))
        print('mean of loss: %f' % np.mean(loss_val))

    print('finish.')