# In[3]: Encoder_max_len = 60 Decoder_max_len = 30 min_count = 3 # In[4]: train_path = ["data/{}/train.csv".format(x) for x in ["length"]] test_path = ["data/{}/test.csv".format(x) for x in ["length"]] # In[5]: print("### Loading Train Data ###") data_agent = data_manager(train_path, train=True) # In[6]: print("### Loading Test Data ###") test_agent = data_manager(test_path, train=False) # ## Preprocessing and Padding # In[7]: idx_in_sen, idx_out_sen, mask_in, mask_out, length_in, idx2word, word2idx, remain_idx = transform_orig( [data_agent.orig_data, data_agent.out_sen], min_count=min_count, max_len=[Encoder_max_len, Decoder_max_len], path="Attn_ver1/tmp/tokenizer.pkl")
# ## Loading data # In[3]: input_file_name = sys.argv[1] output_file_name = sys.argv[2] try: model_index = sys.argv[3] except: model_index = None print("### Loading Test Data ###") test_agent = data_manager(input_file_name , train=False) # ## Preprocessing and Padding # pickle.dump({"orig_word":[idx2word,word2idx] }, # open(os.path.join(tmp_path,"tokenizer.pkl") , "wb")) idx2word,word2idx = pickle.load(open(os.path.join(tmp_path,"tokenizer.pkl") , "rb"))["orig_word"] ################################################################################################## ####################################### Building Model ######################################### ################################################################################################## def Encoder(inputs , dim , name , init_state=None , t_len=20 , reuse=False , stack_flag=False): cell = tf.contrib.rnn.LSTMCell(dim,name=name,reuse=reuse)
exp_folder = "ALL_ver0" model_path = "model_para" tmp_path = "tmp" log_path = "log" exp_folder = exist_or_mkdir("./", exp_folder) model_path = exist_or_mkdir(exp_folder, model_path) tmp_path = exist_or_mkdir(exp_folder, tmp_path) log_path = exist_or_mkdir(exp_folder, log_path) max_len = 20 min_count = 3 print("\n### Loading Train Data ###") data_agent = data_manager("data/all/train.csv", train=True) print("\n### Loading Test Data ###") test_agent = data_manager("data/all/test.csv", train=False) ## Preprocessing and Padding print("\n### Preprocessing and Padding ###") start_t = time.time() idx_in_sen, idx_out_sen, mask_in, mask_out, idx2word, word2idx, remain_idx = transform_word( [data_agent.in_sen, data_agent.out_sen], min_count=min_count, max_len=max_len) idx_gramma, idx2gramma, gramma2idx = transform_gramma(data_agent.gramma, remain_idx, max_len=max_len)