def _generate_inter_files(self): print("loading question_df...") question_df = DataSet.load_all_questions() corpus = question_df[self.data_space] print("generating item2idx...") sen_list = corpus.values.tolist() self.item2idx = {_PAD_:0} for sen in sen_list: for word in sen.split(): if word not in self.item2idx: self.item2idx[word] = len(self.item2idx) print("generating idx2item...") self.idx2item = {v:k for k,v in self.item2idx.items()} print("load %s data..."%(self.train_test)) if self.train_test=="train": self.data_set = DataSet.load_train() else: self.data_set = DataSet.load_test() if self.data_space == "words": q1 = self.data_set["word_len_q1"] q2 = self.data_set["word_len_q2"] else: q1 = self.data_set["char_len_q1"] q2 = self.data_set["char_len_q2"] print("bucketing...") q_pair = list(zip(q1,q2)) bucket = GreedyBucket() fit_res = bucket.fit(q_pair) self.buckets,self.bounds = bucket.get_split_results(fit_res,self.bucket_num) #print("len of self.bounds",len(self.bounds)) print("generating id vectors...") data_set_id_vectors = [] for ind in range(self.data_set.shape[0]): cur_row = self.data_set.iloc[ind] cur_q1 = cur_row["q1"] cur_q1_items = question_df.loc[cur_q1][self.data_space].split() cur_q1_inds = [self.item2idx[x] for x in cur_q1_items] cur_q2 = cur_row["q2"] cur_q2_items = question_df.loc[cur_q2][self.data_space].split() cur_q2_inds = [self.item2idx[x] for x in cur_q2_items] cur_bound = self.bounds[ind] q1_pad_len = cur_bound - len(cur_q1_inds) q2_pad_len = cur_bound - len(cur_q2_inds) if self.pad_prefix: cur_q1_padded = [0]*q1_pad_len+cur_q1_inds cur_q2_padded = [0]*q2_pad_len+cur_q2_inds else: cur_q1_padded = cur_q1_inds+[0]*q1_pad_len cur_q2_padded = cur_q2_inds+[0]*q2_pad_len cur_pair_padded = cur_q1_padded + cur_q2_padded data_set_id_vectors.append(cur_pair_padded) data_set_id_vectors = np.array(data_set_id_vectors) print("generating bucket_idx_vectors...") self.bucket_idx_vectors = {} for b,id_list in self.buckets.items(): tmp = {} if self.train_test == "train": tmplabels = self.data_set["label"].iloc[id_list].values tmp["label"] = tmplabels tmpdata = np.array(data_set_id_vectors[id_list].tolist()) tmp["data"] = tmpdata self.bucket_idx_vectors[b] = tmp print("finish generating inter files.") print("begin caching..") all_cached = {} all_cached["item2idx"] = self.item2idx all_cached["idx2item"] = self.idx2item all_cached["buckets"] = self.buckets all_cached["bounds"] = self.bounds all_cached["bucket_idx_vectors"] = self.bucket_idx_vectors try: os.makedirs("./temp") except: pass pickle.dump(all_cached,open(self._temp_file,"wb")) print("finish caching")
def __init__(self,data_df,space,bucket_num,batch_size,is_prefix_pad,is_shuffle,is_test): assert space in ["words","chars"] self.data_df = data_df self.space = space self.bucket_num = bucket_num self.batch_size = batch_size self.is_prefix_pad = is_prefix_pad self.is_shuffle = is_shuffle self.is_test = is_test if os.path.exists(self._temp_file): print("detect cached intermediate files...loading...") if DataGenerator.item2idx is None: all_cached = pickle.load(open(self._temp_file,"rb")) DataGenerator.item2idx = all_cached["item2idx"] DataGenerator.idx2item = all_cached["idx2item"] DataGenerator.item_embed = all_cached["item_embed"] DataGenerator.q2idvec = all_cached["q2idvec"] print("finish") else: print("Generating intermediate files...") DataGenerator.item2idx = {} DataGenerator.idx2item = {} DataGenerator.item_embed = {} DataGenerator.q2idvec = {} spaces = ["words","chars"] question_df = DataSet.load_all_questions() all_qids = DataSet.load_all_unique_ids_train_test() for space in spaces: print("for",space) corpus = question_df[space] w2i,i2w = self._get_item2id_id2item(corpus) DataGenerator.item2idx[space] = w2i DataGenerator.idx2item[space] = i2w ##Finish mapping table term_embed = DataSet.load_term_embed(space) embed_size = term_embed.shape[1] pad_embed = np.array([0] * embed_size).reshape(1, -1) all_embeding = np.vstack([pad_embed, term_embed]) all_index = [_PAD_] + term_embed.index.values.tolist() all_embeding_df = pd.DataFrame(data=all_embeding, index=all_index) sort_word = [i2w[i] for i in range(len(i2w))] DataGenerator.item_embed[space] = all_embeding_df.loc[sort_word].values ##Finish item embedding tmp_q2idvec = {} for qid in all_qids: items = question_df.loc[qid][space].split() idvec = np.array([w2i[w] for w in items]) tmp_q2idvec[qid] = idvec DataGenerator.q2idvec[space]=tmp_q2idvec ##Finish map from question to id vector print("finish generating inter files.") print("begin caching..") all_cached = {} all_cached["item2idx"] = DataGenerator.item2idx all_cached["idx2item"] = DataGenerator.idx2item all_cached["item_embed"] = DataGenerator.item_embed all_cached["q2idvec"] = DataGenerator.q2idvec try: os.makedirs("./temp") except: pass pickle.dump(all_cached,open(self._temp_file,"wb")) print("finish caching")