def start_transaction(self, request): """Начинает записывать в БД""" if self._hotels is None: raise Exception("Не установлен лист отелей") else: self._chat = _User(request) self._operation = _Operations(self._sql.conn, self._chat) self._operation.insert(self._hotels, request)
def f_create_data(self, args): self.m_min_occ = args.min_occ self.m_max_line = 1e8 self.m_data_dir = args.data_dir self.m_data_name = args.data_name self.m_raw_data_file = args.data_file self.m_raw_data_path = os.path.join(self.m_data_dir, self.m_raw_data_file) self.m_vocab_file = self.m_data_name+"_vocab.json" ### to save new generated data self.m_data_file = "tokenized_"+self.m_data_name+"_pro.pickle" data = pd.read_pickle(self.m_raw_data_path) train_df = data["train"] valid_df = data["valid"] tokenizer = TweetTokenizer(preserve_case=False) train_reviews = train_df.review train_item_ids = train_df.itemid train_user_ids = train_df.userid valid_reviews = valid_df.review valid_item_ids = valid_df.itemid valid_user_ids = valid_df.userid vocab_obj = _Vocab() self._create_vocab(vocab_obj, train_reviews) # i = 0 review_corpus = defaultdict(dict) item_corpus = defaultdict(dict) user_corpus = defaultdict(dict) user2uid = defaultdict() stop_word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in stopwords.words('english')] punc_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in string.punctuation] print("loading train reviews") ss_time = datetime.datetime.now() non_informative_words = stop_word_ids + punc_ids # non_informative_words = stopwords.words()+string.punctuation print("non informative words num", len(non_informative_words)) # print_index = 0 for index, review in enumerate(train_reviews): if index > self.m_max_line: break item_id = train_item_ids.iloc[index] user_id = train_user_ids.iloc[index] words = tokenizer.tokenize(review) word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words] word_tf_map = Counter(word_ids) new_word_tf_map = {} for word in word_tf_map: if word in non_informative_words: continue new_word_tf_map[word] = word_tf_map[word] informative_word_num = sum(new_word_tf_map.values()) if informative_word_num < 5: continue review_id = len(review_corpus['train']) review_obj = _Review() review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num) # print_index += 1 review_corpus["train"][review_id] = review_obj if user_id not in user_corpus: user_obj = _User() user_obj.f_set_user_id(user_id) user_corpus[user_id] = user_obj user2uid[user_id] = len(user2uid) uid = user2uid[user_id] user_obj = user_corpus[user_id] user_obj.f_add_review_id(review_id) if item_id not in item_corpus: item_obj = _Item() item_corpus[item_id] = item_obj item_obj.f_set_item_id(item_id) review_obj.f_set_user_item(uid, item_id) item_obj = item_corpus[item_id] item_obj.f_add_review_id(review_obj, review_id) e_time = datetime.datetime.now() print("load training duration", e_time-ss_time) print("load train review num", len(review_corpus["train"])) s_time = datetime.datetime.now() user_num = len(user_corpus) vocab_obj.f_set_user(user2uid) # vocab_obj.f_set_user_size(user_num) save_item_corpus = {} print("item num", len(item_corpus)) # print_index = 0 # print_review_index = 0 for item_id in item_corpus: item_obj = item_corpus[item_id] # s_time = datetime.datetime.now() item_obj.f_get_item_lm() for review_id in item_obj.m_review_id_list: review_obj = review_corpus["train"][review_id] item_obj.f_get_RRe(review_obj) if item_id not in save_item_corpus: save_item_corpus[item_id] = item_obj.m_avg_review_words print("loading valid reviews") for index, review in enumerate(valid_reviews): if index > self.m_max_line: break item_id = valid_item_ids.iloc[index] user_id = valid_user_ids.iloc[index] if user_id not in user2uid: continue if item_id not in save_item_corpus: continue words = tokenizer.tokenize(review) word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words] word_tf_map = Counter(word_ids) new_word_tf_map = {} for word in word_tf_map: if word in non_informative_words: continue new_word_tf_map[word] = word_tf_map[word] informative_word_num = sum(new_word_tf_map.values()) if informative_word_num < 5: continue review_id = len(review_corpus["valid"]) review_obj = _Review() review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num) review_corpus["valid"][review_id] = review_obj uid = user2uid[user_id] review_obj.f_set_user_item(uid, item_id) item_obj = item_corpus[item_id] # print(len(item_corpus)) item_obj.f_get_RRe(review_obj) print("load validate review num", len(review_corpus["valid"])) save_data = {"item": save_item_corpus, "review": review_corpus, "user":user_num} print("save data to ", self.m_data_file) data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file) f = open(data_pickle_file, "wb") pickle.dump(save_data, f) f.close() vocab = dict(w2i=vocab_obj.m_w2i, i2w=vocab_obj.m_i2w, user2uid=vocab_obj.m_user2uid) with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace'))
def f_create_data(self, args): self.m_min_occ = args.min_occ self.m_max_line = 1e8 self.m_data_dir = args.data_dir self.m_data_name = args.data_name self.m_raw_data_file = args.data_file self.m_raw_data_path = os.path.join(self.m_data_dir, self.m_raw_data_file) self.m_output_file = args.output_file # self.m_vocab_file = self.m_data_name+".vocab.json" self.m_vocab_file = "vocab.json" ### to save new generated data self.m_data_file = "tokenized_" + self.m_output_file # self.m_data_file = "tokenized_"+self.m_data_name+"_"+self.m_output_file # self.m_data_file = "tokenized_"+self.m_data_name+"_pro_v2.pickle" data = pd.read_pickle(self.m_raw_data_path) train_df = data["train"] valid_df = data["valid"] tokenizer = TweetTokenizer(preserve_case=False) train_reviews = train_df.review train_item_ids = train_df.itemid train_user_ids = train_df.userid valid_reviews = valid_df.review valid_item_ids = valid_df.itemid valid_user_ids = valid_df.userid vocab_obj = _Vocab() self.f_create_vocab(vocab_obj, train_reviews) # i = 0 review_corpus = defaultdict(dict) item_corpus = defaultdict(dict) user_corpus = defaultdict(dict) global_user2uid = defaultdict() global_item2iid = defaultdict() stop_word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in stopwords.words('english') ] punc_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in string.punctuation ] print("loading train reviews") ss_time = datetime.datetime.now() non_informative_words = stop_word_ids + punc_ids # non_informative_words = stopwords.words()+string.punctuation print("non informative words num", len(non_informative_words)) ### load user words self.m_user_word_file = os.path.join(self.m_data_dir, args.user_word_file) self.f_load_user_word_score(vocab_obj, self.m_user_word_file) ### load item words self.m_item_word_file = os.path.join(self.m_data_dir, args.item_word_file) self.f_load_item_word_score(vocab_obj, self.m_item_word_file) print("user word", len(self.m_user_word_score_map)) print("item word", len(self.m_item_word_score_map)) for index, review in enumerate(train_reviews): if index > self.m_max_line: break item_id = train_item_ids.iloc[index] user_id = train_user_ids.iloc[index] words = tokenizer.tokenize(review) word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words ] new_review_user, new_review_item, new_review_local = self.f_get_perturb( vocab_obj, item_id, words, word_ids) review_id = len(review_corpus['train']) review_obj = _Review() review_obj.f_set_review(review_id, word_ids) # review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num) review_obj.f_set_pertub_review(new_review_user, new_review_item, new_review_local) review_corpus["train"][review_id] = review_obj if user_id not in user_corpus: user_obj = _User() user_obj.f_set_user_id(user_id) user_corpus[user_id] = user_obj global_user2uid[user_id] = len(global_user2uid) uid = global_user2uid[user_id] user_obj = user_corpus[user_id] user_obj.f_add_review_id(review_id) if item_id not in item_corpus: item_obj = _Item() item_corpus[item_id] = item_obj item_obj.f_set_item_id(item_id) global_item2iid[item_id] = len(global_item2iid) iid = global_item2iid[item_id] item_obj = item_corpus[item_id] item_obj.f_add_review_id(review_obj, review_id) review_obj.f_set_user_item(uid, iid) e_time = datetime.datetime.now() print("load training duration", e_time - ss_time) print("load train review num", len(review_corpus["train"])) s_time = datetime.datetime.now() user_num = len(user_corpus) vocab_obj.f_set_user(global_user2uid) print("item num", len(item_corpus)) print("loading valid reviews") for index, review in enumerate(valid_reviews): if index > self.m_max_line: break item_id = valid_item_ids.iloc[index] user_id = valid_user_ids.iloc[index] if user_id not in global_user2uid: continue if item_id not in item_corpus: continue words = tokenizer.tokenize(review) word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words ] new_review_user, new_review_item, new_review_local = self.f_get_perturb( vocab_obj, item_id, words, word_ids) review_id = len(review_corpus["valid"]) review_obj = _Review() review_obj.f_set_review(review_id, word_ids) # review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num) review_obj.f_set_pertub_review(new_review_user, new_review_item, new_review_local) review_corpus["valid"][review_id] = review_obj uid = global_user2uid[user_id] iid = global_item2iid[item_id] review_obj.f_set_user_item(uid, iid) item_obj = item_corpus[item_id] # print(len(item_corpus)) # item_obj.f_get_RRe(review_obj) # item_obj.f_get_ARe(review_obj) print("load validate review num", len(review_corpus["valid"])) save_data = { "item": global_item2iid, "review": review_corpus, "user": global_user2uid } print("save data to ", self.m_data_file) data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file) f = open(data_pickle_file, "wb") pickle.dump(save_data, f) f.close() vocab = dict(w2i=vocab_obj.m_w2i, i2w=vocab_obj.m_i2w, user2uid=vocab_obj.m_user2uid) with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace'))