def transfer_texts_2_tfRecord(self, start_date, end_date, raw_feature_file_path, tf_record_file_path, column_names, label_name, need_feature_cols, negative_ratio=None, var_length_cols=None, col_preprocess_func=None): """ 根据起止时间,将指定目录下的text文件,都存储为tfRecord格式 参数见:transfer_single_text_2_tfRecord """ raw_feature_folder_name = os.path.basename(raw_feature_file_path) tf_record_folder_name = os.path.basename(tf_record_file_path) raw_feature_file = os.path.join(raw_feature_file_path, raw_feature_folder_name + "_%s") tf_record_file = os.path.join(tf_record_file_path, tf_record_folder_name + "_%s.tfrecord") data_info_csv_path = os.path.join(tf_record_file_path, "data_info.csv") if not os.path.isdir(tf_record_file_path): os.makedirs(tf_record_file_path) date_ls = DateUtil.get_every_date(start_date, end_date) for date in date_ls: print(date) self.transfer_single_text_2_tfRecord(raw_feature_file % date, tf_record_file % date, data_info_csv_path, column_names, label_name, need_feature_cols, negative_ratio, var_length_cols, col_preprocess_func)
def load_wechat_2_dict(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) dict_wechat = OrderedDict() # wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/aggregated_wechat_segment_data") wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s") log.info(date_ls) for tmp_date in date_ls: log.info(tmp_date) dict_wechat[tmp_date] = OrderedDict() with codecs.open(wechat_segment_data % tmp_date, "r", "utf-8") as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls) except Exception as e: log.info(e) continue dict_wechat[tmp_date][opp_id] = chat_ls return dict_wechat
def load_multi_day_order(start_date, end_date): multi_day_dict = {} date_ls = DateUtil.get_every_date(start_date, end_date) date_ls = sorted(date_ls, reverse=True) # 降序排列,如果出现多次,用最小的订单时间覆盖 for date in date_ls: log.info(date) one_day_dict = load_one_day_order(date) multi_day_dict.update(one_day_dict) return multi_day_dict
def get_hist_wechat_segment(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: start_time = time.time() get_one_day_wechat_segment(tmp_date) print("{0} wechat segment cost time: {1}".format( tmp_date, time.time() - start_time))
def get_hist_wechat_tf_feature(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) log.info("initial past n day wechat segment data...") s_date = (datetime.strptime(start_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") t_date = start_date log.info("load past days wechat segment data...") dict_wechat = load_wechat_2_dict(s_date, t_date) log.info("extract tf feature...") for tmp_date in date_ls: log.info("extract %s tf feature..." % tmp_date) start_time = time.time() get_one_day_wechat_tf_feature(tmp_date, dict_wechat) log.info("extract {0} wechat tf feature cost time:{1}".format(tmp_date, time.time()-start_time)) del_date = (datetime.strptime(tmp_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") add_date = (datetime.strptime(tmp_date, "%Y%m%d") + timedelta(days=1)).strftime("%Y%m%d") if add_date <= end_date: log.info("update past days wechat segment data [del %s, add %s]..." % (del_date, add_date)) dict_wechat = update_wechat_dict(dict_wechat, del_date, add_date) log.info("=======" * 3)
def load_hist_wechat_record_dict(date): wechat_dict = defaultdict(list) start_date = DateUtil.get_relative_delta_time_str( date, day=-HISTORY_WECHAT_RECORD_DELTA_DAY) end_date = DateUtil.get_relative_delta_time_str(date, -1) date_ls = sorted(DateUtil.get_every_date(start_date, end_date)) # 时间在后的聊天追加在后面 wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "wechat_full_sentence_data") wechat_full_sentence_data_file = os.path.join( wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s") for date in date_ls: log.info(date) wechat_full_sentence_data = wechat_full_sentence_data_file % date with codecs.open(wechat_full_sentence_data, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 5: continue opp_id, student_chat_num, teacher_chat_num, all_chat_num, chat_content = arr student_chat_num, teacher_chat_num, all_chat_num = int( student_chat_num), int(teacher_chat_num), int(all_chat_num) if opp_id not in wechat_dict: wechat_dict[opp_id] = { "stat_info": [0, 0, 0], "chat_content": "" } wechat_dict[opp_id]["chat_content"] = wechat_dict[opp_id][ "chat_content"] + chat_content wechat_dict[opp_id]["stat_info"] = [ x + y for x, y in zip([student_chat_num, teacher_chat_num, all_chat_num], wechat_dict[opp_id]["stat_info"]) ] return wechat_dict
def gen_bench_mark_multi_day(start_date, end_date): bench_mark_text_file_tmp = os.path.join(bench_mark_text_file_path, "total_chat_num_%s", "total_chat_num_%s_%s") for date in DateUtil.get_every_date(start_date, end_date): print(date) # 按chat_num分组 source_file_path = os.path.join(raw_feature_path, "wechat_basic_feature_%s" % date) chat_group_dict = {x: [] for x in chat_num_ls} with codecs.open(source_file_path, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") column_names = ["label", "opp_id", "acc_id", "create_time", "today_student_chat_num", "today_teacher_chat_num", "today_total_chat_num", "hist_student_chat_num", "hist_teacher_chat_num", "hist_total_chat_num", "chat_content"] tmp_dict = {key: value for key, value in zip(column_names, arr)} total_student_chat_num = int(tmp_dict["today_student_chat_num"]) + int( tmp_dict["hist_student_chat_num"]) if total_student_chat_num in chat_group_dict: chat_group_dict[total_student_chat_num].append(line.strip()) for chat_num, line_ls in chat_group_dict.items(): if not line_ls: continue bench_mark_text_file = bench_mark_text_file_tmp % (chat_num, chat_num, date) tmp_text_folder_path = os.path.dirname(bench_mark_text_file) tmp_tf_record_folder_path = tmp_text_folder_path.replace("feature_file", "tf_record") if not os.path.isdir(tmp_text_folder_path): os.makedirs(tmp_text_folder_path) with codecs.open(bench_mark_text_file, "w", "utf-8") as fout: for line in line_ls: fout.write(line + "\n") # 将转换好的text bench mark 文本文件 , 转换为tfRecord文件 tfRecorder.transfer_texts_2_tfRecord_default(date, date, tmp_text_folder_path, tmp_tf_record_folder_path)
def get_hist_wechat_full_sentence(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: log.info("extract %s basic feature..." % tmp_date) get_one_day_wechat_full_sentence(tmp_date)