def testSpawnThreads(self): self.jobmanager.loadJob(Job(1, "*/2 * * * *", "echo haha", 5)) self.jobmanager.indexJobs() self.assertEqual(1, len(self.jobmanager.hour_index[15])) threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-18 15:30:00"), DateUtil.datetime("2014-02-18 16:00:00")) self.assertEqual(15, len(threads)) threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-18 15:30:23"), DateUtil.datetime("2014-02-18 16:00:00")) self.assertEqual(14, len(threads))
def testSpawnThreads2(self): self.jobmanager.loadJob(Job(1, "* * * * *", "echo haha", 5)) self.jobmanager.indexJobs() threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-19 15:00:00"), DateUtil.datetime("2014-02-19 16:00:00")) self.assertEqual(60, len(threads)) print threads[0].args[0].get_exc_time()
def transfer_texts_2_tfRecord(self, start_date, end_date, raw_feature_file_path, tf_record_file_path, column_names, label_name, need_feature_cols, negative_ratio=None, var_length_cols=None, col_preprocess_func=None): """ 根据起止时间,将指定目录下的text文件,都存储为tfRecord格式 参数见:transfer_single_text_2_tfRecord """ raw_feature_folder_name = os.path.basename(raw_feature_file_path) tf_record_folder_name = os.path.basename(tf_record_file_path) raw_feature_file = os.path.join(raw_feature_file_path, raw_feature_folder_name + "_%s") tf_record_file = os.path.join(tf_record_file_path, tf_record_folder_name + "_%s.tfrecord") data_info_csv_path = os.path.join(tf_record_file_path, "data_info.csv") if not os.path.isdir(tf_record_file_path): os.makedirs(tf_record_file_path) date_ls = DateUtil.get_every_date(start_date, end_date) for date in date_ls: print(date) self.transfer_single_text_2_tfRecord(raw_feature_file % date, tf_record_file % date, data_info_csv_path, column_names, label_name, need_feature_cols, negative_ratio, var_length_cols, col_preprocess_func)
def load_wechat_2_dict(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) dict_wechat = OrderedDict() # wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/aggregated_wechat_segment_data") wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s") log.info(date_ls) for tmp_date in date_ls: log.info(tmp_date) dict_wechat[tmp_date] = OrderedDict() with codecs.open(wechat_segment_data % tmp_date, "r", "utf-8") as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls) except Exception as e: log.info(e) continue dict_wechat[tmp_date][opp_id] = chat_ls return dict_wechat
def load_multi_day_order(start_date, end_date): multi_day_dict = {} date_ls = DateUtil.get_every_date(start_date, end_date) date_ls = sorted(date_ls, reverse=True) # 降序排列,如果出现多次,用最小的订单时间覆盖 for date in date_ls: log.info(date) one_day_dict = load_one_day_order(date) multi_day_dict.update(one_day_dict) return multi_day_dict
def get_hist_wechat_segment(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: start_time = time.time() get_one_day_wechat_segment(tmp_date) print("{0} wechat segment cost time: {1}".format( tmp_date, time.time() - start_time))
def testNextScheduleSectionOverLongBatch(self): ''' Test get next schedule Section Test invalid over long interval ''' self.engine._fillclock(11, 4) # Get clock list [3, 7, 11, 15, 19, 23] try: self.engine._getNextScheduleSection(DateUtil.datetime('2014-02-14 15:06:34'), 23) except Exception: raise
def update_employee(self, emp: Employee): try: emp_result = self.fetch_employee_by_id(emp.emp_id) try: if emp.__eq__(emp_result): # print("Successfully Updated Employee- " + emp.emp_id + "\n") else: date_util = DateUtil() if date_util.check_date_of_birth(emp.dob): self.db.execute_dynamic_query( "update employee set fname = ?, lname =?, dob = ? , dept_id = ? where emp_id=?", emp.fname, emp.lname, emp.dob, emp.dept.dept_id, emp.emp_id) self.db.connection.commit() print("Successfully Updated Employee- " + emp.emp_id + "\n") else: print("Sorry!! Unable to update Employee- " + emp.emp_id + "\n") except AttributeError: if "fname" in str(sys.exc_info()[1]): print("First Name cannot be null") elif "lname" in str(sys.exc_info()[1]): print("Last Name cannot be null") elif "dob" in str(sys.exc_info()[1]): print("Date of birth is not in correct format") except ValueError: print(sys.exc_info()[1]) except: print("Unable to update employee Id - " + str(emp.emp_id) + ". Check employee Id.\n") print(sys.exc_info()) print("\n")
def load_hist_wechat_record_dict(date): wechat_dict = defaultdict(list) start_date = DateUtil.get_relative_delta_time_str( date, day=-HISTORY_WECHAT_RECORD_DELTA_DAY) end_date = DateUtil.get_relative_delta_time_str(date, -1) date_ls = sorted(DateUtil.get_every_date(start_date, end_date)) # 时间在后的聊天追加在后面 wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "wechat_full_sentence_data") wechat_full_sentence_data_file = os.path.join( wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s") for date in date_ls: log.info(date) wechat_full_sentence_data = wechat_full_sentence_data_file % date with codecs.open(wechat_full_sentence_data, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 5: continue opp_id, student_chat_num, teacher_chat_num, all_chat_num, chat_content = arr student_chat_num, teacher_chat_num, all_chat_num = int( student_chat_num), int(teacher_chat_num), int(all_chat_num) if opp_id not in wechat_dict: wechat_dict[opp_id] = { "stat_info": [0, 0, 0], "chat_content": "" } wechat_dict[opp_id]["chat_content"] = wechat_dict[opp_id][ "chat_content"] + chat_content wechat_dict[opp_id]["stat_info"] = [ x + y for x, y in zip([student_chat_num, teacher_chat_num, all_chat_num], wechat_dict[opp_id]["stat_info"]) ] return wechat_dict
def testMatchTimePattern(self): self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * * *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("28 * * * *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* 20 * * *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * 17 * *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * 2 *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * * 1"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("28 20 17 2 1"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("*/2 * * * *"), DateUtil.datetime("2014-02-17 20:28:35"))) self.assertEqual(False, TimeMatcher.matchTimePattern(TimePattern("*/3 * * * *"), DateUtil.datetime("2014-02-17 20:28:35")))
def get_hist_wechat_tf_feature(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) log.info("initial past n day wechat segment data...") s_date = (datetime.strptime(start_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") t_date = start_date log.info("load past days wechat segment data...") dict_wechat = load_wechat_2_dict(s_date, t_date) log.info("extract tf feature...") for tmp_date in date_ls: log.info("extract %s tf feature..." % tmp_date) start_time = time.time() get_one_day_wechat_tf_feature(tmp_date, dict_wechat) log.info("extract {0} wechat tf feature cost time:{1}".format(tmp_date, time.time()-start_time)) del_date = (datetime.strptime(tmp_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") add_date = (datetime.strptime(tmp_date, "%Y%m%d") + timedelta(days=1)).strftime("%Y%m%d") if add_date <= end_date: log.info("update past days wechat segment data [del %s, add %s]..." % (del_date, add_date)) dict_wechat = update_wechat_dict(dict_wechat, del_date, add_date) log.info("=======" * 3)
def gen_bench_mark_multi_day(start_date, end_date): bench_mark_text_file_tmp = os.path.join(bench_mark_text_file_path, "total_chat_num_%s", "total_chat_num_%s_%s") for date in DateUtil.get_every_date(start_date, end_date): print(date) # 按chat_num分组 source_file_path = os.path.join(raw_feature_path, "wechat_basic_feature_%s" % date) chat_group_dict = {x: [] for x in chat_num_ls} with codecs.open(source_file_path, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") column_names = ["label", "opp_id", "acc_id", "create_time", "today_student_chat_num", "today_teacher_chat_num", "today_total_chat_num", "hist_student_chat_num", "hist_teacher_chat_num", "hist_total_chat_num", "chat_content"] tmp_dict = {key: value for key, value in zip(column_names, arr)} total_student_chat_num = int(tmp_dict["today_student_chat_num"]) + int( tmp_dict["hist_student_chat_num"]) if total_student_chat_num in chat_group_dict: chat_group_dict[total_student_chat_num].append(line.strip()) for chat_num, line_ls in chat_group_dict.items(): if not line_ls: continue bench_mark_text_file = bench_mark_text_file_tmp % (chat_num, chat_num, date) tmp_text_folder_path = os.path.dirname(bench_mark_text_file) tmp_tf_record_folder_path = tmp_text_folder_path.replace("feature_file", "tf_record") if not os.path.isdir(tmp_text_folder_path): os.makedirs(tmp_text_folder_path) with codecs.open(bench_mark_text_file, "w", "utf-8") as fout: for line in line_ls: fout.write(line + "\n") # 将转换好的text bench mark 文本文件 , 转换为tfRecord文件 tfRecorder.transfer_texts_2_tfRecord_default(date, date, tmp_text_folder_path, tmp_tf_record_folder_path)
LOG_PATH = os.path.join(PROJECT_DATA_DIR, "log") def get_logger(path): log_file = os.path.join(LOG_PATH, path) logger = logging.getLogger(PROJECT_NAME) # 程序顶级目录的名字 fmt = '[%(asctime)s] - %(filename)s:%(lineno)s - %(name)s - %(message)s' formatter = logging.Formatter(fmt) # 实例化formatter handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=1024 * 1024, backupCount=5) # 实例化handler handler.setFormatter(formatter) # 为handler添加formatter logger.addHandler(handler) # 为logger添加handler logger.setLevel(logging.DEBUG) consoleHandle = logging.StreamHandler() consoleHandle.setFormatter(formatter) logger.addHandler(consoleHandle) return logger G_LOG = get_logger("log_%s" % DateUtil.get_relative_delta_time_str()) if __name__ == "__main__": logger = get_logger("log_20170523") logger.info('first info message') logger.debug('first debug message') logger.debug('-----------')
def get_hist_wechat_full_sentence(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: log.info("extract %s basic feature..." % tmp_date) get_one_day_wechat_full_sentence(tmp_date)
def get_one_day_wechat_basic_feature(date): log.info("get %s wechat basic feature..." % date) wechat_basic_feature_dir = os.path.join(PROJECT_DATA_DIR, "feature_file", "wechat_basic_feature") aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "aggregated_wechat_data") wechat_basic_feature_data = os.path.join(wechat_basic_feature_dir, "wechat_basic_feature_%s" % date) aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir, "aggregated_wechat_data_%s" % date) log.info("prepare hist wechat chat dict...") hist_wechat_chat_dict = load_hist_wechat_record_dict(date) log.info("prepare hist and future order dict...") hist_order_dict = load_multi_day_order( DateUtil.get_relative_delta_time_str(date, day=-HISTORY_ORDER_DELTA_DAY), DateUtil.get_relative_delta_time_str(date, day=-1)) future_order_dict = load_multi_day_order( date, DateUtil.get_relative_delta_time_str(date, day=FUTURE_ORDER_DELTA_DAY)) log.info("start 2 gen wechat basic feature...") with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \ codecs.open(wechat_basic_feature_data, "w", "utf-8") as fout: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls, encoding="utf-8") except Exception as e: log.info(e) continue if opp_id in hist_order_dict: continue student_chat_idx = [] for idx, chat_dict in enumerate(chat_ls): send_type = chat_dict["send_type"] if send_type == "1": student_chat_idx.append(idx) if not student_chat_idx: continue # 随机选取一句学生对话,作为样本生成点 sample_idx = np.random.choice(student_chat_idx, 1)[0] sample_chat = chat_ls[sample_idx] create_time = sample_chat["create_time"] account = sample_chat["account"] order_time = future_order_dict.get(opp_id, None) # 是否最近成单 label = judge_label(order_time, create_time) if label == "-1": continue sample_chat_ls = chat_ls[:sample_idx + 1] cleared_chat_sentence = clear_sentence(sample_chat_ls) chat_stat_ls = stat_sentence(sample_chat_ls) hist_chat_stat_ls = [0, 0, 0] hist_wechat_chat = hist_wechat_chat_dict.get(opp_id, None) # 是否有历史聊天记录 if hist_wechat_chat: # 拼接历史聊天信息 hist_chat_stat_ls = hist_wechat_chat["stat_info"] cleared_chat_sentence = hist_wechat_chat[ "chat_content"] + cleared_chat_sentence today_stat_str = "\t".join(map(str, chat_stat_ls)) hist_stat_str = "\t".join(map(str, hist_chat_stat_ls)) result = "\t".join([ label, opp_id, account, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence ]) fout.write(result + "\n") # 不抽样,剖析每个对话,学生说话则触发一次样本生成 # for idx, chat_dict in enumerate(chat_ls): # send_type = chat_dict["send_type"] # create_time = chat_dict["create_time"] # accout = chat_dict["account"] # # if send_type == "0": # 老师会话不做样本选取点 # continue # # label = judge_label(order_time, create_time) # if label == "-1": # continue # # cleared_chat_sentence = clear_sentence(chat_ls[:idx + 1]) # chat_stat_ls = stat_sentence(chat_ls[:idx + 1]) # hist_chat_stat_ls = [0, 0, 0] # # if hist_wechat_chat: # 拼接历史聊天信息 # hist_chat_stat_ls = hist_wechat_chat["stat_info"] # cleared_chat_sentence = hist_wechat_chat["chat_content"] + cleared_chat_sentence # # today_stat_str = "\t".join(map(str, chat_stat_ls)) # hist_stat_str = "\t".join(map(str, hist_chat_stat_ls)) # result = "\t".join( # [label, opp_id, accout, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence]) # fout.write(result + "\n") log.info("finished, write feature to file : %s" % wechat_basic_feature_data)