def combine(self, path_save): self.load_atlas() utils.delete_existed_file(path_save) for line in utils.read_txt(setting.UA2DEVICE_PRECISE_DATE): model_dict = {} line = utils.format_device_ua(line) lines = line.split("^") model_dict["brand"] = lines[0] model_dict["model"] = lines[1] model_dict["type"] = lines[2] model_dict["keywords"] = lines[3] keywords = lines[3].split("|") for k in keywords: if k in self.atlas: # 判断品牌 相同品牌以atlas库为准 不同品牌以rtb库为准 if lines[0] != self.atlas[k]["brand"]: self.atlas[k] = model_dict else: self.rtb_ua[k] = model_dict dataset = [] for k, v in self.atlas.items(): dataset.append(v["brand"] + "^" + v["model"] + "^" + v["type"] + "^" + v["keywords"]) for k, v in self.rtb_ua.items(): dataset.append(v["brand"] + "^" + v["model"] + "^" + v["type"] + "^" + v["keywords"]) utils.save_to_datas_file(dataset, path_save) utils.deduplication_data(path_save)
def primary_hardware_type_to_precise(path, path_precise, path_save, brand="Vivo"): """ 提取primaryHardwareType字符串作为model 品牌:huawei 输入:Vivo^V5 (1601)^V5^Android^Mobile Phone^2016^0^vivo vivo x20plus a 输出:koobee^koobee A3^2^koobee A3 Tablet Mobile Phone :return: """ utils.delete_existed_file(path_save) data_set = [] for line in utils.read_txt(path): lines = line.split("^") model = lines[2].title().strip() if lines[4] == "Mobile Phone": type_str = "2" else: type_str = "3" model = clean_space(model.replace("-", " ").replace("-", " ")) keywords = lines[7].upper().strip() data_set.append(brand + "^" + model + "^" + type_str + "^" + keywords) for data in utils.read_txt(path_precise): datas = data.split("^") datas_keywords = datas[3].split("|") for k in datas_keywords: data_set.append(brand + "^" + datas[1].title() + "^" + datas[2] + "^" + k) utils.save_to_datas_file(data_set, path_save) utils.deduplication_data(path_save)
def check_ua_tv(): # /sas/tk_history/box_ua_0327.log zzy19 # path_input = "/Users/wangchun/PycharmProjects/Analysis/mobile_identify/data/tv_ua_text.txt" path_input = "/sas/tk_history/box_ua_0327.log" # print(path_input) identify_ua = identify.UaIdentify() identify_ua.load_data() count_brand = {} count = 0 for line in utils.read_txt(path_input): # print(line) count += 1 data_result = identify_ua.identify_tv(line) if data_result is None: continue if data_result["brand"] in count_brand: count_brand[data_result["brand"]] += 1 else: count_brand[data_result["brand"]] = 1 count_brand = sorted(count_brand.items(), key=lambda e: e[1], reverse=True) data_set = [] sum_brand = 0 for k, v in count_brand: sum_brand += v data_set.append(k + "\t" + str(v) + "\t" + str(v / count)) data_set.append("总数:" + str(count) + "\t" + "识别数:" + str(sum_brand) + "\t" + "识别率:" + str(sum_brand / count)) utils.save_to_datas_file(data_set, "count_result.txt")
def dropout_repeat_keywords(path, path_save_data, path_save_repeat): # 删除过重复值 # Zte^yuanhang 4^2^ZTE BA610C|ZTE BA610T|ZTE BLADE A610C UA2DEVICE_PRECISE_DATE setting.device_ua_output utils.delete_existed_file(path_save_data) utils.delete_existed_file(path_save_repeat) keywords_repeat = utils.get_repeat_keywords(path) data_set = [] data_repeat = [] keywords_list = [] for line in utils.read_txt(path): lines = utils.format_device_ua(line).split("^") keywords = lines[3].upper().split("|") for k in keywords: if k in keywords_repeat: lines_fix, keywords_list = ua_fix.fix_repeat_data(lines, keywords_list) if lines_fix: # print(keywords_list) data_set.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k) else: data_repeat.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k) else: data_set.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k) utils.save_to_datas_file(data_set, path_save_data) utils.deduplication_data(path_save_data) utils.save_to_datas_file(data_repeat, path_save_repeat) utils.deduplication_data(path_save_repeat)
def format_ua_device_pattern(path=setting.UA2DEVICE_PATTERN, path_save=setting.UA2DEVICE_PATTERN_DATE): """格式化 ua2device_pattern""" utils.delete_existed_file(path_save) data_set = [] for line in utils.read_txt(path): line = utils.format_device_ua(line) data_set.append(line) utils.save_to_datas_file(data_set, path_save) utils.deduplication_data(path_save)
def get_info_form_ua(path, path_save, keyword): # 提取原有库数据 data_set = [] utils.delete_existed_file(path_save) for line in utils.read_txt(path): lines = line.split("^") if lines[0].upper() != keyword.upper(): continue data_set.append(line) utils.save_to_datas_file(data_set, path_save)
def format_precise(path, path_save, path_brand_save): """ 输入:koobee^koobee A3^2^koobee A3 输出:koobee^koobee A3^2^koobee A3|XXXX\ 过滤后拼接 :return: """ utils.delete_existed_file(path_save) data_set = [] # models_length : length brand model type keywords models_length = {} for line in utils.read_txt(path): model_dict = {} lines = line.split("^") model = lines[1] keyword = lines[3] model_l = utils.delete_space(model) # 过滤 AA B与AAB 格式数据 if model_l in models_length: old_model_len = models_length[model_l]["length"] # print(models_length[model_l]) if old_model_len >= len(model): models_length[model_l]["model"] = model if not utils.whether_repeat(models_length[model_l]["keywords"], keyword): models_length[model_l]["keywords"] = models_length[model_l]["keywords"] + "|" + keyword else: models_length[model_l]["keywords"] = keyword else: model_dict["model"] = model model_dict["type"] = lines[2] model_dict["brand"] = lines[0] model_dict["length"] = len(model_l) model_dict["keywords"] = keyword models_length[model_l] = model_dict brand_set = set() for k, v in models_length.items(): # print(v) brand_set.add(v["brand"]) data_set.append(v["brand"] + "^" + v["model"] + "^" + v["type"] + "^" + v["keywords"]) utils.save_to_datas_file(data_set, path_save) utils.save_to_datas_file(list(brand_set), path_brand_save) utils.deduplication_data(path_save)
def get_not_match_ua_tv(): # /sas/tk_history/box_ua_0327.log zzy19 # path_input = "/Users/wangchun/PycharmProjects/Analysis/mobile_identify/data/tv_ua_text.txt" path_input = "/sas/tk_history/box_ua_0327.log" # print(path_input) identify_ua = identify.UaIdentify() identify_ua.load_data() data_set = [] count = 0 for line in utils.read_txt(path_input): # print(line) count += 1 data_result = identify_ua.identify_tv(line) if data_result is None: data_set.append(line) if len(data_set) >= 500: print(count) utils.save_to_datas_file(data_set, "not_in_list_ua_3.txt") data_set = [] utils.save_to_datas_file(data_set, "not_in_list_ua_3.txt")
def combine(self, path_save, path_update): self.load_old(setting.UA2DEVICE_PRECISE) self.load_old(setting.UA2DEVICE_PATTERN) utils.delete_existed_file(path_save) utils.delete_existed_file(path_update) dataset = [] dataset_update_list = [] # setting.UA2DEVICE_PRECISE_DATE 是device_ua_combine.txt复制过来的 for line in utils.read_txt(setting.UA2DEVICE_PRECISE_DATE): lines = line.split("^") brand = utils.clean_space(lines[0]).lower() model = utils.clean_space(lines[1]).lower() if brand in self.brand_old: dataset_update_list.append("brand:" + lines[0] + "<=" + self.brand_old[brand]) lines[0] = self.brand_old[brand] if model in self.model_old: dataset_update_list.append("model:" + lines[1] + "<=" + self.model_old[model]) lines[1] = self.model_old[model] else: if model.startswith(brand): model = model.replace(brand, "").strip() if model in self.model_old: print("model:" + lines[1] + "<=" + self.model_old[model]) dataset_update_list.append("model:" + lines[1] + "<=" + self.model_old[model]) lines[1] = self.model_old[model] # for k, v in self.model_old.items(): # if model.endswith(k): # dataset_update_list.append(lines[1] + "<=" + self.model_old[k]) # print(lines[1] + "<=" + self.model_old[k]) # lines[1] = self.model_old[k] # break dataset.append("^".join(lines)) utils.save_to_datas_file(dataset, path_save) utils.deduplication_data(path_save) utils.save_to_datas_file(dataset_update_list, path_update) utils.deduplication_data(path_update)
def primary_hardware_type_to_precise(path, path_save): """ 提取primaryHardwareType字符串作为model 品牌:huawei 输入:Vivo^Vivo^V5 (1601)^V5^Android^Mobile Phone^2016^0^vivo vivo x20plus a 输出:koobee^koobee A3^2^koobee A3 Tablet Mobile Phone :return: """ utils.delete_existed_file(path_save) data_set = [] # type_set = set() for line in get_raw_data(path): data = ua_fix.fix_ua(line) if data is None: continue # lines = line.split("^") # model = lines[3].title().strip() # if "ipad" in lines[8].lower() or "ipad" in lines[5].lower() or "tablet" in lines[8].lower() or "tablet" in \ # lines[5].lower(): # type_str = "3" # else: # if lines[5] == "Mobile Phone": # type_str = "2" # elif lines[5] == "Tablet" or lines[5] == "eReader" or lines[5] == "Games Console": # type_str = "3" # elif lines[5] == "TV" or lines[5] == "Set Top Box": # type_str = "4" # else: # continue # # type_str = "what?" # # type_set.add(lines[5]) # model = utils.clean_space(model.replace("-", " ")) # keywords = lines[8].upper().strip() data_set.append(data) # {'eReader', 'Games Console', 'Wireless Hotspot', 'Media Player', 'TV', 'Set Top Box', 'Camera', 'Embedded Network Module', 'Desktop'} # print(type_set) utils.save_to_datas_file(data_set, path_save) utils.deduplication_data(path_save)
def combine_precise_format(path, path_save, brand="Vivo"): """ 输入:Vivo^V5 (1601)^V5^Android^Mobile Phone^2016^0^vivo vivo x20plus a 输出:koobee^koobee A3^2^koobee A3 :return: """ utils.delete_existed_file(path_save) data_set = [] models = {} models_length = {} for line in utils.read_txt(path): lines = line.split("^") model = lines[1].lower() if brand not in ["huawei", "xiaomi", "samsung", "gionee"]: if brand.lower() in model: model = brand + " " + model.replace(brand.lower(), "").strip() else: model = brand.lower() + " " + model model_l = delete_space(model.replace("-", " ")) if model_l in models_length: old_model_len = int(models_length[model_l].split("^")[0]) old_model = models_length[model_l].split("^")[-1] if old_model_len > len(model): model = old_model else: models_length[model_l] = str(len(model)) + "^" + model # model+type model = clean_space(model).strip().title() + "^" + lines[2] keywords = lines[3].upper() if model in models: if not whether_repeat(models[model], keywords): models[model] = models[model] + "|" + keywords else: models[model] = keywords for k, v in models.items(): data_set.append(brand.title() + "^" + k + "^" + v) utils.save_to_datas_file(data_set, path_save) utils.deduplication_data(path_save)
def check_tong_not_match_ua_tv_(): data_path = "/home/galen/ua_keywords/" log_name_list = [ "baidu_model_0328.log", "miaozhen_model_0328.log", "youku_model_0328.log" ] identify_ua = identify.UaIdentify() identify_ua.load_data() data_set = [] count = 0 count_type = {} path_result = data_path + "count_result.txt" utils.delete_existed_file(path_result) for name in log_name_list: path_input = data_path + name ua_not_in_path = data_path + name.replace(".log", "_output.log") utils.delete_existed_file(ua_not_in_path) print(path_input) for line in utils.read_txt(path_input): # print(line) count += 1 data_result = identify_ua.detect_model(clean_character(line)) if data_result is None: data_set.append(line) else: if data_result["type"] in count_type: count_type[data_result["type"]] += 1 else: count_type[data_result["type"]] = 1 if len(data_set) >= 500: utils.save_to_datas_file(data_set, ua_not_in_path) data_set = [] if count % 100000 == 0: print(count) utils.save_to_datas_file(data_set, ua_not_in_path) sum_type = 0 data_set_count = [] tv_unkown_sum = count for k, v in count_type.items(): sum_type += v if k != "4": tv_unkown_sum = tv_unkown_sum - v data_set_count.append(k + "\t" + str(v) + "\t" + str(v / count)) data_set_count.append("总数:" + str(tv_unkown_sum) + "\t" + "识别数:" + str(sum_type) + "\t" + "识别率:" + str()) utils.save_to_datas_file(data_set_count, path_result)