示例#1
0
def primary_hardware_type_to_precise(path,
                                     path_precise,
                                     path_save,
                                     brand="Vivo"):
    """
    提取primaryHardwareType字符串作为model
    品牌:huawei
    输入:Vivo^V5 (1601)^V5^Android^Mobile Phone^2016^0^vivo vivo x20plus a
    输出:koobee^koobee A3^2^koobee A3
    Tablet Mobile Phone
    :return:
    """
    utils.delete_existed_file(path_save)
    data_set = []
    for line in utils.read_txt(path):
        lines = line.split("^")
        model = lines[2].title().strip()
        if lines[4] == "Mobile Phone":
            type_str = "2"
        else:
            type_str = "3"
        model = clean_space(model.replace("-", " ").replace("-", " "))
        keywords = lines[7].upper().strip()
        data_set.append(brand + "^" + model + "^" + type_str + "^" + keywords)
    for data in utils.read_txt(path_precise):
        datas = data.split("^")
        datas_keywords = datas[3].split("|")
        for k in datas_keywords:
            data_set.append(brand + "^" + datas[1].title() + "^" + datas[2] +
                            "^" + k)
    utils.save_to_datas_file(data_set, path_save)
    utils.deduplication_data(path_save)
示例#2
0
 def combine(self, path_save):
     self.load_atlas()
     utils.delete_existed_file(path_save)
     for line in utils.read_txt(setting.UA2DEVICE_PRECISE_DATE):
         model_dict = {}
         line = utils.format_device_ua(line)
         lines = line.split("^")
         model_dict["brand"] = lines[0]
         model_dict["model"] = lines[1]
         model_dict["type"] = lines[2]
         model_dict["keywords"] = lines[3]
         keywords = lines[3].split("|")
         for k in keywords:
             if k in self.atlas:
                 # 判断品牌 相同品牌以atlas库为准 不同品牌以rtb库为准
                 if lines[0] != self.atlas[k]["brand"]:
                     self.atlas[k] = model_dict
             else:
                 self.rtb_ua[k] = model_dict
     dataset = []
     for k, v in self.atlas.items():
         dataset.append(v["brand"] + "^" + v["model"] + "^" + v["type"] +
                        "^" + v["keywords"])
     for k, v in self.rtb_ua.items():
         dataset.append(v["brand"] + "^" + v["model"] + "^" + v["type"] +
                        "^" + v["keywords"])
     utils.save_to_datas_file(dataset, path_save)
     utils.deduplication_data(path_save)
示例#3
0
def check_ua_tv():
    # /sas/tk_history/box_ua_0327.log  zzy19
    # path_input = "/Users/wangchun/PycharmProjects/Analysis/mobile_identify/data/tv_ua_text.txt"
    path_input = "/sas/tk_history/box_ua_0327.log"
    # print(path_input)
    identify_ua = identify.UaIdentify()
    identify_ua.load_data()
    count_brand = {}
    count = 0
    for line in utils.read_txt(path_input):
        # print(line)
        count += 1
        data_result = identify_ua.identify_tv(line)
        if data_result is None:
            continue
        if data_result["brand"] in count_brand:
            count_brand[data_result["brand"]] += 1
        else:
            count_brand[data_result["brand"]] = 1
    count_brand = sorted(count_brand.items(), key=lambda e: e[1], reverse=True)
    data_set = []
    sum_brand = 0
    for k, v in count_brand:
        sum_brand += v
        data_set.append(k + "\t" + str(v) + "\t" + str(v / count))
    data_set.append("总数:" + str(count) + "\t" + "识别数:" + str(sum_brand) +
                    "\t" + "识别率:" + str(sum_brand / count))
    utils.save_to_datas_file(data_set, "count_result.txt")
示例#4
0
def dropout_repeat_keywords(path, path_save_data, path_save_repeat):
    # 删除过重复值
    # Zte^yuanhang 4^2^ZTE BA610C|ZTE BA610T|ZTE BLADE A610C UA2DEVICE_PRECISE_DATE setting.device_ua_output
    utils.delete_existed_file(path_save_data)
    utils.delete_existed_file(path_save_repeat)
    keywords_repeat = utils.get_repeat_keywords(path)
    data_set = []
    data_repeat = []
    keywords_list = []
    for line in utils.read_txt(path):
        lines = utils.format_device_ua(line).split("^")
        keywords = lines[3].upper().split("|")
        for k in keywords:
            if k in keywords_repeat:
                lines_fix, keywords_list = ua_fix.fix_repeat_data(lines, keywords_list)
                if lines_fix:
                    # print(keywords_list)
                    data_set.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k)
                else:
                    data_repeat.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k)
            else:
                data_set.append(lines[0] + "^" + lines[1] + "^" + lines[2] + "^" + k)
    utils.save_to_datas_file(data_set, path_save_data)
    utils.deduplication_data(path_save_data)
    utils.save_to_datas_file(data_repeat, path_save_repeat)
    utils.deduplication_data(path_save_repeat)
示例#5
0
def get_raw_data(path_data):
    """
    原始数据格式:
    brand +"^"+ vendor + "^" + model + "^" +marketingName + "^" + osName + "^" + primaryHardwareType + "^" + yearReleased + "^" + isTablet + "^" + identifierUa;
    保持原数据格式,提取identifierUa。
    Vivo^Y11i T^Y11^Android^Mobile Phone^2013^0^vivo Y11i T Build/KTU84P
    :return:
    """
    data_set = []
    uf = ua_fix.UaFix()
    for line in utils.read_txt(path_data):
        lines = line.split("^")
        # 过滤异常
        if uf.filter_main(lines[0], lines[8]):
            continue
        if "build/" in lines[8].lower():
            build = lines[8].lower().split("build/")[0].strip()
            if len(build) == 0:
                continue
            lines[8] = build
            data_set.append("^".join(lines))
        elif "miui/" in lines[8].lower():
            build = lines[8].lower().split("miui/")[0].strip()
            if len(build) == 0:
                continue
            lines[8] = build
            data_set.append("^".join(lines))
        else:
            pass
    return data_set
示例#6
0
def format_ua_device_pattern(path=setting.UA2DEVICE_PATTERN, path_save=setting.UA2DEVICE_PATTERN_DATE):
    """格式化 ua2device_pattern"""
    utils.delete_existed_file(path_save)
    data_set = []
    for line in utils.read_txt(path):
        line = utils.format_device_ua(line)
        data_set.append(line)
    utils.save_to_datas_file(data_set, path_save)
    utils.deduplication_data(path_save)
 def load_old(self, path):
     # Zte^yuanhang 4^2^ZTE BA610C|ZTE BA610T|ZTE BLADE A610C
     for line in utils.read_txt(path):
         lines = line.split("^")
         brand = utils.clean_space(lines[0]).lower()
         model = utils.clean_space(lines[1]).lower()
         if not (brand in self.brand_old):
             self.brand_old[brand] = lines[0]
         if not (model in self.model_old):
             self.model_old[model] = lines[1]
示例#8
0
def get_info_form_ua(path, path_save, keyword):
    # 提取原有库数据
    data_set = []
    utils.delete_existed_file(path_save)
    for line in utils.read_txt(path):
        lines = line.split("^")
        if lines[0].upper() != keyword.upper():
            continue
        data_set.append(line)
    utils.save_to_datas_file(data_set, path_save)
示例#9
0
 def load_atlas(self):
     # Zte^yuanhang 4^2^ZTE BA610C|ZTE BA610T|ZTE BLADE A610C
     for line in utils.read_txt(setting.device_ua_output):
         model_dict = {}
         lines = utils.format_device_ua(line).split("^")
         model_dict["brand"] = lines[0]
         model_dict["model"] = lines[1]
         model_dict["type"] = lines[2]
         model_dict["keywords"] = lines[3]
         keywords = lines[3].split("|")
         for k in keywords:
             if k not in self.atlas:
                 self.atlas[k] = model_dict
 def data_to_dict(self, data_dic, name):
     # Zte^yuanhang 4^2^ZTE BA610C|ZTE BA610T|ZTE BLADE A610C
     path = self.RESOURCE_PATH + name
     for line in utils.read_txt(path):
         model_dict = {}
         lines = utils.format_device_ua(line).split("^")
         model_dict["brand"] = lines[0]
         model_dict["model"] = lines[1]
         model_dict["type"] = lines[2]
         model_dict["keywords"] = lines[3]
         keywords = lines[3].split("|")
         for k in keywords:
             if k not in data_dic:
                 data_dic[k] = model_dict
示例#11
0
def check_tong_not_match_ua_tv_():
    data_path = "/home/galen/ua_keywords/"
    log_name_list = [
        "baidu_model_0328.log", "miaozhen_model_0328.log",
        "youku_model_0328.log"
    ]
    identify_ua = identify.UaIdentify()
    identify_ua.load_data()
    data_set = []
    count = 0
    count_type = {}
    path_result = data_path + "count_result.txt"
    utils.delete_existed_file(path_result)
    for name in log_name_list:
        path_input = data_path + name
        ua_not_in_path = data_path + name.replace(".log", "_output.log")
        utils.delete_existed_file(ua_not_in_path)
        print(path_input)
        for line in utils.read_txt(path_input):
            # print(line)
            count += 1
            data_result = identify_ua.detect_model(clean_character(line))
            if data_result is None:
                data_set.append(line)
            else:
                if data_result["type"] in count_type:
                    count_type[data_result["type"]] += 1
                else:
                    count_type[data_result["type"]] = 1
            if len(data_set) >= 500:
                utils.save_to_datas_file(data_set, ua_not_in_path)
                data_set = []
            if count % 100000 == 0:
                print(count)
        utils.save_to_datas_file(data_set, ua_not_in_path)
        sum_type = 0
        data_set_count = []
        tv_unkown_sum = count
        for k, v in count_type.items():
            sum_type += v
            if k != "4":
                tv_unkown_sum = tv_unkown_sum - v
            data_set_count.append(k + "\t" + str(v) + "\t" + str(v / count))
        data_set_count.append("总数:" + str(tv_unkown_sum) + "\t" + "识别数:" +
                              str(sum_type) + "\t" + "识别率:" + str())
        utils.save_to_datas_file(data_set_count, path_result)
示例#12
0
def format_precise(path, path_save, path_brand_save):
    """
    输入:koobee^koobee A3^2^koobee A3
    输出:koobee^koobee A3^2^koobee A3|XXXX\
    过滤后拼接
    :return:
    """
    utils.delete_existed_file(path_save)
    data_set = []
    # models_length : length brand model type keywords
    models_length = {}
    for line in utils.read_txt(path):
        model_dict = {}
        lines = line.split("^")
        model = lines[1]
        keyword = lines[3]
        model_l = utils.delete_space(model)
        # 过滤 AA B与AAB 格式数据
        if model_l in models_length:
            old_model_len = models_length[model_l]["length"]
            # print(models_length[model_l])
            if old_model_len >= len(model):
                models_length[model_l]["model"] = model
            if not utils.whether_repeat(models_length[model_l]["keywords"], keyword):
                models_length[model_l]["keywords"] = models_length[model_l]["keywords"] + "|" + keyword
            else:
                models_length[model_l]["keywords"] = keyword
        else:
            model_dict["model"] = model
            model_dict["type"] = lines[2]
            model_dict["brand"] = lines[0]
            model_dict["length"] = len(model_l)
            model_dict["keywords"] = keyword
            models_length[model_l] = model_dict
    brand_set = set()
    for k, v in models_length.items():
        # print(v)
        brand_set.add(v["brand"])
        data_set.append(v["brand"] + "^" + v["model"] + "^" + v["type"] + "^" + v["keywords"])
    utils.save_to_datas_file(data_set, path_save)
    utils.save_to_datas_file(list(brand_set), path_brand_save)
    utils.deduplication_data(path_save)
示例#13
0
def get_not_match_ua_tv():
    # /sas/tk_history/box_ua_0327.log  zzy19
    # path_input = "/Users/wangchun/PycharmProjects/Analysis/mobile_identify/data/tv_ua_text.txt"
    path_input = "/sas/tk_history/box_ua_0327.log"
    # print(path_input)
    identify_ua = identify.UaIdentify()
    identify_ua.load_data()
    data_set = []
    count = 0
    for line in utils.read_txt(path_input):
        # print(line)
        count += 1
        data_result = identify_ua.identify_tv(line)
        if data_result is None:
            data_set.append(line)
        if len(data_set) >= 500:
            print(count)
            utils.save_to_datas_file(data_set, "not_in_list_ua_3.txt")
            data_set = []
    utils.save_to_datas_file(data_set, "not_in_list_ua_3.txt")
 def combine(self, path_save, path_update):
     self.load_old(setting.UA2DEVICE_PRECISE)
     self.load_old(setting.UA2DEVICE_PATTERN)
     utils.delete_existed_file(path_save)
     utils.delete_existed_file(path_update)
     dataset = []
     dataset_update_list = []
     # setting.UA2DEVICE_PRECISE_DATE 是device_ua_combine.txt复制过来的
     for line in utils.read_txt(setting.UA2DEVICE_PRECISE_DATE):
         lines = line.split("^")
         brand = utils.clean_space(lines[0]).lower()
         model = utils.clean_space(lines[1]).lower()
         if brand in self.brand_old:
             dataset_update_list.append("brand:" + lines[0] + "<=" +
                                        self.brand_old[brand])
             lines[0] = self.brand_old[brand]
         if model in self.model_old:
             dataset_update_list.append("model:" + lines[1] + "<=" +
                                        self.model_old[model])
             lines[1] = self.model_old[model]
         else:
             if model.startswith(brand):
                 model = model.replace(brand, "").strip()
                 if model in self.model_old:
                     print("model:" + lines[1] + "<=" +
                           self.model_old[model])
                     dataset_update_list.append("model:" + lines[1] + "<=" +
                                                self.model_old[model])
                     lines[1] = self.model_old[model]
                     # for k, v in self.model_old.items():
                     #     if model.endswith(k):
                     #         dataset_update_list.append(lines[1] + "<=" + self.model_old[k])
                     #         print(lines[1] + "<=" + self.model_old[k])
                     #         lines[1] = self.model_old[k]
                     #         break
         dataset.append("^".join(lines))
     utils.save_to_datas_file(dataset, path_save)
     utils.deduplication_data(path_save)
     utils.save_to_datas_file(dataset_update_list, path_update)
     utils.deduplication_data(path_update)
示例#15
0
def extract_band(brand, keyword, path_data, path_raw, path_build,
                 path_not_build):
    """
    原始数据格式
    vendor + "^" + model + "^" +marketingName + "^" + osName + "^" + primaryHardwareType + "^" + yearReleased + "^" + isTablet + "^" + identifierUa;
    Vivo^Y11i T^Y11^Android^Mobile Phone^2013^0^vivo Y11i T Build/KTU84P
    :return:
    """
    utils.delete_existed_file(path_raw)
    utils.delete_existed_file(path_build)
    utils.delete_existed_file(path_not_build)
    for line in utils.read_txt(path_data):
        if brand.lower() == "xiaomi":
            # XIAOCAISHEN XIAOMANYAO XING_FOURTEEN_V3 XINDAN XINGMI
            if filter_xiaomi(line):
                continue
        lines = line.split("^")
        if brand.lower() != lines[0].lower():
            continue
        if '(' in lines[7] or ')' in lines[7]:
            if keyword.lower() not in lines[7].lower():
                continue
            utils.save_to_data_file(line + "\n", path_raw)
        else:
            if "build/" in lines[7].lower():
                build = lines[7].lower().split("build")[0].strip()
                if len(build) == 0:
                    continue
                lines[7] = build
                utils.save_to_data_file("^".join(lines) + "\n", path_build)
            elif "miui/" in lines[7].lower():
                build = lines[7].lower().split("miui")[0].strip()
                if len(build) == 0:
                    continue
                lines[7] = build
                utils.save_to_data_file("^".join(lines) + "\n", path_build)
            else:
                utils.save_to_data_file(line + "\n", path_not_build)
    utils.deduplication_data(path_build)
示例#16
0
def combine_precise_format(path, path_save, brand="Vivo"):
    """
    输入:Vivo^V5 (1601)^V5^Android^Mobile Phone^2016^0^vivo vivo x20plus a
    输出:koobee^koobee A3^2^koobee A3
    :return:
    """
    utils.delete_existed_file(path_save)
    data_set = []
    models = {}
    models_length = {}
    for line in utils.read_txt(path):
        lines = line.split("^")
        model = lines[1].lower()
        if brand not in ["huawei", "xiaomi", "samsung", "gionee"]:
            if brand.lower() in model:
                model = brand + " " + model.replace(brand.lower(), "").strip()
            else:
                model = brand.lower() + " " + model
        model_l = delete_space(model.replace("-", " "))
        if model_l in models_length:
            old_model_len = int(models_length[model_l].split("^")[0])
            old_model = models_length[model_l].split("^")[-1]
            if old_model_len > len(model):
                model = old_model
        else:
            models_length[model_l] = str(len(model)) + "^" + model
        # model+type
        model = clean_space(model).strip().title() + "^" + lines[2]
        keywords = lines[3].upper()
        if model in models:
            if not whether_repeat(models[model], keywords):
                models[model] = models[model] + "|" + keywords
        else:
            models[model] = keywords
    for k, v in models.items():
        data_set.append(brand.title() + "^" + k + "^" + v)
    utils.save_to_datas_file(data_set, path_save)
    utils.deduplication_data(path_save)