Пример #1
0
    def extra_any_json(rawData: str,
                       ruler: str,
                       cap=None,
                       list_path: list = None) -> list:
        elementlist = []

        rulers = ExtraHtml.ruler_killer(ruler)
        json = None

        if cap != None:
            if isinstance(cap, list):
                rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1])
                json = demjson.decode(rawData)
            elif isinstance(cap, str):
                json = demjson.decode(rawData)
                json = ExtraJSON.dic_dip_extra(json, cap)
        else:
            json = demjson.decode(rawData)

            if list_path is not None:
                json = StringHelper.dic_looper(json, list_path)

        for item in json:  #循环 json 字典
            try:
                dic = dict()
                for rul in rulers:  #循环 ruler 列表
                    res = ExtraJSON.ruler_finder(item, rul[1])
                    if res is not None:
                        dic[rul[0]] = res
                elementlist.append(dic)
            except Exception as e:
                print(e)

        return elementlist
Пример #2
0
    def looper_html(self, result, raw, exists, ruler, extra3_tup, extra4_tup):
        tup = Sh.str_to_dictup(extra3_tup)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(extra4_tup)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            """日常抓取时的重复验证"""
            try:
                link = dic_list['link']
            except:
                break

            if exists.count(link) < 1:
                result.append(dic_list)
            else:
                break
Пример #3
0
    def get_txt_page_encode(raw:str) -> str:
        try:
            encode = StringHelper.extra_equ_value(raw, 'charset', '"')
            encode = StringHelper.cutfrom(encode, '/')

            return encode
        except HttpConnectionFailedException as e:
            raise e
        except:
            return ''
Пример #4
0
    def get_page_encode(self, url):
        try:
            raw = self.req.get(url)

            encode = StringHelper.extra_equ_value(raw, 'charset', '"')
            encode = StringHelper.cutfrom(encode,'/')

            return encode
        except HttpConnectionFailedException as e:
            raise e
        except:
            return ''
Пример #5
0
    def ruler_finder_multi_at(tag, ruler2):
        result = None
        pair = ruler2.split('@', 1)

        tup = Sh.str_to_dictup(pair[1])
        _tag = tup[0]
        dic = tup[1]
        parent_container = (_tag, dic)

        tup = Sh.str_to_dictup(pair[0])
        _tag = tup[0]
        dic = tup[1]
        list_tup = (_tag, dic)

        p_tag = parent_container[0]
        p_dic = parent_container[1]
        s_ruler = ''

        for key in p_dic:
            s_ruler = p_tag + ' ' + key + '=' + p_dic[key]
            break
        ruler_pair = ('', s_ruler)

        ruler_pair_part_of = ruler_pair[1].split(' ', 2)
        identify = ruler_pair_part_of[1].split('=', 1)
        if tag.name != ruler_pair_part_of[0]:
            return result

        try:
            if tag.attrs[identify[0]] == identify[1]:
                result = []
                raw = str(tag)

                # 获取 list 提取成 字典
                list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

                for item in list:
                    if len(item) < 1:
                        continue

                    dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

                    result.append(dic_list)

                return result
        except:
            result = None

        return result
Пример #6
0
    def finder_sohu(self, raw):
        try:
            count = s.cut_tail(raw.split('=')[1], ';')

            return {'playCount': count}
        except:
            pass
Пример #7
0
    def extra_any_json_dic(rawData, ruler, cap=None):
        elementlist = {}

        rulers = ExtraHtml.ruler_killer(ruler)
        json = None

        if cap != None:
            if isinstance(cap, list):
                rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1])
                json = demjson.decode(rawData)
            elif isinstance(cap, str):
                json = demjson.decode(rawData)
                json = ExtraJSON.dic_dip_extra(json, cap)
        else:
            json = demjson.decode(rawData)

        # for item in json:
        try:
            for rul in rulers:
                res = ExtraJSON.ruler_finder(json, rul[1])
                if res is not None:
                    elementlist[rul[0]] = res
        except Exception as e:
            print(e)

        return elementlist
Пример #8
0
    def extra_any_json_ex(rawData, ruler, cap=None):
        dic = dict()

        if cap != None:
            rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1])

        rulers = ExtraHtml.ruler_killer(ruler)
        json = demjson.decode(rawData)

        for key in json:
            try:
                # dic = dict()
                for rul in rulers:
                    res = None
                    if isinstance(json[key], dict):
                        res = ExtraJSON.ruler_finder(json[key], rul[1])
                    elif key == rul[1]:
                        res = json[key]
                    if res is not None:
                        dic[rul[0]] = res
                # elementlist.append(dic)
            except Exception as e:
                print(e)

        return dic
Пример #9
0
    def go_json(rawData, ruler, cap=None):
        """单条json 提取"""
        dic = dict()

        if cap != None:
            rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1])

        rulers = ExtraHtml.ruler_killer_ex(ruler)
        json = demjson.decode(rawData)

        list = []
        ExtraJSON.dic_to_list(json, list)

        for item in list:
            for ruler in rulers:
                if item.count(ruler[1]) > 0:
                    dic[ruler[0]] = StringHelper.cut_head(item, ruler[1] + ' ')
                    continue

        return dic
Пример #10
0
    def looper_html_ex(self, result, raw, ruler, extra3_tup, extra4_tup):
        tup = Sh.str_to_dictup(extra3_tup)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(extra4_tup)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            result.append(dic_list)
Пример #11
0
    def looper_js(self, result:list, raw:str, exists:list, ruler:str, extra3_tup:str=None, list_json_path:str=None, identifier_key:str=None):
        # cap = ('data_callback(', ')')
        cap = None
        path = None

        if identifier_key is None:
            identifier_key = 'link'

        if extra3_tup is not None:
            cap = Sh.str_to_tup(extra3_tup)
        elif list_json_path is not None:
            path = Sh.separator(list_json_path, '->')

        list = ExtraJSON.extra_any_json(raw, ruler, cap=cap, list_path=path)

        if len(list) > 0:
            for item in list:
                """日常抓取时的重复验证"""
                if exists.count(item[identifier_key]) < 1:
                    result.append(item)
                else:
                    break
Пример #12
0
    def find_constant(self):
        self.part_second = StringHelper.url_divider(self.second)
        self.part_third = StringHelper.url_divider(self.third)

        for i in range(0,len(self.part_second)):
            if self.part_second[i] != self.part_third[i]:
                self.out_index = i
                break

        self.constant_sec = self.part_second[self.out_index]
        constant_thi = self.part_third[self.out_index]

        self.list_sec = self.find_num_ex(self.constant_sec)
        self.list_thi = self.find_num_ex(constant_thi)

        for i in range(0, len(self.list_sec)):
            if self.list_sec[i] != self.list_thi[i]:
                self.variable_sec = self.list_sec[i]
                if self.list_sec[i] < self.list_thi[i]:
                    self.to_right = True
                else:
                    self.to_right = False
Пример #13
0
 def finder_need_tag(tag, need_tag=False, next_sibling=False, next_sibling_text=False, get_text=False, ruler_pair:()=None):
     if need_tag:
         return tag
     elif next_sibling_text:
         return tag.next_sibling.text
     elif next_sibling:
         if ruler_pair is not None:
             ruler_pair[0] = Sh.cut_tail(ruler_pair[0], '-')
             return (ruler_pair[0], str(tag.next_sibling))
     elif get_text:
         return tag.get_text()
     else:
         return tag.text
Пример #14
0
    def check_shutdown_status():
        try:
            file_name = Configs().system_shutdown_flag_filename

            status = int(StringHelper.trim(FileHelper.read(file_name)))

            if status == 1:
                somebody_help.reset_shutdown_status()
                return True
            else:
                return False
        except:
            LogGo.error("system_shutdown_flag_file unavailable!")
            return False
Пример #15
0
    def container_process(self, raw:str, tribleStar:str) -> str:
        """
        前后剪裁
        :param extra4:
        :param raw:
        :return:
        """
        res = raw

        if tribleStar is not None and tribleStar != '':
            parts = tribleStar.split('***')

            res = Sh.strip_head_tail(raw, parts[0], parts[1])

        return res
Пример #16
0
    def looper_js(self, result, raw, exists, ruler, captup=None):
        cap = captup
        if captup != None:
            if captup.count(' ') == 2:
                cap = Sh.str_to_tup(captup)

        list = ExtraJSON.extra_any_json(raw, ruler, cap=cap)

        if len(list) > 0:
            for item in list:
                """日常抓取时的重复验证"""
                if 1>0:#if exists.count(item['link']) < 1:
                    result.append(item)
                else:
                    break
Пример #17
0
    def purify_a(tag):
        if tag.name == 'a':
            try:
                # print(tag)

                data_ue_src = tag.attrs['data_ue_src']
                sn = StringHelper.extra_a_to_b(data_ue_src, 'sn=', '&')
                sn = sn[3:len(sn) - 1]

                tag.attrs['data_ue_src'] = sn
                tag.attrs['href'] = sn

                # print(" ")
                # print(tag)
            except:
                pass

        for _tag in tag.contents:
            if isinstance(_tag, Tag):
                ExtraJSON.purify_a(_tag)
Пример #18
0
    def generate_url(self,index):
        """

        :param index:
        :return:
        """
        if index <= 3:
            return None

        ref = int(self.variable_sec[0])
        dis = index - 2

        if self.to_right:
            ref += dis
        else:
            ref -= dis

        self.part_third[self.out_index] = StringHelper.exchange(self.constant_sec, self.variable_sec[0], str(ref), self.variable_sec[1])

        url = "/".join(self.part_third)
        return url
Пример #19
0
    def ruler_finder_condition_content(ruler:str, extract=False) -> bool:
        """
        判断 ruler 是否为 [你好]
        :param ruler:
        :return:
        """
        type1 = '['
        type2 = ']'

        result = False

        try:
            if ruler.count(type1) == 1 and ruler.count(type2) == 1 and ruler.startswith(type1):
                if extract:
                    result = Sh.extra_a_to_b_x(ruler, type1, type2)
                else:
                    result = True
        except:
            pass

        if result is False and extract:
            raise Exception("未找到指定 字符")

        return result
Пример #20
0
    def scan(self, target, order):
        result = []

        type = self.td(target)
        url = target.extra0  #'http://ent.people.com.cn/GB/81374/index1.html'

        cap = None
        ruler = None

        if self.td(target) == 'i':
            cap = ['var tvInfoJs=', '']
            url = self.iqiyi_base.format(url)
            ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount'

        elif type == 'l':
            ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count'
            url = self.letv_base.format(url, target.extra1)

        elif type == 't':
            cap = ["tlux.dispatch('$cover',", ");"]
            ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd'
            url = self.qq_base.format(url)

        elif type == 'm':
            url = self.mgtv_base.format(url)
            cap = ['"data":', ',"msg"']
            ruler = 'playCount:all;like:like;hate:unlike'

        elif type == 'y':
            ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num'
            if not s.is_url(url):
                if not url.startswith('id'):
                    url = self.youku_prefix.format(url)
                url = self.youku_base.format(url)

        elif type == 's':
            url = self.sohu_base.format(url)

        elif type == 'c':
            url = self.cntv_base.format(url)
            ruler = 'playCount:^label [播放次数]'

        try:
            encode = ExtraHtml.get_page_encode(url)

            if type == 'y' or type == 'c':
                result = self.looper_html(url, ruler, encode, target)
            else:
                raw = RequestHelper.get(url, encode=encode)

                if type == 's':
                    result = self.finder_sohu(raw)
                else:
                    result = self.looper_js(raw, ruler, cap)
        except AttributeError as e:
            pass
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(repr(e))

        if len(result) > 0:
            result = self.build_base_dic(target, result, order)

        return result[0]
Пример #21
0
    def extract_dic_list_from_page(self, result, url, parent_attr, list_attr, ruler, exists:list=None, encode:str=None):
        """
        从页面提取 dic 列表
        :param result: 结果列表
        :param raw:
        :param exists:
        :param ruler: 字典规则
        :param parent_attr: parent_container attribute
        :param list_attr: list attribute
        """
        raw = self.req.get(url, encode=encode)

        tup = Sh.str_to_dictup(parent_attr)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(list_attr)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            """抓取时的重复验证"""
            if exists is not None:
                if exists.count(dic_list['link']) < 1:
                    result.append(dic_list)
                else:
                    break
            else:
                result.append(dic_list)





# 备份 1204
# @staticmethod
#     def ruler_finder_ex(tag, ruler_pair, need_tag=False):
#         result = None
#         ruler2 = ruler_pair[1]
#
#         #在某个父tag 下取得所有相同类型的子tag
#         # 这个好像是个废掉的功能
#         if ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop':
#             re = ExtraHtml.ruler_finder_multi_at(tag, ruler2)
#
#             if re is not None:
#                 result = (ruler_pair[0], re)
#                 return result
#         #依据父属性做查找
#         elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'):
#             status = False
#             _pair = ruler_pair[1].split(':', 1)
#             name = _pair[0]#可能 是子标签
#             name_part = []
#
#             if name.count(' ') > 0:#可能 有属性
#                 name_part = name.split(' ', 1)#可能 属性
#                 name = name_part[0]#可能 标签
#
#             if tag.name != name:#可能 不是此标签,跳过
#                 return None
#
#             parent_pair = _pair[1].split(' ', 1)#可能 父标签部分
#             parent_name = parent_pair[0]#可能 父标签名
#
#             if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name:#可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过
#                 return None
#
#             if len(parent_pair) == 1:#可能 没有添加属性 那么 父元素的要求已达到
#                 status = True
#
#             parent_attr = parent_pair[1].split('=', 1)#父标签的属性
#
#             if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''):#可能* 如果有父标签的属性 或者 等号后为空
#                 try:
#                     attr = tag.parent.attrs[parent_attr[0]]
#                     status = True
#                 except:
#                     pass
#
#             if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' :
#                 try:
#                     if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]:
#                         status = True
#                 except:
#                     pass
#
#             if status:
#                 if len(name_part) > 1 and name_part[1] != None and name_part != '':
#                     return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]))
#                 else:
#                     result = (ruler_pair[0], tag.text)
#
#         #依据自身属性
#         elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2:
#             if ruler_pair[1].count('=') == 2:
#                 ruler_pair_part_of = ruler_pair[1].split(' ', 2)
#                 identify = ruler_pair_part_of[2].split('=', 1)
#
#                 if tag.name != ruler_pair_part_of[0]:
#                     return None
#
#                 try:
#                     if tag.attrs[identify[0]] == identify[1]:
#                         return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]))
#                 except:
#                     pass
#             else:
#                 _pair = ruler_pair[1].split(' ', 1)
#                 _name = _pair[0]
#
#                 next_flag = False
#
#                 if _name.count('^') == 1:
#                     next_flag = True
#                     _name = Sh.cut_head(_name, '^')
#
#                 if tag.name != _name:
#                     return None
#
#                 # 关键字查询
#                 keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']')
#                 if _pair[1].count('[') == 1 and _pair[1].count(']') == 1:
#                     if keyword in tag.text and len(tag.contents) <= 1:
#                         if next_flag:
#                             return (ruler_pair[0], tag.next_sibling.text)
#                         else:
#                             return (ruler_pair[0], tag.text)
#
#                 equ_pair = _pair[1].split('=',1)
#
#                 _attr = equ_pair[0]
#
#                 val = StringHelper.extra_a_to_b(_attr, '(', ')')
#                 _attr = StringHelper.delete_piece(_attr, val)
#
#                 for item in equ_pair:
#                     if item == '':
#                         equ_pair.remove(item)
#
#                 if len(equ_pair) > 1:
#                     value = equ_pair[1]
#
#                     try:
#                         att = tag.attrs[_attr]
#                         _value = ''
#                         if isinstance(att, str):
#                             _value = att
#                         elif isinstance(att, list):
#                             _value = " ".join(tag.attrs[_attr])
#                         if _value == value:
#                             result = (ruler_pair[0], tag.text)
#                             # return (ruler_pair[0], tag.text)
#                     except:
#                         pass
#                 else:
#                     try:
#                         val = val[1:len(val) - 1]
#
#                         ed = tag.attrs[_attr]
#                         op = val
#                         # com = op + ed
#                         com = RulerExtra.canwecom(op, ed)
#
#                         # return (ruler_pair[0], com)
#                         result = (ruler_pair[0], com)
#                     except:
#                         return None
#                 # return (ruler_pair[0], tag.attrs[_attr])
#
#         # 不通过属性 直接查找
#         else:
#             if tag.name == ruler_pair[1]:
#                 if tag.text is not None and tag.text != '':
#                     # return (ruler_pair[0], tag.text)
#                     result = (ruler_pair[0], tag.text)
#
#         # 星判断
#         if result != None and len(result) > 1 and result[0].count('*') > 0:
#             key = ''
#             value = ''
#             value_list = []
#             if ruler_pair[0].count('*') > 0:
#                 if len(tag.contents) > 1:#多重判断
#                     for con in tag:
#                         if isinstance(con, Tag):
#                             #提取p标签
#                             if ruler_pair[0].count('***') == 1:
#                                 if con.name == 'p':
#                                     # value += con.get_text()
#                                     # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con)
#                                     ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list)
#                             # 删除所有标签
#                             elif ruler_pair[0].count('**') == 1:
#                                 value += con.get_text()
#                             # 保留标签
#                             elif ruler_pair[0].count('*') == 1:
#                                 value += str(con)
#
#                 else:#单tag 判断
#                     if ruler_pair[0].count('**') == 1:
#                         value += tag.get_text()
#                     elif ruler_pair[0].count('*') == 1:
#                         value += str(tag)
#
#                 #善后
#                 if ruler_pair[0].count('***') == 1:
#                     # value = "".join(value.split())
#                     value = "".join(value_list)
#                     value = "".join(value.split())
#                     key = StringHelper.cutfrom(ruler_pair[0], '***')
#                 elif ruler_pair[0].count('**') == 1:
#                     value = "".join(value.split())
#                     key = StringHelper.cutfrom(ruler_pair[0], '**')
#                 elif ruler_pair[0].count('*') == 1:
#                     key = StringHelper.cutfrom(ruler_pair[0], '*')
#
#             elif ruler_pair[0].count('^') > 0:
#                 value = tag.text
#                 value = "".join(value.split())
#                 key = StringHelper.cutfrom(ruler_pair[0], '^')
#             else:
#                 key = ruler_pair[0]
#                 value = tag.text
#
#             if value == '':
#                 print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!')
#                 value = result[1]
#
#             return (key, value)
#         elif result == None:
#             return None
#         else:
#             return result
Пример #22
0
    def ruler_finder_ex(tag, ruler_pair, need_tag=False):
        """
        html 页面元素抓取核心函数
        :param tag: 待遍历 的 soup tag 元素
        :param ruler_pair: 元素寻找规则
        :param need_tag: 如果为 true 则返回 找到的 子 tag 元素
        :return:
        """

        result = None
        ruler2 = ruler_pair[1]

        next_sibling = False

        # 判断是否需要获取 nextSibling
        if ExtraHtml.ruler_finder_condition_next_sibling(ruler_pair[0]):
            next_sibling = True

        # 通过标签中间的内容判断 <span>你好</span> 则 ruler 为 [你好]
        if ExtraHtml.ruler_finder_condition_content(ruler2):
            try:
                if tag.get_text() == ExtraHtml.ruler_finder_condition_content(ruler2, True):
                    if next_sibling:
                        result = ExtraHtml.finder_need_tag(tag, need_tag, next_sibling=True, ruler_pair=ruler_pair)
                    else:
                        result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

                    return result
            except:
                pass
        #在某个父tag 下取得所有相同类型的子tag
        # 这个好像是个废掉的功能
        elif ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop':
            re = ExtraHtml.ruler_finder_multi_at(tag, ruler2)

            if re is not None:
                result = (ruler_pair[0], re)
                return result
        #依据父属性做查找
        elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'):
            status = False
            _pair = ruler_pair[1].split(':', 1)
            name = _pair[0]#可能 是子标签
            name_part = []

            if name.count(' ') > 0:#可能 有属性
                name_part = name.split(' ', 1)#可能 属性
                name = name_part[0]#可能 标签

            if tag.name != name:#可能 不是此标签,跳过
                return None

            # 可能 父标签部分
            parent_pair = _pair[1].split(' ', 1)
            # 可能 父标签名
            parent_name = parent_pair[0]

            # 可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过
            if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name:
                return None

            # 可能 没有添加属性 那么 父元素的要求已达到
            if len(parent_pair) == 1:
                status = True

            # 父标签的属性
            parent_attr = parent_pair[1].split('=', 1)

            # 可能* 如果有父标签的属性 或者 等号后为空
            if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''):
                try:
                    attr = tag.parent.attrs[parent_attr[0]]
                    status = True
                except:
                    pass

            if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' :
                try:
                    if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]:
                        status = True
                except:
                    pass

            if status:
                if len(name_part) > 1 and name_part[1] != None and name_part != '':
                    return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]), need_tag)
                else:
                    # result = (ruler_pair[0], tag.text)
                    result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

        #依据自身属性
        elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2:
            if ruler_pair[1].count('=') == 2:
                ruler_pair_part_of = ruler_pair[1].split(' ', 2)
                identify = ruler_pair_part_of[2].split('=', 1)

                if tag.name != ruler_pair_part_of[0]:
                    return None

                try:
                    if tag.attrs[identify[0]] == identify[1]:
                        return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]), need_tag)
                except:
                    pass
            else:
                _pair = ruler_pair[1].split(' ', 1)
                _name = _pair[0]

                next_flag = False

                if _name.count('^') == 1:
                    next_flag = True
                    _name = Sh.cut_head(_name, '^')

                if tag.name != _name:
                    return None

                # 关键字查询
                keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']')
                if _pair[1].count('[') == 1 and _pair[1].count(']') == 1:
                    if keyword in tag.text and len(tag.contents) <= 1:
                        if next_flag:
                            # return (ruler_pair[0], tag.next_sibling.text)
                            return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag, True))
                        else:
                            # return (ruler_pair[0], tag.text)
                            return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

                equ_pair = _pair[1].split('=',1)

                _attr = equ_pair[0]

                val = StringHelper.extra_a_to_b(_attr, '(', ')')
                _attr = StringHelper.delete_piece(_attr, val)

                for item in equ_pair:
                    if item == '':
                        equ_pair.remove(item)

                if len(equ_pair) > 1:
                    value = equ_pair[1]

                    try:
                        att = tag.attrs[_attr]
                        _value = ''
                        if isinstance(att, str):
                            _value = att
                        elif isinstance(att, list):
                            _value = " ".join(tag.attrs[_attr])
                        if _value == value:
                            # result = (ruler_pair[0], tag.text)
                            result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))
                    except:
                        pass
                else:
                    try:
                        val = val[1:len(val) - 1]

                        ed = tag.attrs[_attr]
                        op = val
                        # com = op + ed
                        com = RulerExtra.canwecom(op, ed)

                        result = (ruler_pair[0], com)
                    except:
                        return None
                # return (ruler_pair[0], tag.attrs[_attr])

        # 不通过属性 直接查找
        else:
            if tag.name == ruler_pair[1]:
                if tag.text is not None and tag.text != '':
                    # result = (ruler_pair[0], tag.text)
                    result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

        # 星判断
        if result != None and len(result) > 1 and result[0].count('*') > 0:
            key = ''
            value = ''
            value_list = []
            if ruler_pair[0].count('*') > 0:
                if len(tag.contents) > 1:#多重判断
                    for con in tag:
                        if isinstance(con, Tag):
                            #提取p标签
                            if ruler_pair[0].count('***') == 1:
                                if con.name == 'p':
                                    # value += con.get_text()
                                    # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con)
                                    ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list)
                            # 删除所有标签
                            elif ruler_pair[0].count('**') == 1:
                                value += con.get_text()
                            # 保留标签
                            elif ruler_pair[0].count('*') == 1:
                                value += str(con)

                else:#单tag 判断
                    if ruler_pair[0].count('**') == 1:
                        value += tag.get_text()
                    elif ruler_pair[0].count('*') == 1:
                        value += str(tag)

                #善后
                if ruler_pair[0].count('***') == 1:
                    # value = "".join(value.split())
                    value = "".join(value_list)
                    value = "".join(value.split())
                    key = StringHelper.cutfrom(ruler_pair[0], '***')
                elif ruler_pair[0].count('**') == 1:
                    value = "".join(value.split())
                    key = StringHelper.cutfrom(ruler_pair[0], '**')
                elif ruler_pair[0].count('*') == 1:
                    key = StringHelper.cutfrom(ruler_pair[0], '*')

            elif ruler_pair[0].count('^') > 0:
                value = tag.text
                value = "".join(value.split())
                key = StringHelper.cutfrom(ruler_pair[0], '^')
            else:
                key = ruler_pair[0]
                value = tag.text

            if value == '':
                print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!')
                value = result[1]

            return (key, value)
        elif result == None:
            return None
        else:
            return result