def search_longest_mats(self, line):
        '''
        将line中信息抽取成mat
        '''
        max_len, word = 0, ''
        i = 0
        while i < len(line):
            w = self.longest_prefix(line[i:])
            if w != '' and len(w) > max_len:
                word = w
                start_pos = i
                end_pos = i + len(w)
                i += len(w)
                max_len = len(w)
            else:
                i += 1

        if word:
            ratio = 1.0 * len(word) / len(line)
            mat = KVObj(group=None,
                        rank=-1,
                        type_name=None,
                        vpat=None,
                        word=word,
                        start_pos=start_pos,
                        end_pos=end_pos,
                        ratio=ratio)
            return mat
        return []
    def search_all_mats(self, line):
        '''
        将line中信息抽取成mat
        '''
        hits, obj = [], []
        i = 0
        while i < len(line):
            w = self.longest_prefix(line[i:])
            if w != '':
                hits.append((w, i, i + len(w)))
                i += len(w)
            else:
                i += 1

        for hit in hits:
            word = hit[0]
            start_pos = hit[1]
            end_pos = hit[2]
            ratio = 1.0 * len(word) / len(line)
            mat = KVObj(group=None,
                        rank=-1,
                        type_name=None,
                        vpat=None,
                        word=word,
                        start_pos=start_pos,
                        end_pos=end_pos,
                        ratio=ratio)
            obj.append(mat)
        return obj
예제 #3
0
    def _make_vpats(self, contain_vpats, vpats, dict_pats, long=True):
        '''
        用于模板中字典的获取
        '''
        kvpats, kvdicts, kv_contain = [], [], []

        # 解析包含不包含
        group, rank, type_name = '', '', ''
        for contain_pat in contain_vpats:
            contain_pat = contain_pat.split('=')
            kname = contain_pat[0]
            pats = contain_pat[1].split(',')

            info = kname.split('_')
            group = info[0].replace('$', '')
            rank = int(info[-1].replace('rank', ''))

            type_name = 're_expr'

            for pat in pats:
                norm_pat, wrong_pat = self._match_not_match(pat)
                if norm_pat and wrong_pat:
                    contain_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=(norm_pat, wrong_pat))
                    kv_contain.append(contain_pat)

        # 解析正则表达式
        vpats_num = len(vpats)
        group, rank, type_name = '', '', ''
        for i, vpat in enumerate(vpats):
            vpat = vpat.split('=')
            kname = vpat[0]
            pats = '='.join(vpat[1:]).split(',')  # yes_rank1

            info = kname.split('_')
            if info[0].replace('$', '') != group:
                if group:
                    kv_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=inner_pats)
                    kvpats.append(kv_pat)
                group = info[0].replace('$', '')
                rank = int(info[-1].replace('rank', ''))
                type_name = 'regu_expr'
                inner_pats = pats
            else:
                inner_pats.extend(pats)
            if i == vpats_num - 1:
                kv_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=inner_pats)
                kvpats.append(kv_pat)

        # 解析关键词词典
        group, rank, type_name = '', '', ''
        for vdict in dict_pats:
            vdict = vdict.split('=')
            kname = vdict[0]
            words = vdict[1].replace('[', '').replace(']', '')
            words = words.split(',')

            info = kname.split('_')
            group = info[0].replace('$', '')
            rank = int(info[-1].replace('rank', ''))
            type_name = 'dict'
            kv_dict = KVObj(group=group, rank=rank, type_name=type_name, vpat=words)
            kvdicts.append(kv_dict)
        kvdicts = self._combine_pats(kvdicts)
        return kv_contain, kvpats, kvdicts
예제 #4
0
    def parse(self, line, keywordVersion='', long=True):
        '''
        返回匹配到的结果
        '''
        match_mat, re_mat, dict_mat = None, None, None
        if long:
            if keywordVersion:
                sorted_group = self.parsed_pats[keywordVersion][0]
            # else:
            #     logger.fatal('<%s> and <%s> does not exist' % (questionId, keywordVersion))
            #     sys.exit(-1)
        else:
            if keywordVersion:
                sorted_group = self.parsed_pats[keywordVersion][1]
            # else:
            #     logger.fatal('<%s> and <%s> does not exist' % (questionId, keywordVersion))
            #     sys.exit(-1)

        for rank_group in sorted_group:
            match_group = rank_group[0]  # 包含不包含group
            re_group = rank_group[1]  # 每个group中有两种匹配方法,正则表达式group
            dict_group = rank_group[2]  # trie group

            # 先匹配,匹配不匹配。<>内是不匹配的内容,<>外时匹配的内容
            for mat in match_group:
                pats = mat.vpat
                for pat in pats:
                    match_pat = pat[0]
                    not_match_pat = pat[1]
                    res_match = match_pat.search(line)
                    res_not_match = not_match_pat.search(line)

                    if res_match and not res_not_match:
                        pos = res_match.span()
                        if pos:
                            start_pos = pos[0]
                            end_pos = pos[1]
                            ratio = 1.0 * (end_pos - start_pos) / len(line)
                            word = res_match.group()
                            match_mat = KVObj(group=mat.group, rank=mat.rank, type_name=mat.type, word=word,
                                              start_pos=start_pos, end_pos=end_pos, ratio=ratio)
                            break
                    if match_mat:
                        break

            # 第二步,匹配正则表达式中内容
            for mat in re_group:
                pats = mat.vpat
                for pat in pats:
                    compiled_pat, origin_pat = pat[0], pat[1]
                    res_mat = compiled_pat.search(line)
                    if res_mat:
                        pos = res_mat.span()
                        if pos:
                            start_pos = pos[0]
                            end_pos = pos[1]
                            ratio = 1.0 * (end_pos - start_pos) / len(line)
                            re_mat = KVObj(group=mat.group, rank=mat.rank, type_name=mat.type,
                                           word=origin_pat, start_pos=start_pos, end_pos=end_pos, ratio=ratio)
                            break
                if re_mat:
                    break

            # 第三步,匹配字典中出现内容
            for mat in dict_group:
                trie = mat.vpat
                dict_mat = trie.search_longest_mats(line)
                if dict_mat:
                    dict_mat.group = mat.group
                    dict_mat.rank = mat.rank
                    dict_mat.type = mat.type
                    break

            if re_mat and dict_mat:
                if re_mat.ratio > dict_mat.ratio:
                    return re_mat
                else:
                    return dict_mat
            if re_mat:
                return re_mat
            if dict_mat:
                return dict_mat
        return None