Exemplo n.º 1
0
    def search_outline_in_pages(self, pattern, page_range=None, size='fontname', verbose=False, show_matched=False) -> list:
        '''
        return a list of pages number in tuples that contains pattern
        '''
        print('search by page!')
        # print(f'pattern: {pattern}')
        pages = set()
        matched_pattern = []
        with _by_pdfplumber(self.pdf_obj) as pdf:
            if not page_range:
                page_range = pdf.pages
            else:
                page_range = [pdf.pages[p] for p in page_range]
            
            for page in page_range:
                p = page.page_number - 1
                # if verbose: print(f'searching p.{p}')
                
                try:
                    title_alike_txts = get_title_liked_txt(page, size=size)
                except KeyError:
                    logging.warning('Non textual page')
                    continue
                for txt in title_alike_txts:
                    if search_pattern_from_txt(txt, pattern):
                        pages.add(p)
                        matched_pattern.append(txt)
                        if verbose: print(f'with pattern: found {txt} on p.{p}!')

            consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))]
            if show_matched:
                return consecutive_pages, matched_pattern
            return consecutive_pages
Exemplo n.º 2
0
def _get_page_by_page_title_search(pdfplumber_obj, keywords_pattern=None, verbose=False) -> list:
    '''
    return a list of pages that contains title_pattern
    '''
    if verbose:
        print(f'searching by page!')
    if keywords_pattern is None:
        keywords_pattern =  r'^(?!.*internal)(?=.*report).*auditor.*$'
    pages = []
    for p, page in enumerate(pdfplumber_obj.pages):
        if verbose:
            print(f'searching p.{p}')
        try:
            title_alike_txts = get_title_liked_txt(page)
        except KeyError:
            logging.warning('Non textual page')
            continue
        for txt in title_alike_txts:
            if search_pattern_from_txt(txt, keywords_pattern):
                pages.append(p)
                if verbose: print(f'with pattern: found {txt}on p.{p}!')
    # consecutive_pages = pages
    consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))]
    # consecutive_pages = sorted(flatten([li for li in consecutive_int_list(list(set(pages))) if len(li) > 1]))
    # consecutive_pages = [tuple(li) for li in consecutive_int_list(list(set(pages))) if len(li) > 1]
    return consecutive_pages
Exemplo n.º 3
0
def _get_page_by_outline(toc, title_pattern, to_page=True) -> list:
    '''
    return a list of matched title pattern page range
    '''
    # print('from outline')
    # if to_page:
    #     return [page_range[-1] for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] 
    # else:
        # return [page_range for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] 
    # return [list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] 
    pages = flatten([list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)])
    consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))]
    return consecutive_pages
Exemplo n.º 4
0
 def search_outline_in_toc(self, pattern) -> list:
     '''
     return a list of matched title pattern page range
     '''    
     print('search by toc!')
     pages = []
     
     for outline, _page_range in self.toc.items():
         if re.search(pattern, outline, flags=re.IGNORECASE):
             from_page, to_page = _page_range
             page_range = list(range(from_page, to_page + 1))
             pages.append(page_range)
     pages = flatten(pages)
     consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))]
     return consecutive_pages
Exemplo n.º 5
0
    def search_outline(self, regex: Pattern, scope=None) -> list:
        pdf = self.pb_pdf
        pages = scope or pdf.pages
        
        matched_page_nums = set()

        for p in pages:
            page = Page.create(p)
            if not page or page.df_feature_text.empty:
                continue
            self.logger.debug(f'searching page {page.page_number}...')
            if any(page.df_feature_text.text.str.contains(regex, flags=re.IGNORECASE)):
                matched_page_nums.add(page.page_number)
        
        matched_page_num = max(consecutive_int_list(sorted(matched_page_nums)), key=len, default=None)
        
        if matched_page_num is None:
            return []
        
        page_range = min(matched_page_num), max(matched_page_num)
        scope = 'Local' if scope else 'Global'
        return [Outline(f'{scope} search pattern: {regex}', page_range, self.pb_pdf)]