def _get_page_by_page_title_search(pdfplumber_obj, keywords_pattern=None, verbose=False) -> list: ''' return a list of pages that contains title_pattern ''' if verbose: print(f'searching by page!') if keywords_pattern is None: keywords_pattern = r'^(?!.*internal)(?=.*report).*auditor.*$' pages = [] for p, page in enumerate(pdfplumber_obj.pages): if verbose: print(f'searching p.{p}') try: title_alike_txts = get_title_liked_txt(page) except KeyError: logging.warning('Non textual page') continue for txt in title_alike_txts: if search_pattern_from_txt(txt, keywords_pattern): pages.append(p) if verbose: print(f'with pattern: found {txt}on p.{p}!') # consecutive_pages = pages consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] # consecutive_pages = sorted(flatten([li for li in consecutive_int_list(list(set(pages))) if len(li) > 1])) # consecutive_pages = [tuple(li) for li in consecutive_int_list(list(set(pages))) if len(li) > 1] return consecutive_pages
def search_outline_in_pages(self, pattern, page_range=None, size='fontname', verbose=False, show_matched=False) -> list: ''' return a list of pages number in tuples that contains pattern ''' print('search by page!') # print(f'pattern: {pattern}') pages = set() matched_pattern = [] with _by_pdfplumber(self.pdf_obj) as pdf: if not page_range: page_range = pdf.pages else: page_range = [pdf.pages[p] for p in page_range] for page in page_range: p = page.page_number - 1 # if verbose: print(f'searching p.{p}') try: title_alike_txts = get_title_liked_txt(page, size=size) except KeyError: logging.warning('Non textual page') continue for txt in title_alike_txts: if search_pattern_from_txt(txt, pattern): pages.add(p) matched_pattern.append(txt) if verbose: print(f'with pattern: found {txt} on p.{p}!') consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] if show_matched: return consecutive_pages, matched_pattern return consecutive_pages
def _get_page_by_outline(toc, title_pattern, to_page=True) -> list: ''' return a list of matched title pattern page range ''' # print('from outline') # if to_page: # return [page_range[-1] for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] # else: # return [page_range for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] # return [list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] pages = flatten([list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)]) consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] return consecutive_pages
def is_unusual(num): if is_prime(num): l = [x for x in str(num)] lst = [] for p in permutations(l): x = int(''.join(p)) if is_prime(x) and len(str(x)) == 4: lst.append(x) lst = unique(lst) lst.sort() if len(lst) >= 3 and contains_arithmetic_sequence(lst, 3): #print str(lst) + " is unusual" return True return False
def search_outline_in_toc(self, pattern) -> list: ''' return a list of matched title pattern page range ''' print('search by toc!') pages = [] for outline, _page_range in self.toc.items(): if re.search(pattern, outline, flags=re.IGNORECASE): from_page, to_page = _page_range page_range = list(range(from_page, to_page + 1)) pages.append(page_range) pages = flatten(pages) consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] return consecutive_pages
def prime_factors(n): return unique(prime_factors_non_unique(n))
com_str = "SELECT * FROM app WHERE store_lat='"+slat+"' AND store_long='"+slong+"';" cur.execute(com_str) #cur.execute("SELECT * FROM app") rows = cur.fetchall() if rows==[]: rows = create_new_store(cur,name,slat,slong) area = [] for r in rows: area.append(r[4]) print(r) area,ind = unique(area) #print(area) #print(ind) con.commit() cur.close() con.close() # <article class="media content-section"> # <div class="media-body"> # <p class="article-content">{{ r }}</p> # </div> # </article> # cur.execute("insert into app (store_id,store_name,area,item,if_there) values ('42.350903-71.114086', 'Target', 'bathroom', 'towel', '0'); \