def item_name_str(url): try: web_page = urq.urlopen( url ) # (Open the URL) .urlopen take string or Request obj as it parameter content = bs(web_page, 'html.parser') if url.find('amazon.com' ) != -1: # HTTP Error 503: Service Unavailable (st) item_name_str_L = content.find( id="productTitle").get_text().strip() content.find(id=[ "priceblock_ourprice", "priceblock_dealprice", "priceblock_saleprice" ]).get_text() elif url.find('bhphotovideo.com') != -1: item_name_str_L = content.find( class_="title_3bJZzlB3PKkE_8ajs9mroe").get_text().strip() content.find(class_=["price_1DPoToKrLP8uWvruGqgtaY"]).get_text() elif url.find('bestbuy.com') != -1: # Operational Time item_name_str_L = content.find( class_="heading-5 v-fw-regular").get_text().strip() content.find( class_=["priceView-hero-price priceView-customer-price" ]).get_text() elif url.find('apple.com') != -1: # HTTP Error 403: Forbidden item_name_str_L = content.find( class_="as-productdecision-header").get_text().strip() item_name_str_L = item_name_str_L.replace("Buy", "") content.find( class_=["as-price-currentprice", "current_price"]).get_text() return item_name_str_L except Exception as e: err_h.error_handler(e) return -1
def do_it(): 'do it' try: html = render_html() create_html( html ) except: error_handler()
def dates_before_current(indi_list, fam_list): line_num = [] id_list = [] current = datetime.now() for people in indi_list: birt = datetime.strptime(people['BIRT'], '%Y-%m-%d') try: deat = datetime.strptime(people['DEAT'], '%Y-%m-%d') except ValueError: deat = datetime.now() - timedelta(1) if birt >= current or deat >= current: line_num.append(error_handler(people['INDI'])) id_list.append(people['INDI']) for families in fam_list: marr = datetime.strptime(families['MARR'], '%Y-%m-%d') try: div = datetime.strptime(families['DIV'], '%Y-%m-%d') except ValueError: div = datetime.now() - timedelta(1) if marr >= current or div >= current: line_num.append(error_handler(families['FAM'])) id_list.append(families['FAM']) if line_num: print('ERROR: INDIVIDUAL: US01: lines_num:', sorted(set(line_num)), ': indi_id:', sorted(set(id_list)), ': Dates must before current date!')
def process_user( user, utas, database, smtpserver ): 'process one user' try: uta = utas[ user[ 'send_index' ] ] mail_address = user[ 'mail_address' ] subject = create_subject( uta ) body = create_body( user[ 'direction' ], uta, user[ 'column' ], user[ 'row' ] ) mail = create_message( body, subject, mail_address ) if send_mail( mail, mail_address, smtpserver ): increment_send_index( database, mail_address ) except: error_handler()
def call_error_handler(self, kblang): error_handler_input = {} error_handler_input['cTopics'] = kblang['cTopics'] error_handler_input['intent'] = kblang['dialogAct'][-1] error_handler_input['concept'] = kblang['concept'] error_handler_output = error_handler.error_handler(error_handler_input) unCertain = False for i in error_handler_output: if i['inContext'] == 'Uncertain': kblang['pTopic'] = i['label'] dialogAct = kblang['dialogAct'] dialogAct.append('CheckQuestion') kblang['dialogAct'] = dialogAct concept = False with open('./dict.errorhandler', 'r') as f: reader = csv.reader(f) for line in reader: if line[0] == kblang['property']: concept = line[1] prop = line[2] break if concept: kblang['concept'] = concept kblang['property'] = prop else: pass unCertain = True break else: pass return kblang, unCertain
def do_it(): 'do it' try: database = database_open() day, hour = get_day(), int( sys.argv[ 1 ] ) users = get_user_by_date( database, day, hour ) if not users: database_close( database ) return utas = get_uta_by_indices( database, [ user[ 'send_index' ] for user in users ] ) smtpserver = get_smtpserver() for user in users: process_user( user, utas, database, smtpserver ) database_close( database ) smtpserver.close() except: error_handler()
def correct_gender_role(indi_list, fam_list): line_num = [] id_list = [] for families in fam_list: husb_id = families['HUSB'] wife_id = families['WIFE'] for people in indi_list: if people['INDI'] == husb_id and people['SEX'] != 'M': line_num.append(error_handler(families['HUSB'])) id_list.append(husb_id) if people['INDI'] == wife_id and people['SEX'] != 'F': line_num.append(error_handler(families['WIFE'])) id_list.append(wife_id) if line_num: print('ERROR: INDIVIDUAL: US21: lines_num:', sorted(set(line_num)), ': indi_id:', sorted(set(id_list)), ': Gender and role are not correct!')
def process_1(url, user_price, mail_recipient_input): save_url_str = url save_item_name_str = item_name_str(url) if save_item_name_str == -1: return -1 save_user_price_int = user_price_int(user_price) if save_user_price_int == -1: return -1 save_recipients_str = mail_recipient_input if save_recipients_str.find('@') == -1: err_h.error_handler('recipients_err') return -1 conn = sql.connect('tracking_item.db') c = conn.cursor() try: c.execute("""CREATE TABLE tracking_item ( url_str text, item_name_str text, user_price_int integer, recipients_str text )""") except sql.OperationalError: pass c.execute( """INSERT INTO tracking_item VALUES (:url_str, :item_name_str,:user_price_int, :recipients_str)""", { 'url_str': save_url_str, 'item_name_str': save_item_name_str, 'user_price_int': save_user_price_int, 'recipients_str': save_recipients_str }) conn.commit() conn.close() return 1
def less_than_150(indi_list): indi_id = [] line_num = [] for people in indi_list: if people['AGE'] >= 150: line_num.append(error_handler(people['INDI'])) indi_id.append(people['INDI']) if line_num: print('ERROR: INDIVIDUAL: US07: lines_num:', sorted(set(line_num)), ': indi_id:', sorted(set(indi_id)), ': Age must less than 150 years old') return 'BUG'
def marr_before_div(fam_list): line_num = [] fam_id = [] for families in fam_list: if families['DIV'] != 'NONE': marr = datetime.strptime(families['MARR'], '%Y-%m-%d') div = datetime.strptime(families['DIV'], '%Y-%m-%d') if marr >= div: line_num.append(error_handler(families['FAM'])) fam_id.append(families['FAM']) if line_num: print('ERROR: FAMILY: US04: lines_num:', sorted(set(line_num)), ': fam_id:', sorted(set(fam_id)), ': Marriage date must before divorce date!')
def locateChild(self, ctx, segments): ctx.remember(fourohfour.fourohfour(), inevow.ICanHandleNotFound) ctx.remember(error_handler.error_handler(), inevow.ICanHandleException) request = inevow.IRequest(ctx) request.setHeader('server', "AZTK - %s" % socket.gethostname()) re_FEEDS = re.compile("^\/[a-z][_a-z0-9]{3,}\/feeds.*$") re_AVATAR = re.compile("^\/[a-z][_a-z0-9]{3,}\/avatar.*$") re_IMAGE = re.compile( "^\/[a-z][_a-z0-9]{3,}\/img\/(?:(?:\d+(?:x\d+)?(?:x\d)?)|(?:original))\/([a-z0-9]{32})(?:-[a-f0-9]{5})?\.jpg$" ) if segments[0] == "css": return static_css(), segments[1:] elif segments[0] == "image": return static_image(), segments[1:] elif segments[0] == "download": return static_download(), segments[1:] elif segments[0] == "js": return static_js(), segments[1:] elif segments[0] == "RPC2": return self.zapi_handler, [] elif segments[0] == "browser_check": return browser_check.browser_check(), segments[1:] elif segments[0] in static_files.files.keys(): return static_files(), segments elif re_AVATAR.match(request.uri): return main_homepage.main_homepage(), segments elif re_FEEDS.match(request.uri): return main_homepage.main_homepage(), segments elif segments[0] == "qoop": return main_homepage.main_homepage(), segments else: if request.getCookie('browser_checked') or re_IMAGE.match( request.uri): # everything passed, show page return main_homepage.main_homepage(), segments else: # no client side browser_checked cookie found # set a cookie server side to remember the segments of the original request. # redirect them (hit this page again) for segment == browser_check above to handle. # request.addCookie("requested_page", '/'.join(segments), None, self.app.servers.httpserver._cfg_site_domain, "/") #return redirectTo("/browser_check/", request), [] return browser_check.browser_check(), segments[1:]
def locateChild(self, ctx, segments): ctx.remember(fourohfour.fourohfour(), inevow.ICanHandleNotFound) ctx.remember(error_handler.error_handler(), inevow.ICanHandleException) request = inevow.IRequest(ctx) request.setHeader('server', "AZTK - %s" % socket.gethostname()) re_FEEDS = re.compile("^\/[a-z][_a-z0-9]{3,}\/feeds.*$") re_AVATAR = re.compile("^\/[a-z][_a-z0-9]{3,}\/avatar.*$") re_IMAGE = re.compile("^\/[a-z][_a-z0-9]{3,}\/img\/(?:(?:\d+(?:x\d+)?(?:x\d)?)|(?:original))\/([a-z0-9]{32})(?:-[a-f0-9]{5})?\.jpg$") if segments[0] == "css": return static_css(), segments[1:] elif segments[0] == "image": return static_image(), segments[1:] elif segments[0] == "download": return static_download(), segments[1:] elif segments[0] == "js": return static_js(), segments[1:] elif segments[0] == "RPC2": return self.zapi_handler, [] elif segments[0] == "browser_check": return browser_check.browser_check(), segments[1:] elif segments[0] in static_files.files.keys(): return static_files(), segments elif re_AVATAR.match(request.uri): return main_homepage.main_homepage(), segments elif re_FEEDS.match(request.uri): return main_homepage.main_homepage(), segments elif segments[0] == "qoop": return main_homepage.main_homepage(), segments else: if request.getCookie('browser_checked') or re_IMAGE.match(request.uri): # everything passed, show page return main_homepage.main_homepage(), segments else: # no client side browser_checked cookie found # set a cookie server side to remember the segments of the original request. # redirect them (hit this page again) for segment == browser_check above to handle. # request.addCookie("requested_page", '/'.join(segments), None, self.app.servers.httpserver._cfg_site_domain, "/") #return redirectTo("/browser_check/", request), [] return browser_check.browser_check(), segments[1:]
def birt_before_marr(indi_list, fam_list): indi_id = [] line_num = [] for people in indi_list: if people['SPOUSE'] != 'NONE': birt = datetime.strptime(people['BIRT'], '%Y-%m-%d') for spouses in people['SPOUSE']: find_fam_index = int(re.sub('\D', '', spouses)) - 1 marr = datetime.strptime(fam_list[find_fam_index]['MARR'], '%Y-%m-%d') if marr <= birt: line_num.append(error_handler(people['INDI'])) indi_id.append(people['INDI']) if line_num: print('ERROR: INDIVIDUAL: US02: lines_num:', sorted(set(line_num)), ': indi_id:', sorted(set(indi_id)), ': Birth date must before marriage date') return 'BUG'
def user_price_int(user_price): try: return int(user_price) except Exception as e: err_h.error_handler(e) return -1
def everytime_all_board(URL, end_date, db): main_url = URL['url'] board_search_url = "https://everytime.kr/community/search?keyword=" board_search_word = ['게시판', '갤러리'] board_list = [] # driver 연결 try: driver = chromedriver() driver = everytime.login(driver) except Exception as e: error_handler(e, URL, main_url, db) return WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.article"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') # 에브리타임 상단 동적 게시판 긁기============================================================================= board_group_list = bs.find("div", { "id": "submenu" }).findAll('div', {"class": "group"}) for board_group in board_group_list: try: board_li_list = board_group.find("ul").findAll("li") for board_li in board_li_list: board_li_dic = {} board_li_dic['tag'] = board_li.find("a").text if board_li.find("a").text.strip() == "더 보기": continue else: board_li_dic['url'] = main_url + board_li.find("a")['href'] if (board_li_dic['tag'].find("찾기") != -1): continue board_list.append(board_li_dic) except: continue # 에브리타임 추가 동적 게시판 긁기 for search_word in board_search_word: try: board_search_url_done = board_search_url + search_word driver.get(board_search_url_done) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.result"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') board_a_list = bs.find("div", { "class": "searchresults" }).findAll('a') for board_a in board_a_list: board_li_dic = {} board_li_dic['tag'] = board_a.find("h3").text board_li_dic['url'] = main_url + board_a.get('href') board_list.append(board_li_dic) except: continue #=========================================================================================================== # 동적 게시판들 반복문 for board in board_list: page = 1 page_flag = 0 board_url = board['url'] page_url = Change_page(board_url, page) #현재 페이지 포스트 url 반환 print("\nTarget : ", URL['info'], " :: ", board['tag']) continue_handler(URL['info'] + " :: " + board['tag'], URL, page_url) # 페이지 반복문 while True: if page_flag == 50: page_flag = 0 driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) try: print("page_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 post_urls = Parsing_list_url(main_url, page_url, driver, db) # everytime 고질병 문제 고려, 재시도 if len(post_urls) == 0: time.sleep(2) post_urls = Parsing_list_url(main_url, page_url, driver, db) post_data_prepare = [] # 포스트 반복문 for post_url in post_urls: get_post_data = Parsing_post_data(driver, post_url, URL, board['tag'], db) if get_post_data == "error": break title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: page_flag = 0 break else: page_flag += 1 page += 1 page_url = Change_page(board_url, page) except Exception as e: error_handler(e, URL, page_url, db) driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) break #드라이버 연결 해제 driver.quit()
def Crawling(URL, db): driver = None info_name = URL['info'].split('_') crawling_name = info_name[0] #게시판 크롤링 선택 page = 1 main_url = URL['url'] #게시판 url 추출 : 페이지 바꾸는 데에 사용 page_url = eval(crawling_name + '.Change_page(main_url, page)') #현재 페이지 포스트 url 반환 end_date = date_cut(URL['info']) # end_date 추출 if crawling_name in ["sj34"]: # 동적 게시판 예외 sj34.everytime_all_board(URL, end_date, db) return if crawling_name in ["sj20"]: # 제외 게시판 return; #현재 크롤링하는 게시판 info 출력 print("Target : ", URL['info']) continue_handler(URL['info'], URL, page_url) #크롤링 유무판단 if is_crawling(db, URL['info']) == False: return while True: if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj30", "sj44"]: lastly_post = get_lastly_post(URL, db) try: print("\npage_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 #driver_page 생성--------------------------- if crawling_name in ['sj10']: driver_page = URLparser_EUCKR(page_url) elif crawling_name in ['sj12']: driver_page = URLparser_UTF8(page_url) else: driver_page = URLparser(page_url) #------------------------------------------- #Selenium을 쓰는 경우---------------------------------------------------------------------------------------------- if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj38", "sj44"]: data = eval(crawling_name + '.Parsing_list_url(URL, page_url)') driver = data[0] post_urls = data[1] elif crawling_name in ["sj30"]:#---------------------------세종대역 예외처리 data = eval(crawling_name + '.Parsing_list_url(URL, page_url, lastly_post, db, driver)') driver = data[0] post_urls = data[1] #Requests를 쓰는 경우---------------------------------------------------------------------------------------------- else: #로그인을 하는 경우------------------------------------------------------------------------------- if URL['login'] == 1: post_urls = eval(crawling_name + '.Parsing_list_url(URL, page_url)') #로그인을 하지않는 경우--------------------------------------------------------------------------- else: if driver_page is None: #Connect Failed 이면 break error_handler("driver_none", URL, page_url, db) break else: #parsing 형태-------------------------------------------------- if crawling_name in ['sj10']: bs_page = BeautifulSoup(driver_page, 'lxml') else: bs_page = BeautifulSoup(driver_page, 'html.parser') #-------------------------------------------------------------- post_urls = eval(crawling_name + '.Parsing_list_url(URL, bs_page)') #----------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------- #get_post_data 형식 : [게시글정보dictionary, title, date]------------------------------------------------------------------------------------------------------- #date 규격은 "0000-00-00 00:00:00" post_data_prepare = [] for post_url in post_urls: #Selenium인 경우-------------------------------------------------------------------------------------------------------------------- if crawling_name in ['sj29', 'sj30']:#------------------게시판 규격인 경우 get_post_data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL)') #---------------------------------------------------------------------------------------------------게시판 규격이 아닌 경우 elif crawling_name in ['sj23', 'sj26', 'sj27', 'sj28', 'sj44']: data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL, lastly_post)') post_data_prepare = data[0] lastly_post = data[1] if lastly_post is None: pass else: push_lastly_post(URL, lastly_post, db) #Requests인 경우-------------------------------------------------------------------------------------------------------------------- else: #driver_post 생성-------------------------------- if crawling_name in ["sj21", "sj4", "sj5", "sj8", "sj16"]: #---driver_post가 필요없는 경우 pass elif crawling_name in ['sj10', 'sj33']: driver_post = URLparser_EUCKR(post_url) elif crawling_name in ['sj12']: driver_post = URLparser_UTF8(post_url) else: driver_post = URLparser(post_url) #------------------------------------------------ #-----------------------------------------------------------------------------------------------위키백과 구조 if crawling_name in ['sj21']: get_post_data = eval(crawling_name + '.Parsing_post_data(post_url, URL)') #-----------------------------------------------------------------------------------------------게시판 규격이 아닌 구조 elif crawling_name in ["sj4", "sj5", "sj8", "sj16"]: post_data_prepare = eval(crawling_name + '.Parsing_post_data(post_url, URL)') break #-----------------------------------------------------------------------------------------------게시판 규격인 구조 else: if driver_post is None: #Connect Failed 이면 continue error_handler("driver_none", URL, page_url, db) break else: #parsing 형태------------------------------------------- if crawling_name in ['sj10']: bs_post = BeautifulSoup(driver_post, 'lxml') elif crawling_name in ['sj12']: bs_post = driver_post else: bs_post = BeautifulSoup(driver_post, 'html.parser') #------------------------------------------------------- get_post_data = eval(crawling_name + '.Parsing_post_data(bs_post, post_url, URL)') #----------------------------------------------------------------------------------------------------------------------------------- #post_data_prepare이 이미 완성된 경우----------------------------------------------------------------------- if crawling_name in ["sj4", "sj5", "sj8", "sj16", "sj23", "sj26", "sj27", "sj28", "sj44"]: pass #post_data_prepare이 완성되지 않은 경우--------------------------------------------------------------------- else: if get_post_data == None: #잘못된 포스트 데이터인 경우 continue title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) #---------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------------------------------------------------- add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #dirver 종료 [Selenium 을 사용했을 시] if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj30", "sj38", "sj44"]: driver.quit() #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: break else: page += 1 page_url = eval(crawling_name + '.Change_page(main_url, page)') # Error handler : 만약 크롤링이 실패했을 경우, 에러를 logging 하고 크롤링을 중단한다. except Exception as e: error_handler(e, URL, page_url, db) break
for URL in URLS: #List에서 하나의 요소 = URL if not (URL['info'] in INFO_LIST): print('URL parsing Skip! : ' + str(URL["url"])) print( '-----------------------------------------------------------------------------------------------------------------\n' ) continue try: print('URL parsing Start! : ' + str(URL["url"])) Crawling(URL, db) print( '-----------------------------------------------------------------------------------------------------------------\n' ) except Exception as e: error_handler(e, URL, URL["url"], db) continue print(":::: Posts in Boards Count ::::") posts_cnt(db) # 모든 게시물 빈도 출력 print("\n\nCrawling End!\n\n") #프로그램 종료시간 end_time = datetime.now() try: log_write(start_time, end_time, db, BEFORE_DATA) except: error_logging(e, '', '', db) #크롤러 관리자 갱신==========================
import pickle import time import sys from datetime import date from Adafruit_BNO055 import BNO055 from Packet import Packet import HID import profiler import error_handler com = Packet() profile = profiler.profiler() enable_profile = True # set to True to enable profiling feaures errors = error_handler.error_handler() # used to store error messages ############################################################################### # set up the gamepad ############################################################################### hid_enable = True hid = object try: hid = HID.Gamepad(-100, 100) except: error = ("ERROR: Failed to start gamepad interface, continuing anyways." + "Program operation will be severly limited." + "Check that all tether cables are securly connected and that " + "the gamepad is plugged in to the tether box.") errors.add(error)
def Parsing_post_data(driver, post_url, URL, board_tag, db): return_data = [] post_data = {} domain = Domain_check(URL['url']) try: driver.get(post_url) except: try: time.sleep(3) driver.get(post_url) except: return "error" try: WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "time.large"))) #time.large를 발견하면 에이작스 로딩이 완료됬다는 가정 except: try: WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "time.large"))) except Exception as e: error_handler(e, URL, post_url, db) return "error" html = driver.page_source bs = BeautifulSoup(html, 'html.parser') if bs.find("h2", {"class": "large"}) != None: title = bs.find("h2", {"class": "large"}).get_text(" ", strip=True) else: title = "0" author = "0" date = bs.find("time").text.strip() date = everytime_time(date) post = bs.find("p", {'class': "large"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("figure", {"class": "attach"}) is not None: try: img = bs.find("figure", { "class": "attach" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 5 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 5 else: img = 5 if img != 5: if img_size(img): pass else: img = 5 img = 5 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() board_tag = re.compile('[^ ㄱ-ㅣ가-힣|a-z]+').sub('', board_tag.lower()) for remove_tag in SJ34_DELETE_TAGS: board_tag = board_tag.replace(remove_tag, "") tag_done.append(board_tag) post_data['img'] = img post_data['url'] = post_url post_data['info'] = URL['info'].split("_")[1] + "_" + board_tag if post_data["title"] == "0": post_data["title"] = post_data["post"][:30] + "..." return_data.append(post_data) return_data.append(post_data['title']) return_data.append(date) return return_data