def __enter__(self): '''Needed in with-as statements :return: This instance ''' if not os.path.exists(self.directory): try: logging.debug('Directory %s does not exist, creating one' % self.directory) os.makedirs(self.directory) except IOError as err: logging.error('Could not create directory %s; reason: %s' % (self.directory, err)) raise ValueError try: logging.debug('Binding the socket to %s:%d' % self.addr_port) self.socket.bind(self.addr_port) except socket.error as err: logging.error('Can\'t bind the socket to %s:%d; reason: %s' % (self.addr_port + (err, ))) raise ValueError logging.debug('Socket bound to %s:%d' % self.socket.getsockname()) self.db = db_manager(os.path.join(self.directory, 'db.json')) return self
def __init__( _self ): """hook up database, framer, and bind UDP""" _self.db = db_manager() settings = settings_manager() _self.my_uuid = settings.get( "uuid" ) _self.my_last_seq = _self.db.get_seq_from_uuid( _self.my_uuid ) _self.handlers={ dbframe.framer.typeDbUpsert:_self.handle_frame_upsert, dbframe.framer.typeDbDelete:_self.handle_frame_delete, dbframe.framer.typeNetHello:_self.handle_frame_hello, dbframe.framer.typeNetReqClientList:_self.handle_frame_req_client_list, dbframe.framer.typeNetClientList:_self.handle_frame_client_list, dbframe.framer.typeNetReqClientUpdates:_self.handle_frame_req_client_updates, } _self.framer = dbframe.framer() #broadcast to everybody. #Should be able to programatically #compute local broadcast address #but also seems that most routers drop these #so this should work fine _self.UDP_IP = "255.255.255.255" _self.UDP_PORT = 32250 _self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) _self.sock.bind((_self.UDP_IP, _self.UDP_PORT)) if hasattr(socket,'SO_BROADCAST'): #add broadcast abilities _self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
def setup_db_manager(self): """ Set up the db manager based on a dictionary of options supplied by get_configuration """ db = db_manager() if(self.config.db_file): db.init_sb_db(self.config.db_file) else: db.init_sb_db('spiderbro.db') if(self.config.mysql): db.xbmc_init_mysql(self.config.host, self.config.user, self.config.pwd, self.config.schema) else: db.xbmc_init_sqlite(self.config.xbmc_sqlite_db) if(self.config.clear_cache and self.config.show): self.logger.info("Clearing db cache for show %s" % (self.config.show)) db.clear_cache(self.config.show) if(self.config.high_quality and self.config.show): self.logger.info("Changing quality to high for show %s" % (self.config.show)) db.set_quality(self.config.show, 1) if(self.config.low_quality and self.config.show): self.logger.info("Changing quality to low for show %s" % (self.config.show)) db.set_quality(self.config.show, 0) if(self.config.force_id and self.config.show): self.logger.info("Forcing new id %s for show %s" % (self.config.force_id, self.config.show)) db.update_series_id(self.config.show, self.config.force_id) return db
def __init__(self, parent, title): super(Example, self).__init__(parent, title=title, size=(790, 350)) self.db = db_manager() self.filter = self.db.filter() self.InitUI() self.Centre() self.Show()
def db_connect(self, subreddit): path, img_path = get_settings(subreddit) self.db = db_manager(path) self.db.table = subreddit self.conn = self.db.create_connect() self.db.count_row(self.conn) return img_path
def __init__(self, parent, title): super(Example, self).__init__(parent, title=title, size=(790, 200)) self.db = db_manager() s = settings_manager() self.uuid = s.get("uuid") self.InitUI() self.Centre() self.Show()
def __init__(self, parent, title): super(Example, self).__init__(parent, title=title, size=(790, 300)) self.db = db_manager() settings = settings_manager() self.uuid = settings.get( "uuid" ) self.InitUI() self.Centre() self.Show()
def setup_db_manager(self): """ Set up the db manager based on a dictionary of options supplied by get_configuration """ db = db_manager() if (self.config.db_file): db.init_sb_db(self.config.db_file) else: db.init_sb_db('spiderbro.db') if (self.config.mysql): db.xbmc_init_mysql(self.config.host, self.config.user, self.config.pwd, self.config.schema) else: db.xbmc_init_sqlite(self.config.xbmc_sqlite_db) if (self.config.clear_cache and self.config.show): self.logger.info("Clearing db cache for show %s" % (self.config.show)) db.clear_cache(self.config.show) if (self.config.high_quality and self.config.show): self.logger.info("Changing quality to high for show %s" % (self.config.show)) db.set_quality(self.config.show, 1) if (self.config.low_quality and self.config.show): self.logger.info("Changing quality to low for show %s" % (self.config.show)) db.set_quality(self.config.show, 0) if (self.config.force_id and self.config.show): self.logger.info("Forcing new id %s for show %s" % (self.config.force_id, self.config.show)) db.update_series_id(self.config.show, self.config.force_id) return db
'https://script.googleusercontent.com/macros/echo?user_content_key=m6tk5dBDTsx5CUYHpHzKjS-GG4sZTTqmlYpLs174ECmpZi8gyB70GF_GmKO6AT0ASwbHiJOhTH8Q47_CNmtGHt5jWKjJF5H7OJmA1Yb3SEsKFZqtv3DaNYcMrmhZHmUMWojr9NvTBuBLhyHCd5hHa3djn4kcaeuceAVPTcb_IKOQizEdNNom8Sk6pZs7_CDNMUWiDq7n5DCWZiujjnbT-IrTlwrp2DSWLgANXR6ofjDf5WHNTOrgsudrZcxZzA24N6hwApyDVcMAPWBcdI_E2UMraeaYJFFKNiV4qZl1oW8CU6cvb-wj4e0BQ2CJeOqTEMNlqVLIbS8G8d9eiTjLOMGD6mYBOprp&lib=MkTrT-GFjciLZr9a0QLCCFly6XnJGsUf7': 'ecology_twitter' #'https://news.yandex.ru/incident.rss':'accidents' #'https://news.google.com/news/feeds?q=%D0%B4%D0%BE%D0%BD%D0%BE%D1%80%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%80%D0%BE%D0%B2%D1%8C&output=rss':'blood' #'https://news.google.com/news/feeds?q=%D0%BF%D1%80%D0%BE%D1%82%D0%B5%D1%81%D1%82%D1%8B+%D0%B4%D0%B0%D0%BB%D1%8C%D0%BD%D0%BE%D0%B1%D0%BE%D0%B9%D1%89%D0%B8%D0%BA%D0%B8&output=rss':'antiplaton', #'https://news.google.com/news/feeds?q=%D0%BF%D0%BB%D0%B0%D1%82%D0%BE%D0%BD+%D0%B4%D0%B0%D0%BB%D1%8C%D0%BD%D0%BE%D0%B1%D0%BE%D0%B9%D1%89%D0%B8%D0%BA%D0%B8&output=rss':'antiplaton', #'https://news.google.com/news/feeds?q=%D0%B0%D0%BD%D1%82%D0%B8%D0%BF%D0%BB%D0%B0%D1%82%D0%BE%D0%BD&output=rss':'antiplaton', #'https://script.googleusercontent.com/macros/echo?user_content_key=TZsyA5Fq2q6HGLuDWq23ixHb8GfsSZj1_JsqVpggKTsVY7Cuzu-12LGxQfuqCj3KFI2QQt-XcV8Zm62l0aqJS1a1OoCyjxEXm5_BxDlH2jW0nuo2oDemN9CCS2h10ox_1xSncGQajx_ryfhECjZEnNrJjcn5tMnJozsIHLgb4WDmzEc3V7pY_qbmxUTSbtzxx0nSp4K8Rg-TeO0TF_iSt4Tt-flz1lUkWh6nmMXDfIYlXmOAssj6TcGD6mYBOprp&lib=Msp86L4y290o5xM5axajKI1y6XnJGsUf7':'antiplaton_tw' #'https://news.google.com/news/feeds?q=%D0%B2%D1%80%D0%B5%D0%BC%D0%B5%D0%BD%D0%BD%D1%8B%D0%B5%20%D0%BB%D0%B0%D0%B3%D0%B5%D1%80%D1%8F%20%D0%B1%D0%B5%D0%B6%D0%B5%D0%BD%D1%86%D0%B5%D0%B2&output=rss':'ukraine' #'http://blogs.yandex.ru/search.rss?text=%D1%83%D0%BA%D1%80%D0%B0%D0%B8%D0%BD%D1%81%D0%BA%D0%B8%D0%B5+%D0%B1%D0%B5%D0%B6%D0%B5%D0%BD%D1%86%D1%8B':'ukraine' #,'http://news.google.com/news?gl=us&pz=1&ned=us&hl=en&q=%D0%B7%D0%B5%D0%BC%D0%BB%D0%B5%D1%82%D1%80%D1%8F%D1%81%D0%B5%D0%BD%D0%B8%D0%B5&output=rss':'earthquakes' #,'http://news.google.com/news?hl=en&gl=us&q=%D0%BD%D0%B0%D0%B2%D0%BE%D0%B4%D0%BD%D0%B5%D0%BD%D0%B8%D1%8F&um=1&ie=UTF-8&output=rss':'floods' }) #try: db = db_manager() db.connect() print "connect" for url in news_sources.keys(): try: fires_lenta = feedparser.parse(url) for entry in fires_lenta.entries: str_date = time.strftime('%d/%m/%Y %H:%M:%S', entry.updated_parsed) print str_date, entry.guid if hasattr(entry, 'georss_point'): coords = entry.georss_point.split(' ') entry.geo_long = coords[0] entry.geo_lat = coords[1]
#!/usr/bin/python # -*- coding: utf-8 -*- from db_manager import db_manager from settings_manager import settings_manager from dbframe import framer from localtimeutil import local8601 db = db_manager() s = settings_manager() uuid = s.get("uuid") db.insert_local_contact(uuid, local8601(), "KD0LIX", "KC5YTI", "80m", "testmode", "1A", "KS")
def Crawling(URL, db): driver = None info_name = URL['info'].split('_') crawling_name = info_name[0] #게시판 크롤링 선택 page = 1 main_url = URL['url'] #게시판 url 추출 : 페이지 바꾸는 데에 사용 page_url = eval(crawling_name + '.Change_page(main_url, page)') #현재 페이지 포스트 url 반환 end_date = date_cut(URL['info']) # end_date 추출 if crawling_name in ["sj34"]: # 동적 게시판 예외 sj34.everytime_all_board(URL, end_date, db) return if crawling_name in ["sj20"]: # 제외 게시판 return; #현재 크롤링하는 게시판 info 출력 print("Target : ", URL['info']) continue_handler(URL['info'], URL, page_url) #크롤링 유무판단 if is_crawling(db, URL['info']) == False: return while True: if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj30", "sj44"]: lastly_post = get_lastly_post(URL, db) try: print("\npage_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 #driver_page 생성--------------------------- if crawling_name in ['sj10']: driver_page = URLparser_EUCKR(page_url) elif crawling_name in ['sj12']: driver_page = URLparser_UTF8(page_url) else: driver_page = URLparser(page_url) #------------------------------------------- #Selenium을 쓰는 경우---------------------------------------------------------------------------------------------- if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj38", "sj44"]: data = eval(crawling_name + '.Parsing_list_url(URL, page_url)') driver = data[0] post_urls = data[1] elif crawling_name in ["sj30"]:#---------------------------세종대역 예외처리 data = eval(crawling_name + '.Parsing_list_url(URL, page_url, lastly_post, db, driver)') driver = data[0] post_urls = data[1] #Requests를 쓰는 경우---------------------------------------------------------------------------------------------- else: #로그인을 하는 경우------------------------------------------------------------------------------- if URL['login'] == 1: post_urls = eval(crawling_name + '.Parsing_list_url(URL, page_url)') #로그인을 하지않는 경우--------------------------------------------------------------------------- else: if driver_page is None: #Connect Failed 이면 break error_handler("driver_none", URL, page_url, db) break else: #parsing 형태-------------------------------------------------- if crawling_name in ['sj10']: bs_page = BeautifulSoup(driver_page, 'lxml') else: bs_page = BeautifulSoup(driver_page, 'html.parser') #-------------------------------------------------------------- post_urls = eval(crawling_name + '.Parsing_list_url(URL, bs_page)') #----------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------- #get_post_data 형식 : [게시글정보dictionary, title, date]------------------------------------------------------------------------------------------------------- #date 규격은 "0000-00-00 00:00:00" post_data_prepare = [] for post_url in post_urls: #Selenium인 경우-------------------------------------------------------------------------------------------------------------------- if crawling_name in ['sj29', 'sj30']:#------------------게시판 규격인 경우 get_post_data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL)') #---------------------------------------------------------------------------------------------------게시판 규격이 아닌 경우 elif crawling_name in ['sj23', 'sj26', 'sj27', 'sj28', 'sj44']: data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL, lastly_post)') post_data_prepare = data[0] lastly_post = data[1] if lastly_post is None: pass else: push_lastly_post(URL, lastly_post, db) #Requests인 경우-------------------------------------------------------------------------------------------------------------------- else: #driver_post 생성-------------------------------- if crawling_name in ["sj21", "sj4", "sj5", "sj8", "sj16"]: #---driver_post가 필요없는 경우 pass elif crawling_name in ['sj10', 'sj33']: driver_post = URLparser_EUCKR(post_url) elif crawling_name in ['sj12']: driver_post = URLparser_UTF8(post_url) else: driver_post = URLparser(post_url) #------------------------------------------------ #-----------------------------------------------------------------------------------------------위키백과 구조 if crawling_name in ['sj21']: get_post_data = eval(crawling_name + '.Parsing_post_data(post_url, URL)') #-----------------------------------------------------------------------------------------------게시판 규격이 아닌 구조 elif crawling_name in ["sj4", "sj5", "sj8", "sj16"]: post_data_prepare = eval(crawling_name + '.Parsing_post_data(post_url, URL)') break #-----------------------------------------------------------------------------------------------게시판 규격인 구조 else: if driver_post is None: #Connect Failed 이면 continue error_handler("driver_none", URL, page_url, db) break else: #parsing 형태------------------------------------------- if crawling_name in ['sj10']: bs_post = BeautifulSoup(driver_post, 'lxml') elif crawling_name in ['sj12']: bs_post = driver_post else: bs_post = BeautifulSoup(driver_post, 'html.parser') #------------------------------------------------------- get_post_data = eval(crawling_name + '.Parsing_post_data(bs_post, post_url, URL)') #----------------------------------------------------------------------------------------------------------------------------------- #post_data_prepare이 이미 완성된 경우----------------------------------------------------------------------- if crawling_name in ["sj4", "sj5", "sj8", "sj16", "sj23", "sj26", "sj27", "sj28", "sj44"]: pass #post_data_prepare이 완성되지 않은 경우--------------------------------------------------------------------- else: if get_post_data == None: #잘못된 포스트 데이터인 경우 continue title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) #---------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------------------------------------------------- add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #dirver 종료 [Selenium 을 사용했을 시] if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj30", "sj38", "sj44"]: driver.quit() #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: break else: page += 1 page_url = eval(crawling_name + '.Change_page(main_url, page)') # Error handler : 만약 크롤링이 실패했을 경우, 에러를 logging 하고 크롤링을 중단한다. except Exception as e: error_handler(e, URL, page_url, db) break
def handle_frame_delete( _self, frame ): print "delete" db = db_manager() _self.db.insert_frames( [frame] )
def __init__(self): self.parser = get_rest_args_parser() self.dbmanager = db_manager() return
def everytime_all_board(URL, end_date, db): main_url = URL['url'] board_search_url = "https://everytime.kr/community/search?keyword=" board_search_word = ['게시판', '갤러리'] board_list = [] # driver 연결 try: driver = chromedriver() driver = everytime.login(driver) except Exception as e: error_handler(e, URL, main_url, db) return WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.article"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') # 에브리타임 상단 동적 게시판 긁기============================================================================= board_group_list = bs.find("div", { "id": "submenu" }).findAll('div', {"class": "group"}) for board_group in board_group_list: try: board_li_list = board_group.find("ul").findAll("li") for board_li in board_li_list: board_li_dic = {} board_li_dic['tag'] = board_li.find("a").text if board_li.find("a").text.strip() == "더 보기": continue else: board_li_dic['url'] = main_url + board_li.find("a")['href'] if (board_li_dic['tag'].find("찾기") != -1): continue board_list.append(board_li_dic) except: continue # 에브리타임 추가 동적 게시판 긁기 for search_word in board_search_word: try: board_search_url_done = board_search_url + search_word driver.get(board_search_url_done) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.result"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') board_a_list = bs.find("div", { "class": "searchresults" }).findAll('a') for board_a in board_a_list: board_li_dic = {} board_li_dic['tag'] = board_a.find("h3").text board_li_dic['url'] = main_url + board_a.get('href') board_list.append(board_li_dic) except: continue #=========================================================================================================== # 동적 게시판들 반복문 for board in board_list: page = 1 page_flag = 0 board_url = board['url'] page_url = Change_page(board_url, page) #현재 페이지 포스트 url 반환 print("\nTarget : ", URL['info'], " :: ", board['tag']) continue_handler(URL['info'] + " :: " + board['tag'], URL, page_url) # 페이지 반복문 while True: if page_flag == 50: page_flag = 0 driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) try: print("page_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 post_urls = Parsing_list_url(main_url, page_url, driver, db) # everytime 고질병 문제 고려, 재시도 if len(post_urls) == 0: time.sleep(2) post_urls = Parsing_list_url(main_url, page_url, driver, db) post_data_prepare = [] # 포스트 반복문 for post_url in post_urls: get_post_data = Parsing_post_data(driver, post_url, URL, board['tag'], db) if get_post_data == "error": break title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: page_flag = 0 break else: page_flag += 1 page += 1 page_url = Change_page(board_url, page) except Exception as e: error_handler(e, URL, page_url, db) driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) break #드라이버 연결 해제 driver.quit()
def load_db_params(self): dbm = db_manager.db_manager() pprint(dir(dbm)) dbm.setup_db(db_name='swapmeetdb', db_user='******', db_pass='******', db_host='mysql.haoliu.net') return dbm