def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") bs0bj = bs0bj.find("div",{"class":"view-wrap"})\ .find("article",{"itemprop":"articleBody"}) db_record = {} db_record.update({"url": url}) obj = bs0bj.find("h1", {"itemprop": "headline"}) db_record.update({"title": obj.get_text().strip()}) if bs0bj.find("span", {"class": "hidden-xs"}) != None: obj = bs0bj.find("span", {"class": "hidden-xs"}) if obj.get_text().strip() != "": db_record.update({"class": obj.get_text().strip()}) obj = bs0bj.find("span", {"itemprop": "datePublished"}) date = obj.attrs["content"].split( "KST")[0] + " " + obj.attrs["content"].split("KST")[1] db_record.update({"date": date}) try: obj = bs0bj.find("div", {"itemprop": "description"}) db_record.update({"post": post_wash(str(obj.get_text().strip()))}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) try: bs0bj = BeautifulSoup(html.read(), "lxml") except: print("connect error") bs0bj = BeautifulSoup(html.read(), "lxml") bs0bj = bs0bj.find("div",{"class":"view_content_wrap"}) db_record = {} db_record.update({"url":url}) obj = bs0bj.find("h3",{"class":"title ub-word"}).find("span",{"class":"title_subject"}).get_text().strip() db_record.update({"title":obj}) obj = bs0bj.find("div", {"class":"gall_writer ub-writer"}).find("div",{"class":"fl"}) obj = bs0bj.find("span",{"class":"gall_date"}).attrs['title'] obj = obj.strip() db_record.update({"date":obj}) try: obj = bs0bj.find("div",{"class":"gallview_contents"}).find("div",{"style":"overflow:hidden;"}) obj = obj.get_text().strip() db_record.update({"post":post_wash(obj)}) except: db_record.update({"post":1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser").find("td", {"class": "text12graylight"}) db_record = {} db_record.update({"url": url}) obj = bs0bj.find("td", {"class": "title12"}).get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("td", {"class": "text11darkgray"}).get_text().strip() obj = obj.replace(".", "-") db_record.update({"date": obj}) try: obj = bs0bj.find("td", { "class": "text12graylight", "align": "left", "valign": "top" }).get_text().strip() db_record.update({"post": post_wash(obj)}) except: db_record.update({"post": 1}) return db_record
def parsing(driver, URL, is_first): if is_first == False: latest_datetime = db_manage("get_recent", URL['info']) recent_date = None page = 1 while True: print('this page is\t| '+ URL['info'] + ' |\t' + str(page - 1)) bs0bj = BeautifulSoup(driver.read(), "html.parser") bs0bj = bs0bj.find("ul",{"class":"list-body"}) # first 크롤링일 경우 or renewal 크롤링일 경우 if is_first == True: db_docs = list_parse(bs0bj, URL, page) else: db_docs = list_parse(bs0bj, URL, page, latest_datetime) # 최근 날짜 갱신 if page == 1 and len(db_docs) >= 1: recent_date = get_recent_date(URL, db_docs) if len(db_docs) == 0: print("addOK : 0") break else: addok = db_manage("add", URL['info'], db_docs) print("addOK : " + str(addok)) if addok == 0: break page += 1 driver = URLparser(URL['url'] + "&page=" + str(page)) #최근 날짜가 갱신되었다면 db에도 갱신 if recent_date != None: db_manage("renewal_date", URL['info'], recent_date, is_first = is_first) recent_date = None
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url":url}) obj = bs0bj.find("table",{"class":"bbs-view-info"}) obj2 = obj.find("tr").find("td") db_record.update({"title":obj2.get_text().strip()}) obj2 = obj.find("tr").findNext("tr").find("td") db_record.update({"date":obj2.get_text().strip()}) obj = bs0bj.find("table",{"class":"bbs-view"}) db_record.update({"post":post_wash(str(obj.get_text().strip()))}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("div", {"class": "read_header"}).h1 db_record.update({"title": obj.get_text().strip()}) obj = bs0bj.find("p", {"class": "time"}).get_text().strip() obj = obj.replace(".", "-") db_record.update({"date": obj}) obj = bs0bj.find("div", {"class": "read_body"}).get_text().strip() db_record.update({"post": post_wash(obj)}) return db_record
def parsing(driver, URL, is_first): if is_first == False: latest_datetime = db_manage("get_recent", URL['info']) recent_date = None page = 1 while True: print('this page is\t| '+ URL['info'] + ' |\t' + str(page)) bs0bj = BeautifulSoup(driver.read(), "html.parser") # first 크롤링일 경우 그냥 진행 if is_first == True: db_docs = list_parse(driver, bs0bj, URL, page) # renewal 모드일 경우. DB에서 가장 최신 게시물의 정보를 가져옴. else: db_docs = list_parse(driver, bs0bj, URL, page, latest_datetime) # 맨 첫 번째 페이지를 파싱했고, 해당 페이지에서 글을 가져온 경우 # 해당 글을 최신 날짜를 딕셔너리로 저장 if page == 1 and len(db_docs) >= 1: recent_date = get_recent_date(URL, db_docs) if len(db_docs) == 0: print("addOK : 0") break else: addok = db_manage("add", URL['info'], db_docs) print("addOK : " + str(addok)) if addok == 0: break page += 1 driver = URLparser(URL['url'] + "&page=" + str(page)) # 최근 날짜가 갱신되었다면 db에도 갱신 if recent_date != None: db_manage("renewal_date", URL['info'], recent_date, is_first = is_first) recent_date = None
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("td", {"class": "list_loop_left"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.findNext("td", {"class": "list_loop_left"}).get_text().strip() obj = obj.replace(".", "-").split("(")[1].split(" ")[0] db_record.update({"date": obj}) obj = bs0bj.find("td", {"class": "view_content"}).get_text().strip() db_record.update({"post": post_wash(obj)}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url":url}) obj = bs0bj.find("td",{"class":"boardSub"}) db_record.update({"title":obj.get_text().strip()}) obj = obj.findNext("td").findNext("td").get_text().strip() obj = obj.replace(".","-") db_record.update({"date":obj}) obj = bs0bj.find("td",{"class":"contens"}).get_text().strip() db_record.update({"post":post_wash(obj)}) return db_record
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find(text="제목") db_record.update({"title": obj.findNext('td').get_text().strip()}) obj = bs0bj.find(text="작성일") db_record.update({"date": obj.findNext('td').get_text().strip()}) try: obj = bs0bj.find("div", {'class': "bbs-body"}) db_record.update({"post": post_wash(str(obj.get_text().strip()))}) except: db_record.update({"post": 1}) return db_record
def Parsing_post_data(post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = post_url.split("$$")[1] post_url = post_url.split("$$")[0] driver_post = URLparser(post_url) bs = BeautifulSoup(driver_post, 'html.parser') title = "세종대백과 :: " + title author = "0" date = "2019-01-01 12:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "page group"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 post = post.split("//<![CDATA")[0] if bs.find("div", {"class": "page group"}).find("img") is None: img = 0 else: try: img = bs.find("div", { "class": "page group" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 0 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 0 if img != 0: if img_size(img): pass else: img = 0 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("td", {"class": "title"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.findNext("td").findNext("td") db_record.update({"date": obj.get_text().strip()}) try: obj = bs0bj.find("td", {"class": "tdc"}).get_text().strip() db_record.update({"post": post_wash(obj)}) except: db_record.update({"post": 1}) return db_record
def Crawling(target, URL, is_first): select = URL['info'].split('_')[1] try: driver = URLparser(URL['url']) except Exception as e: print("Connect Error") return if driver == None: return if target == 'PK_univ': print('-------------------------------------') print('Selected <' + URL['info'] +'>') print('-------------------------------------') if select == 'main': PK_main.parsing(driver, URL, is_first) elif select == 'ce': PK_ce.parsing(driver, URL, is_first) elif select == 'pknu': PK_pknu.parsing(driver, URL, is_first) elif select == 'today': PK_today.parsing(driver, URL, is_first) elif select == 'pknulec' and is_first == True: PK_pknu_lecture.parsing(driver, URL, is_first) elif select == 'pknulogin': PK_pknulogin.parsing(driver, URL, is_first) elif select == 'dorm': PK_dorm.parsing(driver, URL, is_first) elif select == 'start': PK_start.parsing(driver, URL, is_first) elif select == 'dcinside': PK_dcinside.parsing(driver, URL, is_first) elif select == 'coop': PK_coop.parsing(driver, URL, is_first) elif select == 'sh': PK_sh.parsing(driver, URL, is_first) elif select in duemlist: PK_duem.parsing(driver, URL, is_first) elif select in eelist: PK_ee.parsing(driver, URL, is_first) elif select == 'eelogin': PK_eelogin.parsing(driver, URL, is_first) elif select in aquacullist: PK_aquacul.parsing(driver, URL, is_first) elif select in phlist: PK_physics.parsing(driver, URL, is_first) elif select == 'chem': PK_chem.parsing(driver, URL, is_first) elif select == 'dba': PK_dba.parsing(driver, URL, is_first) elif select == 'english': PK_english.parsing(driver, URL, is_first)
def content_parse(url): db_record = {} html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") bs0bj = bs0bj.find("div",{"id":"board_view"}) db_record.update({"url":url}) obj = bs0bj.find("h3").get_text().strip() db_record.update({"title":obj}) obj = bs0bj.find("p",{"class":"writer"}).find("strong").get_text().strip() db_record.update({"date":obj}) try: obj = bs0bj.find("div",{"class":"board_stance"}).get_text().strip() db_record.update({"post":post_wash(obj)}) except: db_record.update({"post":1}) return db_record
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("tr", { "class": "head" }).find("td", {"class": "first txt-l"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.find_next("td").find_next("td") db_record.update({"date": obj.get_text().strip()}) try: obj = bs0bj.find("tr", {"class": "head"}).find_next("tr") db_record.update({"post": post_wash(str(obj.get_text().strip()))}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("h3", {"class": "title"}).get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("span", {"class": "date"}).get_text().strip() obj = obj.split('.')[0] + "-" + obj.split('.')[1] + "-" + obj.split('.')[2] db_record.update({"date": obj}) try: obj = bs0bj.find("div", {"class": "boardReadBody"}).get_text().strip() db_record.update({"post": post_wash(obj)}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser").find("article",{"id":"bo_v"}) db_record = {} db_record.update({"url":url}) obj = bs0bj.find("h1",{"id":"bo_v_title"}).get_text().strip() db_record.update({"title":obj}) obj = bs0bj.find("section",{"id":"bo_v_info"}).find("strong").find_next("strong") obj = "20" + obj.get_text().strip() db_record.update({"date":obj}) try: obj = bs0bj.find("div",{"id":"bo_v_con"}).get_text().strip() db_record.update({"post":post_wash(obj)}) except: db_record.update({"post":1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("span", {"class": "view_subj_core"}) obj = obj.get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("span", {"class": "view_subj_date"}) obj = obj.get_text().strip() db_record.update({"date": obj}) try: obj = bs0bj.find("div", {"class": "view_txt_container"}) obj = obj.get_text().strip() db_record.update({"post": post_wash(str(obj))}) except: db_record.update({"post": 1}) return db_record
def Parsing_post_data(post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = post_url.split("$$")[1] post_url = post_url.split("$$")[0] driver_post = URLparser(post_url) bs = BeautifulSoup(driver_post, 'html.parser') title = "세종대백과 :: " + title author = "0" date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") post = bs.find("div", {"class": "page group"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 post = post.split("//<![CDATA")[0] if bs.find("div", {"class": "page group"}).find("img") is None: img = 0 else: try: img = bs.find("div", { "class": "page group" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 0 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 0 if img != 0: if img_size(img): pass else: img = 0 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) bs0bj = bs0bj.find("table", {"class": "board_view"}) obj = bs0bj.find("thead").get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("tbody").find("tr").find("td").find_next("td").find_next( "td") obj = obj.get_text().strip().split(" ")[2] db_record.update({"date": obj}) try: obj = bs0bj.find("tbody").find("td", {"class": "tdc"}) obj = obj.get_text().strip() db_record.update({"post": post_wash(str(obj))}) except: db_record.update({"post": 1}) return db_record
def parsing(driver, URL, is_first): target = URL['info'].split('_')[1] global start_datetime start_datetime = startdate_dict[target] if is_first == False: latest_datetime = db_manage("get_recent", URL['info']) recent_date = None page = 1 print("start_date:" + start_datetime) while True: print('this page is\t| '+ URL['info'] + ' |\t' + str(page)) bs0bj = BeautifulSoup(driver.read(), "html.parser") bs0bj = bs0bj.find("div",{"id":"board_box"}).find("ul",{"id":"board_list"}) # first 크롤링일 경우 그냥 진행 if is_first == True: db_docs = list_parse(bs0bj, URL, page) # renewal 모드일 경우. DB에서 가장 최신 게시물의 정보를 가져옴. else: db_docs = list_parse(bs0bj, URL, page, latest_datetime) # 맨 첫 번째 페이지를 파싱했고, 해당 페이지에서 글을 가져온 경우 # 해당 글을 최신 날짜를 딕셔너리로 저장 if page == 1 and len(db_docs) >= 1: recent_date = get_recent_date(URL, db_docs) if len(db_docs) == 0: print("addOK : 0") break else: addok = db_manage("add", URL['info'], db_docs) print("addOK : " + str(addok)) if addok == 0: break page += 1 driver = URLparser(URL['url'] + "&pageIndex=" + str(page)) # 최근 날짜가 갱신되었다면 db에도 갱신 if recent_date != None: db_manage("renewal_date", URL['info'], recent_date, is_first = is_first) recent_date = None
def Parsing_post_data(bs, URL): post_data_prepare = [] end_date = date_cut(URL['info']) posts = bs.findAll("div", {"class": "item article"}) for post in posts: post_infoes = post.findAll("a") #td 묶음 post_data = {} try: title = post_infoes[0].get_text(" ", strip = True) author = post.find("strong").text.strip() if author.find("관리자") != -1: author = "0" date = post.find("span", {"class": "date"}) date = str(date).split(">")[1] date = str(date).split("<")[0] date = date + " 00:00:00" except: title = post_infoes[0].get_text(" ", strip = True) try: author = post.find("strong").text.strip() except: author = "0" if author.find("관리자") != -1: author = "0" date = post.find("span", {"class": "date"}) date = str(date).split(">")[1] date = str(date).split("<")[0] date = date + " 00:00:00" try: date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) except: date = datetime.datetime.now().strftime("%Y-%m-%d") date = date + " 00:00:00" try: phrase = post_infoes[1].get_text(" ", strip = True) except: phrase = "0" phrase = post_wash(phrase) url = post.find("a")["href"] #뉴스 url 에 들어가서 img를 가져오기위한 작업 domain = Domain_check(url) #뉴스 url 도메인 driver_page = URLparser(url) bs_page = BeautifulSoup(driver_page, 'html.parser') try: img = bs_page.find("head").find("meta", {"property": "og:image"})['content'] except: try: if bs_page.find("body").find("img") is None: img = 1 else: img = bs_page.find("body").find("img")['src'] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 1 if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = phrase.lower() post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) return post_data_prepare
def Parsing_post_data(bs, post_url, URL): List = [] posts = bs.find("ul", {"class": "type01"}).find_all("li") for post in posts: if post.has_attr('id') == True: List.append(post) return_data = [] for item in List: post_data = {} title = item.find("dt").get_text(" ", strip=True) author = '' date = '' domain = '' date_parsed = item.find("dd", { "class": "txt_inline" }).get_text(" ", strip=True).split(" ") list_size = len(date_parsed) if list_size <= 4: # ['출처','n시간','전','보내기'] , ['출처','2000-00-00','보내기'] date = change_date_form(date_parsed[1]) elif list_size == 5: #['출처','n시간','전','네이버뉴스','보내기'],['출처','위치','n시간','전','보내기'] if '네이버뉴스' in date_parsed: date = change_date_form(date_parsed[1]) else: date = change_date_form(date_parsed[2]) elif list_size == 6: #['출처','위치','시간','전','네이버뉴스','보내기'] date = change_date_form(date_parsed[2]) else: #['출처','언론사','선정','n시간','전','보내기'] date = change_date_form(date_parsed[3]) print("::::", date) post = item.find("dd", { "class": "txt_inline" }).find_next('dd').get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 list_url = item.find("dt").find("a")['href'] driver_post = URLparser(list_url) bs_post = BeautifulSoup(driver_post, 'html.parser') if bs_post.find("meta", {"property": "og:image"}) is None: img = 7 else: try: img = bs_post.find("meta", { "property": "og:image" }).get("content") #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 7 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 7 if img != 7: if img_size(img): pass else: img = 7 post_data['title'] = title.upper() post_data['author'] = author post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = list_url print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 return_data.append(post_data) return return_data
def Crawling(URL, db): driver = None info_name = URL['info'].split('_') crawling_name = info_name[0] #게시판 크롤링 선택 page = 1 main_url = URL['url'] #게시판 url 추출 : 페이지 바꾸는 데에 사용 page_url = eval(crawling_name + '.Change_page(main_url, page)') #현재 페이지 포스트 url 반환 end_date = date_cut(URL['info']) # end_date 추출 if crawling_name in ["sj34"]: # 동적 게시판 예외 sj34.everytime_all_board(URL, end_date, db) return if crawling_name in ["sj20"]: # 제외 게시판 return; #현재 크롤링하는 게시판 info 출력 print("Target : ", URL['info']) continue_handler(URL['info'], URL, page_url) #크롤링 유무판단 if is_crawling(db, URL['info']) == False: return while True: if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj30", "sj44"]: lastly_post = get_lastly_post(URL, db) try: print("\npage_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 #driver_page 생성--------------------------- if crawling_name in ['sj10']: driver_page = URLparser_EUCKR(page_url) elif crawling_name in ['sj12']: driver_page = URLparser_UTF8(page_url) else: driver_page = URLparser(page_url) #------------------------------------------- #Selenium을 쓰는 경우---------------------------------------------------------------------------------------------- if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj38", "sj44"]: data = eval(crawling_name + '.Parsing_list_url(URL, page_url)') driver = data[0] post_urls = data[1] elif crawling_name in ["sj30"]:#---------------------------세종대역 예외처리 data = eval(crawling_name + '.Parsing_list_url(URL, page_url, lastly_post, db, driver)') driver = data[0] post_urls = data[1] #Requests를 쓰는 경우---------------------------------------------------------------------------------------------- else: #로그인을 하는 경우------------------------------------------------------------------------------- if URL['login'] == 1: post_urls = eval(crawling_name + '.Parsing_list_url(URL, page_url)') #로그인을 하지않는 경우--------------------------------------------------------------------------- else: if driver_page is None: #Connect Failed 이면 break error_handler("driver_none", URL, page_url, db) break else: #parsing 형태-------------------------------------------------- if crawling_name in ['sj10']: bs_page = BeautifulSoup(driver_page, 'lxml') else: bs_page = BeautifulSoup(driver_page, 'html.parser') #-------------------------------------------------------------- post_urls = eval(crawling_name + '.Parsing_list_url(URL, bs_page)') #----------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------- #get_post_data 형식 : [게시글정보dictionary, title, date]------------------------------------------------------------------------------------------------------- #date 규격은 "0000-00-00 00:00:00" post_data_prepare = [] for post_url in post_urls: #Selenium인 경우-------------------------------------------------------------------------------------------------------------------- if crawling_name in ['sj29', 'sj30']:#------------------게시판 규격인 경우 get_post_data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL)') #---------------------------------------------------------------------------------------------------게시판 규격이 아닌 경우 elif crawling_name in ['sj23', 'sj26', 'sj27', 'sj28', 'sj44']: data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL, lastly_post)') post_data_prepare = data[0] lastly_post = data[1] if lastly_post is None: pass else: push_lastly_post(URL, lastly_post, db) #Requests인 경우-------------------------------------------------------------------------------------------------------------------- else: #driver_post 생성-------------------------------- if crawling_name in ["sj21", "sj4", "sj5", "sj8", "sj16"]: #---driver_post가 필요없는 경우 pass elif crawling_name in ['sj10', 'sj33']: driver_post = URLparser_EUCKR(post_url) elif crawling_name in ['sj12']: driver_post = URLparser_UTF8(post_url) else: driver_post = URLparser(post_url) #------------------------------------------------ #-----------------------------------------------------------------------------------------------위키백과 구조 if crawling_name in ['sj21']: get_post_data = eval(crawling_name + '.Parsing_post_data(post_url, URL)') #-----------------------------------------------------------------------------------------------게시판 규격이 아닌 구조 elif crawling_name in ["sj4", "sj5", "sj8", "sj16"]: post_data_prepare = eval(crawling_name + '.Parsing_post_data(post_url, URL)') break #-----------------------------------------------------------------------------------------------게시판 규격인 구조 else: if driver_post is None: #Connect Failed 이면 continue error_handler("driver_none", URL, page_url, db) break else: #parsing 형태------------------------------------------- if crawling_name in ['sj10']: bs_post = BeautifulSoup(driver_post, 'lxml') elif crawling_name in ['sj12']: bs_post = driver_post else: bs_post = BeautifulSoup(driver_post, 'html.parser') #------------------------------------------------------- get_post_data = eval(crawling_name + '.Parsing_post_data(bs_post, post_url, URL)') #----------------------------------------------------------------------------------------------------------------------------------- #post_data_prepare이 이미 완성된 경우----------------------------------------------------------------------- if crawling_name in ["sj4", "sj5", "sj8", "sj16", "sj23", "sj26", "sj27", "sj28", "sj44"]: pass #post_data_prepare이 완성되지 않은 경우--------------------------------------------------------------------- else: if get_post_data == None: #잘못된 포스트 데이터인 경우 continue title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) #---------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------------------------------------------------- add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #dirver 종료 [Selenium 을 사용했을 시] if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj30", "sj38", "sj44"]: driver.quit() #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: break else: page += 1 page_url = eval(crawling_name + '.Change_page(main_url, page)') # Error handler : 만약 크롤링이 실패했을 경우, 에러를 logging 하고 크롤링을 중단한다. except Exception as e: error_handler(e, URL, page_url, db) break