def Parsing_post_data(driver, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) try: driver.get(post_url) time.sleep( 0.5 ) #마땅한 기다릴 요소가 없기에 time.sleep(0.5)를 해준다. 네트워크 및 컴퓨터 사양에 따라 ~3까지 증감시킬 것. html = driver.page_source bs = BeautifulSoup(html, 'html.parser') title = bs.find("li", { "class": "vi_subject vi_title" }).get_text(" ", strip=True) author = bs.find("span", {"id": "regname"}).text.strip() date = bs.find("span", {"id": "regdate"}).text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("li", {"id": "contents"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) img = 1 except: driver.get(post_url) time.sleep( 0.5 ) #마땅한 기다릴 요소가 없기에 time.sleep(0.5)를 해준다. 네트워크 및 컴퓨터 사양에 따라 ~3까지 증감시킬 것. html = driver.page_source bs = BeautifulSoup(html, 'html.parser') title = bs.find("li", {"class": "vi_subject vi_title"}).text.strip() author = bs.find("span", {"id": "regname"}).text.strip() date = bs.find("span", {"id": "regdate"}).text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("li", {"id": "contents"}).text.strip() post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} tds = bs.findAll("span", {"class": "boardTd"}) title = tds[0].get_text(" ", strip=True) author = tds[1].text.strip() if author.find("관리자") != -1: author = "0" date = tds[2].text.strip() date = str(datetime.datetime.strptime(date, "%Y/%m/%d %H:%M:%S")) post = bs.find("div", {"class": "xed"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = 1 #세종대 관련글이므로 1을 넣어준다. post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) print(post_url) title = bs.find("h3", {"class": "jobsearch-JobInfoHeader-title"}).get_text(" ", strip = True) try: author = bs.find("div", {"class": 'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).text.strip() except: author = "Indeed" date = now date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"id": "jobDescriptionText"}).get_text(" ", strip = True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def list_parse(bs0bj, URL, page, latest_datetime=None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("a") domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/'\ + URL['url'].split('/')[3] + '/' + URL['url'].split('/')[4] + '/' for post in post_list: db_record = {} try: obj = post.attrs['href'] except Exception as e: return db_docs db_record.update(content_parse(domain + obj)) # 태그 생성 db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ latest_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif latest_datetime != None and \ db_record['date'] >= latest_datetime['recent_date'] and \ db_record['title'] != latest_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} title = bs.find("header", {"class": "header b-b bg-light h2"}).find("span").get_text(" ", strip = True) author = bs.find("div", {"class": "col-xs-10 lbb"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("span", {"name": "Edate"}).text date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("section", {"class": "wrapper-lg"}).get_text(" ", strip = True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "class": "top_area ngeb" }).find("a").get_text(" ", strip=True) if bs.find("div", {"class": "btm_area clear"}).find("a") is None: author = "0" else: author = bs.find("div", { "class": "btm_area clear" }).find("a").text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("div", {"class": "top_area ngeb"}).find("span").text.strip() date = date + ":00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("div", {"class": "rd_body clear"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("div", {"class": "rd_body clear"}).find("img") is None: img = 1 else: try: img = bs.find("div", { "class": "rd_body clear" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 1 if img != 1: if img_size(img): pass else: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def list_parse(bs0bj, URL, page): today = get_today() db_docs = [] post_list = bs0bj.findAll("li") domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] #게시글 파싱 및 크롤링 for post in post_list: db_record = {} title = "" obj = post.find("div",{"class":"wr-subject"}) title = obj.find("span").find_next("span").get_text().strip() [s.extract() for s in obj('span')] title += " " + obj.find("a").get_text().strip() db_record.update({"url":obj.find("a").attrs["href"]}) db_record.update({"title":title}) db_record.update({"post":0}) db_record.update({"date":today}) db_record.update(tagging(URL, db_record['title'])) print(db_record['title']) db_docs.append(db_record) return db_docs
def Parsing_post_data(bs, post_url, URL): try: time.sleep(2) #서버과부하를 막기위한 조치 return_data = [] post_data = {} domain = Domain_check(URL['url']) author = bs.find("div", {"class": "sumTit"}).find("h3").find("span").text.strip() title = bs.find("div", {"class": "sumTit"}).find("h3").get_text(" ", strip = True).replace(author, "").strip() if author.find("관리자") != -1: author = "0" date = bs.find("dl", {"class": "date"}).findAll("dd")[1].find("span").text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("div", {"class": "tbRow clear"}).get_text(" ", strip = True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data except: return None
def list_parse(driver, bs0bj, URL, page, latest_datetime=None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("td", {"class": "list_loop_left"}) domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/' + URL['url'].split('/')[3] + '/'\ + URL['url'].split('/')[4] + '?mode=view&uid=' for post in post_list: db_record = {} obj = post.find("a").attrs['onclick'].split("'")[1] db_record.update(content_parse(domain + obj)) db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ latest_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif latest_datetime != None and \ db_record['date'] >= latest_datetime['recent_date'] and \ db_record['title'] != latest_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "class": "view_subject" }).find("h5").get_text(" ", strip=True) author = bs.find("ul", {"class": "data"}).find("li").text.strip() date = now date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "view_contents"}).get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def list_parse(bs0bj, URL, page, latest_datetime=None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("td", {"height": "29"}) domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/'\ + URL['url'].split('/')[3] + '/' + URL['url'].split('/')[4] + '/' for post in post_list: db_record = {} obj = post.find("a", {"class": "text12graylightlink"}) db_record.update({"url": domain + obj.attrs['href']}) db_record.update({"title": obj.get_text().strip()}) obj = post.find("td", {"width": "70"}).get_text().strip() obj = obj.replace(".", "-") db_record.update({"date": obj}) db_record.update({"post": 0}) db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ latest_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif latest_datetime != None and \ db_record['date'] >= latest_datetime['recent_date'] and \ db_record['title'] != latest_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def list_parse(bs0bj, URL, page, lastet_datetime=None): today = get_today() db_docs = [] post_list = bs0bj.findAll("li") domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] #게시글 파싱 및 크롤링 for post in post_list: db_record = {} title = "" obj = post.find("div", {"class": "wr-subject"}) title += " " + obj.find("a").get_text().strip() if title.split(" ")[1] == '[알림]': continue print(title) db_record.update({"url": obj.find("a").attrs["href"]}) db_record.update({"title": title}) db_record.update({"post": 0}) db_record.update({"date": today}) db_record.update(tagging(URL, db_record['title'])) print(db_record['title']) # first 파싱일 때 if lastet_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif lastet_datetime != None and\ db_record['title'] != lastet_datetime['title']: db_docs.append(db_record) else: break return db_docs
def list_parse(driver, bs0bj, URL, page, latest_datetime = None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("table",{"class":"text"}) post_list = post_list[0].findAll("tr") + post_list[1].findAll("tr") domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/' + URL['url'].split('/')[3] + '/' for post in post_list: db_record = {} try: url = domain + post.attrs['onclick'].split("'")[1] except: continue db_record.update(content_parse(url)) db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ latest_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif latest_datetime != None and \ db_record['date'] >= latest_datetime['recent_date'] and \ db_record['title'] != latest_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def list_parse(driver, bs0bj, URL, page, latest_datetime=None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("tr") for post in post_list: db_record = {} try: url = post.find("a").attrs['href'] except: continue db_record.update(content_parse(url)) db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ latest_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif latest_datetime != None and \ db_record['date'] >= latest_datetime['recent_date'] and \ db_record['title'] != latest_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) titles = bs.find("tbody").find("tr").findAll("td") title = titles[1].get_text(" ", strip=True) author = "0" date = titles[3].text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("td", {"class": "board_content"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(driver, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) driver.get(post_url) if URL['info'].split("_")[2] == 'qna': WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.protectTable"))) else: WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table#protectTable"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') title = bs.find("div", {"class": "subject"}).find("span", {"class": "b"}).text.strip() author = bs.find("div", {"class": "article_writer"}).find("a").text.strip() date = bs.find("div", {"class": "article_writer"}).find("span", {"class": "p11 ls0"}).text.strip() date = date + ":00" date = str(datetime.strptime(date, "%Y.%m.%d. %H:%M:%S")) post = bs.find("div", {"id": "user_contents"}).text.strip() post = post_wash(post) #post 의 공백을 전부 제거하기 위함 #세종대역은 포스트글이 시작할 때, 항상 112글자의 코드가 같이 긁힌다. 그러니 제외해주자. post = post[67:].strip() #post글을 3000자 까지 읽기위한 작업 tag_done = tag.tagging(URL, title) if bs.find("div", {"id": "user_contents"}).find("img") is None: img = 3 else: img = bs.find("div", {"id": "user_contents"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 3 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 3: if img_size(img): pass else: img = 3 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) time.sleep(2) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("td", {"class": "subject-value"}).get_text(" ", strip=True) author = bs.find("td", {"class": "writer"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("td", {"class": "date"}).text if URL['info'] == "sj1_main_founded": date = date + " 12:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("tbody").find("div").get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) if bs.find("tbody").find("tr").find("img"): img = bs.find("tbody").find("tr").find("img")["src"] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://") or img.startswith( "data:"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img else: img = 1 if img != 1: if img_size(img): pass else: img = 1 post_url_a = post_url.split("&viewNum=")[0] post_url_b = post_url.split("&viewNum=")[1] while post_url_b[0] != '&': post_url_b = post_url_b[1:] post_url = post_url_a + post_url_b #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = post_url.split("$$")[1] post_url = post_url.split("$$")[0] driver_post = URLparser(post_url) bs = BeautifulSoup(driver_post, 'html.parser') title = "세종대백과 :: " + title author = "0" date = "2019-01-01 12:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "page group"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 post = post.split("//<![CDATA")[0] if bs.find("div", {"class": "page group"}).find("img") is None: img = 0 else: try: img = bs.find("div", { "class": "page group" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 0 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 0 if img != 0: if img_size(img): pass else: img = 0 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): try: return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("span", { "class": "col_blue" }).get_text(" ", strip=True) author = "0" date = bs.find("dl", { "class": "explainInfoBx" }).find("dd").text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("p", {"class": "tx"}).get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) if bs.find("div", {"class": "img"}).find("img") is None: img = 1 else: img = bs.find("div", { "class": "img" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data except: return None
def Parsing_post_data(bs, post_url, URL): now = datetime.datetime.now().strftime("%Y-%m-%d") return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", {"class": "title"}).find("h4").get_text(" ", strip = True) author = "0" dates = bs.find("div", {"data-role": "input"}).findAll("time") if len(dates) < 3: date = now date = date + " 00:00:00" else: date = dates[2].text.strip() date1 = date.split("(")[0].strip() date2 = date.split(")")[1].strip() date = date1 + " " + date2 + ":00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("div", {"class": "abstract"}).find("div", {"class": "text"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("meta", {"property": "og:image"})['content'] is None: img = 1 else: try: img = bs.find("meta", {"property": "og:image"})['content'] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 1 if img != 1: if img_size(img): pass else: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): time.sleep(1) return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("span", { "class": "title_subject" }).get_text(" ", strip=True) author = "0" date = bs.find("span", {"class": "gall_date"})['title'] date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", { "class": "writing_view_box" }).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("div", {"class": "writing_view_box"}).find("img") is None: img = 0 else: try: img = bs.find("div", { "class": "writing_view_box" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 0 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 0 if img != 0: if img_size(img): pass else: img = 0 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj4'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: #[title, author, post1, post2, date] 형태 post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip=True) author = post_infoes[0].find("div").text if author.find("관리자") != -1: author = "0" end_data = post_infoes[4].text + " 00:00:00" post = post_infoes[1].get_text( " ", strip=True) + post_infoes[2].get_text( " ", strip=True) + post_infoes[3].get_text( " ", strip=True) + "~" + post_infoes[4].get_text( " ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) post = post[:200] img = 1 url = post_infoes[5].find("a")["href"] post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = str( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) post_data['end_data'] = datetime.datetime.strptime( end_data, "%Y-%m-%d %H:%M:%S") post_data['post'] = post.upper() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] title = bs.find("span", {"class": "on"}).get_text(" ", strip=True) author = bs.find("table", { "class": "basic-table input-table" }).findAll("tr")[1].find("td").text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("table", { "class": "basic-table input-table" }).findAll("tr")[3].find("td").text.strip()[:23].split('~')[1].strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("ul", {"class": "summary-info"}).get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) if bs.find("div", {"class": "poster"}).find("img") is None: img = 1 else: img = bs.find("div", { "class": "poster" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def list_parse(bs0bj, URL, page, lastet_datetime=None): target = URL['info'].split('_')[1] start_datetime = startdate_dict[target] db_docs = [] post_list = bs0bj.findAll("li") domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] #게시글 파싱 및 크롤링 for post in post_list: # 1 페이지에만 나타나는 공지글 스킵 if post.find("span", {"class": "wr-icon wr-notice"}) != None: continue db_record = {} try: obj = post.find("div", { "class": "wr-subject" }).find("a").attrs["href"] except Exception as e: return db_docs db_record.update(content_parse(domain, obj)) if "class" in db_record.keys(): db_record.update( tagging(URL, db_record['title'] + db_record['class'])) else: db_record.update(tagging(URL, db_record['title'])) print(db_record['date']) # first 파싱이고 해당 글의 시간 조건이 맞을 때 if db_record['date'] >= start_datetime and \ lastet_datetime == None: db_docs.append(db_record) #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때 elif lastet_datetime != None and\ db_record['date'] >= lastet_datetime['recent_date'] and \ db_record['title'] != lastet_datetime['title']: db_docs.append(db_record) else: continue return db_docs
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} title = bs.find("div", { "class": "prop article bt1" }).find("div", { "class": "subject" }).get_text(" ", strip=True) date = bs.find("span", {"class": "date"}).text date = date + " 00:00:00" try: date = datetime.datetime.strftime(date, "%Y-%m-%d %H:%M:%S") except: date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") post = bs.find("div", {"class": "phrase"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 try: img = bs.find("div", { "class": "phrase" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 except: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = "0" post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "class": "col-lg-9 title" }).find("span").get_text(" ", strip=True) author = bs.find("span", {"name": "WRITENAME"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("span", {"name": "wdate"}).text date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "form-group"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) #이미지가 있으면 이미지 url 을 넣고, 없으면 1을 넣어준다. if bs.find("img", {"align": "absmiddle"}) is None: img = 1 else: img = domain + bs.find("img", {"align": "absmiddle"})['src'] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) tables = bs.find("div", {"align": "center"}).findAll("table") title_table = tables[3] tds = title_table.findAll("td") title = tds[1].get_text(" ", strip = True) author = "0" date = tds[0].text.strip() date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("td", {"class": "sf_contents"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("td", {"class": "sf_contents"}).find("img") is None: img = 1 else: img = bs.find("td", {"class": "sf_contents"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", {"class": "body contest-detail"}).find("span", {"class": "title"}).get_text(" ", strip = True) author = bs.find("div", {"class": "contest-overview"}).find("tbody").find("tr").text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("th", text="접수기간").parent.find("td").text.strip() date = date[13:] + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "info-cont"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) if bs.find("img", {"id": "poster"}) is None: img = 7 else: try: img = bs.find("img", {"id": "poster"})['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 7 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 7 if img != 7: if img_size(img): pass else: img = 7 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) date = post_url[-8:] url = post_url.replace(date, "") driver = URLparser_UTF8(url) bs = BeautifulSoup(driver, 'html.parser') title = bs.find("div", {"id": "contents"}).find("div", {"class": "vi_subj"}).get_text(" ", strip = True) author = "0" date = "20" + date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "vi_cont"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("div", {"class": "vi_cont"}).find("img") is None: img = 1 else: img = bs.find("div", {"class": "vi_cont"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj5'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip=True) author = post.find("div").text.strip() if author.find("관리자") != -1: author = "0" date = post_infoes[3].text + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip=True) phrase = post_wash(phrase) tag_done = tag.tagging(URL, title) img = 1 url_num = str(post_infoes[4].find("a")).split('"')[3] url = URL['post_url'] + url_num post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = phrase.lower() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare