def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", {"class": "body contest-detail"}).find("span", {"class": "title"}).get_text(" ", strip = True) author = bs.find("div", {"class": "contest-overview"}).find("tbody").find("tr").text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("th", text="접수기간").parent.find("td").text.strip() date = date[13:] + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "info-cont"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) if bs.find("img", {"id": "poster"}) is None: img = 7 else: try: img = bs.find("img", {"id": "poster"})['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 7 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 7 if img != 7: if img_size(img): pass else: img = 7 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} title = bs.find("div", { "class": "prop article bt1" }).find("div", { "class": "subject" }).get_text(" ", strip=True) date = bs.find("span", {"class": "date"}).text date = date + " 00:00:00" try: date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) except: date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") post = bs.find("div", {"class": "phrase"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 try: img = bs.find("div", { "class": "phrase" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 except: img = 1 post_data['title'] = title.upper() post_data['author'] = "0" post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("strong", {"class": "tit"}).get_text(" ", strip=True) author = "0" date = bs.find("span", {"class": "each"}).text.strip()[6:] date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("div", { "class": "board_view_con" }).get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) if bs.find("div", {"class": "board_view_con"}).find("img") is None: img = 1 else: img = bs.find("div", { "class": "board_view_con" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) date = post_url[-8:] url = post_url.replace(date, "") driver = URLparser_UTF8(url) bs = BeautifulSoup(driver, 'html.parser') title = bs.find("div", {"id": "contents"}).find("div", {"class": "vi_subj"}).get_text(" ", strip = True) author = "0" date = "20" + date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "vi_cont"}).get_text(" ", strip = True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("div", {"class": "vi_cont"}).find("img") is None: img = 1 else: img = bs.find("div", {"class": "vi_cont"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 tag_done = tag.tagging(URL, title) #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("thead").find("td", { "class": "subject-value" }).get_text(" ", strip=True) author = bs.find("thead").find("td", {"class": "writer"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("thead").find("td", {"class": "date"}).text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("tbody").find("td", { "class": "content" }).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("tbody").find("img") is None: img = 1 else: img = bs.find("tbody").find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj4'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: #[title, author, post1, post2, date] 형태 post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip = True) author = post_infoes[0].find("div").text if author.find("관리자") != -1: author = "0" date = post_infoes[4].text + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = post_infoes[1].get_text(" ", strip = True) + post_infoes[2].get_text(" ", strip = True) + post_infoes[3].get_text(" ", strip = True) + "~" + post_infoes[4].get_text(" ", strip = True) post = post_wash(post) tag_done = tag.tagging(URL, title) post = post[:200] img = 1 url = post_infoes[5].find("a")["href"] post_data['title'] = title.upper() post_data['author'] = author.upper() db_date = post_infoes[3].text + " 00:00:00" post_data['date'] = str(datetime.datetime.strptime(db_date, "%Y-%m-%d %H:%M:%S")) post_data['post'] = post.upper() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) tables = bs.find("div", {"align": "center"}).findAll("table") title_table = tables[3] tds = title_table.findAll("td") title = tds[1].get_text(" ", strip=True) author = "0" date = tds[0].text.strip() date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("td", {"class": "sf_contents"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("td", {"class": "sf_contents"}).find("img") is None: img = 1 else: img = bs.find("td", { "class": "sf_contents" }).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "id": "ModuleBoardView" }).find("div", { "class": "title" }).find("h5").get_text(" ", strip=True) author = bs.find("span", {"rel": "author"}).text.strip() date = bs.find("li", {"class": "date"}).find("time").text.strip() date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "content"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs.find("article", {"data-role": "post"}).find("img")['src'] is None: img = 1 else: try: img = bs.find("article", {"data-role": "post"}).find("img")['src'] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img except: img = 1 if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("td", {"class": "subject-value"}).get_text(" ", strip=True) author = bs.find("td", {"class": "writer"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("td", {"class": "date"}).text date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("tbody").find("div").get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 tag_done = tag.tagging(URL, title) if bs.find("tbody").find("tr").find("img"): img = bs.find("tbody").find("tr").find("img")["src"] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://") or img.startswith( "data:"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img else: img = 1 if img != 1: if img_size(img): pass else: img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url":url}) obj = bs0bj.find("table",{"class":"bbs-view-info"}) obj2 = obj.find("tr").find("td") db_record.update({"title":obj2.get_text().strip()}) obj2 = obj.find("tr").findNext("tr").find("td") db_record.update({"date":obj2.get_text().strip()}) obj = bs0bj.find("table",{"class":"bbs-view"}) db_record.update({"post":post_wash(str(obj.get_text().strip()))}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("div", {"class": "read_header"}).h1 db_record.update({"title": obj.get_text().strip()}) obj = bs0bj.find("p", {"class": "time"}).get_text().strip() obj = obj.replace(".", "-") db_record.update({"date": obj}) obj = bs0bj.find("div", {"class": "read_body"}).get_text().strip() db_record.update({"post": post_wash(obj)}) return db_record
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "class": "col-lg-9 title" }).find("span").get_text(" ", strip=True) author = bs.find("span", {"name": "WRITENAME"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("span", {"name": "wdate"}).text date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "form-group"}).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 #이미지가 있으면 이미지 url 을 넣고, 없으면 1을 넣어준다. if bs.find("img", {"align": "absmiddle"}) is None: img = 1 else: img = domain + bs.find("img", {"align": "absmiddle"})['src'] if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("td", {"class": "list_loop_left"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.findNext("td", {"class": "list_loop_left"}).get_text().strip() obj = obj.replace(".", "-").split("(")[1].split(" ")[0] db_record.update({"date": obj}) obj = bs0bj.find("td", {"class": "view_content"}).get_text().strip() db_record.update({"post": post_wash(obj)}) return db_record
def Parsing_post_data(bs, post_url, URL): time.sleep(2) #서버과부하를 막기위한 조치 return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", {"class": "infoBx"}).find("h3").get_text(" ", strip = True) author = bs.find("p", {"class": "infoTx"}).find("span", {"class": "cate"}).text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("p", {"class": "infoTx"}).find("span", {"class": "date"}).text.strip() date = date + ":00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"id": "view_text"}).get_text(" ", strip = True) post = post_wash(post) if bs.find("div", {"id": "view_text"}).find("img") is None: img = 1 else: img = bs.find("div", {"id": "view_text"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def Parsing_post_data(bs, post_url, URL): try: return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("span", {"class": "col_blue"}).get_text(" ", strip = True) author = "0" date = bs.find("dl", {"class": "explainInfoBx"}).find("dd").text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("p", {"class": "tx"}).get_text(" ", strip = True) post = post_wash(post) if bs.find("div", {"class": "img"}).find("img") is None: img = 1 else: img = bs.find("div", {"class": "img"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data except: return None
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find(text="제목") db_record.update({"title": obj.findNext('td').get_text().strip()}) obj = bs0bj.find(text="작성일") db_record.update({"date": obj.findNext('td').get_text().strip()}) try: obj = bs0bj.find("div", {'class': "bbs-body"}) db_record.update({"post": post_wash(str(obj.get_text().strip()))}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url":url}) obj = bs0bj.find("td",{"class":"boardSub"}) db_record.update({"title":obj.get_text().strip()}) obj = obj.findNext("td").findNext("td").get_text().strip() obj = obj.replace(".","-") db_record.update({"date":obj}) obj = bs0bj.find("td",{"class":"contens"}).get_text().strip() db_record.update({"post":post_wash(obj)}) return db_record
def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj5'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip = True) author = post.find("div").text.strip() if author.find("관리자") != -1: author = "0" date = post_infoes[3].text + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip = True) phrase = post_wash(phrase) img = 1 url_num = str(post_infoes[4].find("a")).split('"')[3] url = URL['post_url'] + url_num post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = phrase.lower() post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] title = bs.find("span", {"class": "on"}).get_text(" ", strip = True) author = bs.find("table", {"class": "basic-table input-table"}).findAll("tr")[1].find("td").text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("table", {"class": "basic-table input-table"}).findAll("tr")[3].find("td").text.strip()[:23].split('~')[1].strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("ul", {"class": "summary-info"}).get_text(" ", strip = True) post = post_wash(post) if bs.find("div", {"class": "poster"}).find("img") is None: img = 1 else: img = bs.find("div", {"class": "poster"}).find("img")['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 1 else: if img.startswith("http://") or img.startswith("https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 1: if img_size(img): pass else: img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("td", {"class": "title"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.findNext("td").findNext("td") db_record.update({"date": obj.get_text().strip()}) try: obj = bs0bj.find("td", {"class": "tdc"}).get_text().strip() db_record.update({"post": post_wash(obj)}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser").find("article",{"id":"bo_v"}) db_record = {} db_record.update({"url":url}) obj = bs0bj.find("h1",{"id":"bo_v_title"}).get_text().strip() db_record.update({"title":obj}) obj = bs0bj.find("section",{"id":"bo_v_info"}).find("strong").find_next("strong") obj = "20" + obj.get_text().strip() db_record.update({"date":obj}) try: obj = bs0bj.find("div",{"id":"bo_v_con"}).get_text().strip() db_record.update({"post":post_wash(obj)}) except: db_record.update({"post":1}) return db_record
def content_parse(url): db_record = {} html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") bs0bj = bs0bj.find("div",{"id":"board_view"}) db_record.update({"url":url}) obj = bs0bj.find("h3").get_text().strip() db_record.update({"title":obj}) obj = bs0bj.find("p",{"class":"writer"}).find("strong").get_text().strip() db_record.update({"date":obj}) try: obj = bs0bj.find("div",{"class":"board_stance"}).get_text().strip() db_record.update({"post":post_wash(obj)}) except: db_record.update({"post":1}) return db_record
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("tr", { "class": "head" }).find("td", {"class": "first txt-l"}) db_record.update({"title": obj.get_text().strip()}) obj = obj.find_next("td").find_next("td") db_record.update({"date": obj.get_text().strip()}) try: obj = bs0bj.find("tr", {"class": "head"}).find_next("tr") db_record.update({"post": post_wash(str(obj.get_text().strip()))}) except: db_record.update({"post": 1}) return db_record
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("h3", {"class": "title"}).get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("span", {"class": "date"}).get_text().strip() obj = obj.split('.')[0] + "-" + obj.split('.')[1] + "-" + obj.split('.')[2] db_record.update({"date": obj}) try: obj = bs0bj.find("div", {"class": "boardReadBody"}).get_text().strip() db_record.update({"post": post_wash(obj)}) except: db_record.update({"post": 1}) return db_record
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", { "class": "tbl_container" }).find("th").get_text(" ", strip=True) author = bs.find("div", { "class": "tbl_container" }).findAll("tr")[1].findAll("td")[1].text.strip() if author.find("관리자") != -1: author = "0" date = bs.find("div", { "class": "tbl_container" }).findAll("tr")[1].findAll("td")[3].text.strip() date = date.replace(" 오전", "") date = date.replace(" 오후", "") if len(date.split(":")) == 2: date = date + ":00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", { "class": "tbl_container" }).findAll("tr")[2].get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) obj = bs0bj.find("span", {"class": "view_subj_core"}) obj = obj.get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("span", {"class": "view_subj_date"}) obj = obj.get_text().strip() db_record.update({"date": obj}) try: obj = bs0bj.find("div", {"class": "view_txt_container"}) obj = obj.get_text().strip() db_record.update({"post": post_wash(str(obj))}) except: db_record.update({"post": 1}) return db_record
def Parsing_post_data(bs, post_url, URL): try: time.sleep(2) #서버과부하를 막기위한 조치 return_data = [] post_data = {} domain = Domain_check(URL['url']) author = bs.find("div", { "class": "sumTit" }).find("h3").find("span").text.strip() title = bs.find("div", { "class": "sumTit" }).find("h3").get_text(" ", strip=True).replace(author, "").strip() if author.find("관리자") != -1: author = "0" date = bs.find("dl", { "class": "date" }).findAll("dd")[1].find("span").text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = bs.find("div", { "class": "tbRow clear" }).get_text(" ", strip=True) post = post_wash(post) img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data except: return None
def Parsing_post_data(bs, post_url, URL): return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("div", {"class": "view_subject"}).find("h5").get_text(" ", strip = True) author = bs.find("ul", {"class": "data"}).find("li").text.strip() date = now date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) post = bs.find("div", {"class": "view_contents"}).get_text(" ", strip = True) post = post_wash(post) img = 1 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data
def content_parse(domain, url): html = URLparser(url) bs0bj = BeautifulSoup(html.read(), "html.parser") db_record = {} db_record.update({"url": url}) bs0bj = bs0bj.find("table", {"class": "board_view"}) obj = bs0bj.find("thead").get_text().strip() db_record.update({"title": obj}) obj = bs0bj.find("tbody").find("tr").find("td").find_next("td").find_next( "td") obj = obj.get_text().strip().split(" ")[2] db_record.update({"date": obj}) try: obj = bs0bj.find("tbody").find("td", {"class": "tdc"}) obj = obj.get_text().strip() db_record.update({"post": post_wash(str(obj))}) except: db_record.update({"post": 1}) return db_record
def Parsing_post_data(bs, post_url, URL): try: return_data = [] post_data = {} domain = Domain_check(URL['url']) title = bs.find("span", { "class": "txt_jobfair" }).get_text(" ", strip=True) author = bs.find("span", {"class": "tit_company_name"}).text.strip() date = bs.find("p", {"class": 'info'}).find("span").text.strip() date = date + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) post = "" posts = bs.findAll("dl", {"class": "qna_list"}) for posts_one in posts: post += posts_one.text.get_text(" ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) img = 1 #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식 post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = post.lower() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = post_url return_data.append(post_data) return_data.append(title) return_data.append(date) return return_data except: return None