Exemplo n.º 1
0
def Parsing_post_data(driver, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    try:
        driver.get(post_url)

        time.sleep(
            0.5
        )  #마땅한 기다릴 요소가 없기에 time.sleep(0.5)를 해준다. 네트워크 및 컴퓨터 사양에 따라 ~3까지 증감시킬 것.
        html = driver.page_source
        bs = BeautifulSoup(html, 'html.parser')

        title = bs.find("li", {
            "class": "vi_subject vi_title"
        }).get_text(" ", strip=True)
        author = bs.find("span", {"id": "regname"}).text.strip()
        date = bs.find("span", {"id": "regdate"}).text.strip()
        date = date + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
        post = bs.find("li", {"id": "contents"}).get_text(" ", strip=True)
        post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
        tag_done = tag.tagging(URL, title)
        img = 1
    except:
        driver.get(post_url)

        time.sleep(
            0.5
        )  #마땅한 기다릴 요소가 없기에 time.sleep(0.5)를 해준다. 네트워크 및 컴퓨터 사양에 따라 ~3까지 증감시킬 것.
        html = driver.page_source
        bs = BeautifulSoup(html, 'html.parser')

        title = bs.find("li", {"class": "vi_subject vi_title"}).text.strip()
        author = bs.find("span", {"id": "regname"}).text.strip()
        date = bs.find("span", {"id": "regdate"}).text.strip()
        date = date + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
        post = bs.find("li", {"id": "contents"}).text.strip()
        post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
        tag_done = tag.tagging(URL, title)
        img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 2
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}

    tds = bs.findAll("span", {"class": "boardTd"})

    title = tds[0].get_text(" ", strip=True)
    author = tds[1].text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = tds[2].text.strip()
    date = str(datetime.datetime.strptime(date, "%Y/%m/%d %H:%M:%S"))
    post = bs.find("div", {"class": "xed"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = 1  #세종대 관련글이므로 1을 넣어준다.
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 3
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	print(post_url)
	title = bs.find("h3", {"class": "jobsearch-JobInfoHeader-title"}).get_text(" ", strip = True)
	try:
		author = bs.find("div", {"class": 'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).text.strip()
	except:
		author = "Indeed"
	date = now
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"id": "jobDescriptionText"}).get_text(" ", strip = True)
	post = post_wash(post)
	tag_done = tag.tagging(URL, title)
	img = 1

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 4
0
def list_parse(bs0bj, URL, page, latest_datetime=None):
    target = URL['info'].split('_')[1]
    start_datetime = startdate_dict[target]
    db_docs = []
    post_list = bs0bj.findAll("a")
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/'\
     + URL['url'].split('/')[3] + '/' + URL['url'].split('/')[4] + '/'

    for post in post_list:
        db_record = {}
        try:
            obj = post.attrs['href']
        except Exception as e:
            return db_docs

        db_record.update(content_parse(domain + obj))
        # 태그 생성
        db_record.update(tagging(URL, db_record['title']))

        print(db_record['date'])
        # first 파싱이고 해당 글의 시간 조건이 맞을 때
        if db_record['date'] >= start_datetime  and \
          latest_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif latest_datetime != None and \
          db_record['date'] >= latest_datetime['recent_date'] and \
           db_record['title'] != latest_datetime['title']:
            db_docs.append(db_record)
        else:
            continue
    return db_docs
Exemplo n.º 5
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}

	title = bs.find("header", {"class": "header b-b bg-light h2"}).find("span").get_text(" ", strip = True)
	author = bs.find("div", {"class": "col-xs-10 lbb"}).text.strip()
	if author.find("관리자") != -1:
		author = "0"
	date = bs.find("span", {"name": "Edate"}).text
	date = date + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("section", {"class": "wrapper-lg"}).get_text(" ", strip = True)
	post = post_wash(post)
	tag_done = tag.tagging(URL, title)
	img = 1

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 6
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "class": "top_area ngeb"
    }).find("a").get_text(" ", strip=True)
    if bs.find("div", {"class": "btm_area clear"}).find("a") is None:
        author = "0"
    else:
        author = bs.find("div", {
            "class": "btm_area clear"
        }).find("a").text.strip()
        if author.find("관리자") != -1:
            author = "0"
    date = bs.find("div", {"class": "top_area ngeb"}).find("span").text.strip()
    date = date + ":00"
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("div", {"class": "rd_body clear"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    if bs.find("div", {"class": "rd_body clear"}).find("img") is None:
        img = 1
    else:
        try:
            img = bs.find("div", {
                "class": "rd_body clear"
            }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
            if 1000 <= len(img):
                img = 1
            else:
                if img.startswith("http://") or img.startswith(
                        "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                    pass
                elif img.startswith("//"):
                    img = "http:" + img
                else:
                    img = domain + img
        except:
            img = 1
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 7
0
def list_parse(bs0bj, URL, page):
	today = get_today()
	db_docs = []
	post_list = bs0bj.findAll("li")
	domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2]

	#게시글 파싱 및 크롤링
	for post in post_list:
		db_record = {}

		title = ""
		obj = post.find("div",{"class":"wr-subject"})
		title = obj.find("span").find_next("span").get_text().strip()
		[s.extract() for s in obj('span')]
		title += " " + obj.find("a").get_text().strip()

		db_record.update({"url":obj.find("a").attrs["href"]})
		db_record.update({"title":title})
		db_record.update({"post":0})
		db_record.update({"date":today})
		db_record.update(tagging(URL, db_record['title']))

		print(db_record['title'])
		db_docs.append(db_record)

	return db_docs
Exemplo n.º 8
0
def Parsing_post_data(bs, post_url, URL):
	try:
		time.sleep(2)	#서버과부하를 막기위한 조치
		return_data = []
		post_data = {}
		domain = Domain_check(URL['url'])

		author = bs.find("div", {"class": "sumTit"}).find("h3").find("span").text.strip()
		title = bs.find("div", {"class": "sumTit"}).find("h3").get_text(" ", strip = True).replace(author, "").strip()
		if author.find("관리자") != -1:
			author = "0"
		date = bs.find("dl", {"class": "date"}).findAll("dd")[1].find("span").text.strip()
		date = date + " 00:00:00"
		date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
		post = bs.find("div", {"class": "tbRow clear"}).get_text(" ", strip = True)
		post = post_wash(post)
		tag_done = tag.tagging(URL, title)
		img = 1

		#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
		post_data['title'] = title.upper()
		post_data['author'] = author.upper()
		post_data['date'] = date
		post_data['post'] = post.lower()
		post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
		post_data['img'] = img
		post_data['url'] = post_url

		return_data.append(post_data)
		return_data.append(title)
		return_data.append(date)
		return return_data
	except:
		return None
Exemplo n.º 9
0
def list_parse(driver, bs0bj, URL, page, latest_datetime=None):
    target = URL['info'].split('_')[1]
    start_datetime = startdate_dict[target]
    db_docs = []
    post_list = bs0bj.findAll("td", {"class": "list_loop_left"})
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/' + URL['url'].split('/')[3] + '/'\
    + URL['url'].split('/')[4] + '?mode=view&uid='

    for post in post_list:
        db_record = {}

        obj = post.find("a").attrs['onclick'].split("'")[1]

        db_record.update(content_parse(domain + obj))
        db_record.update(tagging(URL, db_record['title']))

        print(db_record['date'])
        # first 파싱이고 해당 글의 시간 조건이 맞을 때
        if db_record['date'] >= start_datetime  and \
          latest_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif latest_datetime != None and \
          db_record['date'] >= latest_datetime['recent_date'] and \
           db_record['title'] != latest_datetime['title']:
            db_docs.append(db_record)
        else:
            continue

    return db_docs
Exemplo n.º 10
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "class": "view_subject"
    }).find("h5").get_text(" ", strip=True)
    author = bs.find("ul", {"class": "data"}).find("li").text.strip()
    date = now
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {"class": "view_contents"}).get_text(" ", strip=True)
    post = post_wash(post)
    tag_done = tag.tagging(URL, title)
    img = 1
    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 11
0
def list_parse(bs0bj, URL, page, latest_datetime=None):
    target = URL['info'].split('_')[1]
    start_datetime = startdate_dict[target]
    db_docs = []
    post_list = bs0bj.findAll("td", {"height": "29"})
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/'\
     + URL['url'].split('/')[3] + '/' + URL['url'].split('/')[4] + '/'

    for post in post_list:
        db_record = {}

        obj = post.find("a", {"class": "text12graylightlink"})
        db_record.update({"url": domain + obj.attrs['href']})
        db_record.update({"title": obj.get_text().strip()})

        obj = post.find("td", {"width": "70"}).get_text().strip()
        obj = obj.replace(".", "-")
        db_record.update({"date": obj})
        db_record.update({"post": 0})
        db_record.update(tagging(URL, db_record['title']))

        print(db_record['date'])
        # first 파싱이고 해당 글의 시간 조건이 맞을 때
        if db_record['date'] >= start_datetime  and \
          latest_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif latest_datetime != None and \
          db_record['date'] >= latest_datetime['recent_date'] and \
           db_record['title'] != latest_datetime['title']:
            db_docs.append(db_record)
        else:
            continue
    return db_docs
Exemplo n.º 12
0
def list_parse(bs0bj, URL, page, lastet_datetime=None):
    today = get_today()
    db_docs = []
    post_list = bs0bj.findAll("li")
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2]

    #게시글 파싱 및 크롤링
    for post in post_list:
        db_record = {}

        title = ""
        obj = post.find("div", {"class": "wr-subject"})
        title += " " + obj.find("a").get_text().strip()
        if title.split(" ")[1] == '[알림]':
            continue
        print(title)
        db_record.update({"url": obj.find("a").attrs["href"]})
        db_record.update({"title": title})
        db_record.update({"post": 0})
        db_record.update({"date": today})
        db_record.update(tagging(URL, db_record['title']))

        print(db_record['title'])

        # first 파싱일 때
        if lastet_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif lastet_datetime != None and\
            db_record['title'] != lastet_datetime['title']:
            db_docs.append(db_record)
        else:
            break

    return db_docs
Exemplo n.º 13
0
def list_parse(driver, bs0bj, URL, page, latest_datetime = None):
	target = URL['info'].split('_')[1]
	start_datetime = startdate_dict[target]
	db_docs = []
	post_list = bs0bj.findAll("table",{"class":"text"})
	post_list = post_list[0].findAll("tr") + post_list[1].findAll("tr")
	domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2] + '/' + URL['url'].split('/')[3] + '/'
	
	for post in post_list:
		db_record = {}
		try:
			url = domain + post.attrs['onclick'].split("'")[1]
		except:
			continue

		db_record.update(content_parse(url))
		db_record.update(tagging(URL, db_record['title']))

		print(db_record['date'])
		# first 파싱이고 해당 글의 시간 조건이 맞을 때
		if db_record['date'] >= start_datetime  and \
				latest_datetime == None:
			db_docs.append(db_record)
		#renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
		elif latest_datetime != None and \
				db_record['date'] >= latest_datetime['recent_date'] and \
					db_record['title'] != latest_datetime['title']:
			db_docs.append(db_record)		
		else:
			continue

	return db_docs
Exemplo n.º 14
0
def list_parse(driver, bs0bj, URL, page, latest_datetime=None):
    target = URL['info'].split('_')[1]
    start_datetime = startdate_dict[target]
    db_docs = []
    post_list = bs0bj.findAll("tr")

    for post in post_list:
        db_record = {}
        try:
            url = post.find("a").attrs['href']
        except:
            continue
        db_record.update(content_parse(url))
        db_record.update(tagging(URL, db_record['title']))

        print(db_record['date'])
        # first 파싱이고 해당 글의 시간 조건이 맞을 때
        if db_record['date'] >= start_datetime  and \
          latest_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif latest_datetime != None and \
          db_record['date'] >= latest_datetime['recent_date'] and \
           db_record['title'] != latest_datetime['title']:
            db_docs.append(db_record)
        else:
            continue

    return db_docs
Exemplo n.º 15
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    titles = bs.find("tbody").find("tr").findAll("td")
    title = titles[1].get_text(" ", strip=True)
    author = "0"
    date = titles[3].text.strip()
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("td", {"class": "board_content"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    img = 1
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 16
0
def Parsing_post_data(driver, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	driver.get(post_url)

	if URL['info'].split("_")[2] == 'qna':
		WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.protectTable")))
	else:
		WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table#protectTable")))
	html = driver.page_source
	bs = BeautifulSoup(html, 'html.parser')
	
	title = bs.find("div", {"class": "subject"}).find("span", {"class": "b"}).text.strip()
	author = bs.find("div", {"class": "article_writer"}).find("a").text.strip()
	date = bs.find("div", {"class": "article_writer"}).find("span", {"class": "p11 ls0"}).text.strip()
	date = date + ":00"
	date = str(datetime.strptime(date, "%Y.%m.%d. %H:%M:%S"))
	post = bs.find("div", {"id": "user_contents"}).text.strip()
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	#세종대역은 포스트글이 시작할 때, 항상 112글자의 코드가 같이 긁힌다. 그러니 제외해주자.
	post = post[67:].strip()	#post글을 3000자 까지 읽기위한 작업
	tag_done = tag.tagging(URL, title)
	if bs.find("div", {"id": "user_contents"}).find("img") is None:
		img = 3
	else:
		img = bs.find("div", {"id": "user_contents"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 3
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 3:
		if img_size(img):
			pass
		else:
			img = 3		

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	time.sleep(2)
	return return_data
Exemplo n.º 17
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("td", {"class": "subject-value"}).get_text(" ", strip=True)
    author = bs.find("td", {"class": "writer"}).text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("td", {"class": "date"}).text
    if URL['info'] == "sj1_main_founded":
        date = date + " 12:00:00"
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("tbody").find("div").get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    tag_done = tag.tagging(URL, title)
    if bs.find("tbody").find("tr").find("img"):
        img = bs.find("tbody").find("tr").find("img")["src"]
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://") or img.startswith(
                        "data:"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    else:
        img = 1
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    post_url_a = post_url.split("&viewNum=")[0]
    post_url_b = post_url.split("&viewNum=")[1]
    while post_url_b[0] != '&':
        post_url_b = post_url_b[1:]
    post_url = post_url_a + post_url_b

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 18
0
def Parsing_post_data(post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])
    title = post_url.split("$$")[1]
    post_url = post_url.split("$$")[0]

    driver_post = URLparser(post_url)
    bs = BeautifulSoup(driver_post, 'html.parser')

    title = "세종대백과 :: " + title
    author = "0"
    date = "2019-01-01 12:00:00"
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {"class": "page group"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    post = post.split("//<![CDATA")[0]
    if bs.find("div", {"class": "page group"}).find("img") is None:
        img = 0
    else:
        try:
            img = bs.find("div", {
                "class": "page group"
            }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
            if 1000 <= len(img):
                img = 0
            else:
                if img.startswith("http://") or img.startswith(
                        "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                    pass
                elif img.startswith("//"):
                    img = "http:" + img
                else:
                    img = domain + img
        except:
            img = 0
    if img != 0:
        if img_size(img):
            pass
        else:
            img = 0
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 19
0
def Parsing_post_data(bs, post_url, URL):
    try:
        return_data = []
        post_data = {}
        domain = Domain_check(URL['url'])

        title = bs.find("span", {
            "class": "col_blue"
        }).get_text(" ", strip=True)
        author = "0"
        date = bs.find("dl", {
            "class": "explainInfoBx"
        }).find("dd").text.strip()
        date = date + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
        post = bs.find("p", {"class": "tx"}).get_text(" ", strip=True)
        post = post_wash(post)
        tag_done = tag.tagging(URL, title)
        if bs.find("div", {"class": "img"}).find("img") is None:
            img = 1
        else:
            img = bs.find("div", {
                "class": "img"
            }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
            if 1000 <= len(img):
                img = 1
            else:
                if img.startswith("http://") or img.startswith(
                        "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                    pass
                elif img.startswith("//"):
                    img = "http:" + img
                else:
                    img = domain + img
        if img != 1:
            if img_size(img):
                pass
            else:
                img = 1

        #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = date
        post_data['post'] = post.lower()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = post_url

        return_data.append(post_data)
        return_data.append(title)
        return_data.append(date)
        return return_data
    except:
        return None
Exemplo n.º 20
0
def Parsing_post_data(bs, post_url, URL):
	now = datetime.datetime.now().strftime("%Y-%m-%d")
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	title = bs.find("div", {"class": "title"}).find("h4").get_text(" ", strip = True)
	author = "0"
	dates = bs.find("div", {"data-role": "input"}).findAll("time")
	if len(dates) < 3:
		date = now
		date = date + " 00:00:00"
	else:
		date = dates[2].text.strip()
		date1 = date.split("(")[0].strip()
		date2 = date.split(")")[1].strip()
		date = date1 + " " + date2 + ":00"
		date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
	post = bs.find("div", {"class": "abstract"}).find("div", {"class": "text"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	if bs.find("meta", {"property": "og:image"})['content'] is None:
		img = 1
	else:
		try:
			img = bs.find("meta", {"property": "og:image"})['content']
			if 1000 <= len(img):
				img = 1
			else:
				if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
					pass
				elif img.startswith("//"):
					img = "http:" + img
				else:
					img = domain + img
		except:
			img = 1
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1
	tag_done = tag.tagging(URL, title)

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 21
0
def Parsing_post_data(bs, post_url, URL):
    time.sleep(1)
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("span", {
        "class": "title_subject"
    }).get_text(" ", strip=True)
    author = "0"
    date = bs.find("span", {"class": "gall_date"})['title']
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {
        "class": "writing_view_box"
    }).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    if bs.find("div", {"class": "writing_view_box"}).find("img") is None:
        img = 0
    else:
        try:
            img = bs.find("div", {
                "class": "writing_view_box"
            }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
            if 1000 <= len(img):
                img = 0
            else:
                if img.startswith("http://") or img.startswith(
                        "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                    pass
                elif img.startswith("//"):
                    img = "http:" + img
                else:
                    img = domain + img
        except:
            img = 0
    if img != 0:
        if img_size(img):
            pass
        else:
            img = 0
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 22
0
def Parsing_post_data(post_url, URL):
    post_data_prepare = []
    end_date = date_cut_dict['sj4']  # end_date 추출

    #udream 로그인하는 함수
    s = udream.login()

    page = s.get(post_url).text
    bs = BeautifulSoup(page, "html.parser")

    posts = bs.find("tbody").findAll("tr")  #tr묶음
    for post in posts:
        #[title, author, post1, post2, date] 형태
        post_infoes = post.findAll("td")  #td 묶음

        post_data = {}
        title = post_infoes[0].get_text(" ", strip=True)
        author = post_infoes[0].find("div").text
        if author.find("관리자") != -1:
            author = "0"
        end_data = post_infoes[4].text + " 00:00:00"
        post = post_infoes[1].get_text(
            " ", strip=True) + post_infoes[2].get_text(
                " ", strip=True) + post_infoes[3].get_text(
                    " ", strip=True) + "~" + post_infoes[4].get_text(
                        " ", strip=True)
        post = post_wash(post)
        tag_done = tag.tagging(URL, title)
        post = post[:200]
        img = 1
        url = post_infoes[5].find("a")["href"]

        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = str(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        post_data['end_data'] = datetime.datetime.strptime(
            end_data, "%Y-%m-%d %H:%M:%S")
        post_data['post'] = post.upper()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = url

        print(date, "::::", title)

        #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
        if str(date) <= end_date:
            continue
        else:
            post_data_prepare.append(post_data)
    s.close()

    return post_data_prepare
Exemplo n.º 23
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2]

    title = bs.find("span", {"class": "on"}).get_text(" ", strip=True)
    author = bs.find("table", {
        "class": "basic-table input-table"
    }).findAll("tr")[1].find("td").text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("table", {
        "class": "basic-table input-table"
    }).findAll("tr")[3].find("td").text.strip()[:23].split('~')[1].strip()
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("ul", {"class": "summary-info"}).get_text(" ", strip=True)
    post = post_wash(post)
    tag_done = tag.tagging(URL, title)
    if bs.find("div", {"class": "poster"}).find("img") is None:
        img = 1
    else:
        img = bs.find("div", {
            "class": "poster"
        }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 24
0
def list_parse(bs0bj, URL, page, lastet_datetime=None):
    target = URL['info'].split('_')[1]
    start_datetime = startdate_dict[target]
    db_docs = []
    post_list = bs0bj.findAll("li")
    domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2]

    #게시글 파싱 및 크롤링
    for post in post_list:
        # 1 페이지에만 나타나는 공지글 스킵
        if post.find("span", {"class": "wr-icon wr-notice"}) != None:
            continue
        db_record = {}
        try:
            obj = post.find("div", {
                "class": "wr-subject"
            }).find("a").attrs["href"]
        except Exception as e:
            return db_docs

        db_record.update(content_parse(domain, obj))
        if "class" in db_record.keys():
            db_record.update(
                tagging(URL, db_record['title'] + db_record['class']))
        else:
            db_record.update(tagging(URL, db_record['title']))

        print(db_record['date'])
        # first 파싱이고 해당 글의 시간 조건이 맞을 때
        if db_record['date'] >= start_datetime and \
             lastet_datetime == None:
            db_docs.append(db_record)
        #renewal 파싱이고 해당 글의 갱신 조건이 맞을 때
        elif lastet_datetime != None and\
          db_record['date'] >= lastet_datetime['recent_date'] and \
            db_record['title'] != lastet_datetime['title']:
            db_docs.append(db_record)
        else:
            continue

    return db_docs
Exemplo n.º 25
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}

    title = bs.find("div", {
        "class": "prop article bt1"
    }).find("div", {
        "class": "subject"
    }).get_text(" ", strip=True)
    date = bs.find("span", {"class": "date"}).text
    date = date + " 00:00:00"
    try:
        date = datetime.datetime.strftime(date, "%Y-%m-%d %H:%M:%S")
    except:
        date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    post = bs.find("div", {"class": "phrase"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    try:
        img = bs.find("div", {
            "class": "phrase"
        }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
        if img != 1:
            if img_size(img):
                pass
            else:
                img = 1
    except:
        img = 1
    tag_done = tag.tagging(URL, title)

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = "0"
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 26
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "class": "col-lg-9 title"
    }).find("span").get_text(" ", strip=True)
    author = bs.find("span", {"name": "WRITENAME"}).text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("span", {"name": "wdate"}).text
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {"class": "form-group"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    tag_done = tag.tagging(URL, title)
    #이미지가 있으면 이미지 url 을 넣고, 없으면 1을 넣어준다.
    if bs.find("img", {"align": "absmiddle"}) is None:
        img = 1
    else:
        img = domain + bs.find("img", {"align": "absmiddle"})['src']
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
Exemplo n.º 27
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])


	tables = bs.find("div", {"align": "center"}).findAll("table")
	title_table = tables[3]


	tds = title_table.findAll("td")

	title = tds[1].get_text(" ", strip = True)
	author = "0"
	date = tds[0].text.strip()
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("td", {"class": "sf_contents"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	if bs.find("td", {"class": "sf_contents"}).find("img") is None:
		img = 1
	else:
		img = bs.find("td", {"class": "sf_contents"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 1
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1
	tag_done = tag.tagging(URL, title)

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 28
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	title = bs.find("div", {"class": "body contest-detail"}).find("span", {"class": "title"}).get_text(" ", strip = True)
	author = bs.find("div", {"class": "contest-overview"}).find("tbody").find("tr").text.strip()
	if author.find("관리자") != -1:
		author = "0"
	date = bs.find("th", text="접수기간").parent.find("td").text.strip()
	date = date[13:] + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"class": "info-cont"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	tag_done = tag.tagging(URL, title)
	if bs.find("img", {"id": "poster"}) is None:
		img = 7
	else:
		try:
			img = bs.find("img", {"id": "poster"})['src']		#게시글의 첫번째 이미지를 가져옴.
			if 1000 <= len(img):
				img = 7
			else:
				if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
					pass
				elif img.startswith("//"):
					img = "http:" + img
				else:
					img = domain + img
		except:
			img = 7
	if img != 7:
		if img_size(img):
			pass
		else:
			img = 7

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 29
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	date = post_url[-8:]
	url = post_url.replace(date, "")
	driver = URLparser_UTF8(url)
	bs = BeautifulSoup(driver, 'html.parser')

	title = bs.find("div", {"id": "contents"}).find("div", {"class": "vi_subj"}).get_text(" ", strip = True)
	author = "0"
	date = "20" + date + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"class": "vi_cont"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	if bs.find("div", {"class": "vi_cont"}).find("img") is None:
		img = 1
	else:
		img = bs.find("div", {"class": "vi_cont"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 1
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1
	tag_done = tag.tagging(URL, title)

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
Exemplo n.º 30
0
def Parsing_post_data(post_url, URL):
    post_data_prepare = []
    end_date = date_cut_dict['sj5']  # end_date 추출

    #udream 로그인하는 함수
    s = udream.login()

    page = s.get(post_url).text
    bs = BeautifulSoup(page, "html.parser")

    posts = bs.find("tbody").findAll("tr")  #tr묶음
    for post in posts:
        post_infoes = post.findAll("td")  #td 묶음

        post_data = {}
        title = post_infoes[0].get_text(" ", strip=True)
        author = post.find("div").text.strip()
        if author.find("관리자") != -1:
            author = "0"
        date = post_infoes[3].text + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
        phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip=True)
        phrase = post_wash(phrase)
        tag_done = tag.tagging(URL, title)
        img = 1
        url_num = str(post_infoes[4].find("a")).split('"')[3]
        url = URL['post_url'] + url_num

        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = date
        post_data['post'] = phrase.lower()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = url

        print(date, "::::", title)

        #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
        if str(date) <= end_date:
            continue
        else:
            post_data_prepare.append(post_data)
    s.close()

    return post_data_prepare