예제 #1
0
def Parsing_post_data(post_url, URL):
    post_data_prepare = []
    end_date = date_cut_dict['sj4']  # end_date 추출

    #udream 로그인하는 함수
    s = udream.login()

    page = s.get(post_url).text
    bs = BeautifulSoup(page, "html.parser")

    posts = bs.find("tbody").findAll("tr")  #tr묶음
    for post in posts:
        #[title, author, post1, post2, date] 형태
        post_infoes = post.findAll("td")  #td 묶음

        post_data = {}
        title = post_infoes[0].get_text(" ", strip=True)
        author = post_infoes[0].find("div").text
        if author.find("관리자") != -1:
            author = "0"
        end_data = post_infoes[4].text + " 00:00:00"
        post = post_infoes[1].get_text(
            " ", strip=True) + post_infoes[2].get_text(
                " ", strip=True) + post_infoes[3].get_text(
                    " ", strip=True) + "~" + post_infoes[4].get_text(
                        " ", strip=True)
        post = post_wash(post)
        tag_done = tag.tagging(URL, title)
        post = post[:200]
        img = 1
        url = post_infoes[5].find("a")["href"]

        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = str(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        post_data['end_data'] = datetime.datetime.strptime(
            end_data, "%Y-%m-%d %H:%M:%S")
        post_data['post'] = post.upper()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = url

        print(date, "::::", title)

        #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
        if str(date) <= end_date:
            continue
        else:
            post_data_prepare.append(post_data)
    s.close()

    return post_data_prepare
예제 #2
0
def Parsing_post_data(post_url, URL):
    post_data_prepare = []
    end_date = date_cut_dict['sj5']  # end_date 추출

    #udream 로그인하는 함수
    s = udream.login()

    page = s.get(post_url).text
    bs = BeautifulSoup(page, "html.parser")

    posts = bs.find("tbody").findAll("tr")  #tr묶음
    for post in posts:
        post_infoes = post.findAll("td")  #td 묶음

        post_data = {}
        title = post_infoes[0].get_text(" ", strip=True)
        author = post.find("div").text.strip()
        if author.find("관리자") != -1:
            author = "0"
        date = post_infoes[3].text + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
        phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip=True)
        phrase = post_wash(phrase)
        tag_done = tag.tagging(URL, title)
        img = 1
        url_num = str(post_infoes[4].find("a")).split('"')[3]
        url = URL['post_url'] + url_num

        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = date
        post_data['post'] = phrase.lower()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = url

        print(date, "::::", title)

        #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
        if str(date) <= end_date:
            continue
        else:
            post_data_prepare.append(post_data)
    s.close()

    return post_data_prepare
예제 #3
0
def Parsing_list_url(URL, page_url):
	List = []

	#udream 로그인하는 함수
	s = udream.login()

	page = s.get(page_url).text
	bs = BeautifulSoup(page, "lxml")	#html.parser 오류 lxml 로 가져온다.

	#리스트 반환
	posts = bs.findAll("tr", {"onmouseover": "hctrOn(this)"})
	for post in posts:
		num = post.find("a")["onclick"]

		post_num = num.split("'")[1]
		page = URL['post_url'] + post_num
		List.append(page)
	s.close()

	return List
예제 #4
0
def Parsing_list_url(URL, page_url):
    List = []
    domain = Domain_check(URL['url'])

    #udream 로그인하는 함수
    s = udream.login()

    page = s.get(page_url).text
    bs = BeautifulSoup(page, "html.parser")

    #리스트 반환
    posts = bs.find("table", {
        "class": "table b-t b-light"
    }).find("tbody").findAll("tr")
    for post in posts:
        num = post.find("div")["onclick"]

        post_num = num.split("'")[1]
        page = URL['post_url'] + post_num
        List.append(page)
    s.close()

    return List