Exemplo n.º 1
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'http://www.ddanzi.com/index.php?m=1&document_srl='

    # 사이트에서 html 가져오기
    s = utils.sess('http://www.ddanzi.com/')
    resp = s.get(url)
    soup = BeautifulSoup(resp.text, 'lxml')
    table = soup.select('table > tbody')[1]
    articles_total = table.select('tr')
    # 공지글 제거
    articles = []
    for article in articles_total:
        try:
            if article['class'][0] == 'notice':
                continue
        except:
            articles.append(article)
    utils.print_log(verbose, "articles cnt", len(articles))
    # 게시글이 없는 경우 빈 데이터 프레임 리턴
    if len(articles) == 0:
        return pd.DataFrame()

    # 추출 요소
    a_list = []
    # print(len(articles))
    for a in articles:
        l = []
        title = mod_char(a.select_one('td.title > a').text, 'for_title')
        # print(title)
        article_id = re.search(r'(\d{9})', a.find('a')['href']).group()
        # print(article_link)
        article_link = base_url + article_id
        try:  # 리플수가 없을 경우에 발생하는 None type error
            reply_num = mod_reply(a.find('span', {'class': 'talk'}).text)
        except:
            reply_num = '0'
        # 날짜를 구하기 위해 게시글 클릭
        cont = BeautifulSoup(s.get(article_link).text, 'html.parser')
        # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
        if subject == 'touching':
            yn = touch_article(cont, tears)
            if not yn:
                continue
        temp = cont.find('div', {'class': 'right'})
        user_id = mod_char(temp.find('a').text)
        view_num = cont.find('span', {'class': 'read'}).text
        date = cont.find('p', {'class': 'time'}).text
        if len(date) < 9:
            date = str(datetime.now()).split(' ')[0] + ' ' + date
        else:
            date = date.replace('.', '-')

        content = ''
        # 추출항목 리스트로 생성
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(user_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)

    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        s.close()
        return pd.DataFrame()

    s.close()  # close Session
    # 결과 데이터 프레임 생성
    result = pd.DataFrame(a_list)
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    # Excepting the particular articles by '펌쟁이'
    result = result[result['member_id'] != '펌쟁이']
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])
    result.set_index('article_id')

    return result
Exemplo n.º 2
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'http://starboard.kr/todayhumor'
    search_url = 'http://starboard.kr/conn/board/search'
    # Get a html
    s = utils.sess('http://starboard.kr/')
    payload = {'search_text': url}
    resp = s.post(search_url, data=payload)
    soup = BeautifulSoup(resp.text, 'lxml')
    articles = soup.find_all('div', attrs={'class': 'ItemContent Discussion'})
    utils.print_log(verbose, "articles cnt", len(articles))
    # 게시글이 없는 경우 리턴
    if len(articles) == 0:
        s.close()
        return pd.DataFrame()

    a_list = []
    for a in articles:
        l = []
        title = mod_title(
            a.find('div', attrs={
                'class': 'Title'
            }).find('a').text)
        utils.print_log(verbose, "1 article title", title)
        user_id = ''
        try:
            article_link = a.find('div', attrs={
                'class': 'Title'
            }).find('a')['href']
            utils.print_log(verbose, "2 article link", article_link)
        except:
            return pd.DataFrame()
        # print(article_link)
        article_id = re.search(r's_no=(\d+)', article_link).group(1)
        utils.print_log(verbose, "3 article id", article_id)
        date = a.find('time')['datetime']
        utils.print_log(verbose, "4 article date", date)
        content = ''
        try:
            reply_num = re.search(r'\[([0-9]+)\]', title).group(1)
        except:
            reply_num = '0'
        utils.print_log(verbose, "5 article reply cnt", reply_num)
        view_num = a.select_one(
            'div.Meta.Meta-Discussion > span.MItem.MCount.ViewCount').text
        utils.print_log(verbose, "6 article view cnt", view_num)
        user_id = ''
        # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
        if subject == 'touching':
            yn = touch_article(s, article_link, tears)
            if not yn:
                continue
        # Making the list
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(user_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(17, 41))

    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        s.close()
        return pd.DataFrame()

    s.close()
    result = pd.DataFrame(a_list)
    # munging of the dataframe
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])
    result.set_index('article_id')

    return result
Exemplo n.º 3
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'https://dvdprime.com/g2/bbs/board.php?bo_table=comm&wr_id='
    # Get a html
    s = utils.sess('https://dvdprime.com/')
    resp = s.get(url)
    soup = BeautifulSoup(resp.text, 'lxml')
    # Extracting articles from the html
    articles = soup.find_all('a', {'class': 'list_subject_a'})
    utils.print_log(verbose, "articles cnt", len(articles))
    if len(articles) == 0:
        s.close()
        return pd.DataFrame()

    # Extracting elements from articles
    a_list = []
    for a in articles:
        l = []
        try:
            title = a.find('span', {'class': 'list_subject_span_pc'}).text
        except:
            continue
        if title.find('알림]') != -1:
            # 공지글은 넘어가기
            continue
        # print(title)
        article_id = re.search(r'(\d{8})', a['href']).group()
        try:
            reply_num = a.select_one(
                'span > span > span > span.list_comment_num_a > em').text
        except:
            reply_num = '0'
        article_link = base_url + article_id
        # 게시글 내용 가져오기
        content = BeautifulSoup(s.get(article_link).text, 'html.parser')
        # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
        if subject == 'touching':
            yn = touch_article(content, tears)
            if not yn:
                continue
        user_id = mod_user_id(content.find('span', {'class': 'member'}).text)
        view_num = re.search(r'[0-9]+',
                             content.find('div', {
                                 'id': 'view_hit'
                             }).text).group()
        # Gathering the cotent of each article
        date = mod_date(content.find('div', {'id': 'view_datetime'}).text)
        content = ''
        # Making the list
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(user_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)

    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        s.close()
        return pd.DataFrame()

    s.close()
    result = pd.DataFrame(a_list)
    # munging of the dataframe
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])

    return result
Exemplo n.º 4
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'http://m.ruliweb.com/community/board/300148/read/'
    # Get a html
    s = utils.sess(base_url)
    resp = s.get(url)
    soup = BeautifulSoup(resp.text, 'lxml')
    # Extracting articles from the html
    articles = soup.select('tr.table_body')
    articles = [x for x in articles if len(x.get('class')) < 2]  # 공지글/베스트 제외
    utils.print_log(verbose, "articles cnt", len(articles))
    # 유동적인 결과없음 글이 있으면 리턴
    if articles[0].find('strong').text == '결과없음':
        return pd.DataFrame()

    # Extracting elements from articles
    a_list = []
    for a in articles:
        l = []
        title = a.find_all('a', {'class': 'subject_link'})[0].text.strip()
        user_id = mod_user_id(a.find_all('span', {'class': 'writer'})[0].text)
        article_link = a.find_all('a',
                                  {'class': 'subject_link'})[0].get('href')
        # print(article_link)
        article_id = re.search(r'(\d{8})', article_link).group()
        article_link = base_url + article_id  # 링크에서 검색어 나오지 않게 수정
        reply_num = mod_reply(a)
        view_num = re.search(r'[0-9]+',
                             a.find('span', {
                                 'class': 'hit'
                             }).text).group()
        # Gathering the cotent of each article
        resp = s.get(article_link)
        con = BeautifulSoup(resp.text, 'html.parser')
        # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
        if subject == 'touching':
            yn = touch_article(con, tears)
            if not yn:
                continue
        date = mod_date(con.find('span', {'class': 'regdate'}).text)
        content = ''
        # Making the list
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(user_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)

    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        return pd.DataFrame()

    result = pd.DataFrame(a_list)
    # munging of the dataframe
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])
    # 루리웹 도배글 삭제
    result = result[~result['title'].str.contains('주식아')
                    & ~result['title'].str.contains('주식이')]
    result.set_index('article_id')

    return result
Exemplo n.º 5
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'http://starboard.kr/slr'
    search_url = 'http://starboard.kr/conn/board/search'
    # Get a html
    s = utils.sess('http://starboard.kr/')
    base = s.get(base_url)
    # s_result.encoding = 'euc-kr' # Revising the encoding
    # Extracting articles from the html
    payload = {'search_text': url}
    resp = s.post(search_url, data=payload)
    soup = BeautifulSoup(resp.text, 'lxml')
    articles = soup.find_all('div', attrs={'class': 'ItemContent Discussion'})
    utils.print_log(verbose, "articles cnt", len(articles))
    if len(articles) == 0:
        s.close()
        return pd.DataFrame()

    # print(articles)
    s.close()  # starboard session 종료
    s = utils.sess('http://www.slrclub.com/')
    a_list = []
    for a in articles:
        l = []
        title = mod_title(
            a.find('div', attrs={
                'class': 'Title'
            }).find('a').text)
        utils.print_log(verbose, "1 article title", title)
        try:
            article_link = a.find('div', attrs={
                'class': 'Title'
            }).find('a')['href']
            utils.print_log(verbose, "2 article link", article_link)
        except:
            continue
        # print(article_link)
        article_id = re.search(r'(\d{8})', article_link).group()
        utils.print_log(verbose, "3 article id", article_id)
        date = a.find('time')['datetime']
        utils.print_log(verbose, "4 article date", date)
        try:
            reply_num = re.search(r'\[([0-9]+)\]', title).group(1)
        except:
            reply_num = '0'
        utils.print_log(verbose, "5 article reply cnt", reply_num)
        view_num = a.select_one(
            'div.Meta.Meta-Discussion > span.MItem.MCount.ViewCount').text
        utils.print_log(verbose, "6 article view cnt", view_num)
        # Get a content of the article
        con = BeautifulSoup(s.get(article_link).text, 'html.parser')
        # 존재하지 않는 게시물 예외 처리
        if con.find('p', {'class': 'err_msg'}) is not None:
            continue
        # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
        if subject == 'touching':
            yn = touch_article(con, tears)
            if not yn:
                continue
        member_id = con.find('span', attrs={'class': 'lop'})
        if member_id is None:
            member_id = ''
        else:
            member_id = con.find('span', attrs={'class': 'lop'}).text

        content = ''
        # Making the list
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(member_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)

    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        base.close()
        s.close()
        return pd.DataFrame()

    # 세션 종료
    base.close()
    s.close()

    result = pd.DataFrame(a_list)
    # munging of the dataframe
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])
    result = result[result['member_id'] != '']
    result.set_index('article_id')

    return result
Exemplo n.º 6
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'https://www.clien.net/service/board/park/'
    # Get a html
    s = utils.sess('https://www.clien.net')
    resp = s.get(url)
    # Extracting articles from the html
    soup = BeautifulSoup(resp.text, 'lxml')
    articles = soup.select('div.list_item')
    utils.print_log(verbose, "articles cnt", len(articles))
    if len(articles) == 0:
        del resp
        return pd.DataFrame()
    # Extracting elements from the html document
    a_list = []
    for a in articles:
        l = []
        try:
            title = mod_text(a.select_one('span.list_subject > a').text)
        except AttributeError:
            continue
        date_time = a.select_one('span.timestamp').text
        #print("datetime", date_time)
        article_link = a.select_one('span.list_subject > a')['href']
        article_id = re.search(r'[0-9]{8}', article_link).group(0)
        article_link = base_url + article_id
        try:
            con = BeautifulSoup(s.get(article_link).text, 'lxml')
            # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop
            if subject == 'touching':
                yn = touch_article(con, tears)
                if not yn:
                    continue
            if con.select_one('span.nickname > img') is None:
                member_id = mod_text(con.select_one('span.nickname').text)
            else:
                member_id = con.select_one('span.nickname > img')['alt']
            content = ''
            reply_num = con.select_one('div#comment-point strong').text
            view_num = con.select_one('span.view_count > strong').text
        except Exception as e:
            print(e)
            continue
        l.append(title)
        l.append(date_time)
        l.append(article_id)
        l.append(member_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)
    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        del resp
        return pd.DataFrame()
    del resp
    result = pd.DataFrame(a_list)
    # 데이터 프레임 컬럼 생성
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'].tolist())
    result.set_index('article_id')
    return result
Exemplo n.º 7
0
def get_article(url, subject, tears=15, verbose=False):
    base_url = 'http://www.etoland.co.kr/plugin/mobile/board.php?bo_table=etoboard01&wr_id='
    # Get a html
    s = utils.sess('http://www.etoland.co.kr/')
    resp = s.get(url)
    soup = BeautifulSoup(resp.text, 'lxml')
    articles = soup.find_all('li', {'class': 'subject'})
    utils.print_log(verbose, "articles cnt", len(articles))
    # 게시글이 없으면 리턴
    if len(articles) == 0:
        s.close()
        return pd.DataFrame()

    a_list = []
    for a in articles:
        l = []
        title = mod_title(a.find('div').text)
        try:
            user_id = mod_user_id(a.find('span', {'class': 'name'}).text)
        except:
            continue
        article_id = re.search(r'wr_id=\d+',
                               a.find('a')['href']).group().replace(
                                   'wr_id=', '')
        article_link = base_url + article_id
        reply_num = mod_reply(a.find('div').text)
        view_num = re.search(
            r'[0-9]+',
            a.find_all('span', {'class': 'datetime'})[1].text).group()
        # Gathering the cotent of each article
        con = s.get(article_link)
        temp = BeautifulSoup(con.text, 'html.parser')
        if subject == 'touching':
            yn = touch_article(temp, tears)
            if not yn:
                continue
        try:
            '''
            content = temp.cssselect('td.mw_basic_view_content')[0].\
            cssselect('div[id="view_content"]')[0].text_content()
            '''
            content = ''
            date = mod_date(temp.find('span', {'class': 'write_date'}).text)
            if temp.find('span', {'class': 'write_date'}).text == date:
                continue
        except:
            continue
        # Making the list
        l.append(title)
        l.append(date)
        l.append(article_id)
        l.append(user_id)
        l.append(article_link)
        l.append(content)
        l.append(reply_num)
        l.append(view_num)
        a_list.append(l)
        utils.print_log(verbose, "article line 1", l)
        time.sleep(random.randint(2, 7) / 3)
    if len(a_list) == 0:  # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환
        s.close()
        return pd.DataFrame()

    s.close()
    result = pd.DataFrame(a_list)

    # munging of the dataframe
    result.columns = [
        'title', 'date_time', 'article_id', 'member_id', 'article_link',
        'content', 'reply_num', 'view_num'
    ]
    result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'])
    result.set_index('article_id')

    return result