def scrape(self): self.sqldb = StoreID('naver') cafe_url = 'http://cafe.naver.com/joonggonara' f = open('joonggonara_menulinks.txt','r') lines = f.readlines() linkids = [line.strip() for line in lines] linkids = linkids[275:300] #menuLink452 article_id 279021807 for linkid in linkids: accessed = self.goto_cafe_board(cafe_url, linkid) if accessed is False: continue else: pass pageno = 1 flag = True while flag and pageno <= 1000: time.sleep(1) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area'))) except TimeoutException: print "cannot find ArticleList area at page %d" % pageno break soup = BS(self.driver.page_source, "html.parser") trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"}) for tr in trs: article_id = tr.find_all('td')[0].find('span').text title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text onclick_str = tr.find('td', class_="p-nick").find('a')['onclick'] splits = onclick_str.split(',') naver_id = splits[1].strip().strip("'") cafe_nickname = splits[3].strip().strip("'") rawdate = tr.find_all('td', class_="view-count")[0].text date = self.to_date(rawdate) title = title.encode('latin-1','ignore').decode('euc-kr','ignore') cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore') naverID = NaverID(naver_id, cafe_nickname, None, None, article_id, title, date, None) self.nid = naverID print naverID self.sqldb.store_naverid(naverID) self.sqldb.commit() pageno += 1 flag = self.goto_page(pageno) self.driver.switch_to.default_content() print "finish scraping category %s" % linkid self.sqldb.close()
def scrape_age_gender(self): age = '30s' gender = '' self.sqldb = StoreID('naver%s%s' % (gender,age)) cafe_url = 'http://cafe.naver.com/naworl' linkid = 'menuLink598' #menuLink602, menuLink597, menuLink600, menuLink599, menuLink598 self.goto_cafe_board(cafe_url, linkid) pageno = 1 if pageno > 1: self.goto_page_by_url(pageno) else: pass flag = True while flag and pageno <= 1000: time.sleep(1) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area'))) except TimeoutException: print "cannot find ArticleList area at page %d" % pageno break soup = BS(self.driver.page_source, "html.parser") trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"}) for tr in trs: article_id = tr.find_all('td')[0].find('span').text title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text onclick_str = tr.find('td', class_="p-nick").find('a')['onclick'] splits = onclick_str.split(',') naver_id = splits[1].strip().strip("'") cafe_nickname = splits[3].strip().strip("'") rawdate = tr.find_all('td', class_="view-count")[0].text date = self.to_date(rawdate) title = title.encode('latin-1','ignore').decode('euc-kr','ignore') cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore') naverID = NaverID(naver_id, cafe_nickname, gender, age, article_id, title, date, None) self.nid = naverID print naverID self.sqldb.store_naverid(naverID) self.sqldb.commit() pageno += 1 flag = self.goto_page(pageno) self.driver.switch_to.default_content() self.sqldb.close()
class NIDScraper(object): def __init__(self): self.driver = webdriver.PhantomJS() self.login() def close(self): self.driver.quit() self.driver = None def login(self): if self.driver is None: self.driver = webdriver.PhantomJS() else: pass self.driver.get('http://nid.naver.com/nidlogin.login') print self.driver.title time.sleep(1) username = self.driver.find_element_by_id('id') password = self.driver.find_element_by_id('pw') username.send_keys('pheewie') password.send_keys('dummy1') btn = self.driver.find_element_by_css_selector('#frmNIDLogin > fieldset > span > input') btn.click() def goto_cafe_board(self, naver_cafe_url, linkid): self.driver.get(naver_cafe_url) print 'cafe name: ' + self.driver.title try: category = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID,linkid))) print 'category name: ' + category.text category.click() except TimeoutException: print 'category link %s is not clickable' % linkid return False self.driver.switch_to.frame('cafe_main') print 'frame url: ' + self.driver.current_url try: sizelistview = self.driver.find_element_by_css_selector('#main-area .sublink') sizelistview.click() sizelist = self.driver.find_elements_by_css_selector('#listSizeLayer li') sizebig = sizelist[-1].find_element_by_css_selector('a') print sizebig.get_attribute('onclick').split(';')[0] sizebig.click() except NoSuchElementException: print 'cannot adjust the number of lists in categery %s' % linkid return False return True def scrape_age_gender(self): age = '30s' gender = '' self.sqldb = StoreID('naver%s%s' % (gender,age)) cafe_url = 'http://cafe.naver.com/naworl' linkid = 'menuLink598' #menuLink602, menuLink597, menuLink600, menuLink599, menuLink598 self.goto_cafe_board(cafe_url, linkid) pageno = 1 if pageno > 1: self.goto_page_by_url(pageno) else: pass flag = True while flag and pageno <= 1000: time.sleep(1) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area'))) except TimeoutException: print "cannot find ArticleList area at page %d" % pageno break soup = BS(self.driver.page_source, "html.parser") trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"}) for tr in trs: article_id = tr.find_all('td')[0].find('span').text title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text onclick_str = tr.find('td', class_="p-nick").find('a')['onclick'] splits = onclick_str.split(',') naver_id = splits[1].strip().strip("'") cafe_nickname = splits[3].strip().strip("'") rawdate = tr.find_all('td', class_="view-count")[0].text date = self.to_date(rawdate) title = title.encode('latin-1','ignore').decode('euc-kr','ignore') cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore') naverID = NaverID(naver_id, cafe_nickname, gender, age, article_id, title, date, None) self.nid = naverID print naverID self.sqldb.store_naverid(naverID) self.sqldb.commit() pageno += 1 flag = self.goto_page(pageno) self.driver.switch_to.default_content() self.sqldb.close() def scrape(self): self.sqldb = StoreID('naver') cafe_url = 'http://cafe.naver.com/joonggonara' f = open('joonggonara_menulinks.txt','r') lines = f.readlines() linkids = [line.strip() for line in lines] linkids = linkids[275:300] #menuLink452 article_id 279021807 for linkid in linkids: accessed = self.goto_cafe_board(cafe_url, linkid) if accessed is False: continue else: pass pageno = 1 flag = True while flag and pageno <= 1000: time.sleep(1) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area'))) except TimeoutException: print "cannot find ArticleList area at page %d" % pageno break soup = BS(self.driver.page_source, "html.parser") trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"}) for tr in trs: article_id = tr.find_all('td')[0].find('span').text title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text onclick_str = tr.find('td', class_="p-nick").find('a')['onclick'] splits = onclick_str.split(',') naver_id = splits[1].strip().strip("'") cafe_nickname = splits[3].strip().strip("'") rawdate = tr.find_all('td', class_="view-count")[0].text date = self.to_date(rawdate) title = title.encode('latin-1','ignore').decode('euc-kr','ignore') cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore') naverID = NaverID(naver_id, cafe_nickname, None, None, article_id, title, date, None) self.nid = naverID print naverID self.sqldb.store_naverid(naverID) self.sqldb.commit() pageno += 1 flag = self.goto_page(pageno) self.driver.switch_to.default_content() print "finish scraping category %s" % linkid self.sqldb.close() def goto_page(self, pageno): try: navi = self.driver.find_element_by_css_selector('.Nnavi') next_page_elem = navi.find_element_by_xpath('//a[text()="%d"]' % pageno) except NoSuchElementException: try: next_page_elem = self.driver.find_element_by_css_selector('.pgR > a:nth-child(1)') except NoSuchElementException: print 'no page %d with last url: %s' % (pageno, self.driver.current_url) return False pass try: next_page_elem.click() except: pass return True def goto_page_by_url(self, pageno): url = self.driver.find_element_by_css_selector('#main-area .Nnavi .on a').get_attribute('href') jump_url = url.replace(url.split('&')[-1], "search.page=%d" % pageno) self.driver.get(jump_url) WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'cafe_main'))) self.driver.switch_to.frame('cafe_main') print 'frame url: ' + self.driver.current_url def to_date(self, rawdate): date = datetime.today().date() try: if '.' in rawdate: date = datetime.strptime(rawdate.strip(), '%Y.%m.%d.').date() elif ':' in rawdate: pass else: date = None except ValueError: pass return date