예제 #1
0
파일: crawler.py 프로젝트: heevery/webid
	def scrape(self):
		self.sqldb = StoreID('naver')
		cafe_url = 'http://cafe.naver.com/joonggonara'

		f = open('joonggonara_menulinks.txt','r')
		lines = f.readlines()
		linkids = [line.strip() for line in lines]
		linkids = linkids[275:300] #menuLink452 article_id 279021807

		for linkid in linkids:
			accessed = self.goto_cafe_board(cafe_url, linkid)
			if accessed is False:
				continue
			else:
				pass
			pageno = 1
			flag = True
			while flag and pageno <= 1000:
				time.sleep(1)
				try:
					WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area')))
				except TimeoutException:
					print "cannot find ArticleList area at page %d" % pageno
					break
				soup = BS(self.driver.page_source, "html.parser")
				trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"})

				for tr in trs:
					article_id = tr.find_all('td')[0].find('span').text
					title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text				
					onclick_str = tr.find('td', class_="p-nick").find('a')['onclick']
					splits = onclick_str.split(',')
					naver_id = splits[1].strip().strip("'")
					cafe_nickname = splits[3].strip().strip("'")
					rawdate = tr.find_all('td', class_="view-count")[0].text
					date = self.to_date(rawdate)

					title = title.encode('latin-1','ignore').decode('euc-kr','ignore')
					cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore')
					
					naverID = NaverID(naver_id, cafe_nickname, None, None, article_id, title, date, None)
					self.nid = naverID
					print naverID
					self.sqldb.store_naverid(naverID)
					
				self.sqldb.commit()
				pageno += 1
				flag = self.goto_page(pageno)

			self.driver.switch_to.default_content()
			print "finish scraping category %s" % linkid

		self.sqldb.close()
예제 #2
0
파일: crawler.py 프로젝트: heevery/webid
	def scrape_age_gender(self):
		age = '30s'
		gender = ''
		self.sqldb = StoreID('naver%s%s' % (gender,age))
		
		cafe_url = 'http://cafe.naver.com/naworl'
		linkid = 'menuLink598' #menuLink602, menuLink597, menuLink600, menuLink599, menuLink598
		self.goto_cafe_board(cafe_url, linkid)

		pageno = 1
		if pageno > 1:
			self.goto_page_by_url(pageno)
		else:
			pass
		flag = True
		while flag and pageno <= 1000:
			time.sleep(1)
			try:
				WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area')))
			except TimeoutException:
				print "cannot find ArticleList area at page %d" % pageno
				break
			soup = BS(self.driver.page_source, "html.parser")
			trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"})

			for tr in trs:
				article_id = tr.find_all('td')[0].find('span').text
				title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text				
				onclick_str = tr.find('td', class_="p-nick").find('a')['onclick']
				splits = onclick_str.split(',')
				naver_id = splits[1].strip().strip("'")
				cafe_nickname = splits[3].strip().strip("'")
				rawdate = tr.find_all('td', class_="view-count")[0].text
				date = self.to_date(rawdate)

				title = title.encode('latin-1','ignore').decode('euc-kr','ignore')
				cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore')
				
				naverID = NaverID(naver_id, cafe_nickname, gender, age, article_id, title, date, None)
				self.nid = naverID
				print naverID
				self.sqldb.store_naverid(naverID)
				
			self.sqldb.commit()
			pageno += 1
			flag = self.goto_page(pageno)

		self.driver.switch_to.default_content()
		self.sqldb.close()
예제 #3
0
파일: crawler.py 프로젝트: heevery/webid
class NIDScraper(object):
	def __init__(self):
		self.driver = webdriver.PhantomJS()
		self.login()
	
	def close(self):
		self.driver.quit()
		self.driver = None

	def login(self):
		if self.driver is None:
			self.driver = webdriver.PhantomJS()
		else:
			pass
		self.driver.get('http://nid.naver.com/nidlogin.login')
		
		print self.driver.title
		time.sleep(1)
		
		username = self.driver.find_element_by_id('id')
		password = self.driver.find_element_by_id('pw')
		username.send_keys('pheewie')
		password.send_keys('dummy1')
		
		btn = self.driver.find_element_by_css_selector('#frmNIDLogin > fieldset > span > input')
		btn.click()
	
	def goto_cafe_board(self, naver_cafe_url, linkid):
		self.driver.get(naver_cafe_url)
		print 'cafe name: ' + self.driver.title

		try:
			category = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID,linkid)))
			print 'category name: ' + category.text
			category.click()
		except TimeoutException:
			print 'category link %s is not clickable' % linkid
			return False 

		self.driver.switch_to.frame('cafe_main')
		print 'frame url: ' + self.driver.current_url
		
		try:
			sizelistview = self.driver.find_element_by_css_selector('#main-area .sublink')
			sizelistview.click()

			sizelist = self.driver.find_elements_by_css_selector('#listSizeLayer li')
			sizebig = sizelist[-1].find_element_by_css_selector('a')
			print sizebig.get_attribute('onclick').split(';')[0]
			sizebig.click()
		except NoSuchElementException:
			print 'cannot adjust the number of lists in categery %s' % linkid
			return False
		return True

	def scrape_age_gender(self):
		age = '30s'
		gender = ''
		self.sqldb = StoreID('naver%s%s' % (gender,age))
		
		cafe_url = 'http://cafe.naver.com/naworl'
		linkid = 'menuLink598' #menuLink602, menuLink597, menuLink600, menuLink599, menuLink598
		self.goto_cafe_board(cafe_url, linkid)

		pageno = 1
		if pageno > 1:
			self.goto_page_by_url(pageno)
		else:
			pass
		flag = True
		while flag and pageno <= 1000:
			time.sleep(1)
			try:
				WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area')))
			except TimeoutException:
				print "cannot find ArticleList area at page %d" % pageno
				break
			soup = BS(self.driver.page_source, "html.parser")
			trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"})

			for tr in trs:
				article_id = tr.find_all('td')[0].find('span').text
				title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text				
				onclick_str = tr.find('td', class_="p-nick").find('a')['onclick']
				splits = onclick_str.split(',')
				naver_id = splits[1].strip().strip("'")
				cafe_nickname = splits[3].strip().strip("'")
				rawdate = tr.find_all('td', class_="view-count")[0].text
				date = self.to_date(rawdate)

				title = title.encode('latin-1','ignore').decode('euc-kr','ignore')
				cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore')
				
				naverID = NaverID(naver_id, cafe_nickname, gender, age, article_id, title, date, None)
				self.nid = naverID
				print naverID
				self.sqldb.store_naverid(naverID)
				
			self.sqldb.commit()
			pageno += 1
			flag = self.goto_page(pageno)

		self.driver.switch_to.default_content()
		self.sqldb.close()
	
	def scrape(self):
		self.sqldb = StoreID('naver')
		cafe_url = 'http://cafe.naver.com/joonggonara'

		f = open('joonggonara_menulinks.txt','r')
		lines = f.readlines()
		linkids = [line.strip() for line in lines]
		linkids = linkids[275:300] #menuLink452 article_id 279021807

		for linkid in linkids:
			accessed = self.goto_cafe_board(cafe_url, linkid)
			if accessed is False:
				continue
			else:
				pass
			pageno = 1
			flag = True
			while flag and pageno <= 1000:
				time.sleep(1)
				try:
					WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'main-area')))
				except TimeoutException:
					print "cannot find ArticleList area at page %d" % pageno
					break
				soup = BS(self.driver.page_source, "html.parser")
				trs = soup.find(id="main-area").find('form',{'name':"ArticleList"} ).find_all('tr',{'align':"center"})

				for tr in trs:
					article_id = tr.find_all('td')[0].find('span').text
					title = tr.find('td', class_="board-list").find('span', class_="aaa").find('a').text				
					onclick_str = tr.find('td', class_="p-nick").find('a')['onclick']
					splits = onclick_str.split(',')
					naver_id = splits[1].strip().strip("'")
					cafe_nickname = splits[3].strip().strip("'")
					rawdate = tr.find_all('td', class_="view-count")[0].text
					date = self.to_date(rawdate)

					title = title.encode('latin-1','ignore').decode('euc-kr','ignore')
					cafe_nickname = cafe_nickname.encode('latin-1','ignore').decode('euc-kr','ignore')
					
					naverID = NaverID(naver_id, cafe_nickname, None, None, article_id, title, date, None)
					self.nid = naverID
					print naverID
					self.sqldb.store_naverid(naverID)
					
				self.sqldb.commit()
				pageno += 1
				flag = self.goto_page(pageno)

			self.driver.switch_to.default_content()
			print "finish scraping category %s" % linkid

		self.sqldb.close()

	def goto_page(self, pageno):
		try:
			navi = self.driver.find_element_by_css_selector('.Nnavi')
			next_page_elem = navi.find_element_by_xpath('//a[text()="%d"]' % pageno)
		except NoSuchElementException:
			try:
				next_page_elem = self.driver.find_element_by_css_selector('.pgR > a:nth-child(1)')
			except NoSuchElementException:
				print 'no page %d with last url: %s' % (pageno, self.driver.current_url)
				return False
			pass
		try:
			next_page_elem.click()
		except:
			pass
		return True
	
	def goto_page_by_url(self, pageno):
		url = self.driver.find_element_by_css_selector('#main-area .Nnavi .on a').get_attribute('href')
		jump_url = url.replace(url.split('&')[-1], "search.page=%d" % pageno)
		self.driver.get(jump_url)

		WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'cafe_main')))
		self.driver.switch_to.frame('cafe_main')
		print 'frame url: ' + self.driver.current_url
	
	def to_date(self, rawdate):
		date = datetime.today().date()
		try:
			if  '.' in rawdate:
				date = datetime.strptime(rawdate.strip(), '%Y.%m.%d.').date()
			elif ':' in rawdate:
				pass
			else:
				date = None
		except ValueError:
			pass
		return date