def scrape_names(self, names): self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx') print self.driver.title WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT'))) self.driver.switch_to.frame('IFRAMENAMERESULT') # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm'))) self.sqldb = StoreNameStat('namestat') for i, name in enumerate(names): try: namestat = self.scrape_name_stat(name) if namestat is None: print "no info for %s" % name continue print namestat self.sqldb.store(namestat) self.sqldb.commit() except TimeoutException: print "%s is not scraped" % name continue self.sqldb.close() self.driver.switch_to.default_content()
class NameStatScraper(object): def __init__(self): self.driver = webdriver.PhantomJS() def close(self): self.driver.quit() self.driver = None def reopen(self): try: self.driver.close() except URLError: print "the window does not respond" self.driver.quit() time.sleep(5) self.driver = webdriver.PhantomJS() print "reopen new webdriver" def scrape(self): with sqlite3.connect("webid.db") as conn: cursor = conn.cursor() cursor.execute('select korchar from namechar where male_rank + female_rank > 0') tuples = cursor.fetchall() namechars = [tup[0].encode('utf-8') for tup in tuples] # with open('kornamechar.txt', 'r') as f: # lines = f.readlines() # namechars = [line.strip() for line in lines] # namechars = ['희', '준', '범'] # namechars = ['김', '팍', '범'] names = list() flag = False for char in namechars: names.append(char + char) batch_size = 1000 num_batch = int(math.ceil(float(len(names))/batch_size)) print len(names), num_batch for batch_i in range(num_batch): start_idx = batch_i * batch_size end_idx = min(start_idx + batch_size, len(names)) self.scrape_names(names[start_idx:end_idx]) self.reopen() def scrape_names(self, names): self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx') print self.driver.title WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT'))) self.driver.switch_to.frame('IFRAMENAMERESULT') # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm'))) self.sqldb = StoreNameStat('namestat') for i, name in enumerate(names): try: namestat = self.scrape_name_stat(name) if namestat is None: print "no info for %s" % name continue print namestat self.sqldb.store(namestat) self.sqldb.commit() except TimeoutException: print "%s is not scraped" % name continue self.sqldb.close() self.driver.switch_to.default_content() def scrape_name_stat(self, name): try: searchbox_id = 'ctl00_ContentPlaceHolder1_idSearchBox' searchbox = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, searchbox_id))) fullname = ''.join(['김', name]) script = "document.getElementById('%s').value='%s';" % (searchbox_id, fullname.decode('utf-8')) self.driver.execute_script(script) searchbox.submit() WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'name_info'))) except TimeoutException: raise soup = BS(self.driver.page_source, 'html.parser') name_info = soup.find(id="name_info") namestat_trs = name_info.find_all('table')[1].find('tbody').find_all('tr') preference = namestat_trs[0].find('td').text.encode('utf-8').strip() gender_likely = namestat_trs[2].find('td').text.encode('utf-8').strip() birth_info = namestat_trs[3].find('td').text.encode('utf-8').strip() gender_likely_lines = gender_likely.split('\n') gender_likely_line2 = gender_likely_lines[2].strip() gender_likely_line3 = gender_likely_lines[3].strip() prob = re.findall('(?<=\s)[0-9.]*(?=%)', gender_likely_line2)[0] gender = re.findall('(?<=\s)[가-힣]*(?=성적인)', gender_likely_line2)[0] m_freq, f_freq = re.findall('(?<=\s)[0-9,]*(?=번)', gender_likely_line3) if gender == '남': f_p = 1 - float(prob)/100 elif gender == '여': f_p = float(prob)/100 else: return None m_freq = int(m_freq.replace(',', '')) f_freq = int(f_freq.replace(',', '')) rank = safe_list_get(re.findall('(?<=\s)[0-9]*(?=번째)', preference), 0, '-1') rank = int(rank.replace(',', '')) proportion = float(safe_list_get(re.findall('(?<=\()[0-9.]*(?=%)', preference), 0, 0)) freq_year = int(re.findall('(?<=\s)[0-9]*(?=년도에)', birth_info)[0]) freq_region = re.findall('(?<=\s)[가-힣]*(?=\s출생인)', birth_info)[0] freq_sur = re.findall('(?<=\s)[가-힣]*(?=씨입니다)', preference)[0] namestat = NameStat(name, rank, proportion, f_p, m_freq, f_freq, freq_year, freq_region, freq_sur) return namestat