def crawling_kyochon(): result = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?txtsearch=&sido1=%d&sido2=%d' % ( sido1, sido2) html = crawler.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) for tag_a in tag_ul.findAll('a', href=True): name = tag_a.find('dt').get_text() address = tag_a.find('dd').get_text().strip().split('\r')[0] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index') table = table.drop_duplicates( subset='name', keep='first').reset_index(drop=True).reset_index().set_index('index') table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding="utf-8", mode='w', index=True)
def store_kyochon(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY))
def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index') table = table.drop_duplicates( subset='name', keep='first').reset_index(drop=True).reset_index().set_index('index') table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawl_goobne(): url = 'http://www.goobne.co.kr/store/search_store.jsp' wd = webdriver.Chrome('D:/Python/webdriver/chromedriver.exe') wd.get("http://www.goobne.co.kr/store/search_store.jsp") time.sleep(5) result = [] for page in count(start=1): script = 'store.getList(%d)' % page wd.execute_script(script) print('%s : success for script execution (%s)' % (datetime.now(), script)) time.sleep(5) html = wd.page_source bs = BeautifulSoup(html, 'html.parser') tag_tbody = bs.find('tbody', attrs={'id': 'store_list'}) tags_tr = tag_tbody.findAll('tr') if tags_tr[0].get('class') is None: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[5] if strings[3] == '' else strings[6] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) wd.quit() # store table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) # 중복 제거 table = table.\ drop_duplicates(subset='name', keep='first').\ reset_index(drop=True) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] RESULT_DIRECTORY = '__result__' for page in count(start=1): # for page in range(1, 3): url = 'http://pelicana.co.kr/store/stroe_search.html?gu=&si=&page={0}'.format( page) html = cw.crawling(url=url) # print(html) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) # print(bs, file=sys.stderr) # print(tag_table, file=sys.stderr) tag_tbody = tag_table.find('tbody') # print(tag_tbody) tags_tr = tag_tbody.findAll('tr') # print(tags_tr) # print(tags_tr) print(page, ":", len(tags_tr), sep=':') # 끝 검출 if len(tags_tr) == 0: break # print(page, ":", len(tags_tr), sep=':') for tag_tr in tags_tr: strings = list(tag_tr.strings) # print(strings, type(strings)) name = strings[1] address = strings[3] # print(address.split()) sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY))
def crawling_pelicana(): result = [] for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d' % page html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) # 중복 제거 table = table.\ drop_duplicates(subset='name', keep='first').\ reset_index(drop=True) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawl_kyochon(): result = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = crawler.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) for tag_a in tag_ul.findAll('a'): tag_dt = tag_a.find('dt') if tag_dt is None: break name = tag_dt.get_text() tag_dd = tag_a.find('dd') if tag_dd is None: break address = tag_dd.get_text().strip().split('\r')[0] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)