def crawl_data(): df_rows = [] # search = input("검색어를 입력하세요 : ") search = "스타벅스dt" driver = webdriver.Chrome() base_url = 'https://m.naver.com/' driver.get(base_url) sleep(1.5) search_window = driver.find_element_by_xpath('//*[@id="MM_SEARCH_FAKE"]') search_window.click() sleep(1) real_search_window = driver.find_element_by_xpath('//*[@id="query"]') real_search_window.send_keys(search) real_search_window.send_keys(Keys.ENTER) sleep(1.5) search_more = driver.find_element_by_xpath( '//*[@id="place-main-section-root"]/div/div[4]/div/a') search_more.send_keys(Keys.ENTER) sleep(1.5) SCROLL_TIME = 100 click_nolink_for_scrollDown(driver, SCROLL_TIME) try: # loc_names = driver.find_elements_by_xpath( # '//*[@id="_list_scroll_container"]/div/div/div[1]/ul/li/div[1]/a/div[1]/div/span') addresses = driver.find_elements_by_xpath( '//*[@id="_list_scroll_container"]/div/div/div[1]/ul/li/div[1]/a/div[2]/span[1]' ) num_reviews = driver.find_elements_by_xpath( '//*[@id="_list_scroll_container"]/div/div/div[1]/ul/li/div[1]/a/div[3]/span' ) links = driver.find_elements_by_xpath( '//*[@id="_list_scroll_container"]/div/div/div[1]/ul/li/div[1]/a') for loc, add, num, link in zip(addresses, num_reviews, links): try: row = { # 'loc_name': loc.text, 'address': add.text, 'num_review': num.text, 'link': link.get_attribute('href') } df_rows.append(row) sleep(0.1) except Exception as e: print(e) except NoSuchElementException as e: print(f'{e}') except StaleElementReferenceException as e: print(f'{e}') dataframe = pd.DataFrame(data=df_rows) save_path = save_dataframe(search, dataframe) print("저장완료") return save_path
def crawl_data(): df_rows = [] search = input("검색어를 입력하세요 : ") driver = webdriver.Chrome() base_url = 'https://map.kakao.com' driver.get(base_url) sleep(1) search_window = driver.find_element_by_xpath( '//*[@id="search.keyword.query"]') search_window.send_keys(search) sleep(1) search_window.send_keys(Keys.RETURN) sleep(1.5) search_more = driver.find_element_by_xpath( '//*[@id="info.search.place.more"]') search_more.send_keys(Keys.ENTER) sleep(1) driver.find_element_by_xpath(f'//*[@id="info.search.page.no1"]').send_keys( Keys.ENTER) sleep(1) total_row_nums = int( driver.find_element_by_xpath('//*[@id="info.search.place.cnt"]').text) for page in cycle(['no2', 'no3', 'no4', 'no5', 'next']): try: mac_names = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[3]/strong/a[2]') addresses = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[5]/div[2]/p[1]') addresses2 = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[5]/div[2]/p[2]') scores = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[4]/span[1]/em') for m, a1, a2, s in zip(mac_names, addresses, addresses2, scores): try: row = { 'mac_name': m.text, 'address': a1.text, 'address2': a2.text, 'score': s.text } df_rows.append(row) sleep(0.1) except Exception as e: print(e) except NoSuchElementException as e: print(f'{e}') continue except StaleElementReferenceException as e: print(f'{e}') continue try: next_page = driver.find_element_by_xpath( f'//*[@id="info.search.page.{page}"]') if total_row_nums <= len(df_rows): break elif next_page.is_enabled(): next_page.send_keys(Keys.ENTER) sleep(1) else: break except Exception as e: print('next page error, break out!') break dataframe = pd.DataFrame(data=df_rows) save_path = save_dataframe(search, dataframe) print("저장완료") return save_path
def crawl_review_data_from_list(crawled_data, basis_column='loc_name', sep='\t'): linkdata = pd.read_csv(crawled_data, sep=sep) if 'Unnamed: 0' in linkdata.columns: df_rows = linkdata.drop_duplicates([basis_column], ignore_index=True) df_rows = df_rows.drop('Unnamed: 0', axis=1) else: df_rows = linkdata.drop_duplicates([basis_column], ignore_index=True) driver = webdriver.Chrome() search = crawled_data.split('_')[-1].split('.')[0] final_rows = [] save_period = 2 for idx, row in df_rows.iterrows(): if idx < 41: continue driver.get(row['link']) sleep(1) try: loc_info = driver.find_element_by_class_name( 'inner_place').text.split('\n') row['loc_name'] = loc_info[0] print(row['loc_name']) except: row['loc_name'] = search print(row['loc_name']) sleep(1) row['reviews'] = {} row['scores'] = {} row['dates'] = {} count = 0 try: num_reviews = int( driver.find_element_by_class_name( 'evaluation_sorting').text.split('전체')[1]) row['num_review'] = num_reviews except: print('num_reviews error... maybe no review yet') continue end_flag = False while not end_flag: try: num_linkpages = max( len(driver.find_elements_by_class_name('link_page')), 1) except: break for i in range(num_linkpages): try: linkpages = driver.find_elements_by_class_name('link_page') except Exception as e: print(e, f'{search} review appending failed...') pass reviews = driver.find_elements_by_class_name('txt_comment') sleep(0.1) scores = driver.find_elements_by_class_name('star_info') sleep(0.1) dates = driver.find_elements_by_class_name('time_write') sleep(0.1) for review, score, date in zip(reviews, scores, dates): try: row['reviews'][count] = review.text except Exception as e: print(f'{search} {e} review error') row['reviews'][count] = '' try: row['scores'][count] = score.text except Exception as e: print(f'{search} {e} score error') row['scores'][count] = '' try: row['dates'][count] = date.text[:-1] except Exception as e: print(f'{search} {e} date error') row['dates'][count] = '' count += 1 sleep(0.3) sleep(0.1) if (len(linkpages) < 5 and (i == len(linkpages) - 1)) or len(linkpages) == 0: end_flag = True break try: linkpages[i + 1].send_keys(Keys.ENTER) sleep(1) except: pass if i == 4: try: driver.find_element_by_class_name('btn_next').click() end_flag = False sleep(1) break except: end_flag = True break print( f'{search} crwal complete, reviews : {len(row["reviews"])}, scores : {len(row["scores"])}, dates : {len(row["dates"])}' ) final_rows.append(row) if idx % save_period == 0: dataframe = pd.DataFrame(data=final_rows) save_path = save_dataframe(search + 'kakao_review', dataframe) print(f"{idx}... {save_path} 저장완료") dataframe = pd.DataFrame(data=final_rows) save_path = save_dataframe(search + 'kakao_review', dataframe) print(f"{save_path} 저장완료") return save_path
def crawl_review_data(): df_rows = [] search = input("검색어를 입력하세요 : ") driver = webdriver.Chrome() base_url = 'https://map.kakao.com' driver.get(base_url) sleep(1) search_window = driver.find_element_by_xpath( '//*[@id="search.keyword.query"]') search_window.send_keys(search) sleep(1) search_window.send_keys(Keys.RETURN) sleep(1.5) search_more = driver.find_element_by_xpath( '//*[@id="info.search.place.more"]') search_more.send_keys(Keys.ENTER) sleep(1) driver.find_element_by_xpath(f'//*[@id="info.search.page.no1"]').send_keys( Keys.ENTER) sleep(1) total_row_nums = int( driver.find_element_by_xpath('//*[@id="info.search.place.cnt"]').text) for page in cycle(['no2', 'no3', 'no4', 'no5', 'next']): try: mac_names = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[3]/strong/a[2]') addresses = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[5]/div[2]/p[1]') addresses2 = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[5]/div[2]/p[2]') scores = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[4]/span[1]/em') links = driver.find_elements_by_xpath( f'//*[@id="info.search.place.list"]/li/div[4]/a') for m, a1, a2, s, l in zip(mac_names, addresses, addresses2, scores, links): try: row = { 'mac_name': m.text, 'address': a1.text, 'address2': a2.text, 'score': s.text, 'link': l.get_attribute('href') } df_rows.append(row) sleep(0.1) except Exception as e: print(e) except NoSuchElementException as e: print(f'{e}') continue except StaleElementReferenceException as e: print(f'{e}') continue try: next_page = driver.find_element_by_xpath( f'//*[@id="info.search.page.{page}"]') if total_row_nums <= len(df_rows): break elif next_page.is_enabled(): next_page.send_keys(Keys.ENTER) sleep(1) else: break except Exception as e: print('next page error, break out!') break dataframe = pd.DataFrame(data=df_rows) save_path = save_dataframe(search, dataframe) print(f"{save_path} 저장완료") for row in df_rows: row['num_review'] = 0 row['reviews'] = [] row['scores'] = [] row['dates'] = [] driver.get(row['link']) sleep(1) try: num_reviews = driver.find_element_by_xpath( '//*[@id="mArticle"]/div[5]/div[2]/a/span[1]').text row['num_review'] = num_reviews except: print('num_reviews error... maybe no review yet') continue num_pages, num_last = divmod(int(num_reviews), 5) if (num_last == 0) and (num_pages > 0): num_pages -= 1 num_pages += 1 for idx, page in enumerate(cycle([2, 3, 4, 5])): try: review_elems = driver.find_elements_by_xpath( '//*[@id="mArticle"]/div[5]/div[4]/ul/li/div[2]/p/span') review_scores = driver.find_elements_by_xpath( '//*[@id="mArticle"]/div[5]/div[4]/ul/li/div[1]/div/em') review_dates = driver.find_elements_by_xpath( '//*[@id="mArticle"]/div[5]/div[4]/ul/li/div[2]/div/span[3]' ) for elem, score, date in zip(review_elems, review_scores, review_dates): row['reviews'].append(elem.text) row['scores'].append(score.text) row['dates'].append(date.text) sleep(0.5) if page > num_pages: break except Exception as e: print(e, 'review appending failed...') continue try: next_page = driver.find_element_by_xpath( f'//*[@id="mArticle"]/div[5]/div[4]/div/a[{page - 1}]') next_page.send_keys(Keys.RETURN) sleep(0.5) except Exception as e: print(e, 'next page failed.') continue if idx + 2 > num_pages: break dataframe = pd.DataFrame(data=df_rows) save_path = save_dataframe(search + '_review', dataframe) print(f"{save_path} 저장완료") return save_path
def review_crawl(crawled_data, basis_column='loc_name', sep='\t'): linkdata = pd.read_csv(crawled_data, sep=sep) if 'Unnamed: 0' in linkdata.columns: df_rows = linkdata.drop_duplicates([basis_column], ignore_index=True) df_rows = df_rows.drop('Unnamed: 0', axis=1) else: df_rows = linkdata.drop_duplicates([basis_column], ignore_index=True) driver = webdriver.Chrome() search = crawled_data.split('_')[-1].split('.')[0] final_rows = [] save_period = 10 for idx, row in df_rows.iterrows(): driver.get(row['link']) sleep(1) row['loc_name'] = driver.find_element_by_class_name('_3XamX').text print(row['loc_name']) sleep(1) while 'receipt' not in driver.current_url: try: receipt_review = driver.find_element_by_xpath( '//*[@id="app-root"]/div/div[2]/div[3]/div/div/div/a[3]/span' ) receipt_review.click() except Exception as e: print(f'{e} ') driver.get(row['link']) sleep(1) sleep(1) try: row['receipt_num'] = int( driver.find_element_by_class_name('place_section_count').text) print(f'receipt_num : {row["receipt_num"]}') except Exception as e: print('receipt_num error', e) sleep(2) row['reviews'] = [] row['scores'] = [] row['dates'] = [] try: more_receipt = driver.find_element_by_class_name('_3iTUo') while more_receipt.is_enabled(): more_receipt.click() sleep(0.4) except StaleElementReferenceException as e: print(f'{e}, it`s ok .. go to next link') except NoSuchElementException as e: print(f'{e}, may be no review... it`s ok .. go to next link') try: review_elems = driver.find_elements_by_class_name('WoYOw') review_scores = driver.find_elements_by_class_name('_3qIdi') review_infos = driver.find_elements_by_class_name('_2wZjV') review_dates = [ x for i, x in enumerate(review_infos) if i % 3 == 1 ] visit_nums = [x for i, x in enumerate(review_infos) if i % 3 == 2] print( f'receipt_elem_num : {len(review_elems)}, {len(review_scores)}, {len(review_dates)}' ) row['reviews'] = { i: elem.text for i, elem in enumerate(review_elems) } row['scores'] = { i: float(score.text) for i, score in enumerate(review_scores) } row['dates'] = { i: date.text for i, date in enumerate(review_dates) } row['visit_num'] = { i: num.text for i, num in enumerate(visit_nums) } print( f"{row['reviews']} \n {row['scores']}\n {row['dates']}\n {row['visit_num']}" ) sleep(0.5) except Exception as e: print(f'{e} ??') final_rows.append(row) if idx % save_period == 0: dataframe = pd.DataFrame(data=final_rows) save_path = save_dataframe(search + '_review', dataframe) print(f"{idx}... {save_path} 저장완료") dataframe = pd.DataFrame(data=final_rows) save_path = save_dataframe(search + '_review', dataframe) print(f"{save_path} 저장완료") return save_path
def crawl_diningcode(): df_rows = [] # search = input("검색어를 입력하세요 : ") place_list = pd.read_csv( '/Users/dhkim/PycharmProjects/RealTastySpot/data/pilot_lists2.csv') print(place_list.head()) search_list = place_list.name.values driver = webdriver.Chrome() base_url = 'https://www.diningcode.com/' driver.get(base_url) sleep(1) for idx, search in enumerate(search_list): if idx < 35: continue search_window = driver.find_element_by_xpath('//*[@id="txt_keyword"]') search_window.send_keys(search + " 영등포") sleep(1) search_window.send_keys(Keys.RETURN) sleep(3) try: clikc_link = driver.find_element_by_xpath( '//*[@id="div_rn"]/ul/li/a') clikc_link.send_keys(Keys.ENTER) except: print(f'there is no place {search}') continue sleep(2) driver.close() sleep(1) driver.switch_to.window(driver.window_handles[0]) while True: try: more_review = driver.find_element_by_id('div_more_review') more_review.click() sleep(0.5) click_nolink_for_scrollDown(driver, 3) sleep(0.2) except: break # scores = driver.find_elements_by_class_name('point-detail') dates = driver.find_elements_by_class_name('star-date') reviews = driver.find_elements_by_class_name('review_contents') scores = [ float( x.find_element_by_class_name('star').find_element_by_tag_name( 'i').get_attribute('style').split(' ')[1].replace( '%;', '')) / 100 * 5 for x in dates ] for score, data, review in zip(scores, dates, reviews): try: row = { 'name': search, 'review': review.text, 'date': data.text, 'score': score } df_rows.append(row) sleep(0.1) except Exception as e: print(e) dataframe = pd.DataFrame(df_rows) save_path = save_dataframe('crawl_diningcode' + '_review', dataframe) print(f"{save_path} 저장완료") return save_path
def crawl_diningcode_favor(): df_rows = [] # search = input("검색어를 입력하세요 : ") place_list = pd.read_csv( '/Users/dhkim/PycharmProjects/RealTastySpot/data/pilot_lists2.csv') print(place_list.head()) search_list = place_list.name.values driver = webdriver.Chrome() base_url = 'https://www.diningcode.com/' driver.get(base_url) sleep(1) for idx, search in enumerate(search_list): if idx < 15: continue search_window = driver.find_element_by_xpath('//*[@id="txt_keyword"]') search_window.send_keys(search + " 영등포") sleep(1) search_window.send_keys(Keys.RETURN) sleep(3) try: clikc_link = driver.find_element_by_xpath( '//*[@id="div_rn"]/ul/li/a') clikc_link.send_keys(Keys.ENTER) except: print(f'there is no place {search}') continue sleep(2) driver.close() sleep(1) driver.switch_to.window(driver.window_handles[0]) try: driver.find_element_by_id( 'div_profile').find_elements_by_class_name( 's-list')[1].find_element_by_class_name( 'favor').find_element_by_tag_name('a').click() sleep(1) counts = driver.find_element_by_id('lbl_favorites_count').text sleep(0.1) body = driver.find_element_by_css_selector('body') body.click() for i in range(np.clip(int(counts) // 3, 1, int(counts) // 2)): sleep(0.3) body.send_keys(Keys.PAGE_DOWN) favor_str = driver.find_element_by_id('ul_favorites_list').text favor_list = favor_str.split('\n') people = islice(favor_list, 0, len(favor_list) - 1, 2) dates = islice(favor_list, 1, len(favor_list) - 1, 2) for person, date in zip(people, dates): if not '년' in date: date = '2020년 ' + date rows = {'name': search, 'person': person, 'like_date': date} df_rows.append(rows) except Exception as e: driver.get(base_url) sleep(1) print(e) dataframe = pd.DataFrame(df_rows) save_path = save_dataframe('crawl_diningcode_favor' + '_review', dataframe) print(f"{save_path} 저장완료") return save_path