def proxy_test(proxy_list): for proxy in proxy_list: buf = get_url_source_by_proxy( 'http://tabelog.com/tokyo/A1307/A130701/13001058/', proxy, 'UTF-8') if buf is None: pass elif buf.find('アクセスが制限されています') != -1: pass else: crawling_module.write_text_to_file_by_utf8('useful_proxy.csv', proxy, 0)
def write_data_to_file_by_url(data_page_url): buf = crawling_module.get_url_source_by_proxy(data_page_url, 'http://202.106.16.36:3128', 'UTF-8') crawling_module.write_text_to_file_by_utf8('logger.csv', data_page_url + 'にアクセス完了しました。') custom_name = crawling_module.get_word_between(buf, '<dt>名前</dt>', '</dd>') custom_name = custom_name.replace('<dd>', '') custom_name = "".join(custom_name.split()) result_str = '"' + custom_name + '","' + data_page_url + '"' crawling_module.write_text_to_file_by_utf8('result.csv', result_str)
def write_data_page_by_page(total_page_url): buf = crawling_module.get_url_source_by_proxy(total_page_url, 'http://202.106.16.36:3128', 'UTF-8') crawling_module.write_text_to_file_by_utf8('logger.csv', total_page_url + 'にアクセス完了しました。') url_list = crawling_module.get_word_between_list(buf, '<div class="photo">', '">') if url_list is None: crawling_module.write_text_to_file_by_utf8('logger.csv', 'クローリングできるURLをみつかりませんでした') else: for url in url_list: url = "".join(url.split()) url = url.replace('<atarget="_blank"href="', '') url = 'http://www.bengo4.com/' + url write_data_to_file_by_url(url) return True
def athome_data_get(self): driver = webdriver.Chrome(self.chrome_driver_path) driver.maximize_window() driver.get('http://www.athome.co.jp/est_top/me_20/1_12_13') #ロード完了までしばらく待つ time.sleep(5) driver.execute_script('javascript:Dialog.searchList(13101)') time.sleep(10) # #検索結果に10番目ページに遷移 driver.execute_script('javascript:List.pageList(10)') time.sleep(10) #タッグ内の値だけを取得 body_text = driver.find_element_by_tag_name('body').text ##JS実行完了のページソースを取得 _html = driver.find_element_by_xpath('/html') html_text = _html.get_attribute('outerHTML') #エレメントIDでエレメントを取得 checkbox_element = driver.find_element_by_id('MES06') #必要の場合:別コードセットからUTF-8コードに変更 #html = buf.decode('shift_jis').encode('utf-8') # ......data analysis...... #取得した内容をファイルに書き込み crawling_module.write_text_to_file_by_utf8('body.csv', body_text) crawling_module.write_text_to_file_by_utf8('html.csv', html_text) # #必ず使用完了のブラウザーを解放 driver.quit()
def write_data_to_file_by_url(data_page_url, PROXY): buf = crawling_module.get_url_source_by_proxy(data_page_url, PROXY, 'utf-8') crawling_module.write_text_to_file_by_utf8('logger.csv', data_page_url + 'にアクセス完了しました。', 1) if buf is None or buf.find('お探しのページが見つかりません。') == -1: #取得できてるかを判断し、固定プロキシIPでもう一回再取得してみます。 if buf is None or buf.find('アクセスが制限されています') != -1 or ( buf.find('<p class="mname">') == -1 and buf.find('<span class="display-name">') == -1): crawling_module.write_text_to_file_by_utf8( 'logger.csv', data_page_url + 'を別プロキシで再取得しています', 1) buf = crawling_module.get_url_source_by_proxy( data_page_url, "http://120.198.243.86:80", 'utf-8') #これでもできない場合、本機IPで再取得してみます。 if buf is None or buf.find('アクセスが制限されています') != -1 or ( buf.find('<p class="mname">') == -1 and buf.find('<span class="display-name">') == -1): crawling_module.write_text_to_file_by_utf8( 'logger.csv', data_page_url + 'を本機IPで再取得しています', 1) buf = crawling_module.get_url_source(data_page_url, 'utf-8') if buf is not None and buf.find('お探しのページが見つかりません。') == -1: #####顧客名 custom_name = crawling_module.get_word_between( buf, '<p class="mname"><strong>', '</strong>') custom_name = custom_name.replace('</strong>', '') custom_name = custom_name.replace('</p>', '') custom_name = "".join(custom_name.split()) if custom_name == '': custom_name = crawling_module.get_word_between( buf, '<span class="display-name">', '</span>') custom_name = custom_name.replace('</strong>', '') custom_name = custom_name.replace('</p>', '') custom_name = "".join(custom_name.split()) #####口コミ数 comment = crawling_module.get_word_between( buf, '<em class="num" property="v:count">', '</em>') comment = comment.replace('<dd>', '') comment = "".join(comment.split()) #####カナ furikana = crawling_module.get_word_between( buf, '<p class="mname"><strong>', '</p>') furikana = crawling_module.get_word_between(furikana, '(', ')') furikana = furikana.replace('<dd>', '') furikana = "".join(furikana.split()) #####CENA予約可否 net_booking = '' if buf.find('_side_calendar_widget.js?1422849891') == -1: net_booking = 'ネット予約不可' else: net_booking = 'ネット予約可' #####閉店チェック shop_status = '' if buf.find('このお店は現在閉店しております') == -1: if buf.find('rst-status-badge-large rst-st-pending') == -1: shop_status = '営業中' else: shop_status = '掲載保留' else: shop_status = '閉店' #####会員状況 membership = '' if buf.find('このレストランは食べログ店舗会員に登録しているため、ユーザの皆様は編集することができません。') != -1: if buf.find('<div class="listing">') != -1: membership = '無料会員' else: membership = '有料会員' else: membership = '非会員' #####ジャンル genre = crawling_module.get_word_between_to_total_string( buf, '<span property="v:category">', '</span>') genre = genre.replace('<dd>', '') genre = "".join(genre.split()) #####電話/IP電話 tel = '' ipp = '' if shop_status == '閉店': tel = '' else: tel = crawling_module.get_word_between(buf, '<p class="ppc-sub">', '</strong>') tel = tel.replace('<strong>', '') tel = "".join(tel.split()) ipp = crawling_module.get_word_between( buf, '<strong property="v:tel">', '</strong>') ipp = ipp.replace('<strong>', '') ipp = "".join(ipp.split()) if tel == '': tel = ipp ipp = '' #####最寄り駅 station = crawling_module.get_word_between(buf, '<th>交通手段</th>', '</td>') station = station.replace('<td>', '') station = station.replace('<p>', '') station = station.replace('</p>', '') station = "".join(station.split()) #####総スコア total_score = crawling_module.get_word_between( buf, '<strong class="score" rel="v:rating"><span property="v:average">', '</span>') total_score = total_score.replace('<dd>', '') total_score = "".join(total_score.split()) #####昼スコア day_score = crawling_module.get_word_between( buf, '<span class="lunch">昼の点数:</span><em>', '</em>') day_score = day_score.replace('<dd>', '') day_score = "".join(day_score.split()) #####夜スコア night_score = crawling_module.get_word_between( buf, '<span class="dinner">夜の点数:</span><em>', '</em>') night_score = night_score.replace('<dd>', '') night_score = "".join(night_score.split()) #####通常予約可否 if buf.find('予約可') == -1: if buf.find('予約不可') == -1: booking = '' else: booking = '予約不可' else: booking = '予約可' #####住所 address = crawling_module.get_word_between(buf, '<p rel="v:addr">', '</p>') address = address.replace('<span property="v:region">', '') address = address.replace('<span property="v:locality">', '') address = address.replace('<span property="v:street-address">', '') address = address.replace('</span>', '') address = address.replace('</a>', '') address = crawling_module.replace_str_by_regex_count( address, '<a href="/\w*/" class="listlink">', '', ) address = crawling_module.replace_str_by_regex_count\ (address, '<a href="/\w*/\w*/" class="listlink">', '',) address = crawling_module.replace_str_by_regex_count\ (address, '<a href="/\w*/\w*/\w*/" class="listlink">', '',) address = crawling_module.replace_str_by_regex_count\ (address, '<a href="/\w*/\w*/\w*/\w*/" class="listlink">', '',) #####経緯度 geoCode = crawling_module.get_word_between(buf, 'center=', '&markers=') geoCode = geoCode.replace('<dd>', '') geoCode = "".join(geoCode.split()) #####営業時間 open_time = crawling_module.get_word_between(buf, '<th>営業時間</th>', '</td>') open_time = open_time.replace('<td>', '') open_time = open_time.replace('<p>', '') open_time = open_time.replace('</p>', '') open_time = "".join(open_time.split()) #####平均予算昼 day_cost = crawling_module.get_word_between( buf, '[昼]</span><span class="price">', '</span>') day_cost = day_cost.replace('<dd>', '') day_cost = "".join(day_cost.split()) #####平均予算夜 night_cost = crawling_module.get_word_between( buf, '[夜]</span><span class="price">', '</span>') night_cost = night_cost.replace('<dd>', '') night_cost = "".join(night_cost.split()) #####席数 seats = crawling_module.get_word_between(buf, '<th>席数</th>', '</td>') seats = seats.replace('</strong>', '') seats = seats.replace('<strong>', '') seats = seats.replace('<td>', '') seats = seats.replace('<p>', '') seats = seats.replace('</p>', '') seats = "".join(seats.split()) #####クーポン if buf.find('<strong>お得なクーポン</strong>') == -1: coupon = 'クーポン情報なし' else: coupon = 'クーポン情報あり' #####定休日 holiday = crawling_module.get_word_between(buf, '<th>定休日</th>', '</td>') holiday = holiday.replace('<dd>', '') holiday = holiday.replace('<td>', '') holiday = holiday.replace('<p>', '') holiday = holiday.replace('</p>', '') holiday = "".join(holiday.split()) #####クレカ credit_card = crawling_module.get_word_between(buf, '<th>カード</th>', '</td>') credit_card = credit_card.replace('<strong>', '') credit_card = credit_card.replace('<td>', '') credit_card = credit_card.replace('<p>', '') credit_card = credit_card.replace('</p>', '') credit_card = "".join(credit_card.split()) #####URL #####data_page_url #####総PV数 PV_total = crawling_module.get_word_between(buf, 'アクセス数 <em>', '</em>') PV_total = PV_total.replace('<dd>', '') PV_total = PV_total.replace('<td>', '') PV_total = PV_total.replace('<p>', '') PV_total = PV_total.replace('</p>', '') PV_total = "".join(PV_total.split()) #####先週PV数 PV_last_week = crawling_module.get_word_between( buf, '先週のアクセス数:</span><em>', '</em>') PV_last_week = PV_last_week.replace('<dd>', '') PV_last_week = PV_last_week.replace('<td>', '') PV_last_week = PV_last_week.replace('<p>', '') PV_last_week = PV_last_week.replace('</p>', '') PV_last_week = "".join(PV_last_week.split()) #####先々週PV数 PV_last_week_before = crawling_module.get_word_between( buf, '先々週のアクセス数:</span><em>', '</em>') PV_last_week_before = PV_last_week_before.replace('<dd>', '') PV_last_week_before = PV_last_week_before.replace('<td>', '') PV_last_week_before = PV_last_week_before.replace('<p>', '') PV_last_week_before = PV_last_week_before.replace('</p>', '') PV_last_week_before = "".join(PV_last_week_before.split()) #####プレーミアムクーポン有無 if buf.find('<span class="pcoupon-item-lead">') == -1: pre_coupon = 'プレーミアムクーポン情報なし' else: pre_coupon = 'プレーミアムクーポン情報あり' #####県域コード tempstr = data_page_url.replace( 'http://tabelog.com/tokyo/A1307/A130701/', '') if len(tempstr) == 7: province_code = tempstr[:1] else: province_code = tempstr[:2] #####公式情報有無 if buf.find('<a class="official-badge">公式情報あり</a>') == -1: official_news = '公式情報なし' else: official_news = '公式情報あり' result_str = '"' + custom_name + '","' + comment + '","' + furikana + '","' + net_booking + '","' + shop_status \ + '","' + membership + '","' + genre + '","' + tel + '","' + ipp + '","' + station\ + '","' + total_score + '","' + day_score + '","' + night_score + '","' + booking\ + '","' + address + '","' + geoCode + '","' + open_time + '","' + day_cost \ + '","' + night_cost + '","' + seats + '","' + coupon + '","' + holiday\ + '","' + credit_card + '","' + data_page_url + '","' + PV_total + '","' + PV_last_week \ + '","' + PV_last_week_before + '","' + pre_coupon + '","' + province_code + '","' + official_news+ '"' crawling_module.write_text_to_file_by_utf8(province_code + '.csv', result_str) else: crawling_module.write_text_to_file_by_utf8( 'logger.csv', data_page_url + 'が見つかりませんでした。', 1)
'http://61.7.147.83:8080', 'http://61.7.213.58:3128', 'http://66.35.68.145:3127', 'http://66.35.68.145:7808', 'http://66.35.68.145:8089', 'http://69.164.213.244:3128', 'http://69.197.148.18:3127', 'http://69.197.148.18:7808', 'http://69.197.148.18:8089', 'http://78.189.30.243:8080', 'http://78.24.221.193:3128', 'http://79.142.57.118:3128', 'http://81.163.88.65:8080', 'http://83.55.33.54:8080', 'http://86.107.110.73:3127', 'http://86.107.110.73:7808', 'http://87.120.58.129:8080', 'http://88.132.82.236:8088', 'http://88.150.136.179:3129', 'http://88.150.136.180:3129', 'http://89.249.207.65:3128', 'http://91.215.108.131:3130', 'http://94.23.23.60:80' ] PROXY = proxy_change(proxy_list) crawling_module.write_text_to_file_by_utf8('logger.csv', '現在使用してるプロキシは' + PROXY, 1) ACCESS_COUNT = 0 start_list = [ 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000, 11000000, 12000000, 13000000, 14000000, 15000000, 16000000, 17000000, 18000000, 19000000, 20000000, 21000000, 22000000, 23000000, 24000000, 25000000, 26000000, 27000000, 28000000, 29000000, 30000000, 31000000, 32000000, 33000000, 34000000, 35000000, 36000000, 37000000, 38000000, 39000000, 40000000, 41000000, 42000000, 43000000, 44000000, 45000000, 46000000, 47000000 ] ended_list = [ 1014267, 2008782, 3008230, 4016220, 5006963, 6007865, 7012233, 8016785, 9014790, 10015240, 11038450, 12036105, 13171150, 14057830, 15015390, 16007268, 17009425, 18006588, 19008345, 20018724, 21014785, 22027990,
ie = PAM30.PAMIE() #URLをアクセス ie.navigate('http://www.athome.co.jp/est_top/me_20/1_12_13') #ロード完了までしばらく待つ time.sleep(5) #下記コードがIEオブジェクトを駆使してJS関数を実行 ie.executeJavaScript('javascript:Dialog.searchList(13101)') time.sleep(10) #検索結果に10番目ページに遷移 ie.executeJavaScript('javascript:List.pageList(10)') time.sleep(10) #JS実行完了のページソースを取得 buf = ie.outerHTML() #別コードセットからUTF-8コードに変更 #html = buf.decode('shift_jis').encode('utf-8') # ......data analysis...... #ここは取得したページソースを処理、処理データをファイルに書き込み #取得した内容をファイルに書き込み crawling_module.write_text_to_file_by_utf8('buf.csv', buf) #必ず使用完了のIEオブジェクトを解放 ie.quit() #---------------------------------------------------------------------------------------------