Python write_text_to_file_by_utf8 예제들, crawling_module.write_text_to_file_by_utf8 Python 예제들

예제 #1

0

파일 보기

파일: ProxyTest.py 프로젝트: marvenhool/crawling

def proxy_test(proxy_list):

    for proxy in proxy_list:
        buf = get_url_source_by_proxy(
            'http://tabelog.com/tokyo/A1307/A130701/13001058/', proxy, 'UTF-8')
        if buf is None:
            pass
        elif buf.find('アクセスが制限されています') != -1:
            pass
        else:
            crawling_module.write_text_to_file_by_utf8('useful_proxy.csv',
                                                       proxy, 0)

예제 #2

0

파일 보기

파일: BaseCrawler_Http_Ver.py 프로젝트: marvenhool/crawling

def write_data_to_file_by_url(data_page_url):

    buf = crawling_module.get_url_source_by_proxy(data_page_url,
                                                  'http://202.106.16.36:3128',
                                                  'UTF-8')
    crawling_module.write_text_to_file_by_utf8('logger.csv',
                                               data_page_url + 'にアクセス完了しました。')

    custom_name = crawling_module.get_word_between(buf, '<dt>名前</dt>', '</dd>')
    custom_name = custom_name.replace('<dd>', '')
    custom_name = "".join(custom_name.split())

    result_str = '"' + custom_name + '","' + data_page_url + '"'
    crawling_module.write_text_to_file_by_utf8('result.csv', result_str)

예제 #3

0

파일 보기

파일: BaseCrawler_Http_Ver.py 프로젝트: marvenhool/crawling

def write_data_page_by_page(total_page_url):
    buf = crawling_module.get_url_source_by_proxy(total_page_url,
                                                  'http://202.106.16.36:3128',
                                                  'UTF-8')
    crawling_module.write_text_to_file_by_utf8('logger.csv',
                                               total_page_url + 'にアクセス完了しました。')
    url_list = crawling_module.get_word_between_list(buf,
                                                     '<div class="photo">',
                                                     '">')

    if url_list is None:
        crawling_module.write_text_to_file_by_utf8('logger.csv',
                                                   'クローリングできるURLをみつかりませんでした')
    else:
        for url in url_list:
            url = "".join(url.split())
            url = url.replace('<atarget="_blank"href="', '')
            url = 'http://www.bengo4.com/' + url
            write_data_to_file_by_url(url)
    return True

예제 #4

0

파일 보기

파일: SeleniumWebDriverDemo.py 프로젝트: marvenhool/crawling

    def athome_data_get(self):
        driver = webdriver.Chrome(self.chrome_driver_path)
        driver.maximize_window()

        driver.get('http://www.athome.co.jp/est_top/me_20/1_12_13')
        #ロード完了までしばらく待つ
        time.sleep(5)

        driver.execute_script('javascript:Dialog.searchList(13101)')
        time.sleep(10)

        # #検索結果に１０番目ページに遷移
        driver.execute_script('javascript:List.pageList(10)')
        time.sleep(10)

        #タッグ内の値だけを取得
        body_text = driver.find_element_by_tag_name('body').text

        ##JS実行完了のページソースを取得
        _html = driver.find_element_by_xpath('/html')
        html_text = _html.get_attribute('outerHTML')

        #エレメントIDでエレメントを取得
        checkbox_element = driver.find_element_by_id('MES06')

        #必要の場合：別コードセットからUTF-8コードに変更
        #html = buf.decode('shift_jis').encode('utf-8')

        #   ......data analysis......

        #取得した内容をファイルに書き込み
        crawling_module.write_text_to_file_by_utf8('body.csv', body_text)
        crawling_module.write_text_to_file_by_utf8('html.csv', html_text)

        # #必ず使用完了のブラウザーを解放
        driver.quit()

예제 #5

0

파일 보기

def write_data_to_file_by_url(data_page_url, PROXY):

    buf = crawling_module.get_url_source_by_proxy(data_page_url, PROXY,
                                                  'utf-8')
    crawling_module.write_text_to_file_by_utf8('logger.csv',
                                               data_page_url + 'にアクセス完了しました。',
                                               1)

    if buf is None or buf.find('お探しのページが見つかりません。') == -1:

        #取得できてるかを判断し、固定プロキシIPでもう一回再取得してみます。
        if buf is None or buf.find('アクセスが制限されています') != -1 or (
                buf.find('<p class="mname">') == -1
                and buf.find('<span class="display-name">') == -1):
            crawling_module.write_text_to_file_by_utf8(
                'logger.csv', data_page_url + 'を別プロキシで再取得しています', 1)
            buf = crawling_module.get_url_source_by_proxy(
                data_page_url, "http://120.198.243.86:80", 'utf-8')

        #これでもできない場合、本機IPで再取得してみます。
        if buf is None or buf.find('アクセスが制限されています') != -1 or (
                buf.find('<p class="mname">') == -1
                and buf.find('<span class="display-name">') == -1):
            crawling_module.write_text_to_file_by_utf8(
                'logger.csv', data_page_url + 'を本機IPで再取得しています', 1)
            buf = crawling_module.get_url_source(data_page_url, 'utf-8')

    if buf is not None and buf.find('お探しのページが見つかりません。') == -1:

        #####顧客名
        custom_name = crawling_module.get_word_between(
            buf, '<p class="mname"><strong>', '</strong>')
        custom_name = custom_name.replace('</strong>', '')
        custom_name = custom_name.replace('</p>', '')
        custom_name = "".join(custom_name.split())
        if custom_name == '':
            custom_name = crawling_module.get_word_between(
                buf, '<span class="display-name">', '</span>')
            custom_name = custom_name.replace('</strong>', '')
            custom_name = custom_name.replace('</p>', '')
            custom_name = "".join(custom_name.split())

        #####口コミ数
        comment = crawling_module.get_word_between(
            buf, '<em class="num" property="v:count">', '</em>')
        comment = comment.replace('<dd>', '')
        comment = "".join(comment.split())

        #####カナ
        furikana = crawling_module.get_word_between(
            buf, '<p class="mname"><strong>', '</p>')
        furikana = crawling_module.get_word_between(furikana, '（', '）')
        furikana = furikana.replace('<dd>', '')
        furikana = "".join(furikana.split())

        #####CENA予約可否
        net_booking = ''
        if buf.find('_side_calendar_widget.js?1422849891') == -1:
            net_booking = 'ネット予約不可'
        else:
            net_booking = 'ネット予約可'

        #####閉店チェック
        shop_status = ''
        if buf.find('このお店は現在閉店しております') == -1:
            if buf.find('rst-status-badge-large rst-st-pending') == -1:
                shop_status = '営業中'
            else:
                shop_status = '掲載保留'
        else:
            shop_status = '閉店'

        #####会員状況
        membership = ''
        if buf.find('このレストランは食べログ店舗会員に登録しているため、ユーザの皆様は編集することができません。') != -1:
            if buf.find('<div class="listing">') != -1:
                membership = '無料会員'
            else:
                membership = '有料会員'
        else:
            membership = '非会員'

        #####ジャンル
        genre = crawling_module.get_word_between_to_total_string(
            buf, '<span property="v:category">', '</span>')
        genre = genre.replace('<dd>', '')
        genre = "".join(genre.split())

        #####電話/IP電話
        tel = ''
        ipp = ''
        if shop_status == '閉店':
            tel = ''
        else:
            tel = crawling_module.get_word_between(buf, '<p class="ppc-sub">',
                                                   '</strong>')
            tel = tel.replace('<strong>', '')
            tel = "".join(tel.split())

            ipp = crawling_module.get_word_between(
                buf, '<strong property="v:tel">', '</strong>')
            ipp = ipp.replace('<strong>', '')
            ipp = "".join(ipp.split())

            if tel == '':
                tel = ipp
                ipp = ''

        #####最寄り駅
        station = crawling_module.get_word_between(buf, '<th>交通手段</th>',
                                                   '</td>')
        station = station.replace('<td>', '')
        station = station.replace('<p>', '')
        station = station.replace('</p>', '')
        station = "".join(station.split())

        #####総スコア
        total_score = crawling_module.get_word_between(
            buf,
            '<strong class="score" rel="v:rating"><span property="v:average">',
            '</span>')
        total_score = total_score.replace('<dd>', '')
        total_score = "".join(total_score.split())

        #####昼スコア
        day_score = crawling_module.get_word_between(
            buf, '<span class="lunch">昼の点数：</span><em>', '</em>')
        day_score = day_score.replace('<dd>', '')
        day_score = "".join(day_score.split())

        #####夜スコア
        night_score = crawling_module.get_word_between(
            buf, '<span class="dinner">夜の点数：</span><em>', '</em>')
        night_score = night_score.replace('<dd>', '')
        night_score = "".join(night_score.split())

        #####通常予約可否
        if buf.find('予約可') == -1:
            if buf.find('予約不可') == -1:
                booking = ''
            else:
                booking = '予約不可'
        else:
            booking = '予約可'

        #####住所
        address = crawling_module.get_word_between(buf, '<p rel="v:addr">',
                                                   '</p>')
        address = address.replace('<span property="v:region">', '')
        address = address.replace('<span property="v:locality">', '')
        address = address.replace('<span property="v:street-address">', '')
        address = address.replace('</span>', '')
        address = address.replace('</a>', '')
        address = crawling_module.replace_str_by_regex_count(
            address,
            '<a href="/\w*/" class="listlink">',
            '',
        )
        address = crawling_module.replace_str_by_regex_count\
            (address, '<a href="/\w*/\w*/" class="listlink">', '',)
        address = crawling_module.replace_str_by_regex_count\
            (address, '<a href="/\w*/\w*/\w*/" class="listlink">', '',)
        address = crawling_module.replace_str_by_regex_count\
            (address, '<a href="/\w*/\w*/\w*/\w*/" class="listlink">', '',)

        #####経緯度
        geoCode = crawling_module.get_word_between(buf, 'center=',
                                                   '&amp;markers=')
        geoCode = geoCode.replace('<dd>', '')
        geoCode = "".join(geoCode.split())

        #####営業時間
        open_time = crawling_module.get_word_between(buf, '<th>営業時間</th>',
                                                     '</td>')
        open_time = open_time.replace('<td>', '')
        open_time = open_time.replace('<p>', '')
        open_time = open_time.replace('</p>', '')
        open_time = "".join(open_time.split())

        #####平均予算昼
        day_cost = crawling_module.get_word_between(
            buf, '[昼]</span><span class="price">', '</span>')
        day_cost = day_cost.replace('<dd>', '')
        day_cost = "".join(day_cost.split())

        #####平均予算夜
        night_cost = crawling_module.get_word_between(
            buf, '[夜]</span><span class="price">', '</span>')
        night_cost = night_cost.replace('<dd>', '')
        night_cost = "".join(night_cost.split())

        #####席数
        seats = crawling_module.get_word_between(buf, '<th>席数</th>', '</td>')
        seats = seats.replace('</strong>', '')
        seats = seats.replace('<strong>', '')
        seats = seats.replace('<td>', '')
        seats = seats.replace('<p>', '')
        seats = seats.replace('</p>', '')
        seats = "".join(seats.split())

        #####クーポン
        if buf.find('<strong>お得なクーポン</strong>') == -1:
            coupon = 'クーポン情報なし'
        else:
            coupon = 'クーポン情報あり'

        #####定休日
        holiday = crawling_module.get_word_between(buf, '<th>定休日</th>',
                                                   '</td>')
        holiday = holiday.replace('<dd>', '')
        holiday = holiday.replace('<td>', '')
        holiday = holiday.replace('<p>', '')
        holiday = holiday.replace('</p>', '')
        holiday = "".join(holiday.split())

        #####クレカ
        credit_card = crawling_module.get_word_between(buf, '<th>カード</th>',
                                                       '</td>')
        credit_card = credit_card.replace('<strong>', '')
        credit_card = credit_card.replace('<td>', '')
        credit_card = credit_card.replace('<p>', '')
        credit_card = credit_card.replace('</p>', '')
        credit_card = "".join(credit_card.split())

        #####URL
        #####data_page_url

        #####総PV数
        PV_total = crawling_module.get_word_between(buf, 'アクセス数 <em>', '</em>')
        PV_total = PV_total.replace('<dd>', '')
        PV_total = PV_total.replace('<td>', '')
        PV_total = PV_total.replace('<p>', '')
        PV_total = PV_total.replace('</p>', '')
        PV_total = "".join(PV_total.split())

        #####先週PV数
        PV_last_week = crawling_module.get_word_between(
            buf, '先週のアクセス数：</span><em>', '</em>')
        PV_last_week = PV_last_week.replace('<dd>', '')
        PV_last_week = PV_last_week.replace('<td>', '')
        PV_last_week = PV_last_week.replace('<p>', '')
        PV_last_week = PV_last_week.replace('</p>', '')
        PV_last_week = "".join(PV_last_week.split())

        #####先々週PV数
        PV_last_week_before = crawling_module.get_word_between(
            buf, '先々週のアクセス数：</span><em>', '</em>')
        PV_last_week_before = PV_last_week_before.replace('<dd>', '')
        PV_last_week_before = PV_last_week_before.replace('<td>', '')
        PV_last_week_before = PV_last_week_before.replace('<p>', '')
        PV_last_week_before = PV_last_week_before.replace('</p>', '')
        PV_last_week_before = "".join(PV_last_week_before.split())

        #####プレーミアムクーポン有無
        if buf.find('<span class="pcoupon-item-lead">') == -1:
            pre_coupon = 'プレーミアムクーポン情報なし'
        else:
            pre_coupon = 'プレーミアムクーポン情報あり'

        #####県域コード
        tempstr = data_page_url.replace(
            'http://tabelog.com/tokyo/A1307/A130701/', '')
        if len(tempstr) == 7:
            province_code = tempstr[:1]
        else:
            province_code = tempstr[:2]

        #####公式情報有無
        if buf.find('<a class="official-badge">公式情報あり</a>') == -1:
            official_news = '公式情報なし'
        else:
            official_news = '公式情報あり'

        result_str = '"' + custom_name + '","' + comment + '","' + furikana + '","' + net_booking + '","' + shop_status \
                     + '","' + membership + '","' + genre + '","' + tel + '","' + ipp + '","' + station\
                     + '","' + total_score + '","' + day_score + '","' + night_score + '","' + booking\
                     + '","' + address + '","' + geoCode + '","' + open_time + '","' + day_cost \
                     + '","' + night_cost + '","' + seats + '","' + coupon + '","' + holiday\
                     + '","' + credit_card + '","' + data_page_url + '","' + PV_total + '","' + PV_last_week \
                     + '","' + PV_last_week_before + '","' + pre_coupon + '","' + province_code + '","' + official_news+ '"'

        crawling_module.write_text_to_file_by_utf8(province_code + '.csv',
                                                   result_str)
    else:
        crawling_module.write_text_to_file_by_utf8(
            'logger.csv', data_page_url + 'が見つかりませんでした。', 1)

예제 #6

0

파일 보기

    'http://61.7.147.83:8080', 'http://61.7.213.58:3128',
    'http://66.35.68.145:3127', 'http://66.35.68.145:7808',
    'http://66.35.68.145:8089', 'http://69.164.213.244:3128',
    'http://69.197.148.18:3127', 'http://69.197.148.18:7808',
    'http://69.197.148.18:8089', 'http://78.189.30.243:8080',
    'http://78.24.221.193:3128', 'http://79.142.57.118:3128',
    'http://81.163.88.65:8080', 'http://83.55.33.54:8080',
    'http://86.107.110.73:3127', 'http://86.107.110.73:7808',
    'http://87.120.58.129:8080', 'http://88.132.82.236:8088',
    'http://88.150.136.179:3129', 'http://88.150.136.180:3129',
    'http://89.249.207.65:3128', 'http://91.215.108.131:3130',
    'http://94.23.23.60:80'
]

PROXY = proxy_change(proxy_list)
crawling_module.write_text_to_file_by_utf8('logger.csv',
                                           '現在使用してるプロキシは' + PROXY, 1)
ACCESS_COUNT = 0

start_list = [
    1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000,
    9000000, 10000000, 11000000, 12000000, 13000000, 14000000, 15000000,
    16000000, 17000000, 18000000, 19000000, 20000000, 21000000, 22000000,
    23000000, 24000000, 25000000, 26000000, 27000000, 28000000, 29000000,
    30000000, 31000000, 32000000, 33000000, 34000000, 35000000, 36000000,
    37000000, 38000000, 39000000, 40000000, 41000000, 42000000, 43000000,
    44000000, 45000000, 46000000, 47000000
]
ended_list = [
    1014267, 2008782, 3008230, 4016220, 5006963, 6007865, 7012233, 8016785,
    9014790, 10015240, 11038450, 12036105, 13171150, 14057830, 15015390,
    16007268, 17009425, 18006588, 19008345, 20018724, 21014785, 22027990,

예제 #7

0

파일 보기

파일: BaseCrawler_IE_Ver.py 프로젝트: marvenhool/crawling

ie = PAM30.PAMIE()

#URLをアクセス
ie.navigate('http://www.athome.co.jp/est_top/me_20/1_12_13')
#ロード完了までしばらく待つ
time.sleep(5)

#下記コードがIEオブジェクトを駆使してJS関数を実行
ie.executeJavaScript('javascript:Dialog.searchList(13101)')
time.sleep(10)

#検索結果に１０番目ページに遷移
ie.executeJavaScript('javascript:List.pageList(10)')
time.sleep(10)

#JS実行完了のページソースを取得
buf = ie.outerHTML()

#別コードセットからUTF-8コードに変更
#html = buf.decode('shift_jis').encode('utf-8')

#   ......data analysis......
#ここは取得したページソースを処理、処理データをファイルに書き込み

#取得した内容をファイルに書き込み
crawling_module.write_text_to_file_by_utf8('buf.csv', buf)

#必ず使用完了のIEオブジェクトを解放
ie.quit()

#---------------------------------------------------------------------------------------------