Exemplos de crawling em Python, exemplos de collection.crawling em Python

Exemplo n.º 1

0

Exibir arquivo

def crawling_pelicana():
    results = []
    # range하고 끝을 지정안해주면 무한 루프.
    for page in count(start=1):#range(1, 6):
        url =f'http://pelicana.co.kr/store/stroe_search.html?page={page}&branch_name=&gu=&si='
        print(url)
        html = collection.crawling(url)

        bs = BeautifulSoup(html, 'html.parser')

        tag_table = bs.find('table', attrs={'class':'table mt20'})
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            # print(strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]
            # print(sidogu)

            results.append((name, address) + tuple(sidogu))

    # store
    for t in results:
        print(t)

    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    print(table['sido'], table['sido'])
    table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v))
    table['gungu'] = table.sido.apply(lambda v: collection.gungu_dict.get(v, v))
    #
    table = table.reset_index().set_index('index')
    table.to_csv('{0}/table-pelicana.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)

Exemplo n.º 2

0

Exibir arquivo

def crawling_pelicana():
    results = []
    # collection
    for page in count(start=1):  # page 숫자 1~5페이지까지 크롤링
        url = 'http://www.pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page
        html = collection.crawling(url)
        bs = BeautifulSoup(html, 'html.parser')

        #점진적으로 태그 탐색
        tag_table = bs.find('table', attrs={'class': 'table mt20'})  # 테이블
        tag_tbody = tag_table.find('tbody')  # 테이블 속의 body
        tags_tr = tag_tbody.findAll('tr')  # tr

        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]

            results.append((name, address) + tuple(sidogu))
    # store
    for t in results:
        print(t)

    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v))
    table['gungu'] = table.sido.apply(
        lambda v: collection.gungu_dict.get(v, v))

    table = table.reset_index().set_index('index')
    table.to_csv('{0}/table_pelica.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)

Exemplo n.º 3

0

Exibir arquivo

def crawling_pelicana():
    # collection
    results = []
    for page in range(1, 118):
        url = f'http://pelicana.co.kr/store/stroe_search.html?page={page}&branch_name=&gu=&si='

        html = collection.crawling(url)

        soup = BeautifulSoup(html, 'html.parser')
        tag_table = soup.find('table', attrs={'class': 'table mt20'})
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.find_all('tr')

        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]

            results.append((name, address) + tuple(sidogu))

    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v))
    table['gungu'] = table.sido.apply(
        lambda v: collection.gungu_dict.get(v, v))

    # table = table.reset_index(drop=True).set_index('no')
    table.to_csv('{0}/table-pelicana.csv'.format(RESULT_DIRECTORY),
                 encoding='UTF-8',
                 mode='w')

Exemplo n.º 4

0

Exibir arquivo

import collection
import analyze
import visualization

if __name__ == '__main__':
    items = [{
        'pagename': 'jtbcnews',
        'since': '2018-05-01',
        'until': '2018-05-31'
    }, {
        'pagename': 'chosun',
        'since': '2018-05-01',
        'until': '2018-05-31'
    }]
    # collection
    for item in items:
        result_file = collection.crawling(**item)
        item['resultfile'] = result_file

    # analyze
    for item in items:
        print(item['resultfile'])

    # visualization

Exemplo n.º 5

0

Exibir arquivo

def store_nene(data):
    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(
        lambda v: collection.gungu_dict.get(v, v))

    table = table.reset_index().set_index('index')

    table.to_csv('{0}/table_nene.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)


if __name__ == '__main__':

    # oekucabab collection
    crawling_pelicana()

    # nene collection
    collection.crawling(
        # 한글을 자동으로 인코딩해서 변환해준다.
        # http://nenechicken.com/subpage/where_list.asp?target_step2=전체&proc_type=step1&target_step1=전체
        # http://nenechicken.com/subpage/where_list.asp?target_step2=%EC%A0%84%EC%B2%B4&proc_type=step1&target_step1=%EC%A0%84%EC%B2%B4
        # 인코딩된 부분을 %s로 받는다. why? 보기 좋으라고
        url=
        'http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s'
        % (urllib.parse.quote('전체'), urllib.parse.quote('전체')),
        proc=proc_nene,
        store=store_nene)

Exemplo n.º 6

0

Exibir arquivo

if __name__ == '__main__':
    items = [{
        "pagename": "jtbcnews",
        "since": '2017-01-01',
        "until": '2017-10-17'
    }, {
        "pagename": "chosun",
        "since": '2017-01-01',
        "until": '2017-10-17'
    }]

    # HTTP Error 500: Internal Server Error  -> 데이터가 없는것으로 유추 가능

    # collection
    for item in items:
        result_file = collection.crawling(**item, fetch=True)
        item['result_file'] = result_file

    # analysis
    for item in items:
        # print(item['result_file'])ㅣ
        data = analyze.json_to_str(item['result_file'], 'message')
        item['count'] = analyze.count_word_freq(data)

    # visualization
    for item in items:
        count = item['count']
        count_top50 = dict(count.most_common(50))
        file_name = '%s_%s_%s' % (item['pagename'], item['since'],
                                  item['until'])

Exemplo n.º 7

0

Exibir arquivo

Arquivo: __main__.py Projeto: sosgur82/analysis_fb

until = '2017-10-01'

if __name__ == '__main__':
    items = [{
        'pagename': 'jtbcnews',
        'since': '2017-10-01',
        'until': '2017-10-17'
    }, {
        'pagename': 'chosun',
        'since': '2017-10-01',
        'until': '2017-10-17'
    }]

    # collection
    for item in items:
        resultfile = collection.crawling(**item, fetch=False)
        item['resultfile'] = resultfile

    # analysis
    for item in items:
        data = analyze.json_to_str(item['resultfile'], 'message')
        item['count'] = analyze.count_wordfreq(data)

    # visualization
    for item in items:
        count = item['count']
        count_t50 = dict(count.most_common(50))
        filename = '%s_%s_%s' % (item['pagename'], item['since'],
                                 item['until'])
        visualize.wordcloud(count_t50, filename)
        visualize.graph_bar(values=list(count_t50.values()),

Exemplo n.º 8

0

Exibir arquivo

Arquivo: __main__.py Projeto: limitlock/analysis_cf

import urllib
import collection


def proc_nene(xml):
    pass


def store_nene(data):
    pass


if __name__ == '__main__':

    # nene collection
    collection.crawling(
        url='http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s'
            % (urllib.parse.quote('전체'), urllib.parse.quote('전체')) ,
        proc=proc_nene,
        store=store_nene)