Python Spider примеры использования

Язык программирования: Python

Пространство имен/Пакет: writer

Класс/Тип: Spider

Примеров на hotexamples.com: 3

Python Spider - 3 примера найдено. Это лучшие примеры Python кода для writer.Spider, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

insert(1)

Пример #1

Показать файл

Файл: yeeyan.py Проект: immissile/42qu_github_mirror

    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>译言网 | ', '</ti', page)
            tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page)
            tags = tags_wrapper.split(',')
            author = txt_wrap_by('<h2 id="user_info"', '/a', page)
            author = txt_wrap_by('">','<',author)
            rating = txt_wrap_by('已有<span class="number">', '</span', page)
            content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page)
            url = txt_wrap_by('wumiiPermaLink = "','"',page)
            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper)
            else:
                return 

            content = str(content)

            reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page)
            reply_list = []
            for reply_wrapper in reply_wrapper_list:
                reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper))

            Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)

Пример #2

Показать файл

Файл: zhihu_explorer.py Проект: xqk/42qu_github_mirror

def main():
    cookies = ((
        '*****@*****.**',
        '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In'
    ), )

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Language':
        'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Host':
        'www.zhihu.com',
        'Referer:http':
        '//www.zhihu.com/',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    }
    count = 0
    headers['cookie'] = cookies[0][1]
    explore_page = fetch('http://www.zhihu.com/explore', headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ');', explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')]
                   for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div',
                                 explore_page)
    result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    title_list = [
        txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i))
        for i in url_list
    ]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list,
                     url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1],
                      entry[4], [], pic_list)

Пример #3

Показать файл

Файл: zhihu_explorer.py Проект: immissile/42qu_github_mirror

def main():
    cookies = (
        (
            "*****@*****.**",
            "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In",
        ),
    )

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.zhihu.com",
        "Referer:http": "//www.zhihu.com/",
        "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11",
    }
    count = 0
    headers["cookie"] = cookies[0][1]
    explore_page = fetch("http://www.zhihu.com/explore", headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ");", explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page)
    result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)