Exemplo n.º 1
0
def main():
    url = "http://www.zizhuauto.com/index-htm-caid-822/page-1.html"
    next = 0
    while 1:
        if url:
            html = get_parse(url)
            urls = get_url(html)
            for url in urls:
                if redis_get(url) is None:  #检查次url是否已获取过
                    items = get_parse(url)
                    data = get_content(items)
                    if data:
                        s = []
                        if '北京' in data['brand'] or '北汽' in data['brand']:
                            s.append(data)
                            conns(s)
                            print(s)
                            set_url(url)
                    else:
                        log.info('此链接没有数据:%s' % url)
                else:
                    next += 1
                    log.info('此链接已抓去过:%s' % url)
                    break
            url = get_next_url(html)
            log.info('next url:%s' % url)
        else:
            break
Exemplo n.º 2
0
def main():
    # url = "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html"
    urls = [
        "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/8_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/12_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/254_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/175_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/255_0_0_0_0_0,0,0,0,0,0_0.html"
    ]
    for url in urls:
        x = 0
        while 1:
            if url:
                html = get_parse(url)
                if html:
                    urls = get_url(html)
                    log.info(urls)
                    data_list = []
                    url_list = []
                    for url in urls:
                        if redis_get(url) is None:
                            my_html = get_parse(url)
                            if my_html:
                                result = get_content(my_html)
                                data_list.append(result)
                                url_list.append(url)
                        else:
                            log.info('此链接已抓去过')
                    if data_list and url_list:
                        conns(data_list)
                        set_url(url_list)
                        print(data_list)
                    else:
                        x += 1
                        if x > 3:
                            break
                    url = get_next_url(html)
                else:
                    log.info('url不能访问:', url)
                    break
Exemplo n.º 3
0
def main():
    url = "http://www.qiche365.org.cn/index.php?m=all&c=complain&a=clist&page=1459"
    while 1:
        if url:
            html = get_parse(url)
            urls = get_url(html)
            for url in urls:
                if redis_get(url) is None:
                    my_html = get_parse(url)
                    result = get_content(my_html)
                    if result:
                        for i in result:
                            if '北京' in i['brand'] or '北汽' in i['brand']:
                                print('result', result)
                                conns(result)
                                set_url(url)
                else:
                    log.info('该url已抓去过:', url)
                    break
            url = get_next_url(html)
        else:
            log.info('抓取完毕')
            break
Exemplo n.º 4
0
def main():
    brid_list = config.QCM_ARGS
    url = "https://www.qichemen.com/complain.html"
    for brid in brid_list:
        pstart = 0
        log.info('开始爬取%d'%brid)
        while 1:
            log.info('第%d页'%(pstart+1))
            html = get_post_url(url, pstart, brid)
            data_list = []
            url_list = []
            if html:
                urls = get_url(html)
                if urls == []:
                    print('爬取完毕')
                    break
                for my_url in urls:
                    if redis_get(my_url) is None:
                        try:
                            my_html = get_parse(my_url)
                            data = get_content(my_html)
                            if data is None:
                                log.info('%d抓取完毕'%brid)
                                break
                        except:
                            data = {}
                        data_list.append(data)
                        url_list.append(my_url)
                    else:
                        log.info('此链接已抓取过')
                        break
                if data_list:
                    conns(data_list)
                    set_url(url_list)
                    print(data_list)
                pstart += 1
            else:
                break
Exemplo n.º 5
0
def main():
    # url = "http://www.12365auto.com/zlts/272-0-0-0-0-0_0-0-0-1.shtml"
    urls = config.CZW_URLS
    for url in urls:
        x = 0
        print('一类抓取完毕')
        while 1:
            if url:
                print(url)
                html = get_parse(url)
                data_list, url_list = get_content(html)
                if data_list and url_list:
                    conns(data_list)
                    set_url(url_list)
                    print(data_list)
                    print(url_list)
                else:
                    x += 1
                    if x > 3:
                        break
                url = get_next_url(html)
            else:
                break