示例#1
0
def detail_new(selector):
    address = xpath_handler(selector.xpath(settings.ADDRESS_MFW_NEW))
    intro = xpath_handler(selector.xpath(settings.INTRO_MFW_NEW))
    comment = xpath_handler(selector.xpath(settings.COMMENT_MFW_NEW))
    contact = xpath_handler(selector.xpath(settings.CONTACT_MFW_NEW))
    grade = xpath_handler(selector.xpath(settings.GRADE_MFW_NEW))
    return address, intro, comment, contact, grade
示例#2
0
def xc_scene(url):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    nodes = selector.xpath(settings.NODES_XC)
    if nodes:
        for node in nodes:
            name = xpath_handler(node.xpath(settings.NAME_XC))
            address = xpath_handler(node.xpath(settings.ADDRESS_XC))
            grade = xpath_handler(node.xpath(settings.SCORE_XC))
            comment = re.search('\d+',
                                xpath_handler(node.xpath(settings.COMMENT_XC)))
            comment = comment.group() if comment else 0
            url = detail_url = settings.DOMAIN_XC + xpath_handler(
                node.xpath(settings.DETAIL_XC))
            intro, website, contact = get_detail(detail_url)
            params = dict(
                name=name.encode('utf-8'),
                address=address.encode('utf-8'),
                grade=float(grade.encode('utf-8')) if grade else 0,
                comment=comment.encode('utf-8'),
                url=url.encode('utf-8'),
                intro=intro.encode('utf-8'),
                website=website.encode('utf-8'),
                contact=contact.encode('utf-8'),
            )
            print name
            MY_DB.insert(settings.QUERY_XC, params)
    else:
        print "***warning url:%s" % url
示例#3
0
def get_detail(url):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    introduce = xpath_handler(selector.xpath(settings.INTRODUCE_XC))
    website = xpath_handler(selector.xpath(settings.WEBSITE_XC))
    phone = xpath_handler(
        filter(lambda x: re.search('\d', x),
               selector.xpath(settings.PHONE_XC)))
    return introduce, website, phone
示例#4
0
文件: crawl.py 项目: w1024k/wbb
def get_page(params):
    rsp = requests.post(url=settings.COMMENT_URL,
                        headers=settings.HEADER,
                        params=params).text
    selector = etree.HTML(rsp)
    page = selector.xpath(settings.PAGE_PATH)
    return int(xpath_handler(page, default_val=1))
示例#5
0
文件: crawl.py 项目: w1024k/wbb
def spider(params, level, page_num):
    params['pagenow'] = page_num
    rsp = requests.post(url=settings.COMMENT_URL,
                        headers=settings.HEADER,
                        params=params).text
    selector = etree.HTML(rsp)
    nodes = selector.xpath(settings.ROOT_PATH)
    for node in nodes:
        record = dict(
            nick=xpath_handler(node.xpath(settings.NICK_PATH)),
            date=xpath_handler(node.xpath(settings.DATE_PATH)),
            comment=xpath_handler(node.xpath(settings.COMMENT_PATH)),
            level=level,
        )
        # print level, record['nick']
        MY_DB.insert(settings.SQL_QUERY, record)
示例#6
0
def detail_old(selector):
    address = xpath_handler(selector.xpath(settings.ADDRESS_MFW))
    intro = xpath_handler(selector.xpath(settings.INTRO_MFW))
    comment = xpath_handler(selector.xpath(settings.COMMENT_MFW))
    comment = re.search('\d+', comment)
    comment = comment.group() if comment else 0
    open = xpath_handler(selector.xpath(settings.OPEN_MFW))
    time = xpath_handler(selector.xpath(settings.TIME_MFW))
    contact = xpath_handler(selector.xpath(settings.CONTACT_MFW))
    website = xpath_handler(selector.xpath(settings.WEBSITE_MFW))
    return address, intro, comment, open, time, contact, website
示例#7
0
def get_detail(url):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    name = xpath_handler(selector.xpath(settings.NAME_MFW))
    if name:
        print 111, name
        address, intro, comment, open, time, contact, website = list(detail_old(selector))
        print address, intro, comment, open, time, contact, website
        params = dict(
            name=name,
            address=address,
            intro=intro,
            comment=comment,
            open=open,
            time=time,
            contact=contact,
            website=website,
            url=url
        )
        MY_DB.insert(settings.QUERY_MFW, params)
    else:
        name = xpath_handler(selector.xpath(settings.NAME_MFW_NEW))
        if name:
            print 222, name
            address, intro, comment, contact, grade = list(detail_new(selector))
            print address, intro, comment, contact, grade
            params = dict(
                name=name,
                address=address,
                intro=intro,
                comment=comment,
                contact=contact,
                grade=grade,
                url=url
            )
            MY_DB.insert(settings.QUERY_MFW_NEW, params)

        else:
            print 333
示例#8
0
def get_detail(url):
    rsp = requests.get(url=url, headers=HEADER, proxies=get_proxy()).text

    print 'get_detail_end...'

    selector = etree.HTML(rsp)
    name = xpath_handler(selector.xpath(settings.NAME_QNE))
    address_phone = selector.xpath(settings.ADDRESS_PHONE_QNE)
    address = address_phone[0].strip() if len(address_phone) else ''
    contact = address_phone[1].strip() if len(address_phone) > 1 else ''
    coord = xpath_handler(selector.xpath(settings.COORD_QNE))
    if coord:
        coord = coord.split(',')
        lon = coord[0]
        lat = coord[1]
    else:
        lon = lat = None
    grade = xpath_handler(selector.xpath(settings.GRADE_QNE)) or None
    comment = xpath_handler(selector.xpath(settings.COMMENT_QNE), 0)
    if comment:
        comment = re.search('\d+', comment)
        comment = comment.group() if comment else 0
    open_time = xpath_handler(selector.xpath(settings.OPEN_QNE))
    time_advise = xpath_handler(selector.xpath(settings.TIME_QNE))
    time_advise = time_advise and time_advise.split(u':')[1]
    website = xpath_handler(selector.xpath(settings.WEBSITE_QNE))
    intro = xpath_handler(selector.xpath(settings.INTRO_QNE))
    if not name:
        return
    print name
    print address
    print contact
    print lon
    print lat
    try:
        grade = float(grade)
    except:
        grade = 0
    print grade
    print comment
    print open_time
    print time_advise
    print website
    print intro
    params = dict(
        name=name,
        address=address,
        grade=grade,
        comment=comment,
        url=url,
        intro=intro,
        website=website,
        contact=contact,
        lon=lon,
        lat=lat,
        open=open_time,
        time=time_advise
    )
    MY_DB.insert(settings.QUERY_QNE, params)
    # MY_DB.insert('delete from qne_url where url=%(url)s', dict(url=url))
    if name:
        REDIS_CLIENT.set(url, 0)