示例#1
0
def do_fetch():
    fc.fetch_page("http://www.fang.com/SoufunFamily.htm",
                  "data/temp/index.html")
    page = fc.read_page("data/temp/index.html", "GBK")
    page = str(page('#c02').html())
    page = re.sub(r'<!--[^>]((?:.|\n)*?)-->', '', page)  # 删除多行注释
    # page = re.sub(r'<!--[^>]*-->', '', page)  替换单行注释
    page = pq(page)
    city_list = page('a')
    for city_info in city_list.items():
        city_name = city_info.text()
        href = str(city_info.attr('href'))
        city_code = fc.get_city_code(href)

        if city_code is False:
            continue

        if not fc.fetch_page(href, "data/temp/temp.html"):
            continue

        cl_page = fc.read_page("data/temp/temp.html", "GBK")
        page_title = cl_page('title').text()
        if page_title.find(city_name) == -1:
            logging.warning(
                f"city_name:{city_name},city_code:{city_code},webtitle != city_name"
            )
            continue

        temp = conn.get_one(
            f"select * from inf_city where city_code = '{city_code}' and website = '房天下'"
            f" and status<>-1")
        if temp is not None:
            t_name = temp[2]
            t_url = temp[5]
            record = temp[9]
            if city_name != t_name:
                record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:city_name [{t_name}->{city_name}]\n{record}"
                conn.mysql(
                    f"update inf_city set city_name='{city_name}'"
                    f",update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'"
                    f",status_record='{record}' where id = '{temp[0]}'")
                logging.warning(
                    f"ID:{temp[0]}:city_name [{t_name}->{city_name}]")
            if href != t_url:
                record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:url [{t_url}->{href}]\n{record}"
                conn.mysql(
                    f"update inf_city set url='{href}'"
                    f",update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'"
                    f",status_record='{record}' where id = '{temp[0]}'")
                logging.warning(f"ID:{temp[0]}:url [{t_url}->{href}]")
            continue
        sql = f"insert into inf_city (city_code, city_name, website, webtitle, url, insert_time)" \
              f" values ('{city_code}','{city_name}', '房天下', '{page_title}'," \
              f" '{href}', '{time.strftime('%Y-%m-%d %H:%M:%S')}') "
        conn.mysql(sql)
示例#2
0
def do_fetch_housing(url, city_code, city_name):
    # 获取小区页面做检测
    if not fc.fetch_page(url, 'data/temp/temp.html'):
        return False
    page = fc.read_page("data/temp/temp.html", "GBK")

    page_title = page('title').text()
    if page_title.find(city_name) == -1:
        logging.warning(
            f"city_name:{city_name},city_code:{city_code},webtitle != city_name"
        )
        conn.mysql(
            f"update inf_city set status=8 where city_code='{city_code}'")
        return False

    # 获取小区页面最大分页值,如果没有分页则说明没有收录小区信息
    page_split = page('#houselist_B14_01')
    if page_split.html() is None or page_split.html() == 'None':
        logging.warning(f'没有找到小区信息: {url}')
        conn.mysql(
            f"update inf_city set status = 9 where city_code = '{city_code}' and website='房天下'"
        )
        return False
    page_split = page_split('.txt').text()
    page_split_max = int(page_split.strip('共页'))
    # 抓取小区各个分页
    for num in range(1, page_split_max + 1):
        split_url = url + '__0_0_0_0_' + str(num) + '_0_0_0/'
        file_path = f'data/city/{city_code}/index/{city_code}_{str(num)}.html'
        if not fc.fetch_page(split_url, file_path):
            return False
    conn.mysql(
        f"update inf_city set status = 1 where city_code = '{city_code}' and website='房天下'"
    )
    return True
示例#3
0
def get_detail_url(city_code, community_code):
    community_page_path = f'data/city/{city_code}/community/{community_code}.html'
    try:
        page = fc.read_page(community_page_path, "GBK")
    except Exception as e:
        try:
            page = fc.read_page(community_page_path, "GB2312")
        except Exception as e:
            logging.warning(f"读取小区页面编码异常,message={e}")
            return False

    detail_url = page('#xqwxqy_C01_17')('div>span>a').attr('href')
    if detail_url is None:
        # 这是另外一种页面形式
        detail_url = page.find("div[class='floatr']")('a').attr('href')
        if detail_url is None:
            if page('#esf_fangyuanlist')('div>span>a').text() == "查看全部房源":
                return "no_detail"
            else:
                logging.warning(f"检测到新的小区页面,小区ID={community_code}")
                return False
    return detail_url
示例#4
0
def do_main:
    community_list = conn.get_all("select * from inf_community where status=1")
    for community in community_list:
        city_code = community[1]
        community_code = community[2]

        page_path = f'data/city/{city_code}/community/{community_code}.html'
        page = fc.read_page(page_path)
        if page is False:
            continue

        if page == '':
            parser_one_main(page)
        if page == '':
            parser_two_main(page)
        logging.warning(f"未识别页面,无法选择解析器.")
        continue
示例#5
0
def do_parser():
    all_city_code = conn.get_all("select city_code from inf_city where status = 1 and website='房天下'")
    for City_code in all_city_code:
        city_code = City_code[0]
        # 获取城市索引下面所有分页
        files = fc.get_all_files(f'data/city/{city_code}/index')

        for file in files:
            file_path = f'data/city/{city_code}/index/{file}'
            file_full_path = Path(__file__).parent.parent.joinpath(f'data/city/{city_code}/index/{file}')
            if not os.path.isdir(file_full_path):
                logging.info(f"读取文件:{file}")
                page_main = fc.read_page(file_path, "GBK")
                # 截取主体部分
                page_main = page_main('.houseList').html()
                if page_main is None or page_main == "":
                    logging.warning(f"{file}页面没有主体部分")
                    continue
                # 抓取每条信息更新到数据库

                # 可能会出现异常页面如sh_76 主体部分内容为空的,房天下的页面经常更新的,内容会变
                try:
                    page_main = pq(page_main)
                except Exception as e:
                    logging.warning(f"读取主体部分失败, message={e}")
                    continue
                page_main = page_main.find("div[class='list rel']")

                for community in page_main.items():
                    community_img_url = community('dl>dt>a>img').attr('src')
                    community_url = community('dl>dd>p>a').attr('href')
                    community_code = fc.get_city_code(community('dl>dd>p>a').eq(0).attr('href'))
                    if community_code is False:
                        continue
                    community_name = community('dl>dd>p>a').eq(0).text()
                    community_type = community('dl>dd>p>span').eq(0).text()
                    community_level = community('dl>dd>p>span').eq(1).html()
                    if community_level is not None:
                        community_level = community_level.count('<i/>') + 0.5 * community_level.count('<i'
                                                                                                      ' class="half"/>')
                    else:
                        community_level = 'Null'
                    community_location = fc.remove_html(community('dl>dd>p').eq(1).html(), 1)
                    community_location = community_location.lstrip(' ')
                    community_location = community_location.rstrip(' ')
                    community_selling = fc.remove_html(community('dl>dd>ul>li').eq(0).html())
                    community_selling = community_selling.replace('套在售', '')
                    community_selling = community_selling.replace('|', '')
                    community_leasing = fc.remove_html(community('dl>dd>ul>li').eq(1).html())
                    community_leasing = community_leasing.replace('套在租', '')
                    community_leasing = community_leasing.replace('|', '')
                    community_build_time = community('dl>dd>ul>li').eq(2).html()
                    community_price = fc.remove_html(community.find("p[class='priceAverage']").html())
                    outfor = 0
                    for x in range(100):
                        temp = conn.get_one(f"select * from inf_community where community_code = '{community_code}'"
                                            f" and website = '房天下' and status<>-1 order by fetch_time desc limit 1")
                        # 处理重复
                        if temp is not None:
                            fetch_time = temp[6]
                            fetch_time = fetch_time.strftime('%Y-%m')
                            this_moon = time.strftime('%Y-%m')
                            if temp[5] == community_url:
                                # code 相同 url 相同的情况下,说明是同一个数据,只判断时间是否当月,当月的执行update,
                                # 并继续大循环,非当月则跳出这个循环,执行 insert
                                # todo 加入完全匹配,如果数据完全一致则不进行 update 减少日志warning输出,如果 city_code 变了抛出提醒,并且不更新数据
                                if this_moon == fetch_time:
                                    record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:ID:{temp[0]}[{temp[3]}" \
                                             f"->{community_name}]\n{temp[9]}"
                                    sql = f"update inf_community set city_code='{city_code}'," \
                                          f"community_name='{community_name}'," \
                                          f"url='{community_url}',update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'," \
                                          f"status=0,status_record='{record}',img_url='{community_img_url}'," \
                                          f"type='{community_type}',level={community_level}," \
                                          f"location='{community_location}',selling={community_selling}," \
                                          f"leasing={community_leasing},build_time='{community_build_time}'," \
                                          f"price='{community_price}',status=0" \
                                          f" where community_code='{community_code}'" \
                                          f" and website='房天下' and status<>-1"
                                    conn.mysql(sql)
                                    logging.warning(f"ID:{temp[0]}:community_name [{temp[3]}->{community_name}]")
                                    outfor = 1
                                break
                            # code 相同 url 不同, code + T 继续循环查询新code
                            community_code = str(community_code) + 'T'
                            continue
                        # 无重复 结束循环直接 insert
                        break

                    if outfor == 1:
                        continue

                    sql = f"insert into inf_community" \
                          f" (city_code,community_code,community_name,website,url,fetch_time,img_url,type,level" \
                          f",location,selling,leasing,build_time,price)" \
                          f" values" \
                          f" ('{city_code}','{community_code}','{community_name}','房天下','{community_url}'" \
                          f",'{time.strftime('%Y-%m-%d %H:%M:%S')}','{community_img_url}','{community_type}'," \
                          f"{community_level},'{community_location}',{community_selling}," \
                          f"{community_leasing},'{community_build_time}','{community_price}')"
                    conn.mysql(sql)
        conn.mysql(f"update inf_city set status = 2 where city_code = '{city_code}' and website='房天下'")