Пример #1
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    mongodatas = []

    page_type = 3
    totalcpl = re.compile("\((\d+)\)")

    pic_a_tags = soup.select(".main .list_left .list .list_lb dl span a")
    for pic_a_tag in pic_a_tags:
        data = {}
        img_tag = pic_a_tag.find("img")
        img_src = img_tag.attrs['src']
        title = clearString(img_tag.attrs['alt']).strip()
        data['t8t_lid'] = info['lid']
        data['t8t_cid'] = info['cid']
        data['t8t_type'] = info['photo_type']
        data['page_type'] = page_type
        data['tag'] = title.strip().replace("'", r"\'")
        data['url'] = img_src.strip()
        mongodatas.append(data)

    # 存入MySQL
    conn = torndb.Connection(host="192.168.1.119",
                             database="data_transfer",
                             user="******",
                             password=passwd)
    for dataobj in mongodatas:
        sqlstring = "select id from %s where `lid`=%s and `ptype`=%s and `url`='%s'" % \
            (lp_photo_link, info['lid'], info['photo_type'], dataobj['url'])
        rows = conn.query(sqlstring)
        if len(rows):
            sqlstring = "update %s set `name`='%s', `url`='%s' where `id`=%s" % \
                        (lp_photo_link, dataobj['tag'], dataobj['url'], rows[0]['id'])
            conn.execute_lastrowid(sqlstring)
            t8t_id = rows[0]['id']
        else:
            sqlstring = "insert into %s(`lid`, `cid`, `ptype`, `name`, `url`, `page_type`) values(%s, %s, %s, '%s', '%s', %s)" % \
                        (lp_photo_link, info['lid'], info['cid'], info['photo_type'], dataobj['tag'], dataobj['url'], page_type)
            t8t_id = conn.execute_lastrowid(sqlstring)

        # mongodb映射到mysql的id
        dataobj['t8t_id'] = t8t_id
    conn.close()

    # 存入Mongodb
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_photo_links")
    for dataobj in mongodatas:
        clearData = {'t8t_id': dataobj['t8t_id']}
        collection.remove(clearData)
        collection.save(dataobj)
    client.close()
Пример #2
0
def jdugePage(info, dataDir=None):
    """
    判断当前页面是否为指定页面
    """
    mid = info['id']
    soup = info["soup"]
    result = False
    loginfo = u", [.main .list_left .list] not found"

    list_tag = soup.select(".main .list_left .list")
    if list_tag:
        list_tag = clearString(list_tag[0].text)
        list_tag = list_tag.strip()

        if info['encoding'] == 'windows-1252':
            list_tag = list_tag.encode('windows-1252').decode('gbk')

        if u"您搜索的内容不存在或因涉及敏感词汇而不能正常显示,请重新搜索其它关键词" in list_tag:
            result = True
            loginfo = u"内容不存在或因涉及敏感词汇而不能正常显示."

    return (result, loginfo)
Пример #3
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    mongodatas = []

    page_type = 2
    totalcpl = re.compile("\((\d+)\)")

    pic_ul_tag = soup.select(".pho_main_right  .pho_main_right_synr ul")
    pic_li_tags = pic_ul_tag[0].findAll("li")
    for pic_li_tag in pic_li_tags:
        data = {}
        data['t8t_lid'] = info['lid']
        data['t8t_cid'] = info['cid']
        data['t8t_type'] = info['photo_type']
        data['page_type'] = page_type
        img = pic_li_tag.find("img")
        p_tags = pic_li_tag.findAll("p")
        p_tag1, p_tag2 = p_tags
        url = img.attrs['src']
        title = clearString(p_tag1.text)
        data['tag'] = title.strip().replace("'", r"\'")
        data['url'] = url.strip()
        """
        unique = u"%s%s" % (title, data['url'])
        unique = unique.encode('utf8')        
        m = md5()
        m.update(unique)
        unique = m.hexdigest()
        data['t8t_name'] = unique
        """
        mongodatas.append(data)

    # 存入MySQL
    conn = torndb.Connection(host="192.168.1.119",
                             database="data_transfer",
                             user="******",
                             password=passwd)
    for dataobj in mongodatas:
        sqlstring = "select id from %s where `lid`=%s and `ptype`=%s and `url`='%s'" % \
            (lp_photo_link, info['lid'], info['photo_type'], dataobj['url'])
        rows = conn.query(sqlstring)
        if len(rows):
            sqlstring = "update %s set `name`='%s', `url`='%s' where `id`=%s" % \
                        (lp_photo_link, dataobj['tag'], dataobj['url'], rows[0]['id'])
            conn.execute_lastrowid(sqlstring)
            t8t_id = rows[0]['id']
        else:
            sqlstring = "insert into %s(`lid`, `cid`, `ptype`, `name`, `url`, `page_type`) values(%s, %s, %s, '%s', '%s', %s)" % \
                        (lp_photo_link, info['lid'], info['cid'], info['photo_type'], dataobj['tag'], dataobj['url'], page_type)
            t8t_id = conn.execute_lastrowid(sqlstring)

        dataobj['t8t_id'] = t8t_id
    conn.close()

    # 存入Mongodb
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_photo_links")
    for dataobj in mongodatas:
        clearData = {'t8t_id': dataobj['t8t_id']}
        collection.remove(clearData)
        collection.save(dataobj)
    client.close()
Пример #4
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    clean_square_brackets = re.compile("\[\s*\S+\s*\]")

    # ------------------------------------------------
    detail_page_info = {'method': 1, 'lid': info['id'], 'url': info['url']}
    # 搜房网链接路径
    # 基本信息
    basic_info = {}
    tbody = soup.select("div.mainl div.besic_inform table tbody")
    basic_info_tags = tbody[0].findAll("tr")
    for basic_info_tag in basic_info_tags:
        strong = basic_info_tag.find("strong")
        if strong:
            key = clearString(strong.text)
            text = clearString(basic_info_tag.text)
            text = clean_square_brackets.sub("", text)
            value = text.replace(key, "")
            if value.startswith(u":") or value.startswith(u":"):
                value = value[1:]
            key = key.replace(u" ", "")
            if key.endswith(u":") or key.endswith(u":"):
                key = key[:-1]
            key = key.endswith(u"房价") and u"房价" or key
            basic_info[key] = value.strip()
    detail_page_info["basicdetails"] = basic_info

    # 项目配套
    xq_xmpt_anchor = soup.select("#xq_xmpt_anchor")
    if xq_xmpt_anchor:
        xq_xmpt_anchor = xq_xmpt_anchor[0]
        lineheight = xq_xmpt_anchor.findNextSibling(class_="lineheight")
        text = clearString(lineheight.text)
        detail_page_info["projectSupporting"] = text

    # 交通状况
    xq_jtzk_anchor = soup.select("#xq_jtzk_anchor")
    if xq_jtzk_anchor:
        xq_jtzk_anchor = xq_jtzk_anchor[0]
        lineheight = xq_jtzk_anchor.findNextSibling(class_="lineheight")
        text = clearString(lineheight.text)
        detail_page_info["trafic"] = text

    # 建材装修
    xq_jczx_anchor = soup.select("#xq_jczx_anchor")
    if xq_jczx_anchor:
        xq_jczx_anchor = xq_jczx_anchor[0]
        lineheight = xq_jczx_anchor.findNextSibling(class_="lineheight")
        text = clearString(lineheight.text)
        detail_page_info["buildingDecoration"] = text

    # 楼层状况
    xq_lczk_anchor = soup.select("#xq_lczk_anchor")
    if xq_lczk_anchor:
        xq_lczk_anchor = xq_lczk_anchor[0]
        lineheight = xq_lczk_anchor.findNextSibling(class_="lineheight")
        text = clearString(lineheight.text)
        detail_page_info["floor"] = text

    # 车位信息
    xq_cwxx_anchor = soup.select("#xq_cwxx_anchor")
    if xq_cwxx_anchor:
        xq_cwxx_anchor = xq_cwxx_anchor[0]
        lineheight = xq_cwxx_anchor.findNextSibling(class_="lineheight")
        text = clearString(lineheight.text)
        detail_page_info["parkingInformation"] = text

    # 相关信息
    relativeInfo = {}
    xq_xgxx_anchor = soup.select("#xq_xgxx_anchor")
    if xq_xgxx_anchor:
        xq_xgxx_anchor = xq_xgxx_anchor[0]
        lineheight = xq_xgxx_anchor.findNextSibling(class_="lineheight")
        strings = []
        for stripped in lineheight.children:
            if isinstance(stripped, element.Tag):
                string = clearString(stripped.text).strip().replace(
                    u" ", "").replace(u" ", "").replace(u":", u":")
            else:
                string = clearString(stripped).strip()
            strings.append(string)
        all_string = " ".join(strings)
        all_string = re.sub("\[\S+\]", "", all_string)
        items = re.findall(ur"((\S+):\s*(\S+)?)", all_string)
        for item in items:
            key = item[1].strip()
            value = item[2].strip()
            if key not in relativeInfo:
                relativeInfo[key] = value
            else:
                relativeInfo[key] = "%s| %s" % (relativeInfo[key], value)
    detail_page_info["relativeInfo"] = relativeInfo

    # Mongo存储
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_details")
    clearData = {"lid": info['id']}
    collection.remove(clearData)
    collection.save(detail_page_info)
    client.close()
Пример #5
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    url = info['photo_url']
    soup = info["soup"]

    page_type = 1
    checkcpl = re.compile("(\W+)\s*(\d+)")

    # 相册类型与总数
    photolist = {}
    soufang_tags = soup.select("div.main_rt300 .xc_xmdl dl dd")
    for soufang_tag in soufang_tags:
        text = clearString(soufang_tag.text)
        allfind = checkcpl.findall(text)
        if allfind:
            ptype = ""
            ptext, ptotal = allfind[0]
            if info['encoding'] == 'windows-1252':
                ptext = ptext.encode('windows-1252').decode('gbk')
            if u"户型图" in ptext:
                ptype = 900
            elif u"交通图" in ptext:
                ptype = 901
            elif u"外景图" in ptext:
                ptype = 902
            elif u"实景图" in ptext:
                ptype = 903
            elif u"效果图" in ptext:
                ptype = 904
            elif u"样板间" in ptext:
                ptype = 905
            elif u"项目现场" in ptext:
                ptype = 906
            elif u"配套图" in ptext or u"周边" in ptext:
                ptype = 907
            
            ptotal = int(ptotal)
            if ptype:
                photolist[ptype] = ptotal

    if 1000 in photolist and photolist[1000] == 0:
        # 存在空的无照片的页面
        return None
    
    sql_lists = []
    for ptype, ptotal in photolist.iteritems():
        urldomain = urlparse(url).netloc
        rid = info['rid']
        for i in range(1, int(ceil(int(ptotal)/6.0) + 1)):
            nextpage = i
            sqlstring = "replace into lp_photo_summary(`lid`, `cid`, `url`, `rid`, `photo_type`, `total`, `npage`, `page_type`) " \
                        "values(%s, %s, '%s', %s, %s, %s, %s, %s)" % \
                        (info['id'], info['cid'], urldomain, info['rid'], ptype, ptotal, nextpage, page_type)
            sql_lists.append(sqlstring)
            
    conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd)
    for sqlstring in sql_lists:
        conn.execute(sqlstring)
    conn.close()
Пример #6
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    # ------------------------------------------------
    detail_page_info = {
        'method': 2,
        'lid': info['id'],
        'url': info['url']
    }
    # 搜房网链接路径
    basic_info_tags = {}
    leftinfo = soup.select("div.maininfo .leftinfo")
    yihang_tags = leftinfo[0].findAll(class_="yihang")
    lbox_tags = leftinfo[0].findAll(class_="lbox")
    for i in range(len(yihang_tags)):
        yihang_tag = yihang_tags[i]
        title = yihang_tag.find('h3').text
        lbox_tag = lbox_tags[i]
        if title == u"基本信息":
            basic_info_tags[title] = lbox_tag
        elif title == u"配套设施":
            basic_info_tags[title] = lbox_tag
        elif title == u"交通状况":
            basic_info_tags[title] = lbox_tag
        elif title == u"周边信息":
            basic_info_tags[title] = lbox_tag
    
    # 基本信息
    basic_info = {}
    if u"基本信息" in basic_info_tags:
        basic_tag = basic_info_tags[u"基本信息"]
        dds = basic_tag.findAll("dd")
        for dd in dds:
            strong = dd.find("strong")
            if strong:
                key = clearString(strong.text)
                value = clearString(dd.text)
                value = value.replace(key, "")
                if value.startswith(u":"):
                    value = value[1:]
                key = key.replace(u" ", "")
                if key.endswith(u":"):
                    key = key[:-1]
                basic_info[key] = value
        detail_page_info["basicdetails"] = basic_info
    
    # 配套设施
    relativeInfo = {}
    if u"配套设施" in basic_info_tags:
        relative_tag = basic_info_tags[u"配套设施"]
        dds = relative_tag.findAll("dd")
        for dd in dds:
            strong = dd.find("strong")
            if strong:
                key = clearString(strong.text)
                value = clearString(dd.text)
                value = value.replace(key, "")
                key = key.replace(u":", "").replace(u" ", "")
                relativeInfo[key] = value
        detail_page_info["relativeInfo"] = relativeInfo
    
    # 周边信息 peripheralInformation
    if u"周边信息" in basic_info_tags:
        peripheral_information = {}
        peripheral_info_tag = basic_info_tags[u"周边信息"]  
        dts = peripheral_info_tag.findAll("dt")
        for dt in dts:
            string = clearString(dt.text)
            if u":" in string:
                key, value = string.split(u":", 1)
            elif u":" in string:
                key, value = string.split(u":", 1)
            key = key.replace(u" ", "").strip()
            value = value.strip()
            peripheral_information[key] = value
        detail_page_info['peripheralInformation'] = peripheral_information
    
    
    # 交通状况
    if u"交通状况" in basic_info_tags:
        trafic_tag = basic_info_tags[u"交通状况"]
        text = clearString(trafic_tag.text)
        detail_page_info["trafic"] = text

    # Mongo存储
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_details")
    clearData = {"lid": info['id']}
    collection.remove(clearData)
    collection.save(detail_page_info)
    client.close()
Пример #7
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    blank_to_one = re.compile("\s+")

    # ------------------------------------------------
    first_page_info = {'method': 5, 'lid': info['id'], 'url': info['url']}
    # 搜房网链接路径
    # 搜房网> 上海新房> 奉贤楼盘> 朗诗未来街区
    lp_path_tag = soup.select(".lpbl .lpblbox .title .gray6")[0]
    lp_path = clearString(lp_path_tag.text)
    lp_path = lp_path.replace(u"查看地图>>", u"")
    first_page_info['linkPath'] = lp_path
    # 小区名称
    lp_name_tag = soup.select(".lpbl .lpblbox .title .biaoti")[0]
    lp_name = clearString(lp_name_tag.text)
    first_page_info['title'] = lp_name
    # 小区别名 (无)
    # 标签 (无)
    # 首页详情信息
    first_details = []
    all_xiangqing_tag = []
    xiangqing_tags = soup.select(".lpbl .lpblbox .xiangqing")[0]
    xiangqing_tag = xiangqing_tags.findAll("dd")
    all_xiangqing_tag.extend(xiangqing_tag)
    xiangqing_tag = xiangqing_tags.findAll("dt")
    all_xiangqing_tag.extend(xiangqing_tag)
    xiangqing1_tags = soup.select(".lpbl .lpblbox1 .xiangqing")[0]
    xiangqing_tag = xiangqing1_tags.findAll("dd")
    all_xiangqing_tag.extend(xiangqing_tag)
    xiangqing_tag = xiangqing1_tags.findAll("dt")
    all_xiangqing_tag.extend(xiangqing_tag)
    for xiangqing_tag in all_xiangqing_tag:
        text = clearString(xiangqing_tag.text)
        """
        if text.find(u":"):
            key, value = text.split(u":")
        elif text.find(u":"):
            key, value = text.split(u":")
        else:
            key = text
            value = ""
        """
        first_details.append(text)
    else:
        first_page_info['firstDetails'] = first_details

    # 地图链接地址
    iframe_map_tag = soup.select("#map iframe")[0]
    iframe_map_link = iframe_map_tag.attrs['src']
    first_page_info['iframeMap'] = iframe_map_link

    # ------------------------------------------------
    # 详情页与相册页链接
    lpxq_link = ""
    lpxc_link = ""
    lp_link_tags = soup.select(".snav_sq ul a")
    for lp_link_tag in lp_link_tags:
        text = lp_link_tag.text.strip()
        if text == u"楼盘详情":
            lpxq_link = lp_link_tag.attrs['href']
        elif text == u"楼盘相册":
            lpxc_link = lp_link_tag.attrs['href']

    # Mongo存储
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_homepage")
    collection.save(first_page_info)
    client.close()

    # 更新首页链接
    sqlstring = "UPDATE %s SET detail_url='%s', photo_url='%s' WHERE `id`=%s" % (
        lp_links, lpxq_link, lpxc_link, mid)
    conn = torndb.Connection(host="192.168.1.119",
                             database="data_transfer",
                             user="******",
                             password=passwd)
    conn.execute(sqlstring)
    conn.close()
Пример #8
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    # ------------------------------------------------
    detail_page_info = {
        'method': 2,
        'lid': info['id'],
        'url': info['url']
    }
    # 搜房网链接路径
    basic_info_tags = {}
    lpblbox1s = soup.select("div.wrap .lpbl .lpblbox1")
    for lpblbox1 in lpblbox1s:
        lpblbox_children = []
        for lpblbox_tag in lpblbox1.children:
            if isinstance(lpblbox_tag, element.Tag):
                lpblbox_children.append(lpblbox_tag)
        title_tag = lpblbox_children[0].find(class_="name")
        title = clearString(title_tag.text)
        basic_tag = lpblbox_children[1]
        if title == u"基本信息":
            basic_info_tags[title] = basic_tag
        elif title == u"配套设施":
            basic_info_tags[title] = basic_tag
        elif title == u"交通状况":
            basic_info_tags[title] = basic_tag
        elif title == u"周边信息":
            basic_info_tags[title] = basic_tag
    
    # 基本信息
    basic_info = {}
    if u"基本信息" in basic_info_tags:
        basic_tag = basic_info_tags[u"基本信息"]
        dds = basic_tag.findAll("dd")
        for dd in dds:
            dd_text = clearString(dd.text)
            if u":" in dd_text:
                key, value = dd_text.split(u":", 1)
            elif u":" in dd_text:
                key, value = dd_text.split(u":", 1)
            else:
                continue
            
            key = key.strip()
            value = value.strip()
            basic_info[key] = value
        detail_page_info["basicdetails"] = basic_info
    
    # 配套设施(无)
    
    # 周边信息 peripheralInformation
    if u"周边信息" in basic_info_tags:
        peripheral_information = {}
        peripheral_info_tag = basic_info_tags[u"周边信息"]
        peripheral_information = clearString(peripheral_info_tag.text)
        detail_page_info['peripheralInformation'] = peripheral_information
    
    
    # 交通状况
    if u"交通状况" in basic_info_tags:
        trafic_tag = basic_info_tags[u"交通状况"]
        text = clearString(trafic_tag.text)
        detail_page_info["trafic"] = text

    # Mongo存储
    client = pymongo.MongoClient(host="192.168.1.83", port=27017)
    collection = client.loupan.get_collection("lp_details")
    clearData = {"lid": info['id']}
    collection.remove(clearData)
    collection.save(detail_page_info)
    client.close()
Пример #9
0
def analysisPage(info, dataDir):
    """
    收集信息
    """
    mid = info['id']
    soup = info["soup"]

    page_type = 2
    totalcpl = re.compile("\((\d+)\)")

    # ------------------------------------------------
    photolist = {}
    mt15s = soup.select(".wrap .pho_main_right .pho_main_right_bt")
    for mt15 in mt15s:
        a_tags = mt15.findAll("a")
        if a_tags > 2:
            ptype_tag, pmore_tag = a_tags[:2]
            purl = pmore_tag.attrs['href']
            ptype_text = clearString(ptype_tag.text)
            if info['encoding'] == 'windows-1252':
                ptype_text = ptype_text.encode('windows-1252').decode('gbk')
            ptotal = totalcpl.findall(ptype_text)
            if ptotal:
                ptype = ""
                ptotal = int(ptotal[0])
                if u"户型图" in ptype_text:
                    ptype = 900
                elif u"交通图" in ptype_text:
                    ptype = 901
                elif u"外景图" in ptype_text:
                    ptype = 902
                elif u"实景图" in ptype_text:
                    ptype = 903
                elif u"效果图" in ptype_text:
                    ptype = 904
                elif u"样板间" in ptype_text:
                    ptype = 905
                elif u"项目现场" in ptype_text:
                    ptype = 906
                elif u"配套图" in ptype_text or u"周边" in ptype_text:
                    ptype = 907
                elif u"全部图片" in ptype_text:
                    ptype = 1000
                if ptype:
                    photolist[ptype] = (ptotal, purl)

    if 1000 in photolist and photolist[1000] == 0:
        # 存在空的无照片的页面
        return None

    #
    sql_lists = []
    for ptype, pvalue in photolist.iteritems():
        ptotal, purl = pvalue
        if purl.find("caseFor4S"):
            radix = 10.0
        else:
            radix = 16.0
        for i in range(1, int(ceil(int(ptotal) / radix) + 1)):
            sqlstring = "replace into lp_photo_summary(`lid`, `cid`, `url`, `rid`, `photo_type`, `total`, `npage`, `page_type`) " \
                        "values(%s, %s, '%s', %s, %s, %s, %s, %s)" % \
                        (info['id'], info['cid'], purl, info['rid'], ptype, ptotal, i, page_type)
            sql_lists.append(sqlstring)

    conn = torndb.Connection(host="192.168.1.119",
                             database="data_transfer",
                             user="******",
                             password=passwd)
    for sqlstring in sql_lists:
        conn.execute(sqlstring)
    conn.close()