def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] mongodatas = [] page_type = 3 totalcpl = re.compile("\((\d+)\)") pic_a_tags = soup.select(".main .list_left .list .list_lb dl span a") for pic_a_tag in pic_a_tags: data = {} img_tag = pic_a_tag.find("img") img_src = img_tag.attrs['src'] title = clearString(img_tag.attrs['alt']).strip() data['t8t_lid'] = info['lid'] data['t8t_cid'] = info['cid'] data['t8t_type'] = info['photo_type'] data['page_type'] = page_type data['tag'] = title.strip().replace("'", r"\'") data['url'] = img_src.strip() mongodatas.append(data) # 存入MySQL conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd) for dataobj in mongodatas: sqlstring = "select id from %s where `lid`=%s and `ptype`=%s and `url`='%s'" % \ (lp_photo_link, info['lid'], info['photo_type'], dataobj['url']) rows = conn.query(sqlstring) if len(rows): sqlstring = "update %s set `name`='%s', `url`='%s' where `id`=%s" % \ (lp_photo_link, dataobj['tag'], dataobj['url'], rows[0]['id']) conn.execute_lastrowid(sqlstring) t8t_id = rows[0]['id'] else: sqlstring = "insert into %s(`lid`, `cid`, `ptype`, `name`, `url`, `page_type`) values(%s, %s, %s, '%s', '%s', %s)" % \ (lp_photo_link, info['lid'], info['cid'], info['photo_type'], dataobj['tag'], dataobj['url'], page_type) t8t_id = conn.execute_lastrowid(sqlstring) # mongodb映射到mysql的id dataobj['t8t_id'] = t8t_id conn.close() # 存入Mongodb client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_photo_links") for dataobj in mongodatas: clearData = {'t8t_id': dataobj['t8t_id']} collection.remove(clearData) collection.save(dataobj) client.close()
def jdugePage(info, dataDir=None): """ 判断当前页面是否为指定页面 """ mid = info['id'] soup = info["soup"] result = False loginfo = u", [.main .list_left .list] not found" list_tag = soup.select(".main .list_left .list") if list_tag: list_tag = clearString(list_tag[0].text) list_tag = list_tag.strip() if info['encoding'] == 'windows-1252': list_tag = list_tag.encode('windows-1252').decode('gbk') if u"您搜索的内容不存在或因涉及敏感词汇而不能正常显示,请重新搜索其它关键词" in list_tag: result = True loginfo = u"内容不存在或因涉及敏感词汇而不能正常显示." return (result, loginfo)
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] mongodatas = [] page_type = 2 totalcpl = re.compile("\((\d+)\)") pic_ul_tag = soup.select(".pho_main_right .pho_main_right_synr ul") pic_li_tags = pic_ul_tag[0].findAll("li") for pic_li_tag in pic_li_tags: data = {} data['t8t_lid'] = info['lid'] data['t8t_cid'] = info['cid'] data['t8t_type'] = info['photo_type'] data['page_type'] = page_type img = pic_li_tag.find("img") p_tags = pic_li_tag.findAll("p") p_tag1, p_tag2 = p_tags url = img.attrs['src'] title = clearString(p_tag1.text) data['tag'] = title.strip().replace("'", r"\'") data['url'] = url.strip() """ unique = u"%s%s" % (title, data['url']) unique = unique.encode('utf8') m = md5() m.update(unique) unique = m.hexdigest() data['t8t_name'] = unique """ mongodatas.append(data) # 存入MySQL conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd) for dataobj in mongodatas: sqlstring = "select id from %s where `lid`=%s and `ptype`=%s and `url`='%s'" % \ (lp_photo_link, info['lid'], info['photo_type'], dataobj['url']) rows = conn.query(sqlstring) if len(rows): sqlstring = "update %s set `name`='%s', `url`='%s' where `id`=%s" % \ (lp_photo_link, dataobj['tag'], dataobj['url'], rows[0]['id']) conn.execute_lastrowid(sqlstring) t8t_id = rows[0]['id'] else: sqlstring = "insert into %s(`lid`, `cid`, `ptype`, `name`, `url`, `page_type`) values(%s, %s, %s, '%s', '%s', %s)" % \ (lp_photo_link, info['lid'], info['cid'], info['photo_type'], dataobj['tag'], dataobj['url'], page_type) t8t_id = conn.execute_lastrowid(sqlstring) dataobj['t8t_id'] = t8t_id conn.close() # 存入Mongodb client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_photo_links") for dataobj in mongodatas: clearData = {'t8t_id': dataobj['t8t_id']} collection.remove(clearData) collection.save(dataobj) client.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] clean_square_brackets = re.compile("\[\s*\S+\s*\]") # ------------------------------------------------ detail_page_info = {'method': 1, 'lid': info['id'], 'url': info['url']} # 搜房网链接路径 # 基本信息 basic_info = {} tbody = soup.select("div.mainl div.besic_inform table tbody") basic_info_tags = tbody[0].findAll("tr") for basic_info_tag in basic_info_tags: strong = basic_info_tag.find("strong") if strong: key = clearString(strong.text) text = clearString(basic_info_tag.text) text = clean_square_brackets.sub("", text) value = text.replace(key, "") if value.startswith(u":") or value.startswith(u":"): value = value[1:] key = key.replace(u" ", "") if key.endswith(u":") or key.endswith(u":"): key = key[:-1] key = key.endswith(u"房价") and u"房价" or key basic_info[key] = value.strip() detail_page_info["basicdetails"] = basic_info # 项目配套 xq_xmpt_anchor = soup.select("#xq_xmpt_anchor") if xq_xmpt_anchor: xq_xmpt_anchor = xq_xmpt_anchor[0] lineheight = xq_xmpt_anchor.findNextSibling(class_="lineheight") text = clearString(lineheight.text) detail_page_info["projectSupporting"] = text # 交通状况 xq_jtzk_anchor = soup.select("#xq_jtzk_anchor") if xq_jtzk_anchor: xq_jtzk_anchor = xq_jtzk_anchor[0] lineheight = xq_jtzk_anchor.findNextSibling(class_="lineheight") text = clearString(lineheight.text) detail_page_info["trafic"] = text # 建材装修 xq_jczx_anchor = soup.select("#xq_jczx_anchor") if xq_jczx_anchor: xq_jczx_anchor = xq_jczx_anchor[0] lineheight = xq_jczx_anchor.findNextSibling(class_="lineheight") text = clearString(lineheight.text) detail_page_info["buildingDecoration"] = text # 楼层状况 xq_lczk_anchor = soup.select("#xq_lczk_anchor") if xq_lczk_anchor: xq_lczk_anchor = xq_lczk_anchor[0] lineheight = xq_lczk_anchor.findNextSibling(class_="lineheight") text = clearString(lineheight.text) detail_page_info["floor"] = text # 车位信息 xq_cwxx_anchor = soup.select("#xq_cwxx_anchor") if xq_cwxx_anchor: xq_cwxx_anchor = xq_cwxx_anchor[0] lineheight = xq_cwxx_anchor.findNextSibling(class_="lineheight") text = clearString(lineheight.text) detail_page_info["parkingInformation"] = text # 相关信息 relativeInfo = {} xq_xgxx_anchor = soup.select("#xq_xgxx_anchor") if xq_xgxx_anchor: xq_xgxx_anchor = xq_xgxx_anchor[0] lineheight = xq_xgxx_anchor.findNextSibling(class_="lineheight") strings = [] for stripped in lineheight.children: if isinstance(stripped, element.Tag): string = clearString(stripped.text).strip().replace( u" ", "").replace(u" ", "").replace(u":", u":") else: string = clearString(stripped).strip() strings.append(string) all_string = " ".join(strings) all_string = re.sub("\[\S+\]", "", all_string) items = re.findall(ur"((\S+):\s*(\S+)?)", all_string) for item in items: key = item[1].strip() value = item[2].strip() if key not in relativeInfo: relativeInfo[key] = value else: relativeInfo[key] = "%s| %s" % (relativeInfo[key], value) detail_page_info["relativeInfo"] = relativeInfo # Mongo存储 client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_details") clearData = {"lid": info['id']} collection.remove(clearData) collection.save(detail_page_info) client.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] url = info['photo_url'] soup = info["soup"] page_type = 1 checkcpl = re.compile("(\W+)\s*(\d+)") # 相册类型与总数 photolist = {} soufang_tags = soup.select("div.main_rt300 .xc_xmdl dl dd") for soufang_tag in soufang_tags: text = clearString(soufang_tag.text) allfind = checkcpl.findall(text) if allfind: ptype = "" ptext, ptotal = allfind[0] if info['encoding'] == 'windows-1252': ptext = ptext.encode('windows-1252').decode('gbk') if u"户型图" in ptext: ptype = 900 elif u"交通图" in ptext: ptype = 901 elif u"外景图" in ptext: ptype = 902 elif u"实景图" in ptext: ptype = 903 elif u"效果图" in ptext: ptype = 904 elif u"样板间" in ptext: ptype = 905 elif u"项目现场" in ptext: ptype = 906 elif u"配套图" in ptext or u"周边" in ptext: ptype = 907 ptotal = int(ptotal) if ptype: photolist[ptype] = ptotal if 1000 in photolist and photolist[1000] == 0: # 存在空的无照片的页面 return None sql_lists = [] for ptype, ptotal in photolist.iteritems(): urldomain = urlparse(url).netloc rid = info['rid'] for i in range(1, int(ceil(int(ptotal)/6.0) + 1)): nextpage = i sqlstring = "replace into lp_photo_summary(`lid`, `cid`, `url`, `rid`, `photo_type`, `total`, `npage`, `page_type`) " \ "values(%s, %s, '%s', %s, %s, %s, %s, %s)" % \ (info['id'], info['cid'], urldomain, info['rid'], ptype, ptotal, nextpage, page_type) sql_lists.append(sqlstring) conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd) for sqlstring in sql_lists: conn.execute(sqlstring) conn.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] # ------------------------------------------------ detail_page_info = { 'method': 2, 'lid': info['id'], 'url': info['url'] } # 搜房网链接路径 basic_info_tags = {} leftinfo = soup.select("div.maininfo .leftinfo") yihang_tags = leftinfo[0].findAll(class_="yihang") lbox_tags = leftinfo[0].findAll(class_="lbox") for i in range(len(yihang_tags)): yihang_tag = yihang_tags[i] title = yihang_tag.find('h3').text lbox_tag = lbox_tags[i] if title == u"基本信息": basic_info_tags[title] = lbox_tag elif title == u"配套设施": basic_info_tags[title] = lbox_tag elif title == u"交通状况": basic_info_tags[title] = lbox_tag elif title == u"周边信息": basic_info_tags[title] = lbox_tag # 基本信息 basic_info = {} if u"基本信息" in basic_info_tags: basic_tag = basic_info_tags[u"基本信息"] dds = basic_tag.findAll("dd") for dd in dds: strong = dd.find("strong") if strong: key = clearString(strong.text) value = clearString(dd.text) value = value.replace(key, "") if value.startswith(u":"): value = value[1:] key = key.replace(u" ", "") if key.endswith(u":"): key = key[:-1] basic_info[key] = value detail_page_info["basicdetails"] = basic_info # 配套设施 relativeInfo = {} if u"配套设施" in basic_info_tags: relative_tag = basic_info_tags[u"配套设施"] dds = relative_tag.findAll("dd") for dd in dds: strong = dd.find("strong") if strong: key = clearString(strong.text) value = clearString(dd.text) value = value.replace(key, "") key = key.replace(u":", "").replace(u" ", "") relativeInfo[key] = value detail_page_info["relativeInfo"] = relativeInfo # 周边信息 peripheralInformation if u"周边信息" in basic_info_tags: peripheral_information = {} peripheral_info_tag = basic_info_tags[u"周边信息"] dts = peripheral_info_tag.findAll("dt") for dt in dts: string = clearString(dt.text) if u":" in string: key, value = string.split(u":", 1) elif u":" in string: key, value = string.split(u":", 1) key = key.replace(u" ", "").strip() value = value.strip() peripheral_information[key] = value detail_page_info['peripheralInformation'] = peripheral_information # 交通状况 if u"交通状况" in basic_info_tags: trafic_tag = basic_info_tags[u"交通状况"] text = clearString(trafic_tag.text) detail_page_info["trafic"] = text # Mongo存储 client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_details") clearData = {"lid": info['id']} collection.remove(clearData) collection.save(detail_page_info) client.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] blank_to_one = re.compile("\s+") # ------------------------------------------------ first_page_info = {'method': 5, 'lid': info['id'], 'url': info['url']} # 搜房网链接路径 # 搜房网> 上海新房> 奉贤楼盘> 朗诗未来街区 lp_path_tag = soup.select(".lpbl .lpblbox .title .gray6")[0] lp_path = clearString(lp_path_tag.text) lp_path = lp_path.replace(u"查看地图>>", u"") first_page_info['linkPath'] = lp_path # 小区名称 lp_name_tag = soup.select(".lpbl .lpblbox .title .biaoti")[0] lp_name = clearString(lp_name_tag.text) first_page_info['title'] = lp_name # 小区别名 (无) # 标签 (无) # 首页详情信息 first_details = [] all_xiangqing_tag = [] xiangqing_tags = soup.select(".lpbl .lpblbox .xiangqing")[0] xiangqing_tag = xiangqing_tags.findAll("dd") all_xiangqing_tag.extend(xiangqing_tag) xiangqing_tag = xiangqing_tags.findAll("dt") all_xiangqing_tag.extend(xiangqing_tag) xiangqing1_tags = soup.select(".lpbl .lpblbox1 .xiangqing")[0] xiangqing_tag = xiangqing1_tags.findAll("dd") all_xiangqing_tag.extend(xiangqing_tag) xiangqing_tag = xiangqing1_tags.findAll("dt") all_xiangqing_tag.extend(xiangqing_tag) for xiangqing_tag in all_xiangqing_tag: text = clearString(xiangqing_tag.text) """ if text.find(u":"): key, value = text.split(u":") elif text.find(u":"): key, value = text.split(u":") else: key = text value = "" """ first_details.append(text) else: first_page_info['firstDetails'] = first_details # 地图链接地址 iframe_map_tag = soup.select("#map iframe")[0] iframe_map_link = iframe_map_tag.attrs['src'] first_page_info['iframeMap'] = iframe_map_link # ------------------------------------------------ # 详情页与相册页链接 lpxq_link = "" lpxc_link = "" lp_link_tags = soup.select(".snav_sq ul a") for lp_link_tag in lp_link_tags: text = lp_link_tag.text.strip() if text == u"楼盘详情": lpxq_link = lp_link_tag.attrs['href'] elif text == u"楼盘相册": lpxc_link = lp_link_tag.attrs['href'] # Mongo存储 client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_homepage") collection.save(first_page_info) client.close() # 更新首页链接 sqlstring = "UPDATE %s SET detail_url='%s', photo_url='%s' WHERE `id`=%s" % ( lp_links, lpxq_link, lpxc_link, mid) conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd) conn.execute(sqlstring) conn.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] # ------------------------------------------------ detail_page_info = { 'method': 2, 'lid': info['id'], 'url': info['url'] } # 搜房网链接路径 basic_info_tags = {} lpblbox1s = soup.select("div.wrap .lpbl .lpblbox1") for lpblbox1 in lpblbox1s: lpblbox_children = [] for lpblbox_tag in lpblbox1.children: if isinstance(lpblbox_tag, element.Tag): lpblbox_children.append(lpblbox_tag) title_tag = lpblbox_children[0].find(class_="name") title = clearString(title_tag.text) basic_tag = lpblbox_children[1] if title == u"基本信息": basic_info_tags[title] = basic_tag elif title == u"配套设施": basic_info_tags[title] = basic_tag elif title == u"交通状况": basic_info_tags[title] = basic_tag elif title == u"周边信息": basic_info_tags[title] = basic_tag # 基本信息 basic_info = {} if u"基本信息" in basic_info_tags: basic_tag = basic_info_tags[u"基本信息"] dds = basic_tag.findAll("dd") for dd in dds: dd_text = clearString(dd.text) if u":" in dd_text: key, value = dd_text.split(u":", 1) elif u":" in dd_text: key, value = dd_text.split(u":", 1) else: continue key = key.strip() value = value.strip() basic_info[key] = value detail_page_info["basicdetails"] = basic_info # 配套设施(无) # 周边信息 peripheralInformation if u"周边信息" in basic_info_tags: peripheral_information = {} peripheral_info_tag = basic_info_tags[u"周边信息"] peripheral_information = clearString(peripheral_info_tag.text) detail_page_info['peripheralInformation'] = peripheral_information # 交通状况 if u"交通状况" in basic_info_tags: trafic_tag = basic_info_tags[u"交通状况"] text = clearString(trafic_tag.text) detail_page_info["trafic"] = text # Mongo存储 client = pymongo.MongoClient(host="192.168.1.83", port=27017) collection = client.loupan.get_collection("lp_details") clearData = {"lid": info['id']} collection.remove(clearData) collection.save(detail_page_info) client.close()
def analysisPage(info, dataDir): """ 收集信息 """ mid = info['id'] soup = info["soup"] page_type = 2 totalcpl = re.compile("\((\d+)\)") # ------------------------------------------------ photolist = {} mt15s = soup.select(".wrap .pho_main_right .pho_main_right_bt") for mt15 in mt15s: a_tags = mt15.findAll("a") if a_tags > 2: ptype_tag, pmore_tag = a_tags[:2] purl = pmore_tag.attrs['href'] ptype_text = clearString(ptype_tag.text) if info['encoding'] == 'windows-1252': ptype_text = ptype_text.encode('windows-1252').decode('gbk') ptotal = totalcpl.findall(ptype_text) if ptotal: ptype = "" ptotal = int(ptotal[0]) if u"户型图" in ptype_text: ptype = 900 elif u"交通图" in ptype_text: ptype = 901 elif u"外景图" in ptype_text: ptype = 902 elif u"实景图" in ptype_text: ptype = 903 elif u"效果图" in ptype_text: ptype = 904 elif u"样板间" in ptype_text: ptype = 905 elif u"项目现场" in ptype_text: ptype = 906 elif u"配套图" in ptype_text or u"周边" in ptype_text: ptype = 907 elif u"全部图片" in ptype_text: ptype = 1000 if ptype: photolist[ptype] = (ptotal, purl) if 1000 in photolist and photolist[1000] == 0: # 存在空的无照片的页面 return None # sql_lists = [] for ptype, pvalue in photolist.iteritems(): ptotal, purl = pvalue if purl.find("caseFor4S"): radix = 10.0 else: radix = 16.0 for i in range(1, int(ceil(int(ptotal) / radix) + 1)): sqlstring = "replace into lp_photo_summary(`lid`, `cid`, `url`, `rid`, `photo_type`, `total`, `npage`, `page_type`) " \ "values(%s, %s, '%s', %s, %s, %s, %s, %s)" % \ (info['id'], info['cid'], purl, info['rid'], ptype, ptotal, i, page_type) sql_lists.append(sqlstring) conn = torndb.Connection(host="192.168.1.119", database="data_transfer", user="******", password=passwd) for sqlstring in sql_lists: conn.execute(sqlstring) conn.close()