def parse_5i5j(self,response): self.logger.info("process 5i5j url") divs = response.xpath('//div[@class="list-con-box"]/div') for div in divs: l = ItemLoader(item=AgentItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("name",'.//h3/text()') l.add_xpath("dist_name", '//li[@class="new_di_tab_cur"]//text()', MapCompose(lambda x:x.strip()), Join()) l.add_xpath("subdist_name", '//dd//a[@class="cur"]//text()', MapCompose(lambda x: x.strip()), Join()) l.add_xpath("address", './/p[@class="iconsleft"]//text()',Join()) l.add_xpath("telephone",'.//div[@class="contacty"]/span/text()', MapCompose(lambda x: int(x)), re='\d+') l.add_xpath("recent_activation", './/p[@class="eye-icons"]', MapCompose(lambda x: int(x)), re = '(\d+)次') l.add_xpath("history_amount", './/p[@class="iconsleft1"]/text()', MapCompose(lambda x: int(x)), re ='买卖(\d+)') l.add_xpath("rent_house_amount", './/p[@class="iconsleft1"]/text()', MapCompose(lambda x: int(x)), re ='租赁(\d+)') # ids self._loads_ids(l, response) # housekeeping self._loads_housekeeping(l, response) yield l.load_item()
def parse_lianjia(self, response): self.logger.info("process lianjia url") ul = response.xpath('//ul[@class="agent-lst"]/li') for li in ul: l = ItemLoader(item=AgentItem(), selector=li) l.default_output_processor = TakeFirst() l.add_xpath("name", './/div[@class="agent-name"]//h2/text()') l.add_xpath("dist_name", './/div[@class="main-plate"]//a[1]/text()', MapCompose(lambda x: x.strip())) l.add_xpath("subdist_name", './/div[@class="main-plate"]//a[2]/text()', MapCompose(lambda x: x.strip())) l.add_xpath("telephone", './/p[@class="mobile_p"]/text()') l.add_xpath("history_amount", './/span[@class="LOGCLICKEVTID"]/text()', MapCompose(lambda x: int(x)), re=r"\d+") l.add_xpath("recent_activation", './/div[@class="achievement"]/span/text()', MapCompose(lambda x: int(x)), re=r"(\d+)套") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_centanet(self, response): self.logger.info("process centanet url") ul = response.xpath('//ul[@class="broker_list broker_listSZ"]/li') for li in ul: l = ItemLoader(item=AgentItem(), selector=li) l.default_output_processor = TakeFirst() l.add_xpath("name", './/p[@class="phone"]/b/@zvalue', re=r"cnName:'(\w+)'") l.add_xpath("dist_name", '(//span[@class="curr"])[1]/text()') l.add_xpath("subdist_name", '(//span[@class="curr"])[2]/text()') l.add_xpath("company",'.//h2//@title') l.add_xpath("address", './/p[@class="xi"]//@title', Join('-')) l.add_xpath("telephone",'.//p[@class="phone"]/b/@zvalue' ,re = r"mobile:'(\w+)'") l.add_xpath("second_house_amount",'.//div[@class="outstanding"]//p[1]/a/text()' ,re = r"\d+") l.add_xpath("rent_house_amount", './/div[@class="outstanding"]//p[2]/a/text()' , re=r"\d+") # ids self._loads_ids(l, response) # housekeeping self._loads_housekeeping(l, response) yield l.load_item()
def parse_anjuke(self, response): self.logger.info("process anjuke url") divs = response.xpath('//div[@class="jjr-itemmod"]') for div in divs: l = ItemLoader(item=AgentItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("name", ".//h3/a/text()", Join()) l.add_xpath("company", './/p[@class="jjr-desc"]/a[1]/text()') l.add_xpath("address", './/p[@class="jjr-desc"]/a[2]/text()') l.add_xpath("telephone", './/div[@class="jjr-side"]/text()', MapCompose(lambda x: int(x)), re=r"\d+") l.add_xpath( "dist_name", '(//span[@class="elems-l"]//a[@class="selected-item"])[1]//text()' ) l.add_xpath( "subdist_name", '(//span[@class="elems-l"]//a[@class="selected-item"])[2]//text()' ) # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_ganji(self, response): self.logger.info("process ganji url") divs = response.xpath('//div[@class="f-list-item"]') for div in divs: l = ItemLoader(item=AgentItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("name", './/a[@class="broker-name"]/text()') l.add_xpath("address", './/span[@class="bi-text broker-xiaoqu"]//text()', MapCompose(lambda x: x.strip()), Join()) l.add_xpath("telephone", './/p[@class="tel"]/text()', MapCompose(lambda x: int(x))) l.add_xpath( "dist_name", '//ul[@class="f-clear"]/li[@class="item current"]//text()') l.add_xpath("subdist_name", '//a[@class="subway-item current"]//text()') l.add_xpath("company", '//span[@class="bi-text broker-company"]/text()') # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_fang(self, response): self.logger.info("process fang url") ul = response.xpath('//li[@link]') for li in ul: l = ItemLoader(item=AgentItem(), selector=li) l.default_output_processor = TakeFirst() l.add_xpath("name", './/div[@class="ttop"]//a//text()') l.add_xpath("telephone", './/div[@class="fl"]/p[1]/text()', MapCompose(lambda x: int(x)), re=r"\d+") l.add_xpath("company", '//li[@link]//p[@class="f14 liaxni"]/span[2]/text()', Join(','), re=r"\w+") l.add_xpath("dist_name", '(//a[@class="orange"])[1]//text()') l.add_xpath("subdist_name", '(//a[@class="orange"])[2]//text()') l.add_xpath("second_house_amount", './/b[@class="ml03"]', re=r"(\d+)套") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_qfang(self, response): self.logger.info("process qfang url") ul = response.xpath('//div[@id="find_broker_lists"]//li') for li in ul: l = ItemLoader(item=AgentItem(), selector=li) l.default_output_processor = TakeFirst() l.add_xpath("name",'//p[@class="name fl"]//a/text()') l.add_xpath("dist_name", './/span[@class="con fl"]/b[1]/text()') l.add_xpath("subdist_name",'.//span[@class="con fl"]/b[2]/text()') l.add_xpath("telephone", './/div[@class="broker-tel fr"]/p/text()', MapCompose(lambda x: int(x)), re = r"\d+") l.add_xpath("history_amount", './/span[@class="con fl"]/em/text()') # ids self._loads_ids(l, response) # housekeeping self._loads_housekeeping(l, response) yield l.load_item()
def parse_item(self, response): # agency table l = ItemLoader(item=AgentItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath("name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.add_xpath("telephone", '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.item.setdefault("company", None) l.add_xpath("company", '//li[@class="st14 stb starial"]//text()') l.add_xpath("address", '//div[@class="xflilist"]/div[3]//text()', re=r':(\w+)') l.add_xpath("register_date", '//div[@class="jbfx"]/text()', re=r'登记日期:([\d/]+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value("category_name", self.category_name) l.add_value("station_name", self.station_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) item = l.load_item() if not item.get("subdist_name"): self.logger.critical( "subdsitrict name is not scrape, save response as a file") f = open("failed_html/html_%s.html" % parse_qs(urlparse(response.url).query).get("id")[0], 'w', encoding='utf8') f.write(response.text) f.close() # return Request(url=response.url) yield item # properties table l = ItemLoader(item=PropertyItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('title', '//div[@class="xxview_title"]/text()') l.add_value("url", response.url) l.add_xpath( "price", '//div[@class="xx_xq_l200"]/span[@class="st22 ' 'sthuangs stb starial"]/text()') l.add_xpath("address", '//div[@class="wydzleft"]/text()', MapCompose(lambda x: x.strip()), re=r'物业地址:([^\x01-\x1f]+)') l.add_xpath("agent_name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.item.setdefault("agent_company", None) l.add_xpath("agent_company", '//li[@class="st14 stb starial"]//text()') l.add_xpath('agent_phone', '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.add_xpath("recent_activation", '//div[@class="fyfbtime"]/text()', re='查看人次:(\\d+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value('station_name', self.station_name) l.add_value("category_name", self.category_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.request.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) yield l.load_item()