Exemplo n.º 1
0
    def parse_5i5j(self,response):
        self.logger.info("process 5i5j  url")
        divs = response.xpath('//div[@class="list-con-box"]/div')
        for div in divs:
            l = ItemLoader(item=AgentItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name",'.//h3/text()')
            l.add_xpath("dist_name", '//li[@class="new_di_tab_cur"]//text()',
                        MapCompose(lambda x:x.strip()), Join())
            l.add_xpath("subdist_name", '//dd//a[@class="cur"]//text()',
                        MapCompose(lambda x: x.strip()), Join())
            l.add_xpath("address", './/p[@class="iconsleft"]//text()',Join())
            l.add_xpath("telephone",'.//div[@class="contacty"]/span/text()',
                        MapCompose(lambda x: int(x)), re='\d+')
            l.add_xpath("recent_activation", './/p[@class="eye-icons"]',
                        MapCompose(lambda x: int(x)), re = '(\d+)次')
            l.add_xpath("history_amount", './/p[@class="iconsleft1"]/text()',
                         MapCompose(lambda x: int(x)), re ='买卖(\d+)')
            l.add_xpath("rent_house_amount", './/p[@class="iconsleft1"]/text()',
                         MapCompose(lambda x: int(x)), re ='租赁(\d+)')

            # ids
            self._loads_ids(l, response)
            #  housekeeping
            self._loads_housekeeping(l, response)

            yield l.load_item()
Exemplo n.º 2
0
    def parse_lianjia(self, response):
        self.logger.info("process lianjia url")
        ul = response.xpath('//ul[@class="agent-lst"]/li')
        for li in ul:
            l = ItemLoader(item=AgentItem(), selector=li)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name", './/div[@class="agent-name"]//h2/text()')
            l.add_xpath("dist_name",
                        './/div[@class="main-plate"]//a[1]/text()',
                        MapCompose(lambda x: x.strip()))
            l.add_xpath("subdist_name",
                        './/div[@class="main-plate"]//a[2]/text()',
                        MapCompose(lambda x: x.strip()))
            l.add_xpath("telephone", './/p[@class="mobile_p"]/text()')
            l.add_xpath("history_amount",
                        './/span[@class="LOGCLICKEVTID"]/text()',
                        MapCompose(lambda x: int(x)),
                        re=r"\d+")
            l.add_xpath("recent_activation",
                        './/div[@class="achievement"]/span/text()',
                        MapCompose(lambda x: int(x)),
                        re=r"(\d+)套")

            # ids
            self._load_ids(l, response)
            # housekeeping
            self._load_keephouse(l, response)

            yield l.load_item()
Exemplo n.º 3
0
    def parse_centanet(self, response):
        self.logger.info("process centanet url")
        ul = response.xpath('//ul[@class="broker_list broker_listSZ"]/li')
        for li in ul:
            l = ItemLoader(item=AgentItem(), selector=li)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name", './/p[@class="phone"]/b/@zvalue',
                        re=r"cnName:'(\w+)'")
            l.add_xpath("dist_name", '(//span[@class="curr"])[1]/text()')
            l.add_xpath("subdist_name", '(//span[@class="curr"])[2]/text()')
            l.add_xpath("company",'.//h2//@title')
            l.add_xpath("address", './/p[@class="xi"]//@title',
                        Join('-'))
            l.add_xpath("telephone",'.//p[@class="phone"]/b/@zvalue'
                        ,re = r"mobile:'(\w+)'")
            l.add_xpath("second_house_amount",'.//div[@class="outstanding"]//p[1]/a/text()'
                        ,re = r"\d+")
            l.add_xpath("rent_house_amount", './/div[@class="outstanding"]//p[2]/a/text()'
                        , re=r"\d+")
            # ids
            self._loads_ids(l, response)
            #  housekeeping
            self._loads_housekeeping(l, response)

            yield l.load_item()
Exemplo n.º 4
0
    def parse_anjuke(self, response):
        self.logger.info("process anjuke url")
        divs = response.xpath('//div[@class="jjr-itemmod"]')
        for div in divs:
            l = ItemLoader(item=AgentItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name", ".//h3/a/text()", Join())
            l.add_xpath("company", './/p[@class="jjr-desc"]/a[1]/text()')
            l.add_xpath("address", './/p[@class="jjr-desc"]/a[2]/text()')
            l.add_xpath("telephone",
                        './/div[@class="jjr-side"]/text()',
                        MapCompose(lambda x: int(x)),
                        re=r"\d+")
            l.add_xpath(
                "dist_name",
                '(//span[@class="elems-l"]//a[@class="selected-item"])[1]//text()'
            )
            l.add_xpath(
                "subdist_name",
                '(//span[@class="elems-l"]//a[@class="selected-item"])[2]//text()'
            )

            # ids
            self._load_ids(l, response)
            # housekeeping
            self._load_keephouse(l, response)

            yield l.load_item()
Exemplo n.º 5
0
    def parse_ganji(self, response):
        self.logger.info("process ganji url")
        divs = response.xpath('//div[@class="f-list-item"]')
        for div in divs:
            l = ItemLoader(item=AgentItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name", './/a[@class="broker-name"]/text()')
            l.add_xpath("address",
                        './/span[@class="bi-text broker-xiaoqu"]//text()',
                        MapCompose(lambda x: x.strip()), Join())
            l.add_xpath("telephone", './/p[@class="tel"]/text()',
                        MapCompose(lambda x: int(x)))
            l.add_xpath(
                "dist_name",
                '//ul[@class="f-clear"]/li[@class="item current"]//text()')
            l.add_xpath("subdist_name",
                        '//a[@class="subway-item current"]//text()')
            l.add_xpath("company",
                        '//span[@class="bi-text broker-company"]/text()')

            # ids
            self._load_ids(l, response)
            # housekeeping
            self._load_keephouse(l, response)

            yield l.load_item()
Exemplo n.º 6
0
    def parse_fang(self, response):
        self.logger.info("process fang url")
        ul = response.xpath('//li[@link]')
        for li in ul:
            l = ItemLoader(item=AgentItem(), selector=li)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name", './/div[@class="ttop"]//a//text()')
            l.add_xpath("telephone",
                        './/div[@class="fl"]/p[1]/text()',
                        MapCompose(lambda x: int(x)),
                        re=r"\d+")
            l.add_xpath("company",
                        '//li[@link]//p[@class="f14 liaxni"]/span[2]/text()',
                        Join(','),
                        re=r"\w+")
            l.add_xpath("dist_name", '(//a[@class="orange"])[1]//text()')
            l.add_xpath("subdist_name", '(//a[@class="orange"])[2]//text()')
            l.add_xpath("second_house_amount",
                        './/b[@class="ml03"]',
                        re=r"(\d+)套")

            # ids
            self._load_ids(l, response)
            # housekeeping
            self._load_keephouse(l, response)

            yield l.load_item()
Exemplo n.º 7
0
    def parse_qfang(self, response):
        self.logger.info("process qfang url")
        ul = response.xpath('//div[@id="find_broker_lists"]//li')
        for li in ul:
            l = ItemLoader(item=AgentItem(), selector=li)
            l.default_output_processor = TakeFirst()
            l.add_xpath("name",'//p[@class="name fl"]//a/text()')
            l.add_xpath("dist_name", './/span[@class="con fl"]/b[1]/text()')
            l.add_xpath("subdist_name",'.//span[@class="con fl"]/b[2]/text()')
            l.add_xpath("telephone", './/div[@class="broker-tel fr"]/p/text()',
                        MapCompose(lambda x: int(x)), re = r"\d+")
            l.add_xpath("history_amount", './/span[@class="con fl"]/em/text()')

            # ids
            self._loads_ids(l, response)
            #  housekeeping
            self._loads_housekeeping(l, response)

            yield l.load_item()
Exemplo n.º 8
0
    def parse_item(self, response):

        # agency table
        l = ItemLoader(item=AgentItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("name", '//div[@class="sthys3"]/text()', re=r":(\w+)")
        l.add_xpath("telephone", '//div[@class="sttelct2 sttelct"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.item.setdefault("company", None)
        l.add_xpath("company", '//li[@class="st14 stb starial"]//text()')
        l.add_xpath("address",
                    '//div[@class="xflilist"]/div[3]//text()',
                    re=r':(\w+)')
        l.add_xpath("register_date",
                    '//div[@class="jbfx"]/text()',
                    re=r'登记日期:([\d/]+)')

        l.add_value("city_name", self.city_name)
        l.add_value("dist_name", self.dist_name)
        l.add_value("category_name", self.category_name)
        l.add_value("station_name", self.station_name)
        l.add_xpath("subdist_name",
                    '(//div[@class="xx_xq_l200"])[2]/text()',
                    re='区域:(?:昆山)?(\\w+)')

        # housekeeping
        l.add_value("source", response.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("dt", datetime.datetime.utcnow())
        item = l.load_item()

        if not item.get("subdist_name"):
            self.logger.critical(
                "subdsitrict name is not scrape, save response as a file")
            f = open("failed_html/html_%s.html" %
                     parse_qs(urlparse(response.url).query).get("id")[0],
                     'w',
                     encoding='utf8')
            f.write(response.text)
            f.close()
            # return Request(url=response.url)

        yield item

        # properties table
        l = ItemLoader(item=PropertyItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath('title', '//div[@class="xxview_title"]/text()')
        l.add_value("url", response.url)
        l.add_xpath(
            "price", '//div[@class="xx_xq_l200"]/span[@class="st22 '
            'sthuangs stb starial"]/text()')
        l.add_xpath("address",
                    '//div[@class="wydzleft"]/text()',
                    MapCompose(lambda x: x.strip()),
                    re=r'物业地址:([^\x01-\x1f]+)')
        l.add_xpath("agent_name",
                    '//div[@class="sthys3"]/text()',
                    re=r":(\w+)")
        l.item.setdefault("agent_company", None)
        l.add_xpath("agent_company", '//li[@class="st14 stb starial"]//text()')
        l.add_xpath('agent_phone', '//div[@class="sttelct2 sttelct"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("recent_activation",
                    '//div[@class="fyfbtime"]/text()',
                    re='查看人次:(\\d+)')

        l.add_value("city_name", self.city_name)
        l.add_value("dist_name", self.dist_name)
        l.add_value('station_name', self.station_name)
        l.add_value("category_name", self.category_name)
        l.add_xpath("subdist_name",
                    '(//div[@class="xx_xq_l200"])[2]/text()',
                    re='区域:(?:昆山)?(\\w+)')

        # housekeeping
        l.add_value("source", response.request.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("dt", datetime.datetime.utcnow())
        yield l.load_item()