def parse_item_page(self, response): #filename = 'irr' + response.url[-10:] + '.html' #self.logger.info('filename %s',filename) #with open(filename, 'wb') as f: # f.write(response.body) l = ApartmentLoader(Apartment(), response) l.add_value('url', response.url) #l.add_xpath('description', '//div[@class="advertDescriptionText"]/text()') l.add_xpath('description', '//meta[@name="description"]/@content') l.add_xpath('street', '//i[contains(@class,"irri-map")]/following-sibling::span/text()') l.add_xpath('street', '//i[contains(@class,"icon_spot")]/following-sibling::div/text()') l.add_xpath('price', '//div[contains(@class,"productPagePrice")]/text()') #l.add_value('city', u'Ростов-на-Дону') l.add_value('updated', datetime.utcnow().isoformat()) l.add_xpath('postDate', '//div[@class="advertHeader"]/div[@class="createDate"]/text()') l.add_xpath('postDate', '//div[@class="productPage_headerColumn"]/div[@class="productPage__createDate"]/text()') # properties l.add_xpath('m2', self.extract_property_string(u"Общая площадь:")) l.add_xpath('kitchenm2', self.extract_property_string(u"Площадь кухни:")) l.add_xpath('restm2', self.extract_property_string(u"Жилая площадь:")) l.add_xpath('floor', self.extract_property_string(u"Этаж:")) l.add_xpath('totfloors', self.extract_property_string(u"Этажей в здании:")) l.add_xpath('rooms', self.extract_property_string(u"Комнат в квартире:")) l.add_xpath('district', self.extract_property_string(u"Район города:")) l.add_xpath('rennovation', self.extract_property_string(u"Ремонт:")) l.add_xpath('builtDate', self.extract_property_string(u"Год постройки:")) l.add_xpath('water', self.extract_property_string(u"Система водоснабжения:")) l.add_xpath('heating', self.extract_property_string(u"Система отопления:")) l.add_xpath('wc', self.extract_property_string(u"Санузел:")) l.add_xpath('walls', self.extract_property_string(u"Материал стен:")) l.add_xpath('ceilings', self.extract_property_string(u"Высота потолков:")) l.add_xpath('balcony', self.extract_property_string(u"Балкон")) l.add_xpath('security', self.extract_property_string(u"Охрана")) yield l.load_item()
def parse_item_page(self, response): #filename = 'avito' + response.url[-10:] + '.html' #self.logger.info('filename %s',filename) #with open(filename, 'wb') as f: # f.write(response.body) l = ApartmentLoader(Apartment(), response) l.add_value('url', response.url) # 1-к квартира, 42 м², 12/18 эт. # Студия, 28 м², 2/5 эт. # > 9-к квартира, 336 м², 23/23 эт #title = response.xpath('//h1[@itemprop="name" and @class="h1"]/text()').extract() #assert 1==len(title) #m = re.search(u"(?:(\d+)-к квартира|(Студия)),\s+(\d+)\s+м²,\s+(\d+)/(\d+)\s+эт", title[0], flags=re.UNICODE) #print title[0].encode('utf-8') #print m.groups() #l.add_xpath('description', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(?:(\d+)-к квартира|(Студия))') l.add_xpath('rooms', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(Студия)') l.add_xpath('rooms', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(?:(\d+)-к квартира)') l.add_xpath('m2', '//h1[@itemprop="name" and @class="h1"]/text()', re=u',\s+(\d*\.\d+|\d+)\s+м²,') l.add_xpath('floor', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'\s+(\d+)/\d+\s+эт') l.add_xpath('totfloors', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'\s+\d+/(\d+)\s+эт') l.add_xpath('price', '//span[@itemprop="price"]/text()') l.add_xpath('city', '//meta[@itemprop="addressLocality"]/@content') l.add_xpath('district', '//span[@itemprop="streetAddress"]/text()') l.add_xpath('street', '//span[@itemprop="streetAddress"]/text()') description = ' '.join(response.xpath('//div[@class="description description-text"]/descendant::*/text()').extract()) print description.encode('utf-8') l.add_xpath('description', '//div[@class="description description-text"]/descendant::*/text()') l.add_value('updated', datetime.utcnow().isoformat()) l.add_xpath('postDate', '//div[@class="item-subtitle"]/text()') l.add_xpath('postDate', '//div[contains(@class,"item-subtitle")]/text()') yield l.load_item()
def parse_item_page(self, response): #filename = 'liferealty' + response.url[-10:].strip('/') + '.html' #with open(filename, 'wb') as f: # f.write(response.body) l = ApartmentLoader(Apartment(), response) l.add_value('url', response.url) type = response.xpath('//div[@id="list_sale"]/div[@class="fav"]/following-sibling::h1/text()').extract()[0] if u'гостинка' in type.lower(): rooms = 0 elif u'комната' in type.lower(): rooms = 0 elif u'однокомн' in type.lower(): rooms = 1 elif u'двухком' in type.lower(): rooms = 2 elif u'трехкомн' in type.lower(): rooms = 3 elif u'четырехко' in type.lower(): rooms = 4 elif u'пятикомнатна' in type.lower(): rooms = 5 else: print "ERROR type", type.encode('utf-8') assert False l.add_value('rooms',rooms) l.add_xpath('m2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Общая площадь")]/text()', re=u'Общая площадь: (\d*\.\d+|\d+) м') l.add_xpath('m2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (\d*\.\d+|\d+)/(?:\d*\.\d+|\d+)/(?:\d*\.\d+|\d+) м') l.add_xpath('kitchenm2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (?:\d*\.\d+|\d+)/(?:\d*\.\d+|\d+)/(\d*\.\d+|\d+) м') l.add_xpath('restm2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (?:\d*\.\d+|\d+)/(\d*\.\d+|\d+)/(?:\d*\.\d+|\d+) м') l.add_xpath('floor',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(text(),"Этаж")]/text()', re=u'Этаж: (\d+)/\d+') l.add_xpath('totfloors',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(text(),"Этаж")]/text()', re=u'Этаж: \d+/(\d+)') l.add_xpath('city', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Населенный пункт:\s+([\w-]+)') l.add_xpath('district', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Район:\s+(\w+)') l.add_xpath('street', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Адрес:\s+(.*)') l.add_xpath('rennovation',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Состояние помещения:\s+(.*)') l.add_xpath('walls',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Тип дома:\s+(.*)') l.add_xpath('balcony',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Балкон:\s+(.*)') l.add_xpath('wc',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Санузел:\s+(.*)') l.add_xpath('description', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Дополнительная")]/descendant-or-self::*/text()') l.add_value('updated', datetime.utcnow().isoformat()) l.add_xpath('postDate',u'//div[@id="list_sale"]//div[@class="card_date" and contains(.,"добавлено")]/descendant-or-self::*/text()',re=u'добавлено\s+(.*)') price = ''.join(response.xpath(u'//div[@id="list_sale"]//div[@class="card_price" and contains(.,"руб")]/text()').extract()) self.logger.debug("1. price "+str(price)) price = ''.join(price) self.logger.debug("2. price "+price) price = price.replace(',','.') self.logger.debug("3. price "+price) price = str(int(float(price)*1000)) self.logger.debug("4. price "+price) l.add_value('price',price) yield l.load_item()