def desc_parser(self): result = [] title = self.sel.xpath('b/text()').extract() if title: result.append(join_strings(title).lower().capitalize()) result.append(join_strings(self.sel.xpath('text()').extract(), ', ')) return [join_strings(result, ' ')]
def prices(self): if not self._prices: price = join_strings(self.price_parser()) mesure = join_strings(self.mesure_parser()) price_str = ['%s %s' % (price, mesure) if price else ''] price_digit = [self.digit_price(price, mesure)] self._prices = {'price': price_str, 'price_digit': price_digit} return self._prices
def prices(self): if not self._prices: price = join_strings(self.price_parser()) mesure = join_strings(self.mesure_parser()) price_str = ['%s %s' % (price, mesure) if price else ''] price_digit = [self.digit_price(price, mesure)] self._prices = {'price' : price_str, 'price_digit' : price_digit} return self._prices
def process_section(self, section): PATERN = u'Продаю' txt = join_strings(section.xpath("text()").extract()) chains = [x.strip() for x in txt.split(u"→")] result = {'sale': False, 'section': None} if PATERN in chains: result['sale'] = True result['section'] = join_strings(chains, ' ') return result
def process_section(self, section): PATERN = u'Продаю' txt = join_strings(section.xpath("text()").extract()) chains = [x.strip() for x in txt.split(u"→")] result = {'sale':False, 'section':None} if PATERN in chains: result['sale'] = True result['section'] = join_strings(chains, ' ') return result
def process_detail_page(self, sel, response): saler_type = join_strings( sel.xpath('//div[@class="c_face"]/span[@class="lgrey"]/text()'). extract()) address = join_strings( sel.xpath('//*[@id="list_sale"]/div[@class="card_block"][1]/text()' ).extract(), ', ') name = join_strings( sel.xpath('//div[@class="c_face"]/text()').extract()) phone_str = join_strings( join_strings( sel.xpath('//div[@class="c_phone"]/text()').extract())) description = join_strings( sel.xpath('//*[@id="list_sale"]/div[@class="card_block"]/text()'). extract(), '\n') price = join_strings( sel.xpath('//div[@class="card_price"]/text()').extract()) mesure = join_strings( sel.xpath('//div[@class="card_price"]/span/text()').extract()) item = RealtyItem() item['do_not_process'] = not self.check_saler_type(saler_type) item['phone'] = self.filter_phone(phone_str) item['name'] = [name] item['desc'] = [description] item['price'] = ['%s %s' % (price, mesure)] item['price_digit'] = [self.get_digit_price(price, mesure)] item['link'] = [response.url] item['estate_type_id'] = self.estate_type_parser(sel) item['region_id'] = self.region_parser(address) item['locality_id'] = self.locality_parser(item['region_id'], address) #item['microdistrict'] = '' #item['street'] = '' #item['estate_number'] = '' item['room_count'] = self.room_count_parser(self.page_title(sel)) return item
def commerce_parser(self): COMMERCE_CAT_ID = 6 full_txt = join_strings(self.sel.xpath('//div[@class="media-body"]/p[@class="text_justify"]/text()').extract()) txt = join_strings(full_txt.split()[:3], ' ') print txt key = 'commerce_mapper_smart' from django.core.cache import cache mapper = cache.get(key) if not mapper: types = EstateType.objects.filter(estate_type_category_id=COMMERCE_CAT_ID) mapper = {} for t in types: mapper[ur'%s\s' % t.name] = t.id mapper[ur'%s\s' % t.name_accs] = t.id cache.set(key, mapper, 3600) return self.re_mapper(mapper, txt) or self.ZDANIE
def desc_parser(self): result = [] result.append(self.title()) result.append('\n') result.append( join_strings( self.sel.xpath( '//div[@itemprop="description"]//text()').extract(), ', ')) return result
def filter_phone(self): phone = self.phone_parser() if not phone: return phone_str = join_strings(phone, ',') phones = re.split(r'[\,|\n]', phone_str) result = [] for phone in phones: phone = phone.strip().replace('+7', '8') result.append(re.sub('\D', '', phone)) return result
def filter_phone(self): phone = self.phone_parser() if not phone: return phone_str = join_strings(phone,',') phones = re.split(r'[\,|\n]', phone_str) result = [] for phone in phones: phone = phone.strip().replace('+7', '8') result.append(re.sub('\D','', phone)) return result
def process_detail_page(self, sel, response): saler_type = join_strings(sel.xpath('//div[@class="c_face"]/span[@class="lgrey"]/text()').extract()) address = join_strings(sel.xpath('//*[@id="list_sale"]/div[@class="card_block"][1]/text()').extract(), ', ') name = join_strings(sel.xpath('//div[@class="c_face"]/text()').extract()) phone_str = join_strings(join_strings(sel.xpath('//div[@class="c_phone"]/text()').extract())) description = join_strings(sel.xpath('//*[@id="list_sale"]/div[@class="card_block"]/text()').extract(), '\n') price = join_strings(sel.xpath('//div[@class="card_price"]/text()').extract()) mesure = join_strings(sel.xpath('//div[@class="card_price"]/span/text()').extract()) item = RealtyItem() item['do_not_process'] = not self.check_saler_type(saler_type) item['phone'] = self.filter_phone(phone_str) item['name'] = [name] item['desc'] = [description] item['price'] = ['%s %s' % (price, mesure)] item['price_digit'] = [self.get_digit_price(price, mesure)] item['link'] = [response.url] item['estate_type_id'] = self.estate_type_parser(sel) item['region_id'] = self.region_parser(address) item['locality_id'] = self.locality_parser(item['region_id'], address) #item['microdistrict'] = '' #item['street'] = '' #item['estate_number'] = '' item['room_count'] = self.room_count_parser(self.page_title(sel)) return item
def desc_parser(self): result = [] result.append(self.title()) result.append('\n') result.append(join_strings(self.sel.xpath('//div[@class="media-body"]//text()').extract(), ', ')) return result
def room_count(self): room_parser_result = self.room_count_parser() if room_parser_result: result = join_strings(room_parser_result) return re.sub('\D','', result)
def title_parser(self): title = join_strings(self.sel.xpath('b/text()').extract()) txt = join_strings(self.sel.xpath('text()').extract(), ' ') txt = join_strings(txt.split()[0:3], ' ') title = u'%s %s' % (title, txt) return title
ur'офис': 35, ur'склад': 53, # 'kommercheskaya-nedvizhimost' : 93, } result = self.re_mapper(mapper, page_title) if callable(result): return result(sel) return result or ZDANIE def stead_parser(self, sel): mapper = { ur'индивидуальное жилищное строительство' : 15, ur'сельскохозяйственного назначения': 42, ur'коммерческое строительство': 20, } txt = join_strings(sel.xpath('//*[@id="list_sale"]/div[@class="card_block"]').extract()) print "stead parser %s" % txt return self.re_mapper(mapper, txt) def re_mapper(self, mapper, txt): for key, value in mapper.iteritems(): matches = re.search(key, txt, re.I | re.U) if matches: return value def get_digit_price(self, price, mesure): mesures = {u'тыс. руб.':1000} if price: if mesure in mesures: price_digit = float(price) price_digit = int(price_digit * mesures[mesure])
def title(self): if not self._title: self._title = join_strings(self.title_parser()) return self._title
def room_count(self): room_parser_result = self.room_count_parser() if room_parser_result: result = join_strings(room_parser_result) return re.sub('\D', '', result)
def page_title(self, sel): return join_strings( sel.xpath('//*[@id="list_sale"]/h1/text()').extract())
def link(self): import hashlib id_instead_link = hashlib.md5(join_strings(self.sel.xpath('text()').extract()).encode('utf-8')).hexdigest() return [id_instead_link]
def desc_parser(self): result = [] result.append(self.title()) result.append('\n') result.append(join_strings(self.sel.xpath('//div[@itemprop="description"]//text()').extract(), ', ')) return result
def page_title(self, sel): return join_strings(sel.xpath('//*[@id="list_sale"]/h1/text()').extract())
def title_parser(self): title = join_strings(self.sel.xpath('b/text()').extract()) txt = join_strings(self.sel.xpath('text()').extract(), ' ') txt = join_strings(txt.split()[0:3],' ') title = u'%s %s' % (title, txt) return title
def link(self): import hashlib id_instead_link = hashlib.md5( join_strings(self.sel.xpath('text()').extract()).encode( 'utf-8')).hexdigest() return [id_instead_link]
ur'склад': 53, # 'kommercheskaya-nedvizhimost' : 93, } result = self.re_mapper(mapper, page_title) if callable(result): return result(sel) return result or ZDANIE def stead_parser(self, sel): mapper = { ur'индивидуальное жилищное строительство': 15, ur'сельскохозяйственного назначения': 42, ur'коммерческое строительство': 20, } txt = join_strings( sel.xpath( '//*[@id="list_sale"]/div[@class="card_block"]').extract()) print "stead parser %s" % txt return self.re_mapper(mapper, txt) def re_mapper(self, mapper, txt): for key, value in mapper.iteritems(): matches = re.search(key, txt, re.I | re.U) if matches: return value def get_digit_price(self, price, mesure): mesures = {u'тыс. руб.': 1000} if price: if mesure in mesures: price_digit = float(price)
def locality_parser(self): return join_strings(self.sel.re(ur'(?:п\.|ст\.|x\.|г\.)\s(\D+?)\,')) or u'Темрюк'