def parse_card(self, card, type_of_house, house_id, link, response, page_index=0): city = 0 address = CITY_CHOICES[city][1] + card.css( self.parsing_params['address_selector']).get() x_cord, y_cord, address = get_cord(address) title_image = response.urljoin( card.css(self.parsing_params['title_image_selector']).get()) title = card.css(self.parsing_params['title_selector']).get() price = correct_price( card.css(self.parsing_params['price_selector']).get()) item = { 'mode': 0, 'house_id': house_id, 'img': title_image, 'title': title, 'link': link, 'price': price, 'address': address, 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'house_type': type_of_house } print(item) yield item return ''
def parse_card(self, card, type_of_house, house_id, link, page_index=0): city = 0 geo = '' if 0 <= page_index < 5: city = 0 offer_type = 0 else: offer_type = 1 if card.css(self.parsing_params['geo_selector']): geo = card.css(self.parsing_params['geo_data_selector']).get() address = CITY_CHOICES[city][1] + card.css(self.parsing_params['address_selector']).get() + geo x_cord, y_cord, address = get_cord(address) title_image = card.css(self.parsing_params['title_image_selector']).get() title = card.css(self.parsing_params['title_selector']).get() price = correct_price(card.css(self.parsing_params['price_selector']).get()) item = { 'mode': 0, 'offer_type': offer_type, 'house_id': house_id, 'img': title_image, 'title': title, 'link': link, 'price': price, 'address': address, 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'house_type': type_of_house } # print(item) yield item return ''
def parse_card(self, card, type_of_house, house_id, link, page_index=0): city = 0 geo = '' if page_index < 5: city = 0 offer_type = 0 else: offer_type = 1 if card.css(self.parsing_params['price_selector'][0]).get(): address = card.css( self.parsing_params['address_selector']).get() + geo if card.css(self.parsing_params['title_selector'][0]).get(): title = card.css( self.parsing_params['title_selector'][0]).get() else: title = card.css( '.c6e8ba5398--single_title--22TGT::text').get() price = correct_price( card.css(self.parsing_params['price_selector'][0]).get()) else: if card.css(self.parsing_params['title_selector'][1]).get(): title = card.css( self.parsing_params['title_selector'][1]).get() else: title = card.css( '.c6e8ba5398--single_title--22TGT::text').get() address = card.css( self.parsing_params['address_selector']).get() + geo print('hu2') price = correct_price( card.css(self.parsing_params['price_selector'][1]).get()) x_cord, y_cord, address = get_cord(address) title_image = card.css( self.parsing_params['title_image_selector']).get() item = { 'mode': 0, 'offer_type': offer_type, 'house_id': house_id, 'img': title_image, 'title': title, 'link': link, 'price': price, 'address': address, 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'house_type': type_of_house } print(item) yield item return ''
def parse_card(self, item, page_index, response): data = '' if item.css('span.item-address-georeferences').get(): geo = item.css( 'span.item-address-georeferences-item__content::text').get() else: geo = '' url = item.css('a.snippet-link::attr(href)').get() h_id = self.get_house_id(url) if 0 <= page_index < 3: city = 0 else: city = 0 full_address = CITY_CHOICES[city][1] + item.css( 'span.item-address__string::text').get() + ' ' + geo y_cord, x_cord, full_address = get_cord(address=full_address) link = response.urljoin( item.css('h3.snippet-title > a.snippet-link::attr(href)').get()) print(item.css('h3.snippet-title >a > span::text').get()) if self.check_db(h_id): yield { 'mode': 0, 'house_id': h_id, 'img': item.css('img.large-picture-img::attr(src)').get(), 'title': item.css('h3.snippet-title >a > span::text').get(), 'link': link, 'price': int(correct_price(item.css('span.snippet-price::text').get())), 'address': full_address, 'data': data, 'time_created': item.css('div.snippet-date-info::text').get(), 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord] } a = True return a, link else: a = False return a, link
def parse_card(self, card, response, page_index): city = 0 if 0 <= page_index < 5: city = 0 offer_type = 0 else: city = 0 offer_type = 1 title = card.css('span.long-item-card__title___16K7W::text').get() link = response.urljoin(card.css('::attr(href)').get()) house_id = correct_house_id(link.split('-')[-1]) self.house_id_global = correct_house_id(link.split('-')[-1]) price = re.sub( r'[^0-9]', '', card.css('span.long-item-card__price___3A6JF::text').get()) address = card.css('span.long-item-card__address___PVI5p::text').get() if address.find(CITY_CHOICES[city][1]) == -1: address = CITY_CHOICES[city][1] + ' ' + address x_cord, y_cord, address = get_cord(address) title_image = card.css( 'img.card-photo__image___31CHC::attr(src)').get() if 0 <= page_index < 5: type_ = self.types[self.urls_pool.index(response.url)] elif page_index == 7: type_ = self.types[self.urls_pool.index(response.url) - 3] else: type_ = self.types[self.urls_pool.index(response.url) - 4] print('suka') yield ({ 'mode': 0, 'offer_type': offer_type, 'type': type_, 'house_id': house_id, "link": link, "title": title, "price": price, 'address': address, "img": title_image, 'time_created': '', 'data': '', 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord] }) return ''
def parse_card_info(self, card, city, house_id, page_index, response): if 0 <= page_index <= 3: city = 0 y_cord, x_cord, address = get_cord(''.join( card.css('a.p-instance__title::text').get().split(',')[1:])) if response.css('.p-instance__param_color_green'): _type = 'Новостройки' else: _type = 'Вторичка' if response.url == 'https://tumn.realty.mail.ru/sale/country/?types%5B0%5D=12&types%5B1%5D=13&types%5B2%5D=14': _type = 'Коттеджи' if response.url == 'https://tumn.realty.mail.ru/sale/country-plot/': _type = 'Участки' yield ({ 'mode': 0, 'house_id': int(house_id), "link": card.css('a.p-instance__title::attr(href)').get(), "title": card.css('a.p-instance__title::text').get(), "price": int("".join([ x for x in card.css('span.p-instance__title::text').get() if ord(x) < 128 ])), 'address': address, "img": card.css('img.photo__pic::attr(src)').get(), 'time_created': card.css('.p-instance__param.js-ago::attr(datetime)').get(), 'data': '', 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'type': _type }) return card.css('a.p-instance__title::attr(href)').get()
def parse_card(self, card, response, page_index): city = 0 link = response.urljoin(card.css('.SerpItemLink::attr(href)').get()) house_id = correct_house_id(link) if check_db(house_id): print('NEW HOUSE!') if 0 <= page_index < 3: city = 0 address = card.css('.OffersSerpItem__address::text').get() if not address.find(CITY_CHOICES[city][1]) > -1: address = CITY_CHOICES[city][1] + ' ' + address x_cord, y_cord, address = get_cord(address) price = re.sub('[^0-9]', '', card.css('.Price > span::text').get()) title = card.css('h3.OffersSerpItem__title::text').get() print(x_cord, y_cord, address) yield ({ 'mode': 0, # 'type': type_, 'house_id': house_id, "link": link, "title": title, "price": price, 'address': address, "img": card.css('.offer-list-preview__item > img::attr(src)').get(), 'time_created': '', 'data': '', 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord] }) else: print('This house is already exist')
def parse(self, response): cards = response.css('._93444fe79c--card--_yguQ') page_index = self.urls_pool.index(response.url) city = 0 print('Processing...') ob_params = {'max_ob': 1} for card in cards: if cards.index(card) < 1: address, house_id, link, price, time_created, title, title_image = parse_info( card) if check_db(house_id): print('Database check completed') if 0 <= page_index < 3: city = 0 else: city = 0 print(f'City index is: {city}') y_cord, x_cord, address = get_cord(address) print(f"House cords is: {x_cord}, {y_cord}") yield ({ 'mode': 0, 'title': title, "link": link, "house_id": house_id, "price": price, "img": title_image, 'address': address, 'data': '', 'time_created': time_created, 'host': self.allowed_domains[0], 'city': city, 'type': '', 'cords': [x_cord, y_cord] }) yield scrapy.Request(url=link, callback=self.parse_info_of_card) else: print('This row is already exist') if page_index < self.urls_pool.__len__() - 1: yield scrapy.Request(url=self.urls_pool[page_index + 1], callback=self.parse)
def parse(self, response): page_index = self.urls_pool.index(response.url) cards = response.css('.media.clearfix.object') type_ = self.types[self.urls_pool.index(response.url)] for card in cards: if card.css('.yandex_adfox_action'): cards.pop(cards.index(card)) for card in cards: if cards.index(card) < 5: city = 0 url = response.urljoin( card.css('.header_adv_short::attr(href)').get()) house_id = card.css('::attr(element_id)').get() title_image = card.css('.object__gallery').css( 'img::attr(src)').get() title = card.css('.header_adv_short::text').get() price = card.css('span.formatRub::text').get() if 0 <= page_index < 3: city = 0 address = CITY_CHOICES[city][1] + " " + ' '.join( card.css('.text-location > a::text').getall()) x_cord, y_cord, address = get_cord(address) yield ({ 'mode': 0, 'house_id': int(house_id), "link": url, "title": title, "price": price, 'address': address, "img": title_image, 'time_created': '', 'data': '', 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'type': type_ }) yield scrapy.Request(url=url, callback=self.parse_info) if page_index < self.urls_pool.__len__() - 1: yield scrapy.Request(self.urls_pool[page_index + 1], callback=self.parse)
def parse_card(self, card, type_of_house, house_id, link, page_index=0): city = 0 if 0 <= page_index < 3: city = 0 offer_type = 0 else: offer_type = 1 address = ' '.join( card.css(self.parsing_params['address_selector']).getall()) x_cord, y_cord, address = get_cord(address) try: title_image = card.css( self.parsing_params['title_image_selector']).get().split( ',')[0] except: title_image = '' print(card) title = type_of_house + " " + address print(card.css(self.parsing_params['price_selector']).get()) price = correct_price( card.css(self.parsing_params['price_selector']).get()) item = { 'mode': 0, 'offer_type': offer_type, 'house_id': house_id, 'img': title_image, 'title': title, 'link': link, 'price': price, 'address': address, 'host': self.allowed_domains[0], 'city': city, 'cords': [x_cord, y_cord], 'house_type': type_of_house } yield item return ''
def parse_info_data(self, response): # print(f'processing: {response.url}') type_of_participation = official_builder = name_of_build = decoration = floor = floor_count = house_type = \ num_of_rooms = total_area = living_area = kitchen_area = deadline = land_area = ' ' images = [] # print('Start parse basic info...') h = response.url.replace(' ', '').split("/")[-1].split('_')[0].replace( '.html', '') address = response.css('p.adress > span::text').get() user_id = response.css( '.user-item-container::attr(data-user-id)').get() # print('Parsing house params...') if address: address = 'Тюмень, ' + address y_cords, x_cord, address = get_cord(address) else: x_cord = y_cords = None address = '' # print(f'Address: {address}') for label in response.css('div > span'): if isinstance(label.css('::text').get(), str): if label.css('::text').get().find('Общая площадь:') > 0: if label.css('strong::text').get().find('сот.') > 0: land_area = re.sub(r'кв.м.| | .', '', label.css('strong::text').get()) else: total_area = re.sub(r'сот.| | .', '', label.css('strong::text').get()) for param in response.css('div > span > strong::text').getall(): new_param = re.sub(r'[\n ]', '', param) if param.index == 2: total_area = re.sub(r'кв.м.| | .', '', new_param) type_ = 'Вторичка' if param.index == 3: if int(new_param) > 5: num_of_rooms = f'5к+ {new_param}' else: num_of_rooms = f"{new_param}" if response.url.find('doma-kottedzhi-dachi') >= 0: type_ = 'Коттеджи' elif response.url.find('kvartiry-i-komnaty') >= 0: type_ = 'Вторичка' else: type_ = 'Участки' data = response.css('p.px18::text').get() phone = '' print('Parsing images...') for img in response.css('img.zoomable::attr(src)').getall(): images.append(img) yield { 'mode': 1, 'user_id': user_id, 'house_id': h, 'type_of_participation': type_of_participation, 'official_builder': official_builder, 'name_of_build': name_of_build, 'decoration': decoration, "floor": floor, "floor_count": floor_count, "house_type": house_type, "num_of_rooms": num_of_rooms, "total_area": total_area, "living_area": living_area, "kitchen_area": kitchen_area, 'land_area': land_area, "deadline": deadline, 'phone': phone, 'images': images, 'data': data, 'address': address, 'cords': [x_cord, y_cords], 'type': type_ }
def correct(self, result, category, offer_type, ): num_of_rooms = type_ = house_type = address = '' floor = floor_count = total_area = kitchen_area = living_area = land_area = 0 if category == 2: if offer_type == 0: num_of_rooms = self.correct_num_of_rooms(result, 'param_1945') type_ = self.correct_type(result, 'param_1957') try: house_type = result['param_2009'] except: pass floor = result['param_2113'] floor_count = result['param_2213'] total_area = float(result['param_2313']) address = result['address'] try: kitchen_area = float(result['param_2314']) except: pass try: living_area = float(result['param_12722']) except: pass else: num_of_rooms = self.correct_num_of_rooms(result, 'param_2019') type_ = 'Вторичка' try: house_type = result['param_2078'] except: pass floor = result['param_2315'] floor_count = result['param_2415'] try: total_area = float(result['param_2515']) except: pass address = result['address'] try: kitchen_area = float(result['param_12723']) except: pass try: living_area = float(result['param_12724']) except: pass elif category == 4: if offer_type == 0: try: floor_count = result['param_3837'] except: pass type_ = 'Коттеджи' try: house_type = result['param_3843'] except: pass try: total_area = float(result['param_4014']) except: pass try: land_area = float(result['param_4015']) except: pass else: try: floor_count = result['param_4016'] except: pass type_ = 'Коттеджи' try: house_type = result['param_4022'] except: pass try: total_area = float(result['param_4193']) except: pass try: land_area = float(result['param_4194']) except: pass elif category == 5: if offer_type == 0: type_ = 'Участки' try: land_area = float(result['param_4616']) except: pass else: type_ = 'Участки' try: land_area = float(result['param_4194']) except: pass elif category == 7: if offer_type == 0: try: floor_count = result['param_12869'] except: pass type_ = 'Коммерческаянедвижимость' try: total_area = float(result['param_4920']) except: pass try: floor = result['param_12868'] except: pass else: try: floor_count = result['param_12881'] except: pass type_ = 'Коммерческаянедвижимость' try: total_area = float(result['param_4922']) except: pass try: floor = result['param_12880'] except: pass address = result['address'] address = result['city'] + ' ' + address price = result['price'] cords = [0, 0] try: cords = [result['cords']['lat'], result['cords']['lng']] except: cords[1], cords[0], _ = get_cord(address) title = result['title'] description = result['description'] host = result['source'] phone = result['phone'] link = result['url'] images = [] for img in result["images"]: images.append(img['imgurl']) # TODO Написать сохранение фоток для того что бы резать знаки if not phone[0] == '7': phone = '7' + phone[1:] if len(images) > 0: title_image = images[0] else: title_image = '' try: house_id = result['avitoid'] except: house_id = result['id'] data = {'num_of_rooms': num_of_rooms, 'type': type_, 'house_type': house_type, 'floor': floor, 'floor_count': floor_count, 'total_area': total_area, 'kitchen_area': kitchen_area, "living_area": living_area, 'address': address, 'land_area': land_area, 'images': images, 'price': price, 'cords': cords, 'title': title, 'description': description, "host": host, 'phone': phone, 'title_image': title_image, 'link': link, 'offer_type': offer_type, 'house_id': house_id} return data
def parse_card_info(self, card, city, house_id, page_index, response): if card.css('.living-list-card-newbuilding'): type_ = 'Новостройки' else: type_ = 'Вторичка' if 0 <= page_index <= 3: city = 0 link = response.urljoin(card.css('a.link::attr(href)').get()) print(response.url.find('zemlya')) if response.url.find('zemlya') > -1: address = card.css('.land-list-card__district::text').get() else: try: if card.css('a.link > span::text').get().split( ',' ) and (card.css('.living-list-card__inner-block::text').get() or card.css( 'span.living-list-card-city-with-estate__item::text' ).get()): address = card.css('a.link > span::text').get().split(',')[1] + ' ' + \ card.css('a.link > span::text').get().split(',')[2] + ' ' + card.css( '.living-list-card__inner-block::text').get() + ' ' + card.css( 'span.living-list-card-city-with-estate__item::text').get() else: address = '' except: address = '' x_cord, y_cord, address = get_cord(address) if response.url.find('zemlya') > -1: price_class = '.land-list-card__price::attr(title)' else: price_class = '.living-list-card-price__item::text' yield ({ 'mode': 0, 'type': type_, 'house_id': house_id, "link": response.urljoin(card.css('a.link::attr(href)').get()), "title": card.css('a.link > span::text').get(), "price": re.sub(r'[^0-9]', '', card.css(price_class).get()), 'address': address, "img": card.css('.offer-list-preview__item > img::attr(src)').get(), 'time_created': '', 'data': '', 'host': 'tumen.n1.ru', 'city': city, 'cords': [x_cord, y_cord] }) return link