示例#1
0
 def parse_card(self,
                card,
                type_of_house,
                house_id,
                link,
                response,
                page_index=0):
     city = 0
     address = CITY_CHOICES[city][1] + card.css(
         self.parsing_params['address_selector']).get()
     x_cord, y_cord, address = get_cord(address)
     title_image = response.urljoin(
         card.css(self.parsing_params['title_image_selector']).get())
     title = card.css(self.parsing_params['title_selector']).get()
     price = correct_price(
         card.css(self.parsing_params['price_selector']).get())
     item = {
         'mode': 0,
         'house_id': house_id,
         'img': title_image,
         'title': title,
         'link': link,
         'price': price,
         'address': address,
         'host': self.allowed_domains[0],
         'city': city,
         'cords': [x_cord, y_cord],
         'house_type': type_of_house
     }
     print(item)
     yield item
     return ''
示例#2
0
 def parse_card(self, card, type_of_house, house_id, link, page_index=0):
     city = 0
     geo = ''
     if 0 <= page_index < 5:
         city = 0
         offer_type = 0
     else:
         offer_type = 1
     if card.css(self.parsing_params['geo_selector']):
         geo = card.css(self.parsing_params['geo_data_selector']).get()
     address = CITY_CHOICES[city][1] + card.css(self.parsing_params['address_selector']).get() + geo
     x_cord, y_cord, address = get_cord(address)
     title_image = card.css(self.parsing_params['title_image_selector']).get()
     title = card.css(self.parsing_params['title_selector']).get()
     price = correct_price(card.css(self.parsing_params['price_selector']).get())
     item = {
         'mode': 0,
         'offer_type': offer_type,
         'house_id': house_id,
         'img': title_image,
         'title': title,
         'link': link,
         'price': price,
         'address': address,
         'host': self.allowed_domains[0],
         'city': city,
         'cords': [x_cord, y_cord],
         'house_type': type_of_house
     }
     # print(item)
     yield item
     return ''
示例#3
0
    def parse_card(self, card, type_of_house, house_id, link, page_index=0):
        city = 0
        geo = ''
        if page_index < 5:
            city = 0
            offer_type = 0
        else:
            offer_type = 1
        if card.css(self.parsing_params['price_selector'][0]).get():
            address = card.css(
                self.parsing_params['address_selector']).get() + geo
            if card.css(self.parsing_params['title_selector'][0]).get():
                title = card.css(
                    self.parsing_params['title_selector'][0]).get()
            else:
                title = card.css(
                    '.c6e8ba5398--single_title--22TGT::text').get()
            price = correct_price(
                card.css(self.parsing_params['price_selector'][0]).get())
        else:
            if card.css(self.parsing_params['title_selector'][1]).get():
                title = card.css(
                    self.parsing_params['title_selector'][1]).get()
            else:
                title = card.css(
                    '.c6e8ba5398--single_title--22TGT::text').get()
            address = card.css(
                self.parsing_params['address_selector']).get() + geo
            print('hu2')
            price = correct_price(
                card.css(self.parsing_params['price_selector'][1]).get())

        x_cord, y_cord, address = get_cord(address)
        title_image = card.css(
            self.parsing_params['title_image_selector']).get()
        item = {
            'mode': 0,
            'offer_type': offer_type,
            'house_id': house_id,
            'img': title_image,
            'title': title,
            'link': link,
            'price': price,
            'address': address,
            'host': self.allowed_domains[0],
            'city': city,
            'cords': [x_cord, y_cord],
            'house_type': type_of_house
        }
        print(item)
        yield item
        return ''
示例#4
0
    def parse_card(self, item, page_index, response):
        data = ''
        if item.css('span.item-address-georeferences').get():
            geo = item.css(
                'span.item-address-georeferences-item__content::text').get()
        else:
            geo = ''
        url = item.css('a.snippet-link::attr(href)').get()
        h_id = self.get_house_id(url)
        if 0 <= page_index < 3:
            city = 0
        else:
            city = 0
        full_address = CITY_CHOICES[city][1] + item.css(
            'span.item-address__string::text').get() + ' ' + geo
        y_cord, x_cord, full_address = get_cord(address=full_address)
        link = response.urljoin(
            item.css('h3.snippet-title > a.snippet-link::attr(href)').get())
        print(item.css('h3.snippet-title >a > span::text').get())
        if self.check_db(h_id):
            yield {
                'mode':
                0,
                'house_id':
                h_id,
                'img':
                item.css('img.large-picture-img::attr(src)').get(),
                'title':
                item.css('h3.snippet-title >a > span::text').get(),
                'link':
                link,
                'price':
                int(correct_price(item.css('span.snippet-price::text').get())),
                'address':
                full_address,
                'data':
                data,
                'time_created':
                item.css('div.snippet-date-info::text').get(),
                'host':
                self.allowed_domains[0],
                'city':
                city,
                'cords': [x_cord, y_cord]
            }
            a = True
            return a, link
        else:
            a = False

            return a, link
示例#5
0
 def parse_card(self, card, response, page_index):
     city = 0
     if 0 <= page_index < 5:
         city = 0
         offer_type = 0
     else:
         city = 0
         offer_type = 1
     title = card.css('span.long-item-card__title___16K7W::text').get()
     link = response.urljoin(card.css('::attr(href)').get())
     house_id = correct_house_id(link.split('-')[-1])
     self.house_id_global = correct_house_id(link.split('-')[-1])
     price = re.sub(
         r'[^0-9]', '',
         card.css('span.long-item-card__price___3A6JF::text').get())
     address = card.css('span.long-item-card__address___PVI5p::text').get()
     if address.find(CITY_CHOICES[city][1]) == -1:
         address = CITY_CHOICES[city][1] + ' ' + address
     x_cord, y_cord, address = get_cord(address)
     title_image = card.css(
         'img.card-photo__image___31CHC::attr(src)').get()
     if 0 <= page_index < 5:
         type_ = self.types[self.urls_pool.index(response.url)]
     elif page_index == 7:
         type_ = self.types[self.urls_pool.index(response.url) - 3]
     else:
         type_ = self.types[self.urls_pool.index(response.url) - 4]
     print('suka')
     yield ({
         'mode': 0,
         'offer_type': offer_type,
         'type': type_,
         'house_id': house_id,
         "link": link,
         "title": title,
         "price": price,
         'address': address,
         "img": title_image,
         'time_created': '',
         'data': '',
         'host': self.allowed_domains[0],
         'city': city,
         'cords': [x_cord, y_cord]
     })
     return ''
示例#6
0
 def parse_card_info(self, card, city, house_id, page_index, response):
     if 0 <= page_index <= 3:
         city = 0
     y_cord, x_cord, address = get_cord(''.join(
         card.css('a.p-instance__title::text').get().split(',')[1:]))
     if response.css('.p-instance__param_color_green'):
         _type = 'Новостройки'
     else:
         _type = 'Вторичка'
     if response.url == 'https://tumn.realty.mail.ru/sale/country/?types%5B0%5D=12&types%5B1%5D=13&types%5B2%5D=14':
         _type = 'Коттеджи'
     if response.url == 'https://tumn.realty.mail.ru/sale/country-plot/':
         _type = 'Участки'
     yield ({
         'mode':
         0,
         'house_id':
         int(house_id),
         "link":
         card.css('a.p-instance__title::attr(href)').get(),
         "title":
         card.css('a.p-instance__title::text').get(),
         "price":
         int("".join([
             x for x in card.css('span.p-instance__title::text').get()
             if ord(x) < 128
         ])),
         'address':
         address,
         "img":
         card.css('img.photo__pic::attr(src)').get(),
         'time_created':
         card.css('.p-instance__param.js-ago::attr(datetime)').get(),
         'data':
         '',
         'host':
         self.allowed_domains[0],
         'city':
         city,
         'cords': [x_cord, y_cord],
         'type':
         _type
     })
     return card.css('a.p-instance__title::attr(href)').get()
示例#7
0
 def parse_card(self, card, response, page_index):
     city = 0
     link = response.urljoin(card.css('.SerpItemLink::attr(href)').get())
     house_id = correct_house_id(link)
     if check_db(house_id):
         print('NEW HOUSE!')
         if 0 <= page_index < 3:
             city = 0
         address = card.css('.OffersSerpItem__address::text').get()
         if not address.find(CITY_CHOICES[city][1]) > -1:
             address = CITY_CHOICES[city][1] + ' ' + address
         x_cord, y_cord, address = get_cord(address)
         price = re.sub('[^0-9]', '', card.css('.Price > span::text').get())
         title = card.css('h3.OffersSerpItem__title::text').get()
         print(x_cord, y_cord, address)
         yield ({
             'mode':
             0,
             # 'type': type_,
             'house_id':
             house_id,
             "link":
             link,
             "title":
             title,
             "price":
             price,
             'address':
             address,
             "img":
             card.css('.offer-list-preview__item > img::attr(src)').get(),
             'time_created':
             '',
             'data':
             '',
             'host':
             self.allowed_domains[0],
             'city':
             city,
             'cords': [x_cord, y_cord]
         })
     else:
         print('This house is already exist')
示例#8
0
    def parse(self, response):
        cards = response.css('._93444fe79c--card--_yguQ')
        page_index = self.urls_pool.index(response.url)

        city = 0
        print('Processing...')
        ob_params = {'max_ob': 1}
        for card in cards:
            if cards.index(card) < 1:
                address, house_id, link, price, time_created, title, title_image = parse_info(
                    card)
                if check_db(house_id):
                    print('Database check completed')
                    if 0 <= page_index < 3:
                        city = 0
                    else:
                        city = 0
                    print(f'City index is: {city}')
                    y_cord, x_cord, address = get_cord(address)
                    print(f"House cords is: {x_cord}, {y_cord}")
                    yield ({
                        'mode': 0,
                        'title': title,
                        "link": link,
                        "house_id": house_id,
                        "price": price,
                        "img": title_image,
                        'address': address,
                        'data': '',
                        'time_created': time_created,
                        'host': self.allowed_domains[0],
                        'city': city,
                        'type': '',
                        'cords': [x_cord, y_cord]
                    })
                    yield scrapy.Request(url=link,
                                         callback=self.parse_info_of_card)
                else:
                    print('This row is already exist')
                if page_index < self.urls_pool.__len__() - 1:
                    yield scrapy.Request(url=self.urls_pool[page_index + 1],
                                         callback=self.parse)
示例#9
0
 def parse(self, response):
     page_index = self.urls_pool.index(response.url)
     cards = response.css('.media.clearfix.object')
     type_ = self.types[self.urls_pool.index(response.url)]
     for card in cards:
         if card.css('.yandex_adfox_action'):
             cards.pop(cards.index(card))
     for card in cards:
         if cards.index(card) < 5:
             city = 0
             url = response.urljoin(
                 card.css('.header_adv_short::attr(href)').get())
             house_id = card.css('::attr(element_id)').get()
             title_image = card.css('.object__gallery').css(
                 'img::attr(src)').get()
             title = card.css('.header_adv_short::text').get()
             price = card.css('span.formatRub::text').get()
             if 0 <= page_index < 3:
                 city = 0
             address = CITY_CHOICES[city][1] + " " + ' '.join(
                 card.css('.text-location > a::text').getall())
             x_cord, y_cord, address = get_cord(address)
             yield ({
                 'mode': 0,
                 'house_id': int(house_id),
                 "link": url,
                 "title": title,
                 "price": price,
                 'address': address,
                 "img": title_image,
                 'time_created': '',
                 'data': '',
                 'host': self.allowed_domains[0],
                 'city': city,
                 'cords': [x_cord, y_cord],
                 'type': type_
             })
             yield scrapy.Request(url=url, callback=self.parse_info)
     if page_index < self.urls_pool.__len__() - 1:
         yield scrapy.Request(self.urls_pool[page_index + 1],
                              callback=self.parse)
示例#10
0
 def parse_card(self, card, type_of_house, house_id, link, page_index=0):
     city = 0
     if 0 <= page_index < 3:
         city = 0
         offer_type = 0
     else:
         offer_type = 1
     address = ' '.join(
         card.css(self.parsing_params['address_selector']).getall())
     x_cord, y_cord, address = get_cord(address)
     try:
         title_image = card.css(
             self.parsing_params['title_image_selector']).get().split(
                 ',')[0]
     except:
         title_image = ''
     print(card)
     title = type_of_house + " " + address
     print(card.css(self.parsing_params['price_selector']).get())
     price = correct_price(
         card.css(self.parsing_params['price_selector']).get())
     item = {
         'mode': 0,
         'offer_type': offer_type,
         'house_id': house_id,
         'img': title_image,
         'title': title,
         'link': link,
         'price': price,
         'address': address,
         'host': self.allowed_domains[0],
         'city': city,
         'cords': [x_cord, y_cord],
         'house_type': type_of_house
     }
     yield item
     return ''
示例#11
0
    def parse_info_data(self, response):
        # print(f'processing: {response.url}')
        type_of_participation = official_builder = name_of_build = decoration = floor = floor_count = house_type = \
            num_of_rooms = total_area = living_area = kitchen_area = deadline = land_area = ' '
        images = []
        # print('Start parse basic info...')
        h = response.url.replace('  ',
                                 '').split("/")[-1].split('_')[0].replace(
                                     '.html', '')
        address = response.css('p.adress > span::text').get()
        user_id = response.css(
            '.user-item-container::attr(data-user-id)').get()
        # print('Parsing house params...')
        if address:
            address = 'Тюмень, ' + address
            y_cords, x_cord, address = get_cord(address)
        else:
            x_cord = y_cords = None
            address = ''
        # print(f'Address: {address}')
        for label in response.css('div > span'):
            if isinstance(label.css('::text').get(), str):
                if label.css('::text').get().find('Общая площадь:') > 0:
                    if label.css('strong::text').get().find('сот.') > 0:
                        land_area = re.sub(r'кв.м.| | .', '',
                                           label.css('strong::text').get())
                    else:
                        total_area = re.sub(r'сот.| | .', '',
                                            label.css('strong::text').get())
        for param in response.css('div > span > strong::text').getall():
            new_param = re.sub(r'[\n ]', '', param)

            if param.index == 2:
                total_area = re.sub(r'кв.м.| | .', '', new_param)
                type_ = 'Вторичка'
            if param.index == 3:
                if int(new_param) > 5:
                    num_of_rooms = f'5к+ {new_param}'
                else:
                    num_of_rooms = f"{new_param}"
        if response.url.find('doma-kottedzhi-dachi') >= 0:
            type_ = 'Коттеджи'
        elif response.url.find('kvartiry-i-komnaty') >= 0:
            type_ = 'Вторичка'
        else:
            type_ = 'Участки'
        data = response.css('p.px18::text').get()
        phone = ''
        print('Parsing images...')
        for img in response.css('img.zoomable::attr(src)').getall():
            images.append(img)
        yield {
            'mode': 1,
            'user_id': user_id,
            'house_id': h,
            'type_of_participation': type_of_participation,
            'official_builder': official_builder,
            'name_of_build': name_of_build,
            'decoration': decoration,
            "floor": floor,
            "floor_count": floor_count,
            "house_type": house_type,
            "num_of_rooms": num_of_rooms,
            "total_area": total_area,
            "living_area": living_area,
            "kitchen_area": kitchen_area,
            'land_area': land_area,
            "deadline": deadline,
            'phone': phone,
            'images': images,
            'data': data,
            'address': address,
            'cords': [x_cord, y_cords],
            'type': type_
        }
示例#12
0
    def correct(self, result, category, offer_type, ):
        num_of_rooms = type_ = house_type = address = ''
        floor = floor_count = total_area = kitchen_area = living_area = land_area = 0
        if category == 2:
            if offer_type == 0:
                num_of_rooms = self.correct_num_of_rooms(result, 'param_1945')
                type_ = self.correct_type(result, 'param_1957')
                try:
                    house_type = result['param_2009']
                except:
                    pass
                floor = result['param_2113']
                floor_count = result['param_2213']
                total_area = float(result['param_2313'])
                address = result['address']

                try:
                    kitchen_area = float(result['param_2314'])
                except:
                    pass
                try:
                    living_area = float(result['param_12722'])
                except:
                    pass
            else:
                num_of_rooms = self.correct_num_of_rooms(result, 'param_2019')
                type_ = 'Вторичка'
                try:
                    house_type = result['param_2078']
                except:
                    pass
                floor = result['param_2315']
                floor_count = result['param_2415']
                try:
                    total_area = float(result['param_2515'])
                except:
                    pass
                address = result['address']
                try:
                    kitchen_area = float(result['param_12723'])
                except:
                    pass
                try:
                    living_area = float(result['param_12724'])
                except:
                    pass
        elif category == 4:
            if offer_type == 0:
                try:
                    floor_count = result['param_3837']
                except:
                    pass
                type_ = 'Коттеджи'
                try:
                    house_type = result['param_3843']
                except:
                    pass
                try:
                    total_area = float(result['param_4014'])
                except:
                    pass
                try:
                    land_area = float(result['param_4015'])
                except:
                    pass
            else:
                try:
                    floor_count = result['param_4016']
                except:
                    pass
                type_ = 'Коттеджи'
                try:
                    house_type = result['param_4022']
                except:
                    pass
                try:
                    total_area = float(result['param_4193'])
                except:
                    pass
                try:
                    land_area = float(result['param_4194'])
                except:
                    pass
        elif category == 5:
            if offer_type == 0:
                type_ = 'Участки'
                try:
                    land_area = float(result['param_4616'])
                except:
                    pass
            else:
                type_ = 'Участки'
                try:
                    land_area = float(result['param_4194'])
                except:
                    pass
        elif category == 7:
            if offer_type == 0:
                try:

                    floor_count = result['param_12869']
                except:
                    pass
                type_ = 'Коммерческаянедвижимость'
                try:
                    total_area = float(result['param_4920'])
                except:
                    pass
                try:
                    floor = result['param_12868']
                except:
                    pass
            else:
                try:
                    floor_count = result['param_12881']
                except:
                    pass
                type_ = 'Коммерческаянедвижимость'
                try:
                    total_area = float(result['param_4922'])
                except:
                    pass
                try:
                    floor = result['param_12880']
                except:
                    pass

        address = result['address']
        address = result['city'] + ' ' + address
        price = result['price']
        cords = [0, 0]
        try:
            cords = [result['cords']['lat'], result['cords']['lng']]
        except:
            cords[1], cords[0], _ = get_cord(address)

        title = result['title']
        description = result['description']
        host = result['source']
        phone = result['phone']
        link = result['url']
        images = []
        for img in result["images"]:
            images.append(img['imgurl'])
            # TODO Написать сохранение фоток для того что бы резать знаки
        if not phone[0] == '7':
            phone = '7' + phone[1:]
        if len(images) > 0:
            title_image = images[0]
        else:
            title_image = ''
        try:
            house_id = result['avitoid']
        except:
            house_id = result['id']
        data = {'num_of_rooms': num_of_rooms, 'type': type_, 'house_type': house_type, 'floor': floor,
                'floor_count': floor_count, 'total_area': total_area, 'kitchen_area': kitchen_area,
                "living_area": living_area, 'address': address, 'land_area': land_area, 'images': images,
                'price': price, 'cords': cords, 'title': title, 'description': description, "host": host,
                'phone': phone, 'title_image': title_image, 'link': link, 'offer_type': offer_type,
                'house_id': house_id}

        return data
示例#13
0
文件: n1.py 项目: SilentLords/DomProd
    def parse_card_info(self, card, city, house_id, page_index, response):
        if card.css('.living-list-card-newbuilding'):
            type_ = 'Новостройки'
        else:
            type_ = 'Вторичка'
        if 0 <= page_index <= 3:
            city = 0
        link = response.urljoin(card.css('a.link::attr(href)').get())
        print(response.url.find('zemlya'))
        if response.url.find('zemlya') > -1:
            address = card.css('.land-list-card__district::text').get()
        else:
            try:
                if card.css('a.link > span::text').get().split(
                        ','
                ) and (card.css('.living-list-card__inner-block::text').get()
                       or card.css(
                           'span.living-list-card-city-with-estate__item::text'
                       ).get()):
                    address = card.css('a.link > span::text').get().split(',')[1] + ' ' + \
                              card.css('a.link > span::text').get().split(',')[2] + ' ' + card.css(
                        '.living-list-card__inner-block::text').get() + ' ' + card.css(
                        'span.living-list-card-city-with-estate__item::text').get()
                else:
                    address = ''
            except:
                address = ''

            x_cord, y_cord, address = get_cord(address)
            if response.url.find('zemlya') > -1:
                price_class = '.land-list-card__price::attr(title)'
            else:
                price_class = '.living-list-card-price__item::text'
            yield ({
                'mode':
                0,
                'type':
                type_,
                'house_id':
                house_id,
                "link":
                response.urljoin(card.css('a.link::attr(href)').get()),
                "title":
                card.css('a.link > span::text').get(),
                "price":
                re.sub(r'[^0-9]', '',
                       card.css(price_class).get()),
                'address':
                address,
                "img":
                card.css('.offer-list-preview__item > img::attr(src)').get(),
                'time_created':
                '',
                'data':
                '',
                'host':
                'tumen.n1.ru',
                'city':
                city,
                'cords': [x_cord, y_cord]
            })

        return link