Пример #1
0
def parse_review(review: bs4.element.Tag) -> dict:
    """
    INPUT:
    review: HTML segment that contains all relevant review information

    OUTPUT:
    d: dictionary of relevant review information
    """

    d = {}
    if review.select_one("div.rating-10 span"):
        d['rating'] = int(review.select_one("div.rating-10 span").text)
    d['headline'] = review.select_one("h2.text_header").text
    try:
        d['country'] = review.select_one('h3.text_sub_header').text\
            .replace(')', '(').split('(')[1]
    except IndexError:
        d['country'] = 'None'
    d['body'] = review.select_one("div.text_content").text.strip()
    rows = review.select('tr')
    for row in rows:
        if row.select('td')[1].attrs['class'][0] == 'review-rating-stars':
            for x in row.select('span'):
                try:
                    if x.attrs['class'] == ['star', 'fill']:
                        num = int(x.text)
                        d[row.td.attrs['class'][1]] = num
                except KeyError:
                    continue
        else:
            d[row.td.attrs['class'][1]] = row.select('td')[1].text
    return d
Пример #2
0
def parse_event(event: bs4.element.Tag):
    """イベントひとつ分の要素から情報を抜き出す"""
    url = event.select_one('.events-list-item-title h3 a').get('href')
    community = event.select_one('.events-list-item-group a')
    community = community.text if community else None
    #     thumbnail = event.select_one('.event_thumbnail img').get('src')
    #     thumbnail = ''
    if re.search(r'/no_image_', thumbnail):
        thumbnail = None
    return Event(
        id=int(re.match(r'.+/(\d+)/?', url)[1]),
        title=event.select_one('.events-list-item-title h3 a span').text,
        url=url,
        dt_start=datetime.strptime(
            event.select_one('time').get('datetime'),
            '%Y-%m-%dT%H:%M:%S%z'),  #2019-10-12T13:00:00+09:00
        dt_end=datetime.strptime(
            event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'),
        #                  amount = event.select_one('.amount').text,
        #                  thumbnail = thumbnail,
        community=community,
        owner=community,
        place=''.join(
            map(lambda x: x.text,
                event.select('.events-list-item-venue > span'))))
Пример #3
0
 def parse_item(item: bs4.element.Tag):
     """
     :return SongItem(name, singer, id)
     """
     mid = item.attrs['mid']
     name = item.select_one('.song-name-text').text.strip()
     singer = item.select_one('.song-singer').text
     if singer:
         singer = re.sub(r'\s+', ' ', singer).strip()
     return SongItem(name, singer, mid)
Пример #4
0
    def __init__(self, item: bs4.element.Tag):
        self.name = item.select_one('span.txt').text

        sub_menu = item.select('li.listItem a')
        if len(sub_menu) == 0:
            self.link, self.id = solve_link(item.select_one('a')['href'])
            return

        for item in sub_menu:
            self.append(sub_industry(item))
Пример #5
0
def tag2gift(tag: bs4.element.Tag):
    """
    extrace info from tag and return the Gift Object constructed by those info

    :param tag bs4.element.Tag: html tag
    """
    name: str = tag.select_one("td:first-child").get_text().strip()
    description: str = tag.select_one("td:nth-child(2)").get_text().strip()
    price_str: str = tag.select_one("td:nth-child(3)").get_text().strip()
    price: float = float(price_str[1:].replace(',', ''))

    return Gift(name, description, price)
Пример #6
0
    def parse_block(self, block: bs4.element.Tag):
        url_block = block.select_one('a.ref_goods_n_p')
        if not url_block:
            logger.error('no url_block')
            return

        url = url_block.get('href')
        if not url:
            logger.error('no href')
            return

        name_block = block.select_one('div.dtlist-inner-brand-name')
        if not name_block:
            logger.error(f'no name_block on {url}')
            return

        brand_name = name_block.select_one('strong.brand-name')
        if not name_block:
            logger.error(f'no brand_name on {url}')
            return

        # Wrangler /
        brand_name = brand_name.text
        brand_name = brand_name.replace("/", "").strip()

        goods_name = name_block.select_one('span.goods-name')
        if not goods_name:
            logger.error(f'no goods_name on {url}')
            return

        goods_name = goods_name.text.strip()

        self.result.append(
            ParseResult(
                url=url,
                brand_name=brand_name,
                goods__name=goods_name,
            ))

        logger.debug('%s, %s, %s', url, brand_name, goods_name)
        logger.debug('-' * 100)
Пример #7
0
    def parse_block(self, item: bs4.element.Tag):
        # Выбрать блок с ссылкой и названием
        url_block = item.select_one('a.snippet-link')
        href = url_block.get('href')
        if href:
            url = 'https://www.avito.ru' + href
        else:
            url = None
        title = url_block.string.strip()

        # Выбрать блок с ценой
        price_block = item.select_one('span.price')
        price_block = price_block.get_text('\n')
        price_block = list(
            filter(None, map(lambda i: i.strip(), price_block.split('\n'))))
        if len(price_block) == 2:
            price, currency = price_block
            price = int(price.replace(" ", ""))
        else:
            price, currency = None, None
            logger.error(f"Что-то пошло не так при поиске цены: %s, %s",
                         price_block, url)

        # Выбрать блок с датой размещения объявления
        date = None
        date_block = item.select_one('div.item-date div.js-item-date.c-2')
        absolute_date = date_block.get('data-absolute-date')
        if absolute_date:
            date = self.parse_date(item=absolute_date)

        logger.info(f'%s, %s, %s, %s, %s', url, title, price, currency, date)

        return Block(
            url=url,
            title=title,
            price=price,
            currency=currency,
            date=date,
        )
Пример #8
0
def parse_event(event: bs4.element.Tag):
    """イベントひとつ分の要素から情報を抜き出す"""
    url = event.select_one('.event_title a').get('href')
    community = event.select_one('.series_title')
    community = community.text if community else None
    thumbnail = event.select_one('.event_thumbnail img').get('src')
    if re.search(r'/no_image_', thumbnail):
        thumbnail = None
    return Event(id=int(re.match(r'.+/(\d+)/?', url)[1]),
                 title=event.select_one('.event_title a').text,
                 url=url,
                 dt_start=datetime.strptime(
                     event.select_one('.dtstart .value-title').get('title'),
                     '%Y-%m-%dT%H:%M:%S%z'),
                 dt_end=datetime.strptime(
                     event.select_one('.dtend .value-title').get('title'),
                     '%Y-%m-%dT%H:%M:%S%z'),
                 amount=event.select_one('.amount').text,
                 thumbnail=thumbnail,
                 community=community,
                 owner=event.select_one('.event_owner img').get('title'),
                 place=event.select_one('.event_place').text.strip())
Пример #9
0
 def from_tag(cls, tag: bs4.element.Tag):
     rent = tag.find("span", class_="cassetteitem_price cassetteitem_price--rent").text
     admin_fee = tag.find("span", class_="cassetteitem_price cassetteitem_price--administration").text
     deposit = tag.find("span", class_="cassetteitem_price cassetteitem_price--deposit").text
     gratuity = tag.find("span", class_="cassetteitem_price cassetteitem_price--gratuity").text
     layout = tag.find("span", class_="cassetteitem_madori").text
     area = tag.find("span", class_="cassetteitem_menseki").text
     floor, *_ = tag.find_all("td")[2].stripped_strings
     min_floor, max_floor = parse_floor_range(floor)
     detail_href = tag.select_one("td.ui-text--midium.ui-text--bold a")["href"]
     url = f"{SUUMO_URL}{detail_href}"
     jnc_id = re.search(r"jnc_([0-9]*)/", detail_href).group(1)
     new_arrival = tag.find(class_="cassetteitem_other-checkbox--newarrival") is not None
     return cls(parse_money(rent, unit="万円"),
                parse_money(admin_fee, unit="円"),
                parse_money(deposit, unit="万円"),
                parse_money(gratuity, unit="万円"),
                layout, parse_area(area),
                min_floor, max_floor,
                url, jnc_id, new_arrival)
Пример #10
0
    def parse_block(self, item: bs4.element.Tag):
        # Выбрать блок с ссылкой и названием
        url_block = item.select_one('a.snippet-link')
        if not url_block:
            raise CommandError('bad "url_block" css')

        href = url_block.get('href')
        if href:
            url = 'https://www.avito.ru' + href
        else:
            url = None
        title = url_block.string.strip()
        if not title:
            raise CommandError(f'no title for item: {url_block}')

        # Выбрать блок с ценой
        price_block = item.select_one('span.price')
        if not price_block:
            raise CommandError('bad "price_block" css')

        price_block = price_block.get_text('\n')
        price_block = list(
            filter(None, map(lambda i: i.strip(), price_block.split('\n'))))
        if len(price_block) == 2:
            price, currency = price_block
            price = int(price.replace(" ", ""))
        else:
            price, currency = None, None
            logger.error(f"Что-то пошло не так при поиске цены: %s, %s",
                         price_block, url)

        # Выбрать блок с датой размещения объявления
        date = None
        date_block = item.select_one('div.item-date div.js-item-date.c-2')
        if not date_block:
            raise CommandError('bad "date_block" css')

        absolute_date = date_block.get('data-absolute-date')
        if absolute_date:
            date = self.parse_date(item=absolute_date)

        block = Block(
            url=url,
            title=title,
            price=price,
            currency=currency,
            date=date,
        )
        logger.info(block)

        # обновляем значение, если объект в базе уже существует
        try:
            p = Product.objects.get(url=url)
            p.task = self.task
            p.title = title
            p.price = price
            p.currency = currency
            p.public_date = date
            p.save()
        except Product.DoesNotExist:
            p = Product(
                url=url,
                task=self.task,
                title=title,
                price=price,
                currency=currency,
                public_date=date,
            ).save()
            logger.info(f"Except in product url: {url}")

        return block