示例#1
0
文件: parser.py 项目: DrSpez/avitoApt
def processItemPage(url):
    itemInfo = {}

    html = tb.getHTML(url)
    soup = BeautifulSoup(html, 'html.parser')
    desc_soup = soup(attrs={'id': 'desc_text'})

    if len(desc_soup) == 0:
        return {'description': 'None'}

    assert len(desc_soup) == 1

    itemInfo['description'] = desc_soup[0].text

    print itemInfo['description']

    return itemInfo
示例#2
0
文件: parser.py 项目: DrSpez/avitoApt
def processListPage(url):

    html = tb.getHTML(url)

    soup = BeautifulSoup(html, 'html.parser')
    flats = soup(attrs={'class': 'description'})

    PARSED = []
    for flat in flats:
        flat_info = {}

        #
        # --  Link and metro  --
        #
        flat_url = 'https://www.avito.ru' + flat.a['href']
        flat_info['url'] = flat_url

        metroClassNames = ['i-metro i-metro-msk-{}'.format(i)
                           for i in range(1, 20)]

        for className in metroClassNames:
            metroInfo = flat.find_all(class_=className)
            if len(metroInfo) == 0:
                continue
            elif len(metroInfo) == 1:
                print '!!!!!! ', className
                metro_line = metroInfo[0]['title']
                metro_station = metroInfo[0].nextSibling
                break
            else:
                raise Exception('Metro info unknown layout')

        flat_info['metro_line'] = metro_line
        flat_info['metro_station'] = metro_station

        print u'Station: {}  ({} line)'.format(metro_station, metro_line)

        #
        # --  Price and comission  --
        #

        about = flat.find_all(class_='about')
        if len(about) == 1:
            about = about[0]
        else:
            raise Exception('Unknown About layout')

        comission_info = about.find_all(class_='about__commission ')

        if len(comission_info) == 0:
            no_comission = None
            continue
        else:
            no_comission = comission_info[0].text == 'без комиссии'.decode('utf-8')

        flat_info['no_add_fee'] = no_comission

        currencies = flat.find_all(class_='popup-prices popup-prices__wrapper clearfix')[0]['data-prices']
        price = json.loads(currencies)[0]['currencies']['RUB']
        flat_info['price'] = int(price)

        print 'Price:', price
        print 'No additional fee:', no_comission

        #
        # --  Distance and post date  --
        #

        distPostdate = flat.find_all(class_='c-2')
        if len(distPostdate) == 2:
            distance, post_date = distPostdate
            post_date = post_date.text
        elif len(distPostdate) == 1:
            distance = None
            units = None
            post_date = distPostdate[0].text
        else:
            print distPostdate
            raise Exception('Unknown distance | post_date container layout')

        if distance is not None:
            distance, units = distance.text.split()

            numbers = re.compile('\d+(?:\.\d+)?')
            distance = float(numbers.findall(distance)[0])

            kilometers = 'км'.decode('utf-8')
            meters = 'м'.decode('utf-8')

            # Convert meters to kilometers:
            if units == meters:
                units = kilometers
                distance = 0.001 * distance

        flat_info['distance'] = distance
        flat_info['posted_at'] = post_date

        print 'Distance:', distance, units
        print 'Posted at:', post_date
        print ''

        #
        # --  Get text descripton  --
        #

        flat_info['description'] = processItemPage(flat_url)['description']

        PARSED.append(flat_info)

    print 'Found {} flats'.format(len(flats))

    for i, F in enumerate(PARSED):
        print '[{}] flat'.format(i+1)
        for x in F.items():
            print u"    {}  -->  {}".format(x[0], x[1])

    return PARSED