def processItemPage(url): itemInfo = {} html = tb.getHTML(url) soup = BeautifulSoup(html, 'html.parser') desc_soup = soup(attrs={'id': 'desc_text'}) if len(desc_soup) == 0: return {'description': 'None'} assert len(desc_soup) == 1 itemInfo['description'] = desc_soup[0].text print itemInfo['description'] return itemInfo
def processListPage(url): html = tb.getHTML(url) soup = BeautifulSoup(html, 'html.parser') flats = soup(attrs={'class': 'description'}) PARSED = [] for flat in flats: flat_info = {} # # -- Link and metro -- # flat_url = 'https://www.avito.ru' + flat.a['href'] flat_info['url'] = flat_url metroClassNames = ['i-metro i-metro-msk-{}'.format(i) for i in range(1, 20)] for className in metroClassNames: metroInfo = flat.find_all(class_=className) if len(metroInfo) == 0: continue elif len(metroInfo) == 1: print '!!!!!! ', className metro_line = metroInfo[0]['title'] metro_station = metroInfo[0].nextSibling break else: raise Exception('Metro info unknown layout') flat_info['metro_line'] = metro_line flat_info['metro_station'] = metro_station print u'Station: {} ({} line)'.format(metro_station, metro_line) # # -- Price and comission -- # about = flat.find_all(class_='about') if len(about) == 1: about = about[0] else: raise Exception('Unknown About layout') comission_info = about.find_all(class_='about__commission ') if len(comission_info) == 0: no_comission = None continue else: no_comission = comission_info[0].text == 'без комиссии'.decode('utf-8') flat_info['no_add_fee'] = no_comission currencies = flat.find_all(class_='popup-prices popup-prices__wrapper clearfix')[0]['data-prices'] price = json.loads(currencies)[0]['currencies']['RUB'] flat_info['price'] = int(price) print 'Price:', price print 'No additional fee:', no_comission # # -- Distance and post date -- # distPostdate = flat.find_all(class_='c-2') if len(distPostdate) == 2: distance, post_date = distPostdate post_date = post_date.text elif len(distPostdate) == 1: distance = None units = None post_date = distPostdate[0].text else: print distPostdate raise Exception('Unknown distance | post_date container layout') if distance is not None: distance, units = distance.text.split() numbers = re.compile('\d+(?:\.\d+)?') distance = float(numbers.findall(distance)[0]) kilometers = 'км'.decode('utf-8') meters = 'м'.decode('utf-8') # Convert meters to kilometers: if units == meters: units = kilometers distance = 0.001 * distance flat_info['distance'] = distance flat_info['posted_at'] = post_date print 'Distance:', distance, units print 'Posted at:', post_date print '' # # -- Get text descripton -- # flat_info['description'] = processItemPage(flat_url)['description'] PARSED.append(flat_info) print 'Found {} flats'.format(len(flats)) for i, F in enumerate(PARSED): print '[{}] flat'.format(i+1) for x in F.items(): print u" {} --> {}".format(x[0], x[1]) return PARSED