def parse_exchange_page(page): logging.debug('Parsing exchange page') data = lxml.html.document_fromstring(page) rows = data.xpath('//div[contains(@class, "row main-row-w")]') last_size = 0 were_new = False with db_session: for row in rows: club = row.getchildren()[0].xpath('.//a')[0] club_id = re.search('club(\d+)*', club.attrib['href']).group(1) public = Public.get(club_id=club_id) if public == None: name = club.text_content().strip() if not name: name = 'Noname' price = int(re.sub("[^0-9]", "", row.xpath('.//span[contains(@class, "js_placement_price")]')[0].text_content())) size, coverage = map(lambda x: int(re.sub("[^0-9]", "", x.text_content())), row.xpath('.//span[@class="num"]')) try: public = Public(club_id=club_id, name=name, \ size=size, coverage=coverage, \ price=price) except Exception as e: logging.error('club_id: {}, name: {}, size: {}, price: {}'.\ format(club_id, name, size, price)) raise e were_new = True last_size = public.size return last_size, were_new
def parse_exchange_page(page): logging.debug('Parsing exchange page') data = lxml.html.document_fromstring(page) public_names = data.xpath('//a[@class="exchange_ad_post_stats"]') def text2int(text): try: return int(text.replace(' ', '')) except: return 0 last_size = 0 were_new = False with db_session: for public_name in public_names: club_id = re.search('stats-(\d+)*', public_name.attrib['onclick']).group(1) public = Public.get(club_id=club_id) if public == None: cur_path = public_name.getparent().getnext() public_id = cur_path.attrib['href'].lstrip('/') name = cur_path.text if cur_path.text else 'Noname' cur_path = cur_path.getnext().getnext() category = cur_path.text cur_path = cur_path.getparent().getnext() size = text2int(cur_path.xpath('b')[0].text_content()) cur_path = cur_path.getnext() coverage2 = cur_path.xpath('b')[0].text_content() coverage, coverage_day = map(text2int, coverage2.split('/')) cur_path = cur_path.getnext() price = text2int(cur_path.xpath('b')[0].text_content()) try: public = Public(club_id=club_id, public_id=public_id, name=name, \ category=category, size=size, coverage=coverage, \ coverage_day=coverage_day, price=price) except Exception as e: logging.error('public_id: {0}, name: {1}, size: {2}, price: {3}'.\ format(public_id, name, size, price)) raise e were_new = True last_size = public.size return last_size, were_new