def load_agenda(engine, wp, session): url = WEBTV_BASE % (session, wp) doc = _html(url, timeout=4.0) if doc is None: return False table = doc.find('//div[@class="meetingTable"]/table') if table is None: return False data = {'wp': wp, 'session': session} rows = table.findall('.//tr') for i, row in enumerate(rows): tds = row.findall('td') session_name = tds[0].xpath('string()').strip() if len(session_name): data['session_name'] = session_name bla, date = session_name.rsplit(' ', 1) data['session_url'] = url data['session_date'] = datetime.strptime(date, "%d.%m.%Y").isoformat() anchor = tds[0].find('a') if anchor is not None: data['item_id'] = anchor.get('name') key, label = tds[1].xpath('string()').strip().split('\n', 1) data['item_key'] = key.strip().replace('TOP:', '').strip() data['item_label'] = label.strip() text = rows[i + 1].find('.//span[@class="hiddenTopText"]') data['item_description'] = text.xpath('string()').strip() load_speeches(engine, data.copy()) return True
def load_speeches(engine, data): url = WEBTV_SPEECHES % (data['wp'], data['session'], data['item_id']) doc = _html(url) rows = doc.findall('//tr') table = sl.get_table(engine, 'webtv') for i, row in enumerate(rows): if i % 4 != 0: continue data['speaker'] = row.xpath('string()').strip() if isinstance(data['speaker'], str): data['speaker'] = data['speaker'].encode('latin-1').decode('utf-8') data['speech_id'] = rows[i + 2].find('.//a').get('href').split('=')[-1] sl.upsert(engine, table, data, ['speech_id']) pprint(data)
def load_dip_index(): doc = _html(EXTRAKT_INDEX, timeout=120.0) for result in doc.findall("//a[@class='linkIntern']"): yield urljoin(EXTRAKT_INDEX, result.get('href'))
def load_index(engine, incremental=True): doc = _html(INDEX) for a in doc.findall('//ul[@class="standardLinkliste"]//a'): url = urlparse.urljoin(INDEX, a.get('href')) load_vote(url, engine, incremental=incremental)