def func(url): from offenesparlament.core import etl_engine, db engine = etl_engine() proc['handler'](engine, indexer, url, \ force=force) engine.dispose() db.session.close()
def index(): engine = etl_engine() webtv_table = sl.get_table(engine, 'webtv') sessions = sl.distinct(engine, webtv_table, 'wp', 'session', 'session_name') sessions = sorted(sessions, reverse=True) return render_template('backend/index.html', sessions=sessions)
def load(): """ Load and index staging DB into production """ engine = etl_engine() from offenesparlament.load import load load.load(engine) load.aggregate() from offenesparlament.load import index index.index()
def speechmatcher_alignment_post(wp, session): engine = etl_engine() table = sl.get_table(engine, 'alignments') data = dict(request.form.items()) data['sequence'] = int(data['sequence']) data['wp'] = wp data['session'] = session sl.upsert(engine, table, data, ['wp', 'session', 'sequence']) return speechmatcher_alignment_get(wp, session)
def extract_base(): """ Run the extract stage """ engine = etl_engine() from offenesparlament.extract.xml import ausschuss ausschuss.load_index(engine) from offenesparlament.extract.xml import news #news.load_index(engine) from offenesparlament.extract.xml import mdb mdb.load_index(engine)
def speechmatcher(wp, session): engine = etl_engine() speech_table = sl.get_table(engine, 'speech') speeches = sl.find(engine, speech_table, order_by='sequence', wahlperiode=wp, sitzung=session, matched=True) webtv_table = sl.get_table(engine, 'webtv') agenda = sl.find(engine, webtv_table, wp=wp, session=session) agenda = list(agenda) return render_template('backend/speechmatcher.html', speeches=speeches, agenda=agenda, wp=wp, session=session)
def update(force=False, threaded=False, preload=False): """ Update the entire database. """ app.config["NOMENKLATURA_PRELOAD"] = not preload engine = etl_engine() indexer = get_indexer() try: for stage in [GREMIUM, PERSON, ABSTIMMUNG, ABLAUF, TRANSCRIPT]: process(engine, indexer, proc, force=force, threaded=threaded) finally: indexer.flush()
def _stage(proc, url=None, force=False, threaded=False, preload=True): app.config['NOMENKLATURA_PRELOAD'] = preload indexer = get_indexer() try: if url is None: process(indexer, proc, force=force, threaded=threaded) else: engine = etl_engine() proc['handler'](engine, indexer, url, force=force) finally: indexer.flush()
def _stage(proc, url=None, force=False, threaded=False, preload=True): app.config["NOMENKLATURA_PRELOAD"] = preload indexer = get_indexer() try: if url is None: process(indexer, proc, force=force, threaded=threaded) else: engine = etl_engine() proc["handler"](engine, indexer, url, force=force) finally: indexer.flush()
def speechmatcher_alignment_get(wp, session): engine = etl_engine() score, alignment = get_alignment(engine, wp, session) align_data = {} for align in alignment: seq = align.pop('sequence') align['matched'] = align['transcript_fp']==align['agenda_fp'] del align['transcript_fp'] align_data[seq] = align return jsonify({ 'score': score, 'alignment': align_data })
def transform(): """ Transform and clean up content """ engine = etl_engine() from offenesparlament.transform import persons persons.generate_person_long_names(engine) from offenesparlament.transform import positions positions.extend_positions(engine) from offenesparlament.transform import namematch namematch.match_persons(engine) from offenesparlament.transform import abstimmungen abstimmungen.extend_abstimmungen(engine) persons.generate_person_long_names(engine) from offenesparlament.transform import speechparser speechparser.load_transcripts(engine) from offenesparlament.transform import webtv webtv.merge_speeches(engine) from offenesparlament.transform import awatch awatch.load_profiles(engine) from offenesparlament.transform import speechmatch persons.generate_person_long_names(engine) speechmatch.extend_speeches(engine) persons.generate_person_long_names(engine) from offenesparlament.transform import drs drs.merge_speeches(engine)
def cache_abstimmungen(engine): Abstimmung = sl.get_table(engine, 'abstimmung') data = defaultdict(dict) for e in sl.distinct(engine, Abstimmung, 'subject', 'date'): data[e['date']][e['subject']] = set(drucksachen(e['subject'])) return dict(data.items()) def extend_beschluesse(engine): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(engine) #pprint(abstimmungen) Beschluss = sl.get_table(engine, 'beschluss') for data in sl.find(engine, Beschluss): date = data['fundstelle'].split(' ')[0] data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat() if not data['dokument_text']: continue if data['date'] in abstimmungen: abst = abstimmungen[data['date']] doks = set(data['dokument_text'].split(', ')) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data['date'], doks, adoks if __name__ == '__main__': engine = etl_engine() print "DESTINATION", engine extend_beschluesse(engine)
def download_docs(): """ Download all PDFs from DIP. """ engine = etl_engine() from offenesparlament.extract import documents documents.load_documents(engine)
def extract_votes(): """ Run the extract stage """ engine = etl_engine() from offenesparlament.extract import abstimmungen abstimmungen.load_index(engine)
parser = SpeechParser(engine, sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url contrib['matched'] = True sl.upsert(engine, Speech, contrib, unique=['sequence', 'sitzung', 'wahlperiode']) seq += 1 if parser.missing_recon: sl.upsert(engine, Speech, { 'matched': False, 'sitzung': session, 'wahlperiode': wp }, unique=['sitzung', 'wahlperiode']) return True def load_transcripts(engine, incremental=True): for i in count(33): if not load_transcript(engine, 17, i, incremental=incremental) and i > 180: break if __name__ == '__main__': engine = etl_engine() print "DESTINATION", engine load_transcripts(engine)
def longextract(): """ Run the extract stage, including long-running tasks """ engine = etl_engine() from offenesparlament.extract import wahlkreise wahlkreise.load_wahlkreise(engine)
def extract_media(): """ Run the extract stage """ engine = etl_engine() from offenesparlament.extract import webtv webtv.load_sessions(engine)
def devtf(): """ Transform and clean up content (dev bits) """ engine = etl_engine() from offenesparlament.transform import drs drs.merge(engine)
def extract_docs(): """ Run the extract stage """ engine = etl_engine() from offenesparlament.extract import dip dip.load_dip(engine)