def make_fingerprint(engine, person): try: long_name = make_long_name(person) try: long_name = resolve_person(long_name) log.info(" -> %s" % long_name.strip()) except: log.error("Resolve did not work") pass Person = sl.get_table(engine, 'person') sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': url_slug(long_name), 'mdb_id': person['mdb_id'] }, unique=['mdb_id']) Rolle = sl.get_table(engine, 'rolle') sl.upsert(engine, Rolle, { 'mdb_id': person['mdb_id'], 'fingerprint': long_name }, unique=['mdb_id']) person['fingerprint'] = long_name except BadReference: log.error("Bad Reference %s", person) pass
def match_beitrag(engine, beitrag, url): beitrag_print = make_long_name(beitrag) log.info("Matching: %s", beitrag_print.encode('ascii', 'replace')) try: value = resolve_person(beitrag_print) if sl.find_one(engine, sl.get_table(engine, 'person'), fingerprint=value) is None: make_person(engine, beitrag, value, url) return value except BadReference: log.info("Beitrag person is unknown: %s", beitrag_print.encode('ascii', 'replace'))
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, {'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp}, unique=['person'])
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, { 'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp }, unique=['person'])
def load_profiles(engine): doc = etree.parse(FEED_URL) Person = sl.get_table(engine, 'person') for profile in doc.findall('//PROFIL'): name = profile.findtext('.//VORNAME') if name is None: continue name += ' ' + profile.findtext('.//NACHNAME') partei = profile.findtext('.//PARTEI') name += ' ' + PARTEI_MAPPING.get(partei, partei) try: fp = resolve_person(name) sl.upsert(engine, Person, {'awatch_url': profile.get('url'), 'fingerprint': fp}, unique=['fingerprint']) except BadReference: pass
def parse_pois(self, group): for poi in group.split(' - '): text = poi speaker_name = None fingerprint = None sinfo = poi.split(': ', 1) if len(sinfo) > 1: speaker_name = sinfo[0] text = sinfo[1] speaker = speaker_name.replace('Gegenruf des Abg. ', '') try: fingerprint = resolve_person(speaker) except InvalidReference: pass except BadReference: self.missing_recon = True yield (speaker_name, fingerprint, text)
def parse_pois(self, group): for poi in group.split(' - '): text = poi speaker_name = None fingerprint = None sinfo = poi.split(': ', 1) if len(sinfo) > 1: speaker_name = sinfo[0] text = sinfo[1] speaker = speaker_name.replace('Gegenruf des Abg. ', '') try: fingerprint = resolve_person(speaker) except InvalidReference: pass except BadReference: self.missing_recon = True yield (speaker_name, fingerprint, text)
def make_person(engine, beitrag, fp, source_url): try: fp = resolve_person(fp) person = { 'fingerprint': fp, 'slug': url_slug(fp), 'source_url': source_url, 'vorname': beitrag['vorname'], 'nachname': beitrag['nachname'], 'ort': beitrag.get('ort'), 'ressort': beitrag.get('ressort'), 'land': beitrag.get('land'), 'fraktion': beitrag.get('fraktion') } sl.upsert(engine, sl.get_table(engine, 'person'), person, unique=['fingerprint']) except BadReference: pass return fp
def speakers_webtv(engine, wp, session): table = sl.get_table(engine, 'webtv') for speech in sl.distinct(engine, table, 'speaker', wp=wp, session=session): if speech['speaker'] is None: continue speaker = speaker_name_transform(speech['speaker']) matched = True try: fp = resolve_person(speaker) except InvalidReference: fp = None except BadReference: fp = None matched = False sl.upsert(engine, table, {'fingerprint': fp, 'matched': matched, 'speaker': speech['speaker']}, unique=['speaker'])
def load_profiles(engine): doc = etree.parse(FEED_URL) Person = sl.get_table(engine, 'person') for profile in doc.findall('//PROFIL'): name = profile.findtext('.//VORNAME') if name is None: continue name += ' ' + profile.findtext('.//NACHNAME') partei = profile.findtext('.//PARTEI') name += ' ' + PARTEI_MAPPING.get(partei, partei) try: fp = resolve_person(name) sl.upsert(engine, Person, { 'awatch_url': profile.get('url'), 'fingerprint': fp }, unique=['fingerprint']) except BadReference: pass
def make_person(engine, beitrag, fp, source_url): try: fp = resolve_person(fp) person = { 'fingerprint': fp, 'slug': url_slug(fp), 'source_url': source_url, 'vorname': beitrag['vorname'], 'nachname': beitrag['nachname'], 'ort': beitrag.get('ort'), 'ressort': beitrag.get('ressort'), 'land': beitrag.get('land'), 'fraktion': beitrag.get('fraktion') } sl.upsert(engine, sl.get_table(engine, 'person'), person, unique=['fingerprint']) except BadReference: pass return fp
def __iter__(self): self.in_session = False speaker = None fingerprint = None chair_ = [False] text = [] def emit(reset_chair=True): data = { 'speaker': speaker, 'type': 'chair' if chair_[0] else 'speech', 'fingerprint': fingerprint, 'text': "\n\n".join(text).strip() } if reset_chair: chair_[0] = False [text.pop() for i in xrange(len(text))] return data for line in self.fh: line = line.decode('latin-1') line = line.replace(u'\u2014', '-') line = line.replace(u'\x96', '-') if not self.in_session and BEGIN_MARK.match(line): self.in_session = True continue elif not self.in_session: continue if END_MARK.match(line): return if not len(line.strip()): continue is_top = False if TOP_MARK.match(line): is_top = True has_stopword = False for sw in SPEAKER_STOPWORDS: if sw.lower() in line.lower(): has_stopword = True m = SPEAKER_MARK.match(line) if m is not None and not is_top and not has_stopword: if speaker is not None: yield emit() _speaker = m.group(1) role = line.strip().split(' ')[0] try: fingerprint = resolve_person(_speaker) speaker = _speaker chair_[0] = role in CHAIRS continue except InvalidReference: pass except BadReference: self.missing_recon = True m = POI_MARK.match(line) if m is not None: if not m.group(1).lower().strip().startswith('siehe'): yield emit(reset_chair=False) for _speaker, _fingerprint, _text in self.parse_pois(m.group(1)): yield { 'speaker': _speaker, 'type': 'poi', 'fingerprint': _fingerprint, 'text': _text } continue text.append(line) yield emit()
def __iter__(self): self.in_session = False speaker = None fingerprint = None in_writing = False chair_ = [False] text = [] def emit(reset_chair=True): data = { 'speaker': speaker, 'in_writing': in_writing, 'type': 'chair' if chair_[0] else 'speech', 'fingerprint': fingerprint, 'text': "\n\n".join(text).strip() } if reset_chair: chair_[0] = False [text.pop() for i in xrange(len(text))] return data for line in self.fh.readlines(): try: line = line.decode('latin-1') except: pass line = line.replace(u'\u2014', '-') line = line.replace(u'\x96', '-') rline = line.replace(u'\xa0', ' ').strip() if not self.in_session and BEGIN_MARK.match(line): self.in_session = True continue elif not self.in_session: continue if END_MARK.match(rline): return if WRITING_BEGIN.match(rline): in_writing = True if WRITING_END.match(rline): in_writing = False if not len(line.strip()): continue is_top = False if TOP_MARK.match(rline): is_top = True has_stopword = False for sw in SPEAKER_STOPWORDS: if sw.lower() in line.lower(): has_stopword = True m = SPEAKER_MARK.match(line) if m is not None and not is_top and not has_stopword: if speaker is not None: yield emit() _speaker = m.group(1) role = line.strip().split(' ')[0] try: fingerprint = resolve_person(_speaker) speaker = _speaker chair_[0] = role in CHAIRS continue except InvalidReference: pass except BadReference: self.missing_recon = True m = POI_MARK.match(line) if m is not None: if not m.group(1).lower().strip().startswith('siehe'): yield emit(reset_chair=False) in_writing = False for _speaker, _fingerprint, _text in self.parse_pois(m.group(1)): yield { 'speaker': _speaker, 'in_writing': False, 'type': 'poi', 'fingerprint': _fingerprint, 'text': _text } continue text.append(line) yield emit()