Пример #1
0
def update_mdbs_from_crawler(file: Path):
    try:
        if file:
            persons = json.loads(open(file.absolute(), "r").read())
        else:
            persons = database.get_crawler_db()["person"].find({})
    except:
        raise ConnectionError(
            "Can't connect to remote crawler db. If you're developing locally you must specify a equivalent "
            "json with --file as fallback.")

    for p in persons:
        memberships = []
        for timeframe in p["fraktionen"]:
            austrittsdatum = None
            if 'austrittsDatum' in timeframe:
                austrittsdatum = get_safe_datetime(timeframe['austrittsDatum'])

            eintrittsdatum = get_safe_datetime(timeframe['eintrittsDatum'])
            membership = (eintrittsdatum, austrittsdatum, Faction.from_mdb_description(timeframe["beschreibung"]))

            memberships.append(membership)

        # will auto create MDB if not yet existent
        MDB.find_and_add_in_storage(p['vorname'], p['nachname'], memberships, p['_id'],
                                    get_safe_datetime(p['geburtsdatum']), p['geburtsort'], p['titel'], p['beruf'],
                                    initial=True, created_by="init")
def retrieve_paragraph_keymap(add_debug_obj: bool = False):
    # fetch person list from mdb database
    person_keymap = {}
    mdb_list = MDB.find_known_mdbs()
    ignored_mdbs = []
    if len(mdb_list) > 0:
        for mdb in mdb_list:
            keyword = mdb['surname']
            # TODO disambiguation improvement
            # for now, we opt to look for
            # mdb references only by their surname, as we have no method
            # to contextualize role- or forename references enough to tell
            # who's been adressed. Even in this solution, we discard any names
            # that appear multiple times in our database, as we again have no
            # system in place to figure out which entity is meant.
            if keyword not in person_keymap.keys():
                if keyword not in ignored_mdbs:
                    person_keymap[keyword] = keyword
            else:
                person_keymap.pop(keyword)
                ignored_mdbs.append(keyword)
    for k in person_keymap.keys():
        people = database.find_many("mdb", {"surname": k})
        if not len(people) > 1:
            person_keymap[k] = MDB(**people[0])

    return person_keymap
Пример #3
0
def manual_import(args):
    if args.dry_run:
        MDB.set_storage_mode("runtime")

    files = []
    for file in args.files:
        if file.is_file():
            files.append(file)
        elif file.is_dir():
            for sub_file in list(file.iterdir()):
                if sub_file.is_file():
                    files.append(sub_file)

    for file in files:
        logger.info("reading \"{}\" now...".format(file.as_posix()))
        transcripts = list()

        if file.suffix.lower() == ".json":
            logger.info("reading json based transcript file now...")
            file_content = read_transcripts_json_file(file)
        else:
            logger.info("reading xml based transcript file now...")
            file_content = [read_transcript_xml_file(file)]

        logger.info("extracting communication model now...".format(file.as_posix()))
        for metadata, inter_candidates in file_content:

            transcript = Transcript.from_interactions(
                metadata=metadata,
                interactions=extract_communication_model(
                    candidates=inter_candidates,
                    add_debug_objects=args.add_debug_objects))

            # insert into DB
            if not args.dry_run:
                transcript_dict = transcript.dict(exclude_none=True, exclude_unset=True)
                logger.info(f"writing transcript with '{len(transcript_dict['interactions'])}' interactions into db.")
                database.update_one("session", {"session_id": transcript.session_no}, transcript_dict)

            transcripts.append(transcript)

            # notify sentiment group
            if args.notify and transcript:
                utils.notify_sentiment_analysis_group([str(transcript.session_no)])

        cm = CommunicationModel(transcripts=transcripts)

        if args.dry_run:
            out_file: Path = file.with_suffix(".converted.json")
            logger.info("writing transcripts into {}.".format(out_file.absolute().as_posix()))
            with open(out_file, "w", encoding="utf-8") as o:
                o.write(cm.json(exclude_none=True, indent=4, ensure_ascii=False))
            with open(out_file.parent / "mdb.json", "w", encoding="utf-8") as o:
                safe_json_dump(MDB._mdb_runtime_storage, o)
Пример #4
0
def _build_candidate(comment: str) -> InteractionCandidate:
    return InteractionCandidate(speaker=MDB.find_and_add_in_storage(
        forename="Likey",
        surname="McUnittest",
        memberships=[(datetime.min, None, Faction.NONE)]),
                                paragraph="Unittest",
                                comment=comment)
Пример #5
0
    def test_extract_funny_sample_4(self):
        comment = "(Heiterkeit des Abg. Manfred Grund [CDU/CSU])"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]

        self.assertEqual(
            interaction_0.sender,
            MDB.find_and_add_in_storage(forename="Manfred",
                                        surname="Grund",
                                        memberships=[(datetime.min, None,
                                                      Faction.CDU_AND_CSU)]))
        self.assertEqual(interaction_0.message,
                         'Heiterkeit des Abg. Manfred Grund [CDU/CSU]')
def _get_candidates(topic_points: List[Dict], speaker_map: Dict[str, MDB]) -> List[InteractionCandidate]:
    candidates = list()
    not_in_speaker_list = list()

    for tp in topic_points:
        if tp["ablaufTyp"].lower() not in ["sitzungsbeginn", "tagesordnungspunkt"]:
            continue

        last_paragraph = None
        speeches = tp.get("reden", list())
        for sp in speeches:
            # why is this not in all objects?
            if "redeInhalt" not in sp:
                continue

            speaker = speaker_map.get(sp["rednerId"])

            if not speaker:
                # try to get speaker through mdb_number from DB
                speaker = database.find_one('mdb', {'mdb_number': sp["rednerId"]})
                if not speaker:
                    if sp['rednerId'] not in not_in_speaker_list:
                        not_in_speaker_list.append(sp['rednerId'])
                    continue
                speaker = MDB(**speaker)
                speaker_map[sp['rednerId']] = speaker

            for sp_part in sp["redeInhalt"]:
                part_type = sp_part["typ"]
                if last_paragraph is not None and part_type.lower() == "paragraf":
                    candidates.append(InteractionCandidate(
                        speaker=speaker,
                        paragraph=utils.cleanup_str(last_paragraph),
                        comment=None))
                    last_paragraph = sp_part["text"]
                elif part_type.lower() == "kommentar":
                    if last_paragraph and speaker:
                        candidates.append(InteractionCandidate(
                            speaker=speaker,
                            paragraph=utils.cleanup_str(last_paragraph),
                            comment=utils.cleanup_str(sp_part["text"])))
                    last_paragraph = None
                else:
                    last_paragraph = sp_part["text"]

    logger.warning(f"Following speakers were not in the speaker list: {not_in_speaker_list}")
    return candidates
Пример #7
0
    def test_extract_sample3(self):
        comment3 = "(Carsten Schneider [Erfurt] [SPD]: Was für ein Blödsinn! – Zuruf vom BÜNDNIS90/DIE GRÜNEN: Vielleicht mal lesen! Lesen bildet!)"

        cm3 = extract_communication_model([_build_candidate(comment3)])
        interaction_0 = cm3[0]
        interaction_1 = cm3[1]

        self.assertEqual(
            interaction_0.sender,
            MDB.find_and_add_in_storage(forename="Carsten",
                                        surname="Schneider",
                                        memberships=[(datetime.min, None,
                                                      Faction.SPD)]))
        self.assertEqual(interaction_0.message, 'Was für ein Blödsinn!')
        self.assertEqual(interaction_1.sender, Faction.DIE_GRÜNEN)
        self.assertEqual(interaction_1.message,
                         'Vielleicht mal lesen! Lesen bildet!')
Пример #8
0
    def test_extract_sample2(self):
        comment = "(Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN – Zuruf des Abg. Armin-Paulus Hampel [AfD])"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]
        interaction_1 = cm[1]
        interaction_2 = cm[2]
        interaction_3 = cm[3]
        interaction_4 = cm[4]
        interaction_5 = cm[5]
        self.assertEqual(interaction_0.sender, Faction.CDU_AND_CSU)
        self.assertEqual(
            interaction_0.message,
            'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN'
        )
        self.assertEqual(interaction_1.sender, Faction.SPD)
        self.assertEqual(
            interaction_1.message,
            'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN'
        )
        self.assertEqual(interaction_2.sender, Faction.FDP)
        self.assertEqual(
            interaction_2.message,
            'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN'
        )
        self.assertEqual(interaction_3.sender, Faction.DIE_LINKE)
        self.assertEqual(
            interaction_3.message,
            'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN'
        )
        self.assertEqual(interaction_4.sender, Faction.DIE_GRÜNEN)
        self.assertEqual(
            interaction_4.message,
            'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN'
        )
        self.assertEqual(
            interaction_5.sender,
            MDB.find_and_add_in_storage(forename="Armin-Paulus",
                                        surname="Hampel",
                                        memberships=[(datetime.min, None,
                                                      Faction.AFD)]))
        self.assertEqual(interaction_5.message,
                         'Zuruf des Abg. Armin-Paulus Hampel [AfD]')
def _convert_speaker(speaker_map: Dict[str, Dict]):
    def _fix_factions(factions) -> List[Tuple[datetime, datetime, Faction]]:
        fixed_factions = list()
        for f in factions:
            austrittsdatum = None
            if 'austrittsDatum' in f:
                austrittsdatum = f["austrittsDatum"]
                if not isinstance(f["austrittsDatum"], datetime):
                    austrittsdatum = datetime.fromisoformat(f["austrittsDatum"])

            eintrittsdatum = f["eintrittsDatum"]
            if not isinstance(f["eintrittsDatum"], datetime):
                eintrittsdatum = datetime.fromisoformat(f["eintrittsDatum"])

            fixed_factions.append((
                eintrittsdatum,
                austrittsdatum,
                Faction.from_mdb_description(f["beschreibung"]).value))
        return fixed_factions

    conv_map = dict()
    for k, v in speaker_map.items():

        birthday = v.get("geburtsdatum")
        if isinstance(v, str):
            birthday = datetime.fromisoformat(birthday)

        conv_map[v["_id"]] = MDB.find_and_add_in_storage(
            mdb_number=v["_id"],
            forename=utils.cleanup_str(v["vorname"]),
            surname=utils.cleanup_str(v["nachname"]),
            memberships=_fix_factions(v.get("fraktionen", list())),
            birthday=birthday,
            birthplace=utils.cleanup_str(v.get("geburtsort")),
            title=utils.cleanup_str(v.get("title")),
            job_title=utils.cleanup_str(v.get("beruf", "")),
            created_by="jsonParse")

    return conv_map
Пример #10
0
    def test_extract_funny_sample_3(self):
        comment = "(Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP])"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]
        interaction_1 = cm[1]

        self.assertEqual(interaction_0.sender, Faction.CDU_AND_CSU)
        self.assertEqual(
            interaction_0.message,
            'Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP]'
        )
        self.assertEqual(
            interaction_1.sender,
            MDB.find_and_add_in_storage(forename="Jens",
                                        surname="Beeck",
                                        memberships=[(datetime.min, None,
                                                      Faction.FDP)]))
        self.assertEqual(
            interaction_1.message,
            'Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP]'
        )
Пример #11
0
    def test_extract_funny_sample2(self):
        comment = "(Beifall bei der SPD sowie bei Abgeordneten der LINKEN – Matthias W. Birkwald [DIE LINKE]: Ich mich auch!)"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]
        interaction_1 = cm[1]
        interaction_2 = cm[2]

        self.assertEqual(interaction_0.sender, Faction.SPD)
        self.assertEqual(
            interaction_0.message,
            'Beifall bei der SPD sowie bei Abgeordneten der LINKEN')
        self.assertEqual(interaction_1.sender, Faction.DIE_LINKE)
        self.assertEqual(
            interaction_1.message,
            'Beifall bei der SPD sowie bei Abgeordneten der LINKEN')
        self.assertEqual(
            interaction_2.sender,
            MDB.find_and_add_in_storage(forename="Matthias W.",
                                        surname="Birkwald",
                                        memberships=[(datetime.min, None,
                                                      Faction.DIE_LINKE)]))
        self.assertEqual(interaction_2.message, 'Ich mich auch!')
Пример #12
0
    def test_extract_sample1(self):
        comment = "(Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und des BÜNDNISSES 90/DIE GRÜNEN – Dr. Eberhardt Alexander Gauland [AfD]: Ha, ha, ha!)"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]
        interaction_1 = cm[1]
        interaction_2 = cm[2]
        interaction_3 = cm[3]
        interaction_4 = cm[4]
        self.assertEqual(interaction_0.sender, Faction.FDP)
        self.assertEqual(
            interaction_0.message,
            'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und '
            'des BÜNDNISSES 90/DIE GRÜNEN')
        self.assertEqual(interaction_1.sender, Faction.CDU_AND_CSU)
        self.assertEqual(
            interaction_1.message,
            'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und '
            'des BÜNDNISSES 90/DIE GRÜNEN')
        self.assertEqual(interaction_2.sender, Faction.SPD)
        self.assertEqual(
            interaction_2.message,
            'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und '
            'des BÜNDNISSES 90/DIE GRÜNEN')
        self.assertEqual(interaction_3.sender, Faction.DIE_GRÜNEN)
        self.assertEqual(
            interaction_3.message,
            'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und '
            'des BÜNDNISSES 90/DIE GRÜNEN')
        self.assertEqual(
            interaction_4.sender,
            MDB.find_and_add_in_storage(forename="Eberhardt Alexander",
                                        surname="Gauland",
                                        memberships=[(datetime.min, None,
                                                      Faction.AFD)]))
        self.assertEqual(interaction_4.message, 'Ha, ha, ha!')
def _build_mdb(person_str, add_debug_obj):
    # the following lines are a workaround for the somehow not working
    # optional matching group for the Abg. string. If someone finds a way to
    # get this optional matching group working feel free to remove also
    # remove the following lines
    cut_idx = person_str.find("Abg.")
    if cut_idx >= 0:
        cut_idx = person_str.find(" ", cut_idx)
        person_str = person_str[cut_idx:].strip()

    person_str = person_str.replace("(", "[")
    person_str = person_str.replace(")", "]")

    num_opening_brackets = person_str.count("[")
    num_closing_brackets = person_str.count("]")

    if num_opening_brackets != num_closing_brackets:
        logger.warning(
            "the received person_str \"{}\" contains not the same amount of "
            "opening brackets as closing brackets. this might become a "
            "problem shortly after this...".format(person_str))

    if num_opening_brackets > num_closing_brackets:
        person_str = person_str.lstrip("[")

    work_str = person_str
    person_parts = list()
    metadata_parts = list()
    while "[" in work_str:
        start_idx = work_str.find("[")
        end_idx = work_str.find("]", start_idx) + 1

        person_parts.append(work_str[:start_idx].strip())
        metadata_parts.append(work_str[start_idx:end_idx].strip().strip("[]"))
        work_str = work_str[end_idx:]

    full_name = " ".join(person_parts)

    faction = ""
    for mp in metadata_parts:
        found_factions = Faction.in_text(mp)
        if found_factions:
            if len(found_factions) != 1:
                logger.info(f"Found factions != 1: {found_factions}")
            assert len(found_factions) == 1
            faction = found_factions[0]
            break

    membership = list()
    if faction:
        membership = [(datetime.min, None, faction)]

    full_name = full_name.replace("- ", "-")
    full_name = full_name.replace(" -", "-")

    role, title, forename, surname = split_name_str(full_name)

    # detection of malformed extractions (will later remove interactions with malformed MDB)
    # check that forename and surname are filled and have more than one char in them
    malformed = not forename or len(forename) <= 1
    malformed = malformed or not surname or len(surname) <= 1
    # check if forename starts with a small char
    malformed = malformed or forename[0].islower()
    if not malformed:
        extended_keywords = keywords.copy()
        extended_keywords.update([
            "am", "um", "ne", "wo", "Wo", ".", "-", "der", "die", "das", "des",
            "von", "an", "h", "h."
        ])
        for k in extended_keywords:
            malformed = malformed or k in full_name.split(" ")
            malformed = malformed or k == forename.lower(
            ) or k == surname.lower()
            if malformed:
                break

    if malformed:
        return MalformedMDB(person_str, forename, surname, membership)

    debug_info = None
    if add_debug_obj:
        debug_info = {
            "constructed_from_text": True,
            "creation_person_str": person_str
        }

    return MDB.find_and_add_in_storage(forename=forename,
                                       surname=surname,
                                       memberships=membership,
                                       job_title=role,
                                       debug_info=debug_info,
                                       created_by="_buildMdb")
Пример #14
0
import unittest
from datetime import datetime

from cme.domain import InteractionCandidate, MDB, Faction
from cme.extraction import extract_communication_model

MDB.set_storage_mode("runtime")


def _build_candidate(comment: str) -> InteractionCandidate:
    return InteractionCandidate(speaker=MDB.find_and_add_in_storage(
        forename="Likey",
        surname="McUnittest",
        memberships=[(datetime.min, None, Faction.NONE)]),
                                paragraph="Unittest",
                                comment=comment)


class TestExtraction(unittest.TestCase):
    def test_extract_sample1(self):
        comment = "(Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und des BÜNDNISSES 90/DIE GRÜNEN – Dr. Eberhardt Alexander Gauland [AfD]: Ha, ha, ha!)"

        cm = extract_communication_model([_build_candidate(comment)])
        interaction_0 = cm[0]
        interaction_1 = cm[1]
        interaction_2 = cm[2]
        interaction_3 = cm[3]
        interaction_4 = cm[4]
        self.assertEqual(interaction_0.sender, Faction.FDP)
        self.assertEqual(
            interaction_0.message,
Пример #15
0
    def _extract(
            block_el: bs4e.Tag,
            curr_speaker: MDB = None,
            curr_paragraph: str = None) \
            -> List[InteractionCandidate]:

        pms = list()
        for el in block_el:
            # there are random line breaks in the file which BeautifulSoup
            # makes accessible but we don't need
            if isinstance(el, bs4e.NavigableString):
                continue
            elif el.name == "name" or (el.name == "p" and el.get("klasse") == "N"):
                role, title, first_name, last_name = split_name_str(cleanup_str(el.getText().rstrip(":")))
                curr_speaker = {
                    "forename": cleanup_str(first_name),
                    "surname": cleanup_str(last_name),
                    "memberships": [(datetime.min, None, Faction.NONE)],
                    "job_title": role,
                    "title": title
                }
            elif el.name == "rede":
                pms += _extract(el, curr_speaker, curr_paragraph)
            elif el.name == "p":
                category = el.get("klasse")

                if category == "redner":
                    # workaround for the situation in which the fraktion tags in
                    # the xml somehow contain a direct speech formatted like this "SPD: ja."
                    faction_txt = _safe_get_text(el.redner, "fraktion")
                    if ":" in faction_txt:
                        faction_txt = faction_txt.split(":")[0].strip()

                    # TODO: Proper name and integrate into find_in_storage
                    curr_speaker = {
                        "mdb_number": el.redner.get("id"),
                        "forename": _safe_get_text(el.redner, "vorname"),
                        "surname": _safe_get_text(el.redner, "nachname"),
                        "memberships": [(datetime.min, None, Faction.from_name(faction_txt))],
                        "job_title": _safe_get_text(el.redner, "rolle_lang")}

                elif category in ["J", "J_1", "O", "Z"]:
                    new_para_str = cleanup_str(el.getText())
                    if curr_paragraph is not None:
                        if not curr_speaker:
                            # logger.warning(
                            #    "found a new paragraph but couldn't finish "
                            #    "the old one as there has been no speaker so "
                            #    "far! dropping the old one (\"{}\") now...".format(curr_paragraph))
                            curr_paragraph = new_para_str
                            continue

                        speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                            else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

                        pms.append(InteractionCandidate(
                            speaker=speaker,
                            paragraph=curr_paragraph,
                            comment=None))
                    curr_paragraph = new_para_str
                else:
                    logger.debug("Ignoring unhandled category \"{}\" of tag "
                                 "p.".format(category))
            elif el.name == "kommentar":
                if not curr_speaker:
                    if logging_is_needed(el.getText()):
                        logger.warning(
                            "found a comment but there has been no speaker so far"
                            "! skipping it (\"{}\") until we find a speaker...".format(
                                cleanup_str(el.getText())))
                    continue

                if not curr_paragraph:
                    logger.warning(
                        "found a comment but there has been no paragraph so far"
                        "! skipping it (\"{}\") until we find a paragraph...".format(
                            cleanup_str(el.getText())))
                    continue

                speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                    else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

                pms.append(InteractionCandidate(
                    speaker=speaker,
                    paragraph=curr_paragraph,
                    comment=cleanup_str(el.getText())))
                curr_paragraph = None

        # finish still open curr_paragraph
        if curr_paragraph is not None:
            if not curr_speaker:
                logger.warning(
                    "found a open paragraph but there has been no speaker so far"
                    "! skipping it (\"{}\"), but this should be investigated as it "
                    "means no speaker in the whole block has been found".format(
                        cleanup_str(curr_paragraph)))
                return pms

            speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

            pms.append(InteractionCandidate(
                speaker=speaker,
                paragraph=curr_paragraph,
                comment=None))

        return pms