Пример #1
0
def retrieve():
    list_link = "{}{}".format(BASE_URL, BOOK_LIST)
    html = retrieve_data("goodreads.top-books.html", list_link)
    soup = bs(html, "html.parser")
    rows = soup.find_all("tr", {"itemtype": "http://schema.org/Book"})

    for row in rows[:100]:
        link = row.find("div", {"data-resource-type": "Book"}).a["href"]
        book_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("goodreads", link_to_fname(link))

        print("Fetching {}...".format(book_link))
        html = retrieve_data(fname, book_link)

        try:
            soup = bs(html, "html.parser")
            title = soup.find("h1", {"id": "bookTitle"}).get_text()
            title = clean_whitespace(title)
            description = soup.select("div#description span")[-1].get_text()
            description = clean_whitespace(description)
            link = soup.find("a", {"id": "buyButton"})["href"]
            genres = soup.select(".left .bookPageGenreLink")
            genres = [clean_whitespace(genre.get_text()) for genre in genres]
            image = soup.find("img", {"id": "coverImage"})["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            product = Product(title, "{}{}".format(BASE_URL, link), image,
                              "books", genres, description)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Пример #2
0
def normalize_namen(groups):
    new_groups = []
    for titles, members, sekretariat in groups:
        title_de = titles[0]
        title_fr = ""
        title_it = ""

        #The rumantsch group is the only group that is actually named in four seperate languages
        #Hack around it
        if "rumantscha" in titles[0] and len(titles) == 4:
            titles = titles[:-1]

        last_language = 'de'
        for title in titles[1:]:
            title_removed_misleading_words = title.replace('Italianità', '')
            language = guess_language(title_removed_misleading_words,
                                      ['de', 'fr', 'it'])

            # Override wrong language guessing
            if title == 'Pro Balticum':
                language = 'fr'

            if language == "de":
                title_de += title.strip()
            elif language == "fr":
                title_fr += title.strip()
            elif language == "it":
                title_it += title.strip()
            else:
                if last_language == "de":
                    title_de += " " + title.strip()
                elif last_language == "fr":
                    title_fr += " " + title.strip()
                elif last_language == "it":
                    title_it += " " + title.strip()

        new_groups.append(
            (clean_whitespace(title_de), clean_whitespace(title_fr),
             clean_whitespace(title_it), members, sekretariat))
    return new_groups
Пример #3
0
def normalize_namen(groups):
    new_groups = []
    for titles, members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste in groups:
        title_de = titles[0]
        title_fr = titles[1] if len(titles) > 1 else None
        title_it = titles[2] if len(titles) > 2 else None

        if (title_fr and not guess_language(title_fr, ['de', 'fr', 'it'])
                in ['fr', 'UNKNOWN']):
            print("Warning: title_fr '{}' guess lanuage is guessed '{}'\n".
                  format(title_fr, guess_language(title_fr,
                                                  ['de', 'fr', 'it'])))
        if (title_it and not guess_language(title_it, ['de', 'fr', 'it'])
                in ['it', 'UNKNOWN']):
            print("Warning: title_it '{}' guess lanuage is guessed '{}'\n".
                  format(title_it, guess_language(title_it,
                                                  ['de', 'fr', 'it'])))

        new_groups.append(
            (clean_whitespace(title_de), clean_whitespace(title_fr),
             clean_whitespace(title_it), members, sekretariat, konstituierung,
             zweck, art_der_aktivitaeten, mitgliederliste))
    return new_groups
Пример #4
0
def retrieve_products_for_interest(interest):
    list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest,
                                        QUERY_STR)
    html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url)
    soup = bs(html, "html.parser")
    prod_links = [link["href"] for link in soup.select("article.product a")]

    for link in prod_links[:100]:
        prod_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link))

        print("Fetching {}...".format(prod_link))
        html = retrieve_data(fname, prod_link)
        soup = bs(html, "html.parser")

        try:
            title = soup.find("h1", {"itemprop": "name"}).get_text()
            title = clean_whitespace(title)
            description = soup.select_one(".theStoryCopy p").get_text()
            description = clean_whitespace(description)
            image = soup.select_one("a#mainImage img")["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            price = soup.find("span", {"itemprop": "price"}).get_text()
            price = float(clean_whitespace(price))
            tags = get_tags(description)
            product = Product(title,
                              "{}{}".format(BASE_URL, link),
                              image,
                              interest,
                              tags,
                              description,
                              price=price)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Пример #5
0
def retrieve():
    list_link = "{}{}".format(BASE_URL, FILM_LIST)
    html = retrieve_data("imdb.top-films.html", list_link)
    soup = bs(html, "html.parser")
    film_links = soup.select("tbody.lister-list tr .titleColumn a")
    film_links = [link["href"] for link in film_links]

    for link in film_links[:100]:
        film_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("imdb", link_to_fname(link))

        print("Fetching {}...".format(film_link))
        html = retrieve_data(fname, film_link)
        soup = bs(html, "html.parser")

        try:
            title = soup.select_one(".title_wrapper h1").get_text()
            title = clean_whitespace(title)
            description = soup.select_one(".plot_summary .summary_text")
            description = clean_whitespace(description.get_text())
            image = soup.select_one(".poster a img")["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            link = soup.select_one(".winner-option.watch-option")["data-href"]
            genres = soup.select(".title_wrapper .subtext a[href^=\"/genre\"]")
            genres = [clean_whitespace(genre.get_text()) for genre in genres]
            product = Product(title,
                              "{}{}".format(BASE_URL, link),
                              image,
                              "films",
                              genres,
                              description)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Пример #6
0
 def __init__(self,
              name,
              link,
              image,
              primary_tag,
              secondary_tags,
              description,
              price=None):
     super(Product, self).__init__()
     unique_tags = set()
     for tag in secondary_tags:
         tag = clean_whitespace(tag).lower()
         unique_tags.add(tag)
     self.name = name
     self.link = link
     self.image = image
     self.primary_tag = primary_tag
     self.secondary_tags = list(unique_tags)
     self.description = description
     self.price = price
Пример #7
0
def handle_homepage_and_sekretariat(group, name_de, name_fr, name_it,
                                    organisation_id, summary, conn,
                                    batch_time):
    sekretariat = "\n".join(group["sekretariat"])
    sekretariat_line = '; '.join(sekretariat.splitlines())
    sekretariat_list = "\n".join(group["sekretariat"]).replace(
        '; ',
        '\n').replace(';', '\n').replace(', ',
                                         '\n').replace(',', '\n').splitlines()

    (adresse_str_list, adresse_zusatz_list, adresse_plz, adresse_ort,
     alias_list) = ([], [], None, None, [])
    # The order is important. The last match wins. Convenient, addresses are built like that.
    for line_raw in sekretariat_list:
        line = clean_whitespace(line_raw)
        if re.search(r'@\w+\.\w+', line):
            # We alread reached the email address
            break

        m_str = re.search(
            r'(strasse|gasse|weg|rain|graben|gebäude\b|park|platz|zentrum|av\.|chemin|rue|quai|route|via|Technopôle|Bollwerk)|^\d{1,3} [a-z]+',
            line, re.IGNORECASE)
        m_zusatz = re.search(
            r'(Postfach|Case postale|Casella postale|Botschaft|c/o|p\.a\.|Schweiz|Suisse|Swiss|Svizzera|Schweizer|Schweizerischer|Schweizerische|Herr|Frau|Monsieur|Madame|Dr\. | AG|\.ag| SA|GmbH|Sàrl|Ltd|Public|Swiss|Pro|relazioni|Repubblica|Cancelleria|Lia|Koalition|Forum|International|Institut|\bHaus\b|Stiftung|Verein|verband|vereinigung|forum|Gesellschaft|Association|Fédération|Sekretariat|sekretär|Geschäft|Vereinigung|Collaborateur|Bewegung|Minister|Direktor|präsident|Assistent|Délégation|Comité|national|Mesdames|Messieurs|industrie|Inclusion|organisation|Partner|Center|Netzwerk|[^.]com|Vauroux|furrerhugi|Burson|konferenz|bewegung|\.iur|rat|Leiter|Kommunikation)',
            line, re.IGNORECASE)
        m_ort = re.search(r'(\d{4,5}) ([\w. ]+)', line, re.IGNORECASE)
        m_alias = re.findall(r'\b([A-Z]{3,5})\b', line)
        if m_str:
            adresse_str_list.append(line)
        if m_zusatz:
            adresse_zusatz_list.append(line)
        if m_ort:
            adresse_plz = m_ort.group(1)
            adresse_ort = m_ort.group(2)
            break
        if m_alias:
            for alias in m_alias:
                if alias not in ('CVP', 'EVP', 'FDP', 'SVP', 'GLP'):
                    alias_list.append(alias)

    if adresse_str_list:
        adresse_str = "; ".join(adresse_str_list)
    else:
        adresse_str = None

    if adresse_zusatz_list:
        adresse_zusatz = "; ".join(adresse_zusatz_list)
        if len(adresse_zusatz) > 150:
            print("ERROR 'adresse_zusatz' TOO LONG: " +
                  str(len(adresse_zusatz)))
            print("Line: " + line)
            print("Adresse_zusatz: " + adresse_zusatz)
    else:
        adresse_zusatz = None

    if alias_list:
        alias = "; ".join(alias_list)
    else:
        alias = None

    adresse = (adresse_str, adresse_zusatz, adresse_plz, adresse_ort)

    homepage = re.findall(WEB_URL_REGEX, sekretariat)
    email_host = re.findall(r"@([a-zA-Z.\-_]+)", sekretariat)

    if homepage is not None and len(homepage) > 0:
        homepage = max(homepage, key=len)
        if not re.match('^https?://', homepage):
            homepage = 'http://' + homepage
    elif email_host:
        homepage = None
        for host in email_host:
            if host not in ('parl.ch', 'bluewin.ch', 'gmail.com', 'yahoo.com',
                            'yahoo.de', 'yahoo.fr', 'gmx.ch', 'gmx.net',
                            'gmx.de', 'swissonline.ch', 'hotmail.com',
                            'bluemail.ch', 'outlook.com'):
                homepage = ("http://" + host).upper()
                # print("-- Benutze Domain von E-Mail-Adresse: " + homepage)
                break
    else:
        homepage = None

    sekretariat_line = '; '.join(sekretariat.splitlines())

    if not organisation_id:
        print("\n-- Neue parlamentarische Gruppe: '{}'".format(name_de))
        print(
            sql_statement_generator.insert_parlamentarische_gruppe(
                name_de, name_fr, name_it, sekretariat, adresse_str,
                adresse_zusatz, adresse_plz, adresse_ort, homepage, alias,
                batch_time))
        summary.organisation_added()

        organisation_id = '@last_parlamentarische_gruppe'

    else:
        db_sekretariat = db.get_organisation_sekretariat(conn, organisation_id)

        if db_sekretariat:
            db_sekretariat_line = '; '.join(db_sekretariat.splitlines())
        else:
            db_sekretariat_line = ''

        if db_sekretariat_line != sekretariat_line:
            if db_sekretariat:
                summary.sekretariat_changed()
                print('-- Sekretariat alt: ' + db_sekretariat_line)
                print('-- Sekretariat neu: ' + sekretariat_line)
                print(
                    "-- Sekretariat der Gruppe '{}' geändert".format(name_de))
            else:
                # Same code as in new organisation
                summary.sekretariat_added()
                print("-- Sekretariat der Gruppe '{}' hinzugefügt".format(
                    name_de))
            print(
                sql_statement_generator.update_sekretariat_organisation(
                    organisation_id, sekretariat, batch_time))

        db_adresse = db.get_organisation_adresse(conn, organisation_id)

        if db_adresse != adresse:
            if db_adresse:
                summary.adresse_changed()
                print(
                    "-- Adresse der Gruppe '{}' geändert von {} zu {}".format(
                        name_de, db_adresse, adresse))
            else:
                # Same code as in new organisation
                summary.adresse_added()
                print("-- Adresse der Gruppe {} hinzugefügt".format(name_de))
            print('-- Sekretariat: ' + sekretariat.replace('\n', '; '))
            print(
                sql_statement_generator.update_adresse_organisation(
                    organisation_id, adresse_str, adresse_zusatz, adresse_plz,
                    adresse_ort, batch_time))

        db_homepage = db.get_organisation_homepage(conn, organisation_id)

        if db_homepage != homepage:
            if db_homepage:
                summary.website_changed()
                print("-- Website der Gruppe '{}' geändert von '{}' zu '{}'".
                      format(name_de, '\\n'.join(db_homepage.splitlines()),
                             homepage))
            else:
                # Same code as in new organisation
                summary.website_added()
                print("-- Website der Gruppe '{}' hinzugefügt: '{}'".format(
                    name_de, homepage))
            print(
                sql_statement_generator.update_homepage_organisation(
                    organisation_id, homepage, batch_time))

        db_alias = db.get_organisation_alias(conn, organisation_id)

        if db_alias != alias:
            if db_alias:
                summary.alias_changed()
                print("-- Alias der Gruppe '{}' geändert von '{}' zu '{}'".
                      format(name_de, '\\n'.join(db_alias.splitlines()),
                             alias))
            else:
                # Same code as in new organisation
                summary.alias_added()
                print("-- Alias der Gruppe '{}' hinzugefügt: '{}'".format(
                    name_de, alias))
            print(
                sql_statement_generator.update_alias_organisation(
                    organisation_id, alias, batch_time))
Пример #8
0
def handle_homepage_and_sekretariat(group, name_de, name_fr, name_it, organisation_id, summary, conn, batch_time):
    sekretariat = "\n".join(group["sekretariat"])
    sekretariat_line = '; '.join(sekretariat.splitlines())
    sekretariat_list = "\n".join(group["sekretariat"]).replace('; ', '\n').replace(';', '\n').replace(', ', '\n').replace(',', '\n').splitlines()

    (adresse_str_list, adresse_zusatz_list, adresse_plz, adresse_ort, alias_list) = ([], [], None, None, [])
    # The order is important. The last match wins. Convenient, addresses are built like that.
    for line_raw in sekretariat_list:
        line = clean_whitespace(line_raw)
        if re.search(r'@\w+\.\w+', line):
            # We alread reached the email address
            break

        m_str = re.search(r'(strasse|gasse|weg|rain|graben|gebäude\b|park|platz|zentrum|av\.|chemin|rue|quai|route|via|Technopôle|Bollwerk)|^\d{1,3} [a-z]+', line, re.IGNORECASE)
        m_zusatz = re.search(r'(Postfach|Case postale|Casella postale|Botschaft|c/o|p\.a\.|Schweiz|Suisse|Swiss|Svizzera|Schweizer|Schweizerischer|Schweizerische|Herr|Frau|Monsieur|Madame|Dr\. | AG|\.ag| SA|GmbH|Sàrl|Ltd|Public|Swiss|Pro|relazioni|Repubblica|Cancelleria|Lia|Koalition|Forum|International|Institut|\bHaus\b|Stiftung|Verein|verband|vereinigung|forum|Gesellschaft|Association|Fédération|Sekretariat|sekretär|Geschäft|Vereinigung|Collaborateur|Bewegung|Minister|Direktor|präsident|Assistent|Délégation|Comité|national|Mesdames|Messieurs|industrie|Inclusion|organisation|Partner|Center|Netzwerk|[^.]com|Vauroux|furrerhugi|Burson|konferenz|bewegung|\.iur|rat|Leiter|Kommunikation)', line, re.IGNORECASE)
        m_ort = re.search(r'(\d{4,5}) ([\w. ]+)', line, re.IGNORECASE)
        m_alias = re.findall(r'\b([A-Z]{3,5})\b', line)
        if m_str:
            adresse_str_list.append(line)
        if m_zusatz:
            adresse_zusatz_list.append(line)
        if m_ort:
            adresse_plz = m_ort.group(1)
            adresse_ort = m_ort.group(2)
            break
        if m_alias:
            for alias in m_alias:
                if alias not in ('CVP', 'EVP', 'FDP', 'SVP', 'GLP'):
                  alias_list.append(alias)

    if adresse_str_list:
        adresse_str = "; ".join(adresse_str_list)
    else:
        adresse_str = None

    if adresse_zusatz_list:
        adresse_zusatz = "; ".join(adresse_zusatz_list)
        if len(adresse_zusatz) > 150:
            print("ERROR 'adresse_zusatz' TOO LONG: " + str(len(adresse_zusatz)))
            print("Line: " + line)
            print("Adresse_zusatz: " + adresse_zusatz)
    else:
        adresse_zusatz = None

    if alias_list:
        alias = "; ".join(alias_list)
    else:
        alias = None

    adresse = (adresse_str, adresse_zusatz, adresse_plz, adresse_ort)

    homepage = re.findall(WEB_URL_REGEX, sekretariat)
    email_host =re.findall(r"@([a-zA-Z.\-_]+)", sekretariat)

    if homepage is not None and len(homepage) > 0:
        homepage = max(homepage, key=len)
        if not re.match('^https?://', homepage):
            homepage = 'http://' + homepage
    elif email_host:
        homepage = None
        for host in email_host:
            if host not in ('parl.ch', 'bluewin.ch', 'gmail.com', 'yahoo.com', 'yahoo.de', 'yahoo.fr', 'gmx.ch', 'gmx.net', 'gmx.de', 'swissonline.ch', 'hotmail.com', 'bluemail.ch', 'outlook.com'):
                homepage = ("http://" + host).upper()
                # print("-- Benutze Domain von E-Mail-Adresse: " + homepage)
                break
    else:
        homepage = None

    sekretariat_line = '; '.join(sekretariat.splitlines())

    if not organisation_id:
        print("\n-- Neue parlamentarische Gruppe: '{}'".format(name_de))
        print(sql_statement_generator.insert_parlamentarische_gruppe(
            name_de, name_fr, name_it, sekretariat, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, homepage, alias, batch_time))
        summary.organisation_added()

        organisation_id = '@last_parlamentarische_gruppe'

    else:
        db_sekretariat = db.get_organisation_sekretariat(conn, organisation_id)

        if db_sekretariat:
            db_sekretariat_line = '; '.join(db_sekretariat.splitlines())
        else:
            db_sekretariat_line = ''

        if db_sekretariat_line != sekretariat_line:
            if db_sekretariat:
                summary.sekretariat_changed()
                print('-- Sekretariat alt: ' + db_sekretariat_line)
                print('-- Sekretariat neu: ' + sekretariat_line)
                print("-- Sekretariat der Gruppe '{}' geändert".format(name_de))
            else:
                # Same code as in new organisation
                summary.sekretariat_added()
                print("-- Sekretariat der Gruppe '{}' hinzugefügt".format(name_de))
            print(sql_statement_generator.update_sekretariat_organisation(
                    organisation_id, sekretariat, batch_time))

        db_adresse = db.get_organisation_adresse(conn, organisation_id)

        if db_adresse != adresse:
            if db_adresse:
                summary.adresse_changed()
                print("-- Adresse der Gruppe '{}' geändert von {} zu {}".format(name_de, db_adresse, adresse))
            else:
                # Same code as in new organisation
                summary.adresse_added()
                print("-- Adresse der Gruppe {} hinzugefügt".format(name_de))
            print('-- Sekretariat: ' + sekretariat.replace('\n', '; '))
            print(sql_statement_generator.update_adresse_organisation(
                    organisation_id, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, batch_time))

        db_homepage = db.get_organisation_homepage(conn, organisation_id)

        if db_homepage != homepage:
            if db_homepage:
                summary.website_changed()
                print("-- Website der Gruppe '{}' geändert von '{}' zu '{}'".format(name_de, '\\n'.join(db_homepage.splitlines()), homepage))
            else:
                # Same code as in new organisation
                summary.website_added()
                print("-- Website der Gruppe '{}' hinzugefügt: '{}'"
                .format(name_de, homepage))
            print(sql_statement_generator.update_homepage_organisation(
                    organisation_id, homepage, batch_time))

        db_alias = db.get_organisation_alias(conn, organisation_id)

        if db_alias != alias:
            if db_alias:
                summary.alias_changed()
                print("-- Alias der Gruppe '{}' geändert von '{}' zu '{}'".format(name_de, '\\n'.join(db_alias.splitlines()), alias))
            else:
                # Same code as in new organisation
                summary.alias_added()
                print("-- Alias der Gruppe '{}' hinzugefügt: '{}'"
                .format(name_de, alias))
            print(sql_statement_generator.update_alias_organisation(
                    organisation_id, alias, batch_time))
Пример #9
0
def read_groups(filename):
    groups = []
    rows = csv.reader(open(filename, encoding="utf-8"))

    is_new_page = True
    page = 1
    reading_mode = None
    titles = []
    presidents = []
    sekretariat = []
    konstituierung = None
    zweck = []
    art_der_aktivitaeten = []
    mitglieder = []
    president_titles = set()

    lines = [
        clean_whitespace(clean_str(' '.join(row))) for row in rows
        if ''.join(row).strip() != ''
    ]
    for i, line in enumerate(lines):

        match_page = re.search(r'Seite\s*(\d+)\s*/\s*\d+', line)
        if line == '' or line.startswith('Fortsetzung:') or line.lower() in [
                'folgt', 'vakant'
        ]:
            continue
        elif match_page:
            is_new_page = True
            new_page = match_page.group(1)
            assert page + 1 != new_page, "Page numbers not succeding, current={}, new={}".format(
                page, new_page)
            page = int(new_page)
            continue

        if i < len(lines) - 2:
            next_line = lines[i + 1]

        if is_new_page:
            is_new_page = False
            if line.startswith('Mitgliederliste') or line.startswith(
                    'Konstituierung') or line.startswith(
                        'Art der ') or line.startswith('Zweck:'):
                # not a new group on the new page
                continue
            elif line.startswith(MEMBER_LINE_START) or next_line.startswith(
                    MEMBER_LINE_START):
                # continue normally as it is a group member
                pass
            else:
                # save previous page
                if titles:
                    if not presidents:
                        print("-- WARN: no presidents for group '{}'".format(
                            titles[0]))
                    groups.append(
                        (titles, presidents, sekretariat, konstituierung,
                         zweck, art_der_aktivitaeten, mitglieder))
                reading_mode = ReadingMode.TITLE
                titles = []
                presidents = []
                sekretariat = []
                konstituierung = None
                zweck = []
                art_der_aktivitaeten = []
                mitglieder = []

        # reading_mode checks must be in reverse order as in the document
        if line.startswith('Mitgliederliste'):
            reading_mode = ReadingMode.MITGLIEDERLISTE

        elif line.startswith('Konstituierung:'):
            reading_mode = None
            str = line.replace('Konstituierung:', '').replace(
                'Le',
                '').replace(', cf courrier de création en 3 langues ci-joint',
                            '').replace('in Bern', '').strip()
            if str and not str in ['--', '-']:
                konstituierung = str

        elif line.startswith(
                'Art der '
        ) or reading_mode == ReadingMode.ART_DER_AKTIVITAETEN:
            reading_mode = ReadingMode.ART_DER_AKTIVITAETEN
            str = replace_bullets(
                re.sub(r'(Art der|geplanten|Aktivitäten:)', '', line))
            if str:
                art_der_aktivitaeten.append(str)

        elif line.startswith('Zweck:') or reading_mode == ReadingMode.ZWECK:
            reading_mode = ReadingMode.ZWECK
            text = replace_bullets(
                line.replace('Zweck:', '').replace('--', ''))
            if text:
                zweck.append(text)

        elif is_sekretariat(line) or reading_mode == ReadingMode.SEKRETARIAT:
            reading_mode = ReadingMode.SEKRETARIAT
            sekretariat += extract_sekretariat(line)

        # avoid reading on second line, case separete Co-, second line PräsidentInnen (PG Mehrsprachigkeit CH)
        elif is_president(
                line) and reading_mode != ReadingMode.PRESIDENTS_SKIP_NEXT:
            if line.startswith('Co- '):
                reading_mode = ReadingMode.PRESIDENTS_SKIP_NEXT
                president_title = extract_president_title('Co-' + next_line)
            else:
                reading_mode = ReadingMode.PRESIDENTS
                president_title = extract_president_title(line)
            president_titles.add(president_title)
            for president in extract_presidents(line):
                presidents.append((fix_parlamentarian_name_typos(president),
                                   president_title))

        elif reading_mode == ReadingMode.PRESIDENTS or reading_mode == ReadingMode.PRESIDENTS_SKIP_NEXT:
            reading_mode = ReadingMode.PRESIDENTS
            for president in extract_presidents(line):
                presidents.append((fix_parlamentarian_name_typos(president),
                                   president_title))

        elif reading_mode == ReadingMode.MITGLIEDERLISTE and line.startswith(
                MEMBER_LINE_START):
            mitglieder.append((fix_parlamentarian_name_typos(line).replace(
                'S ', 'SR ').replace('N ', 'NR '), None))

        elif reading_mode == ReadingMode.TITLE:
            titles.append(line)

    # save last page
    if titles and presidents:
        groups.append((titles, presidents, sekretariat, konstituierung, zweck,
                       art_der_aktivitaeten, mitglieder))

    # print counts for sanity check
    print("\n{} parlamentarische Gruppen\n"
          "{} total members of parlamentarische Gruppen".format(
              len(groups), sum(len(gruppe) for gruppe in groups)))

    return groups