def retrieve(): list_link = "{}{}".format(BASE_URL, BOOK_LIST) html = retrieve_data("goodreads.top-books.html", list_link) soup = bs(html, "html.parser") rows = soup.find_all("tr", {"itemtype": "http://schema.org/Book"}) for row in rows[:100]: link = row.find("div", {"data-resource-type": "Book"}).a["href"] book_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("goodreads", link_to_fname(link)) print("Fetching {}...".format(book_link)) html = retrieve_data(fname, book_link) try: soup = bs(html, "html.parser") title = soup.find("h1", {"id": "bookTitle"}).get_text() title = clean_whitespace(title) description = soup.select("div#description span")[-1].get_text() description = clean_whitespace(description) link = soup.find("a", {"id": "buyButton"})["href"] genres = soup.select(".left .bookPageGenreLink") genres = [clean_whitespace(genre.get_text()) for genre in genres] image = soup.find("img", {"id": "coverImage"})["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) product = Product(title, "{}{}".format(BASE_URL, link), image, "books", genres, description) product.dump() except Exception as e: print("ERROR:", e) print("")
def normalize_namen(groups): new_groups = [] for titles, members, sekretariat in groups: title_de = titles[0] title_fr = "" title_it = "" #The rumantsch group is the only group that is actually named in four seperate languages #Hack around it if "rumantscha" in titles[0] and len(titles) == 4: titles = titles[:-1] last_language = 'de' for title in titles[1:]: title_removed_misleading_words = title.replace('Italianità', '') language = guess_language(title_removed_misleading_words, ['de', 'fr', 'it']) # Override wrong language guessing if title == 'Pro Balticum': language = 'fr' if language == "de": title_de += title.strip() elif language == "fr": title_fr += title.strip() elif language == "it": title_it += title.strip() else: if last_language == "de": title_de += " " + title.strip() elif last_language == "fr": title_fr += " " + title.strip() elif last_language == "it": title_it += " " + title.strip() new_groups.append( (clean_whitespace(title_de), clean_whitespace(title_fr), clean_whitespace(title_it), members, sekretariat)) return new_groups
def normalize_namen(groups): new_groups = [] for titles, members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste in groups: title_de = titles[0] title_fr = titles[1] if len(titles) > 1 else None title_it = titles[2] if len(titles) > 2 else None if (title_fr and not guess_language(title_fr, ['de', 'fr', 'it']) in ['fr', 'UNKNOWN']): print("Warning: title_fr '{}' guess lanuage is guessed '{}'\n". format(title_fr, guess_language(title_fr, ['de', 'fr', 'it']))) if (title_it and not guess_language(title_it, ['de', 'fr', 'it']) in ['it', 'UNKNOWN']): print("Warning: title_it '{}' guess lanuage is guessed '{}'\n". format(title_it, guess_language(title_it, ['de', 'fr', 'it']))) new_groups.append( (clean_whitespace(title_de), clean_whitespace(title_fr), clean_whitespace(title_it), members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste)) return new_groups
def retrieve_products_for_interest(interest): list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest, QUERY_STR) html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url) soup = bs(html, "html.parser") prod_links = [link["href"] for link in soup.select("article.product a")] for link in prod_links[:100]: prod_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link)) print("Fetching {}...".format(prod_link)) html = retrieve_data(fname, prod_link) soup = bs(html, "html.parser") try: title = soup.find("h1", {"itemprop": "name"}).get_text() title = clean_whitespace(title) description = soup.select_one(".theStoryCopy p").get_text() description = clean_whitespace(description) image = soup.select_one("a#mainImage img")["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) price = soup.find("span", {"itemprop": "price"}).get_text() price = float(clean_whitespace(price)) tags = get_tags(description) product = Product(title, "{}{}".format(BASE_URL, link), image, interest, tags, description, price=price) product.dump() except Exception as e: print("ERROR:", e) print("")
def retrieve(): list_link = "{}{}".format(BASE_URL, FILM_LIST) html = retrieve_data("imdb.top-films.html", list_link) soup = bs(html, "html.parser") film_links = soup.select("tbody.lister-list tr .titleColumn a") film_links = [link["href"] for link in film_links] for link in film_links[:100]: film_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("imdb", link_to_fname(link)) print("Fetching {}...".format(film_link)) html = retrieve_data(fname, film_link) soup = bs(html, "html.parser") try: title = soup.select_one(".title_wrapper h1").get_text() title = clean_whitespace(title) description = soup.select_one(".plot_summary .summary_text") description = clean_whitespace(description.get_text()) image = soup.select_one(".poster a img")["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) link = soup.select_one(".winner-option.watch-option")["data-href"] genres = soup.select(".title_wrapper .subtext a[href^=\"/genre\"]") genres = [clean_whitespace(genre.get_text()) for genre in genres] product = Product(title, "{}{}".format(BASE_URL, link), image, "films", genres, description) product.dump() except Exception as e: print("ERROR:", e) print("")
def __init__(self, name, link, image, primary_tag, secondary_tags, description, price=None): super(Product, self).__init__() unique_tags = set() for tag in secondary_tags: tag = clean_whitespace(tag).lower() unique_tags.add(tag) self.name = name self.link = link self.image = image self.primary_tag = primary_tag self.secondary_tags = list(unique_tags) self.description = description self.price = price
def handle_homepage_and_sekretariat(group, name_de, name_fr, name_it, organisation_id, summary, conn, batch_time): sekretariat = "\n".join(group["sekretariat"]) sekretariat_line = '; '.join(sekretariat.splitlines()) sekretariat_list = "\n".join(group["sekretariat"]).replace( '; ', '\n').replace(';', '\n').replace(', ', '\n').replace(',', '\n').splitlines() (adresse_str_list, adresse_zusatz_list, adresse_plz, adresse_ort, alias_list) = ([], [], None, None, []) # The order is important. The last match wins. Convenient, addresses are built like that. for line_raw in sekretariat_list: line = clean_whitespace(line_raw) if re.search(r'@\w+\.\w+', line): # We alread reached the email address break m_str = re.search( r'(strasse|gasse|weg|rain|graben|gebäude\b|park|platz|zentrum|av\.|chemin|rue|quai|route|via|Technopôle|Bollwerk)|^\d{1,3} [a-z]+', line, re.IGNORECASE) m_zusatz = re.search( r'(Postfach|Case postale|Casella postale|Botschaft|c/o|p\.a\.|Schweiz|Suisse|Swiss|Svizzera|Schweizer|Schweizerischer|Schweizerische|Herr|Frau|Monsieur|Madame|Dr\. | AG|\.ag| SA|GmbH|Sàrl|Ltd|Public|Swiss|Pro|relazioni|Repubblica|Cancelleria|Lia|Koalition|Forum|International|Institut|\bHaus\b|Stiftung|Verein|verband|vereinigung|forum|Gesellschaft|Association|Fédération|Sekretariat|sekretär|Geschäft|Vereinigung|Collaborateur|Bewegung|Minister|Direktor|präsident|Assistent|Délégation|Comité|national|Mesdames|Messieurs|industrie|Inclusion|organisation|Partner|Center|Netzwerk|[^.]com|Vauroux|furrerhugi|Burson|konferenz|bewegung|\.iur|rat|Leiter|Kommunikation)', line, re.IGNORECASE) m_ort = re.search(r'(\d{4,5}) ([\w. ]+)', line, re.IGNORECASE) m_alias = re.findall(r'\b([A-Z]{3,5})\b', line) if m_str: adresse_str_list.append(line) if m_zusatz: adresse_zusatz_list.append(line) if m_ort: adresse_plz = m_ort.group(1) adresse_ort = m_ort.group(2) break if m_alias: for alias in m_alias: if alias not in ('CVP', 'EVP', 'FDP', 'SVP', 'GLP'): alias_list.append(alias) if adresse_str_list: adresse_str = "; ".join(adresse_str_list) else: adresse_str = None if adresse_zusatz_list: adresse_zusatz = "; ".join(adresse_zusatz_list) if len(adresse_zusatz) > 150: print("ERROR 'adresse_zusatz' TOO LONG: " + str(len(adresse_zusatz))) print("Line: " + line) print("Adresse_zusatz: " + adresse_zusatz) else: adresse_zusatz = None if alias_list: alias = "; ".join(alias_list) else: alias = None adresse = (adresse_str, adresse_zusatz, adresse_plz, adresse_ort) homepage = re.findall(WEB_URL_REGEX, sekretariat) email_host = re.findall(r"@([a-zA-Z.\-_]+)", sekretariat) if homepage is not None and len(homepage) > 0: homepage = max(homepage, key=len) if not re.match('^https?://', homepage): homepage = 'http://' + homepage elif email_host: homepage = None for host in email_host: if host not in ('parl.ch', 'bluewin.ch', 'gmail.com', 'yahoo.com', 'yahoo.de', 'yahoo.fr', 'gmx.ch', 'gmx.net', 'gmx.de', 'swissonline.ch', 'hotmail.com', 'bluemail.ch', 'outlook.com'): homepage = ("http://" + host).upper() # print("-- Benutze Domain von E-Mail-Adresse: " + homepage) break else: homepage = None sekretariat_line = '; '.join(sekretariat.splitlines()) if not organisation_id: print("\n-- Neue parlamentarische Gruppe: '{}'".format(name_de)) print( sql_statement_generator.insert_parlamentarische_gruppe( name_de, name_fr, name_it, sekretariat, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, homepage, alias, batch_time)) summary.organisation_added() organisation_id = '@last_parlamentarische_gruppe' else: db_sekretariat = db.get_organisation_sekretariat(conn, organisation_id) if db_sekretariat: db_sekretariat_line = '; '.join(db_sekretariat.splitlines()) else: db_sekretariat_line = '' if db_sekretariat_line != sekretariat_line: if db_sekretariat: summary.sekretariat_changed() print('-- Sekretariat alt: ' + db_sekretariat_line) print('-- Sekretariat neu: ' + sekretariat_line) print( "-- Sekretariat der Gruppe '{}' geändert".format(name_de)) else: # Same code as in new organisation summary.sekretariat_added() print("-- Sekretariat der Gruppe '{}' hinzugefügt".format( name_de)) print( sql_statement_generator.update_sekretariat_organisation( organisation_id, sekretariat, batch_time)) db_adresse = db.get_organisation_adresse(conn, organisation_id) if db_adresse != adresse: if db_adresse: summary.adresse_changed() print( "-- Adresse der Gruppe '{}' geändert von {} zu {}".format( name_de, db_adresse, adresse)) else: # Same code as in new organisation summary.adresse_added() print("-- Adresse der Gruppe {} hinzugefügt".format(name_de)) print('-- Sekretariat: ' + sekretariat.replace('\n', '; ')) print( sql_statement_generator.update_adresse_organisation( organisation_id, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, batch_time)) db_homepage = db.get_organisation_homepage(conn, organisation_id) if db_homepage != homepage: if db_homepage: summary.website_changed() print("-- Website der Gruppe '{}' geändert von '{}' zu '{}'". format(name_de, '\\n'.join(db_homepage.splitlines()), homepage)) else: # Same code as in new organisation summary.website_added() print("-- Website der Gruppe '{}' hinzugefügt: '{}'".format( name_de, homepage)) print( sql_statement_generator.update_homepage_organisation( organisation_id, homepage, batch_time)) db_alias = db.get_organisation_alias(conn, organisation_id) if db_alias != alias: if db_alias: summary.alias_changed() print("-- Alias der Gruppe '{}' geändert von '{}' zu '{}'". format(name_de, '\\n'.join(db_alias.splitlines()), alias)) else: # Same code as in new organisation summary.alias_added() print("-- Alias der Gruppe '{}' hinzugefügt: '{}'".format( name_de, alias)) print( sql_statement_generator.update_alias_organisation( organisation_id, alias, batch_time))
def handle_homepage_and_sekretariat(group, name_de, name_fr, name_it, organisation_id, summary, conn, batch_time): sekretariat = "\n".join(group["sekretariat"]) sekretariat_line = '; '.join(sekretariat.splitlines()) sekretariat_list = "\n".join(group["sekretariat"]).replace('; ', '\n').replace(';', '\n').replace(', ', '\n').replace(',', '\n').splitlines() (adresse_str_list, adresse_zusatz_list, adresse_plz, adresse_ort, alias_list) = ([], [], None, None, []) # The order is important. The last match wins. Convenient, addresses are built like that. for line_raw in sekretariat_list: line = clean_whitespace(line_raw) if re.search(r'@\w+\.\w+', line): # We alread reached the email address break m_str = re.search(r'(strasse|gasse|weg|rain|graben|gebäude\b|park|platz|zentrum|av\.|chemin|rue|quai|route|via|Technopôle|Bollwerk)|^\d{1,3} [a-z]+', line, re.IGNORECASE) m_zusatz = re.search(r'(Postfach|Case postale|Casella postale|Botschaft|c/o|p\.a\.|Schweiz|Suisse|Swiss|Svizzera|Schweizer|Schweizerischer|Schweizerische|Herr|Frau|Monsieur|Madame|Dr\. | AG|\.ag| SA|GmbH|Sàrl|Ltd|Public|Swiss|Pro|relazioni|Repubblica|Cancelleria|Lia|Koalition|Forum|International|Institut|\bHaus\b|Stiftung|Verein|verband|vereinigung|forum|Gesellschaft|Association|Fédération|Sekretariat|sekretär|Geschäft|Vereinigung|Collaborateur|Bewegung|Minister|Direktor|präsident|Assistent|Délégation|Comité|national|Mesdames|Messieurs|industrie|Inclusion|organisation|Partner|Center|Netzwerk|[^.]com|Vauroux|furrerhugi|Burson|konferenz|bewegung|\.iur|rat|Leiter|Kommunikation)', line, re.IGNORECASE) m_ort = re.search(r'(\d{4,5}) ([\w. ]+)', line, re.IGNORECASE) m_alias = re.findall(r'\b([A-Z]{3,5})\b', line) if m_str: adresse_str_list.append(line) if m_zusatz: adresse_zusatz_list.append(line) if m_ort: adresse_plz = m_ort.group(1) adresse_ort = m_ort.group(2) break if m_alias: for alias in m_alias: if alias not in ('CVP', 'EVP', 'FDP', 'SVP', 'GLP'): alias_list.append(alias) if adresse_str_list: adresse_str = "; ".join(adresse_str_list) else: adresse_str = None if adresse_zusatz_list: adresse_zusatz = "; ".join(adresse_zusatz_list) if len(adresse_zusatz) > 150: print("ERROR 'adresse_zusatz' TOO LONG: " + str(len(adresse_zusatz))) print("Line: " + line) print("Adresse_zusatz: " + adresse_zusatz) else: adresse_zusatz = None if alias_list: alias = "; ".join(alias_list) else: alias = None adresse = (adresse_str, adresse_zusatz, adresse_plz, adresse_ort) homepage = re.findall(WEB_URL_REGEX, sekretariat) email_host =re.findall(r"@([a-zA-Z.\-_]+)", sekretariat) if homepage is not None and len(homepage) > 0: homepage = max(homepage, key=len) if not re.match('^https?://', homepage): homepage = 'http://' + homepage elif email_host: homepage = None for host in email_host: if host not in ('parl.ch', 'bluewin.ch', 'gmail.com', 'yahoo.com', 'yahoo.de', 'yahoo.fr', 'gmx.ch', 'gmx.net', 'gmx.de', 'swissonline.ch', 'hotmail.com', 'bluemail.ch', 'outlook.com'): homepage = ("http://" + host).upper() # print("-- Benutze Domain von E-Mail-Adresse: " + homepage) break else: homepage = None sekretariat_line = '; '.join(sekretariat.splitlines()) if not organisation_id: print("\n-- Neue parlamentarische Gruppe: '{}'".format(name_de)) print(sql_statement_generator.insert_parlamentarische_gruppe( name_de, name_fr, name_it, sekretariat, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, homepage, alias, batch_time)) summary.organisation_added() organisation_id = '@last_parlamentarische_gruppe' else: db_sekretariat = db.get_organisation_sekretariat(conn, organisation_id) if db_sekretariat: db_sekretariat_line = '; '.join(db_sekretariat.splitlines()) else: db_sekretariat_line = '' if db_sekretariat_line != sekretariat_line: if db_sekretariat: summary.sekretariat_changed() print('-- Sekretariat alt: ' + db_sekretariat_line) print('-- Sekretariat neu: ' + sekretariat_line) print("-- Sekretariat der Gruppe '{}' geändert".format(name_de)) else: # Same code as in new organisation summary.sekretariat_added() print("-- Sekretariat der Gruppe '{}' hinzugefügt".format(name_de)) print(sql_statement_generator.update_sekretariat_organisation( organisation_id, sekretariat, batch_time)) db_adresse = db.get_organisation_adresse(conn, organisation_id) if db_adresse != adresse: if db_adresse: summary.adresse_changed() print("-- Adresse der Gruppe '{}' geändert von {} zu {}".format(name_de, db_adresse, adresse)) else: # Same code as in new organisation summary.adresse_added() print("-- Adresse der Gruppe {} hinzugefügt".format(name_de)) print('-- Sekretariat: ' + sekretariat.replace('\n', '; ')) print(sql_statement_generator.update_adresse_organisation( organisation_id, adresse_str, adresse_zusatz, adresse_plz, adresse_ort, batch_time)) db_homepage = db.get_organisation_homepage(conn, organisation_id) if db_homepage != homepage: if db_homepage: summary.website_changed() print("-- Website der Gruppe '{}' geändert von '{}' zu '{}'".format(name_de, '\\n'.join(db_homepage.splitlines()), homepage)) else: # Same code as in new organisation summary.website_added() print("-- Website der Gruppe '{}' hinzugefügt: '{}'" .format(name_de, homepage)) print(sql_statement_generator.update_homepage_organisation( organisation_id, homepage, batch_time)) db_alias = db.get_organisation_alias(conn, organisation_id) if db_alias != alias: if db_alias: summary.alias_changed() print("-- Alias der Gruppe '{}' geändert von '{}' zu '{}'".format(name_de, '\\n'.join(db_alias.splitlines()), alias)) else: # Same code as in new organisation summary.alias_added() print("-- Alias der Gruppe '{}' hinzugefügt: '{}'" .format(name_de, alias)) print(sql_statement_generator.update_alias_organisation( organisation_id, alias, batch_time))
def read_groups(filename): groups = [] rows = csv.reader(open(filename, encoding="utf-8")) is_new_page = True page = 1 reading_mode = None titles = [] presidents = [] sekretariat = [] konstituierung = None zweck = [] art_der_aktivitaeten = [] mitglieder = [] president_titles = set() lines = [ clean_whitespace(clean_str(' '.join(row))) for row in rows if ''.join(row).strip() != '' ] for i, line in enumerate(lines): match_page = re.search(r'Seite\s*(\d+)\s*/\s*\d+', line) if line == '' or line.startswith('Fortsetzung:') or line.lower() in [ 'folgt', 'vakant' ]: continue elif match_page: is_new_page = True new_page = match_page.group(1) assert page + 1 != new_page, "Page numbers not succeding, current={}, new={}".format( page, new_page) page = int(new_page) continue if i < len(lines) - 2: next_line = lines[i + 1] if is_new_page: is_new_page = False if line.startswith('Mitgliederliste') or line.startswith( 'Konstituierung') or line.startswith( 'Art der ') or line.startswith('Zweck:'): # not a new group on the new page continue elif line.startswith(MEMBER_LINE_START) or next_line.startswith( MEMBER_LINE_START): # continue normally as it is a group member pass else: # save previous page if titles: if not presidents: print("-- WARN: no presidents for group '{}'".format( titles[0])) groups.append( (titles, presidents, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitglieder)) reading_mode = ReadingMode.TITLE titles = [] presidents = [] sekretariat = [] konstituierung = None zweck = [] art_der_aktivitaeten = [] mitglieder = [] # reading_mode checks must be in reverse order as in the document if line.startswith('Mitgliederliste'): reading_mode = ReadingMode.MITGLIEDERLISTE elif line.startswith('Konstituierung:'): reading_mode = None str = line.replace('Konstituierung:', '').replace( 'Le', '').replace(', cf courrier de création en 3 langues ci-joint', '').replace('in Bern', '').strip() if str and not str in ['--', '-']: konstituierung = str elif line.startswith( 'Art der ' ) or reading_mode == ReadingMode.ART_DER_AKTIVITAETEN: reading_mode = ReadingMode.ART_DER_AKTIVITAETEN str = replace_bullets( re.sub(r'(Art der|geplanten|Aktivitäten:)', '', line)) if str: art_der_aktivitaeten.append(str) elif line.startswith('Zweck:') or reading_mode == ReadingMode.ZWECK: reading_mode = ReadingMode.ZWECK text = replace_bullets( line.replace('Zweck:', '').replace('--', '')) if text: zweck.append(text) elif is_sekretariat(line) or reading_mode == ReadingMode.SEKRETARIAT: reading_mode = ReadingMode.SEKRETARIAT sekretariat += extract_sekretariat(line) # avoid reading on second line, case separete Co-, second line PräsidentInnen (PG Mehrsprachigkeit CH) elif is_president( line) and reading_mode != ReadingMode.PRESIDENTS_SKIP_NEXT: if line.startswith('Co- '): reading_mode = ReadingMode.PRESIDENTS_SKIP_NEXT president_title = extract_president_title('Co-' + next_line) else: reading_mode = ReadingMode.PRESIDENTS president_title = extract_president_title(line) president_titles.add(president_title) for president in extract_presidents(line): presidents.append((fix_parlamentarian_name_typos(president), president_title)) elif reading_mode == ReadingMode.PRESIDENTS or reading_mode == ReadingMode.PRESIDENTS_SKIP_NEXT: reading_mode = ReadingMode.PRESIDENTS for president in extract_presidents(line): presidents.append((fix_parlamentarian_name_typos(president), president_title)) elif reading_mode == ReadingMode.MITGLIEDERLISTE and line.startswith( MEMBER_LINE_START): mitglieder.append((fix_parlamentarian_name_typos(line).replace( 'S ', 'SR ').replace('N ', 'NR '), None)) elif reading_mode == ReadingMode.TITLE: titles.append(line) # save last page if titles and presidents: groups.append((titles, presidents, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitglieder)) # print counts for sanity check print("\n{} parlamentarische Gruppen\n" "{} total members of parlamentarische Gruppen".format( len(groups), sum(len(gruppe) for gruppe in groups))) return groups