def parse_html(html): soup = BeautifulSoup(html, "html.parser") # last update time (UTC) try: date_col = soup.select('p > strong')[-1].text update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") except ValueError: date_col = soup.select('p > strong')[-2].text update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") data = { "last_updated": update_time, "lots": [] } # get all tables with lots raw_lot_list = soup.find_all("div", {"class": "listing"}) # get all lots for lot_list in raw_lot_list: raw_lots = lot_list.select('tr + tr') for lot in raw_lots: lot_name = lot.select('a')[0].text try: lot_free = int(lot.select('td + td')[0].text) except ValueError: lot_free = 0 try: if "green" in str(lot.select("td + td")[0]): lot_state = "open" else: lot_state = "closed" except ValueError: lot_state = "nodata" lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": lot_free, "total": lot.total, "coords": lot.coords, "state": lot_state, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") # last update time (UTC) try: date_col = soup.select('p > strong')[-1].text update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") except ValueError: date_col = soup.select('p > strong')[-2].text update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") data = {"last_updated": update_time, "lots": []} # get all tables with lots raw_lot_list = soup.find_all("div", {"class": "listing"}) # get all lots for lot_list in raw_lot_list: raw_lots = lot_list.select('tr + tr') for lot in raw_lots: lot_name = lot.select('a')[0].text try: lot_free = int(lot.select('td + td')[0].text) except ValueError: lot_free = 0 try: if "green" in str(lot.select("td + td")[0]): lot_state = "open" else: lot_state = "closed" except ValueError: lot_state = "nodata" lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": lot_free, "total": lot.total, "coords": lot.coords, "state": lot_state, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") data = { "last_updated": convert_date(soup.p.string, "(%d.%m.%Y, %H.%M Uhr)"), "data_source": data_source, "lots": [] } # get all lots raw_lots = soup.find_all("tr") for lot in raw_lots: elements = lot.find_all("td") lot_name = elements[0].text data["lots"].append({ "name": lot_name, "free": int(elements[1].text), "total": data_map.get(lot_name)["total"], "type": data_map.get(lot_name)["type"], "address": data_map.get(lot_name)["address"], "coords": geodata.coords(lot_name), "state": "nodata", "id": generate_id(__file__, lot_name), "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") data = { "last_updated": convert_date(soup.p.string, "(%d.%m.%Y, %H.%M Uhr)"), "lots": [] } # get all lots raw_lots = soup.find_all("tr") for raw_lot in raw_lots: elements = raw_lot.find_all("td") state = "open" if "class" in raw_lot.attrs and "strike" in raw_lot["class"]: state = "closed" lot_name = elements[0].text lot = geodata.lot(lot_name) data["lots"].append({ "name": lot.name, "free": int(elements[1].text), "total": lot.total, "lot_type": lot.type, "address": lot.address, "coords": lot.coords, "state": state, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") lot_table_trs = soup.select("div#parkingList table")[0].find_all("tr") date_field = soup.find(id="lastRefresh").text.strip() data = { "last_updated": convert_date(date_field, "%d.%m.%Y %H:%M Uhr"), "lots": [] } for tr in lot_table_trs[1:-1]: tds = tr.find_all("td") type_and_name = process_name(tds[0].text.strip()) lot = geodata.lot(tds[0].text.strip()) data["lots"].append({ "name": type_and_name[1].strip("\n"), "lot_type": type_and_name[0], "free": int(tds[1].text), "total": lot.total, "state": state_map.get(tds[2].text, ""), "coords": lot.coords, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") free_lots = soup.find_all("td", {"class": "stell"}) assert len(free_lots) == 6, "Expect to find 6 lots in Bonn, got: %d" % len( free_lots) time = soup.find("td", {"class": "stand"}).text.strip() lots = [] for idx, free in enumerate(free_lots): lot = lot_map.get(idx) lots.append({ "name": lot.name, "coords": geodata.coords(lot.name), "free": int(free.text), "address": lot.address, "total": lot.total, "state": "nodata", "id": generate_id(__file__, lot.name), "forecast": False }) return { "last_updated": convert_date(time, "%d.%m.%y %H:%M:%S"), "data_source": data_source, "lots": lots }
def parse_html(text_content): elems = text_content.split("\r\n\r\n") data = { "last_updated": convert_date(elems[0], "%d-%m-%Y %H:%M:%S "), "lots": [] } state_mappings = {1: "open", 0: "closed"} for elem in elems[1:]: e = { "name": elem.split("\r\n")[0].split("=")[1], "free": int(elem.split("\r\n")[1].split("=")[1]) } lot = geodata.lot(e["name"]) data["lots"].append({ "name": e["name"], "free": e["free"], "total": lot.total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") free_lots = soup.find_all("td", {"class": "stell"}) assert len(free_lots) == 6, "Expect to find 6 lots in Bonn, got: %d" % len(free_lots) time = soup.find("td", {"class": "stand"}).text.strip() lots = [] for idx, free in enumerate(free_lots): lot = lot_map.get(idx) lots.append({ "name": lot.name, "coords": geodata.coords(lot.name), "free": int(free.text), "address": lot.address, "total": lot.total, "state": "nodata", "id": generate_id(__file__, lot.name), "forecast": False }) return { "last_updated": convert_date(time, "%d.%m.%y %H:%M:%S"), "data_source": data_source, "lots": lots }
def parse_html(html): soup = BeautifulSoup(html, "html.parser") lot_table_trs = soup.select("table[cellpadding=5]")[0].find_all("tr") data = { "last_updated": convert_date(lot_table_trs[-1].text.strip(), "%d.%m.%Y %H:%M Uhr"), "data_source": data_source, "lots": [] } for tr in lot_table_trs[1:-1]: tds = tr.find_all("td") type_and_name = process_name(tds[0].text) data["lots"].append({ "name": type_and_name[1], "type": type_and_name[0], "free": int(tds[1].text), "total": total_number_map.get(tds[0].text, 0), "state": state_map.get(tds[2].text, ""), "coords": geodata.coords(type_and_name[1]), "id": generate_id(__file__, type_and_name[1]), "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") data = { "lots": [], "data_source": data_source, "last_updated": convert_date(soup.find(id="P1_LAST_UPDATE").text, "%d.%m.%Y %H:%M:%S") } for table in soup.find_all("table"): if table["summary"] != "": region = table["summary"] for lot_row in table.find_all("tr"): if lot_row.find("th") is not None: continue state_div = lot_row.find("div") if "green" in state_div["class"]: state = "open" elif "yellow" in state_div["class"]: state = "open" elif "red" in state_div["class"]: state = "open" elif "park-closed" in state_div["class"]: state = "closed" else: state = "nodata" lot_name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text try: free = int(lot_row.find("td", {"headers": "FREI"}).text) except ValueError: free = 0 try: total = int(lot_row.find("td", {"headers": "KAPAZITAET"}).text) except ValueError: total = get_most_lots_from_known_data("Dresden", lot_name) id = generate_id(__file__, lot_name) forecast = os.path.isfile("forecast_data/" + id + ".csv") data["lots"].append({ "coords": geodata.coords(lot_name), "name": lot_name, "total": total, "free": free, "state": state, "id": id, "lot_type": type_map.get(lot_name, ""), "address": address_map.get(lot_name, ""), "forecast": forecast, "region": region }) return data
def parse_website_app(html): soup = BeautifulSoup(html, "html.parser") date_field = soup.find(id="P1_LAST_UPDATE").text last_updated = convert_date(date_field, "%d.%m.%Y %H:%M:%S") data = { "lots": [], "last_updated": last_updated } for table in soup.find_all("table"): if table["summary"] != "": region = table["summary"] if region == "Busparkplätze": continue for lot_row in table.find_all("tr"): if lot_row.find("th") is not None: continue cls = lot_row.find("div")["class"] state = "nodata" if "green" in cls or "yellow" in cls or "red" in cls: state = "open" elif "park-closed" in cls: state = "closed" lot_name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text try: col = lot_row.find("td", {"headers": "FREI"}) free = int(col.text) except ValueError: free = 0 try: col = lot_row.find("td", {"headers": "KAPAZITAET"}) total = int(col.text) except ValueError: total = get_most_lots_from_known_data("Dresden", lot_name) lot = geodata.lot(lot_name) forecast = os.path.isfile("forecast_data/" + lot.id + ".csv") data["lots"].append({ "coords": lot.coords, "name": lot_name, "total": total, "free": free, "state": state, "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": forecast, "region": region }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") # last update time (UTC) try: update_time = convert_date(soup.select('p > strong')[-1].text, "Stand: %d.%m.%Y - %H:%M:%S") except ValueError: update_time = convert_date(soup.select('p > strong')[-2].text, "Stand: %d.%m.%Y - %H:%M:%S") data = { "last_updated": update_time, "data_source": data_source, "lots": [] } # get all tables with lots raw_lot_list = soup.find_all("div", {"class": "listing"}) # get all lots for lot_list in raw_lot_list: raw_lots = lot_list.select('tr + tr') for lot in raw_lots: lot_name = lot.select('a')[0].text try: lot_free = int(lot.select('td + td')[0].text) lot_state = "open" if "green" in str(lot.select("td + td")[0]) else "closed" except ValueError: lot_free = 0 lot_state = "nodata" data["lots"].append({ "name": lot_name, "free": lot_free, "total": total_number_map.get(lot_name, 0), "coords": geodata.coords(lot_name), "state": lot_state, "id": generate_id(__file__, lot_name), "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") date_field = soup.find(id="P1_LAST_UPDATE").text last_updated = convert_date(date_field, "%d.%m.%Y %H:%M:%S") data = { "lots": [], "last_updated": last_updated } for table in soup.find_all("table"): if table["summary"] != "": region = table["summary"] for lot_row in table.find_all("tr"): if lot_row.find("th") is not None: continue cls = lot_row.find("div")["class"] state = "nodata" if "green" in cls or "yellow" in cls or "red" in cls: state = "open" elif "park-closed" in cls: state = "closed" lot_name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text try: col = lot_row.find("td", {"headers": "FREI"}) free = int(col.text) except ValueError: free = 0 try: col = lot_row.find("td", {"headers": "KAPAZITAET"}) total = int(col.text) except ValueError: total = get_most_lots_from_known_data("Dresden", lot_name) lot = geodata.lot(lot_name) forecast = os.path.isfile("forecast_data/" + lot.id + ".csv") data["lots"].append({ "coords": lot.coords, "name": lot_name, "total": total, "free": free, "state": state, "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": forecast, "region": region }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and # find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated last_updated = str(soup.select("body")) start = str.find(last_updated, "Letzte Aktualisierung:") + 23 last_updated = last_updated[start:start + 16] data = { # convert_date is a utility function # you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M"), "lots": [] } status_map = { "Offen": "open", "Geschlossen": "closed" } for tr in soup.find_all("tr"): if tr.td is None: continue td = tr.findAll('td') parking_name = td[0].string # work-around for the Umlaute-problem: ugly but working if ( 'Heiligengeist-' in parking_name) : parking_name = 'Heiligengeist-Höfe' elif ( 'Schlossh' in parking_name) : parking_name = 'Schlosshöfe' # get the data lot = geodata.lot(parking_name) try: parking_state = 'open' parking_free = 0 if ( 'Geschlossen' in td[3].text ) : parking_state = 'closed' else : parking_free = int(td[1].text) except: parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") data = { "last_updated": convert_date(soup.find("tr").find("strong").text, "Stand: %d.%m.%Y, %H:%M Uhr"), "data_source": data_source, "lots": [] } rows = soup.find_all("tr") rows = rows[1:] region_header = "" for row in rows: if len(row.find_all("th")) > 0: # This is a header row, save it for later region_header = row.find("th", {"class": "head1"}).text else: if row.find("td").text == "Gesamt": continue # This is a parking lot row raw_lot_data = row.find_all("td") if len(raw_lot_data) == 2: type_and_name = process_name(raw_lot_data[0].text) data["lots"].append({ "name": type_and_name[1], "type": type_and_name[0], "total": get_most_lots_from_known_data("Lübeck", type_and_name[1]), "free": 0, "region": region_header, "state": process_state_map.get(raw_lot_data[1].text, ""), "coords": geodata.coords(type_and_name[1]), "id": generate_id(__file__, type_and_name[1]), "forecast": False }) elif len(raw_lot_data) == 4: type_and_name = process_name(raw_lot_data[0].text) data["lots"].append({ "name": type_and_name[1], "type": type_and_name[0], "total": int(raw_lot_data[1].text), "free": int(raw_lot_data[2].text), "region": region_header, "state": "open", "coords": geodata.coords(type_and_name[1]), "id": generate_id(__file__, type_and_name[1]), "forecast": False }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. # soup = BeautifulSoup(html, "html.parser") # read the JSON-file: # URL (removed: + no caching) urlHD = "http://parken.heidelberg.de/api-v1/parking-location?api_key=H5WaIyR4lgn6wzo7rJf8u4ubecgpX0Q8" #"&nc="+str(random.random()) headerHD={'Accept': 'application/json; charset=utf-8', 'User-Agent': 'ParkAPI v%s - Info: %s' %(env.SERVER_VERSION, env.SOURCE_REPOSITORY) } req = urllib.request.Request(url=urlHD, headers=headerHD) webURL = urllib.request.urlopen(req) data=webURL.read() dataJSON=json.loads(data.decode('utf-8')) data = { # convert_date is a utility function you can use to turn this date into the correct string format "last_updated": convert_date(dataJSON['data']['updated'].split("+")[0][:-1], '%a, %d %b %Y %H:%M:%S'), # URL for the page where the scraper can gather the data "lots": [] } # iteration over single parking_lots for parking_lot in dataJSON['data']['parkinglocations'] : # please keep the name in the geojson-file in the same form as delivered here (including spaces) parking_name = 'P'+str(parking_lot['uid'])+' '+parking_lot['name'] # get the data lot = geodata.lot(parking_name) parking_state = 'open' parking_free = 0 try : if ( parking_lot['parkingupdate']['status'] == 'closed' ) : parking_state = 'closed' else : parking_free = int(parking_lot['parkingupdate']['total']) - int(parking_lot['parkingupdate']['current']) except : parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): data = json.loads(html) lots = {"lots": [], "last_updated": None} id_lots = {} for l in geodata.lots: aux = json.loads(geodata.lots[l].aux) id_lots[aux["identifier"]] = { "lot": geodata.lots[l], "open": aux["open"] } timestamps = [] for feature in data["features"]: try: if id_lots[feature["attributes"]["IDENTIFIER"]]["open"]: state = "open" else: if feature["attributes"]["KAPAZITAET"] == -1: state = "nodata" else: state = "unknown" lot = id_lots[feature["attributes"]["IDENTIFIER"]]["lot"] lots["lots"].append({ "coords": lot.coords, "name": lot.name, "total": int(lot.total), "free": int(feature["attributes"]["KAPAZITAET"]), "state": state, "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": False, "region": "" }) timestamps.append( convert_date(feature["attributes"]["TIMESTAMP"], "%Y-%m-%d %H:%M:%S")) except (KeyError, ValueError): pass timestamps.sort() timestamps.reverse() lots["last_updated"] = timestamps[0] return lots
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") data = { "last_updated": '', # will fill this later # URL for the page where the scraper can gather the data "lots": [] } # Datum: 22.07.2019 - Uhrzeit: 16:57 data['last_updated'] = convert_date( soup.find('div', class_='col-sm-12').text, 'Datum: %d.%m.%Y - Uhrzeit: %H:%M') parking_lots = soup.find_all('div', class_='row carparkContent') for one_parking_lot in parking_lots: park_temp1 = one_parking_lot.find('div', class_='carparkLocation col-sm-9') park_temp2 = park_temp1.find('a') if (park_temp2 != None): parking_name = park_temp2.text else: parking_name = park_temp1.text.strip() lot = geodata.lot(parking_name) parking_free = 0 parking_state = 'open' try: # text: Freie Parkplätze: 195 parking_free_temp = one_parking_lot.find( 'div', class_='col-sm-5').text.split() # parking_free_temp: ['Freie', 'Parkplätze:', '195'] parking_free = int(parking_free_temp[2]) except: parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") date_field = soup.find("tr").find("strong").text last_updated = convert_date(date_field, "Stand: %d.%m.%Y, %H:%M Uhr") data = { "last_updated": last_updated, "lots": [] } rows = soup.find_all("tr") rows = rows[1:] region_header = "" for row in rows: if len(row.find_all("th")) > 0: # This is a header row, save it for later region_header = row.find("th", {"class": "head1"}).text else: if row.find("td").text == "Gesamt": continue # This is a parking lot row raw_lot_data = row.find_all("td") type_and_name = process_name(raw_lot_data[0].text) if len(raw_lot_data) == 2: total = get_most_lots_from_known_data("Lübeck", type_and_name[1]) free = 0 state = process_state_map.get(raw_lot_data[1].text, "") elif len(raw_lot_data) == 4: total = int(raw_lot_data[1].text) free = int(raw_lot_data[2].text) state = "open" lot = geodata.lot(type_and_name[1]) data["lots"].append({ "name": lot.name, "lot_type": type_and_name[0], "total": total, "free": free, "region": region_header, "state": state, "coords": lot.coords, "id": lot.id, "forecast": False }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") data = { "last_updated": '', # URL for the page where the scraper can gather the data "lots": [] } try: # <div class="container-fluid" parking_data = soup.find('div', class_='container-fluid') # Letzte Aktualisierung: 04.07.2019 11:03:00 last_updated = convert_date( parking_data.find('h5').text, 'Letzte Aktualisierung: %d.%m.%Y %H:%M:%S') data["last_updated"] = last_updated except: # if the service is unavailable (did happen in one of my tests): data["last_updated"] = utc_now() # return data parking_lots = parking_data.find_all('div', class_='well') for one_parking_lot in parking_lots: parking_name = one_parking_lot.find('b').text.strip() lot = geodata.lot(parking_name) parking_free = 0 try: parking_status = 'open' parking_free = int( one_parking_lot.find_all( 'div', role='progressbar')[1].find('b').text.strip()) except: parking_status = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_status, "lot_type": lot.type, "id": lot.id, "forecast": False }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") data = { "last_updated": '', # URL for the page where the scraper can gather the data "lots": [] } # <b>Stand: 13.08.2019 16:40:00 Uhr</b> (Aktualisierung alle 60 Sekunden)<br> data['last_updated'] = convert_date( soup.find('b').text, 'Stand: %d.%m.%Y %H:%M:%S Uhr') entries = soup.find('table', class_='tabellenformat') entries_rows = entries.find_all('tr') # first line: header for one_entry in entries_rows[1:]: one_entry_data = one_entry.find_all('td') parking_name = one_entry_data[0].text lot = geodata.lot(parking_name) parking_free = 0 parking_total = 0 try: parking_total = int(one_entry_data[1].text) if (one_entry_data[5].text.__eq__('Offen')): parking_status = 'open' parking_free = int(one_entry_data[3].text) elif (one_entry_data[5].text.__eq__('Geschlossen')): parking_status = 'closed' else: parking_status = 'nodata' except: parking_status = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": parking_total, "address": lot.address, "coords": lot.coords, "state": parking_status, "lot_type": lot.type, "id": lot.id, "forecast": False }) return data
def parse_website(html): soup = BeautifulSoup(html, "html.parser") for h3 in soup.find_all("h3"): if h3.text == "Letzte Aktualisierung": last_updated = convert_date(h3.find_next_sibling("div").text, "%d.%m.%Y %H:%M:%S") data = { "lots": [], "last_updated": last_updated } for table in soup.find_all("table"): thead = table.find("thead") if not thead: continue region = table.find("thead").find("tr").find_all("th")[1].find("div").text if region == "Busparkplätze": continue for tr in table.find("tbody").find_all("tr"): td = tr.find_all("td") name = tr.find("a").text lot = geodata.lot(name) try: total = int(td[2].find_all("div")[1].text) except ValueError: total = get_most_lots_from_known_data("Dresden", name) try: free = int(td[3].find_all("div")[1].text) valid_free = True except ValueError: valid_free = False free = 0 if "park-closed" in td[0]["class"]: state = "closed" elif "blue" in td[0]["class"] and not valid_free: state = "nodata" else: state = "open" data["lots"].append({ "coords": lot.coords, "name": name, "total": total, "free": free, "state": state, "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": os.path.isfile("forecast_data/" + lot.id + ".csv"), "region": region }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") date_field = soup.find("tr").find("strong").text last_updated = convert_date(date_field, "Stand: %d.%m.%Y, %H:%M Uhr") data = {"last_updated": last_updated, "lots": []} rows = soup.find_all("tr") rows = rows[1:] region_header = "" for row in rows: if len(row.find_all("th")) > 0: # This is a header row, save it for later region_header = row.find("th", {"class": "head1"}).text else: if row.find("td").text == "Gesamt": continue # This is a parking lot row raw_lot_data = row.find_all("td") type_and_name = process_name(raw_lot_data[0].text) if len(raw_lot_data) == 2: total = get_most_lots_from_known_data("Lübeck", type_and_name[1]) free = 0 state = process_state_map.get(raw_lot_data[1].text, "") elif len(raw_lot_data) == 4: total = int(raw_lot_data[1].text) free = int(raw_lot_data[2].text) state = "open" lot = geodata.lot(type_and_name[1]) data["lots"].append({ "name": lot.name, "lot_type": type_and_name[0], "total": total, "free": free, "region": region_header, "state": state, "coords": lot.coords, "id": lot.id, "forecast": False }) return data
def parse_html(source_json): parsed_json = json.loads(source_json) features = parsed_json['features'] # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = "" data = { # URL for the page where the scraper can gather the data "lots": [] } for feature in features: lot_name = feature['properties']['park_name'] lot_free = int(feature['properties']['obs_free']) lot_total = int(feature['properties']['obs_max']) obs_ts = feature['properties']['obs_ts'].split('.')[0] if last_updated < obs_ts: last_updated = obs_ts # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities state = "nodata" if feature['properties']['obs_state'] == "1": state = "open" elif feature['properties']['obs_state'] == "0": state = "closed" lot = geodata.lot(lot_name) data["lots"].append({ "name": lot.name, "free": lot_free, "total": lot_total, "address": lot.address, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) data['last_updated'] = convert_date(last_updated, "%Y-%m-%d %H:%M:%S") return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") data = { "last_updated": '', # will fill this later # URL for the page where the scraper can gather the data "lots": [] } # suche: <div id="parkhausliste-ct"> div_level1 = soup.find('div', id='parkhausliste-ct') # <p style="color: #7a7a7b; padding: 18px 0 8px 0">zuletzt aktualisiert am 19.06.2019, 15:27 Uhr</p> date_time = div_level1.find('p') data['last_updated'] = convert_date( date_time.text, 'zuletzt aktualisiert am %d.%m.%Y, %H:%M Uhr') # find all entries: div_level2 = div_level1.find('div') div_level3 = div_level2.find_all('div') count = 0 while (count < len(div_level3) - 2): parking_name = div_level3[count + 1].text.strip() lot = geodata.lot(parking_name) parking_free = 0 parking_state = 'open' try: parking_free = int(div_level3[count + 2].text) except: parking_state = 'nodata' count += 3 data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False }) return data
def parse_html(html): dataJSON = json.loads(html) data = { # convert_date is a utility function you can use to turn this date into the correct string format "last_updated": convert_date(dataJSON['data']['updated'].split("+")[0][:-1], '%a, %d %b %Y %H:%M:%S'), # URL for the page where the scraper can gather the data "lots": [] } # iteration over single parking_lots for parking_lot in dataJSON['data']['parkinglocations']: # please keep the name in the geojson-file in the same form as delivered here (including spaces) parking_name = 'P' + str( parking_lot['uid']) + ' ' + parking_lot['name'] # get the data lot = geodata.lot(parking_name) parking_state = 'open' parking_free = 0 try: if (parking_lot['parkingupdate']['status'] == 'closed'): parking_state = 'closed' else: parking_free = int( parking_lot['parkingupdate']['total']) - int( parking_lot['parkingupdate']['current']) except: parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(source_json): parsed_json = json.loads(source_json) features = parsed_json['features'] # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = "" data = { # URL for the page where the scraper can gather the data "lots": [] } for feature in features: lot_name = feature['properties']['park_name'] lot_free = int(feature['properties']['obs_free']) lot_total = int(feature['properties']['obs_max']) if last_updated < feature['properties']['obs_ts']: last_updated = feature['properties']['obs_ts'] # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities state = "nodata" if feature['properties']['obs_state'] == "1": state = "open" elif feature['properties']['obs_state'] == "0": state = "closed" lot = geodata.lot(lot_name) data["lots"].append({ "name": lot.name, "free": lot_free, "total": lot_total, "address": lot.address, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) data['last_updated'] = convert_date(last_updated, "%Y-%m-%d %H:%M:%S") return data
def parse_html(text_content): data_as_json = json.loads(text_content) # more data about the available parking spaces can be found at # http://odensedataplatform.dk/dataset/parkering # the service doesn't actually publish the last date it was updated, # so we will assume the data has just been updated last_updated = datetime.now().strftime("%d.%m.%Y %H:%M") data = { # convert_date is a utility function you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M"), "lots": [] } for _, parking in data_as_json.items(): lot_code = parking["idName"] name = parking["name"] total = parking["maxCount"] free = parking["freeCount"] # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities state = "unknown" lot = geodata.lot(lot_code) # this is to make sure that we don't include erroneous totals from the JSON file, # see the parking filosoffen_q_park_(ski_data) which outputs a total of 9999 if lot.total < total: total = lot.total data["lots"].append({ "name": name, "free": free, "total": total, "address": None, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(text_content): # the original JSON is invalid, let's fix it p = re.compile(r'([^"]|\s)([a-zA-Z]+)\s?(:)') text_content = text_content.replace("'", "\"") text_content = re.sub(p, r'\1"\2"\3', text_content) data_as_json = json.loads(text_content) # the source doesn't publish the update time, so we assume present last_updated = datetime.now().strftime("%Y/%m/%d %H:%M:%S") data = { "last_updated": convert_date(last_updated, "%Y/%m/%d %H:%M:%S"), "lots": [] } state_mappings = { 1: "open", 0: "closed" } for record in data_as_json["parkPlacesAreaMarkers"]: lot_name = record["Name"] free = int(record["FreeCount"]) total = int(record["MaxCount"]) # the JSON file contains parking lots for which counting does work, let's ignore them if total > 0: latitude = record["Latitude"] longitude = record["Longitude"] state_key = int(record["IsOpen"]) state = state_mappings[state_key] lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated, it should be listed on most pages # suche: <p class="updateinfo">zuletzt aktualisiert: 28.05.2019 15.30 Uhr</p> updated = soup.find("p", class_="updateinfo") last_updated = convert_date(updated.text, 'zuletzt aktualisiert: %d.%m.%Y %H.%M Uhr') data = { "last_updated": last_updated, # URL for the page where the scraper can gather the data "lots": [] } parking_lots = soup.find_all("div", class_="accordeon parkmoeglichkeit") for one_lot in parking_lots: parking_name = one_lot.find("h3").text lot = geodata.lot(parking_name) parking_state = 'open' parking_free = 0 try: parking_belegung = one_lot.find("div", class_="belegung") if (parking_belegung != None): parking_free = int(parking_belegung.find("strong").text) except: parking_state = 'nodata' data["lots"].append({ "name": lot.name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = soup.find('h2').text data = { # convert_date is a utility function you can use to turn this date into the correct string format # Stand: 07.06.2019 15:46 Uhr "last_updated": convert_date(last_updated, "Stand: %d.%m.%Y %H:%M Uhr"), # URL for the page where the scraper can gather the data "lots": [] } # find all entries all_parking_lots = soup.find_all('dl') for parking_lot in all_parking_lots : parking_name = parking_lot.find('dt').text lot = geodata.lot(parking_name) try : parking_state = 'open' parking_free = int(parking_lot.find('dd').find('strong').text) except : parking_state = 'nodata' parking_free = 0 data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(text_content): # the original JSON is invalid, let's fix it p = re.compile(r'([^"]|\s)([a-zA-Z]+)\s?(:)') text_content = text_content.replace("'", "\"") text_content = re.sub(p, r'\1"\2"\3', text_content) data_as_json = json.loads(text_content) # the source doesn't publish the update time, so we assume present last_updated = datetime.now().strftime("%Y/%m/%d %H:%M:%S") data = { "last_updated": convert_date(last_updated, "%Y/%m/%d %H:%M:%S"), "lots": [] } state_mappings = {1: "open", 0: "closed"} for record in data_as_json["parkPlacesAreaMarkers"]: lot_name = record["Name"] free = int(record["FreeCount"]) total = int(record["MaxCount"]) # the JSON file contains parking lots for which counting does work, let's ignore them if total > 0: latitude = record["Latitude"] longitude = record["Longitude"] state_key = int(record["IsOpen"]) state = state_mappings[state_key] lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html) # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = soup.select("p#last_updated")[0].text data = { # convert_date is a utility function you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), "data_source": data_source, "lots": [] } for tr in soup.find_all("tr"): lot_name = tr.find("td", {"class": "lot_name"}).text lot_free = tr.find("td", {"class": "lot_free"}).text lot_total = tr.find("td", {"class": "lot_total"}).text lot_address = tr.find("td", {"class": "lot_address"}).text lot_type = tr.find("td", {"class": "lot_type"}).text # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities state = tr.find("td", {"class": "lot_state"}).text data["lots"].append({ "name": lot_name, "free": lot_free, "total": lot_total, "address": lot_address, "coords": geodata.coords(lot_name), "state": state, "type": lot_type, # use the utility function generate_id to generate an ID for this lot # it takes this file path and the lot's name as params "id": generate_id(__file__, lot_name), "forecast": False, }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = soup.select("p#last_updated")[0].text data = { # convert_date is a utility function you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), # URL for the page where the scraper can gather the data "lots": [] } for tr in soup.find_all("tr"): lot_name = tr.find("td", {"class": "lot_name"}).text lot_free = tr.find("td", {"class": "lot_free"}).text lot_total = tr.find("td", {"class": "lot_total"}).text # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities state = tr.find("td", {"class": "lot_state"}).text lot = geodata.lot(lot_name) data["lots"].append({ "name": lot.name, "free": lot_free, "total": lot_total, "address": lot.address, "coords": lot.coords, "state": state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") # get page m = re.findall(r'Ob(.*von.*);', html) time = str(datetime.datetime.now()) lots_tmp = {} for elem in m: name = re.search(r'"[ A-Za-zöüä\xfc0-9+-]+"', elem, re.UNICODE).group(0)[1:-1] belegt = re.search(r'[\-0-9]+\)', elem).group(0)[0:-1] max = re.search(r'von.*,', elem).group(0)[5:-1] lots_tmp[name] = {"free": str(int(max)-int(belegt))} assert len(m) == 8, \ "Expect to find 8 lots in Erfurt, got: %d" % len(m) lots = [] for idx, free in enumerate(m): lot = geodata.lot(lot_map[idx]) lots.append({ "name": lot.name, "coords": lot.coords, "free": int(lots_tmp.get(name).get("free")), "address": lot.address, "total": lot.total, "state": "nodata", "id": lot.id, "forecast": False }) return { "last_updated": convert_date(time.split('.')[0], "%Y-%m-%d %H:%M:%S"), "lots": lots }
def parse_html(text_content): data_as_json = json.loads(text_content) # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = data_as_json["result"]["records"][0]["date"] data = { "last_updated": convert_date(last_updated, "%Y/%m/%d %H:%M:%S"), "lots": [] } # The page at https://www.odaa.dk/dataset/parkeringshuse-i-aarhus describes how the counts are made map_json_names = { "NORREPORT": "Nørreport", # "SKOLEBAKKEN": None, "SCANDCENTER": "Scandinavian Center", "BRUUNS": "Bruuns Galleri", "MAGASIN": "Magasin", "KALKVAERKSVEJ": "Kalkværksvej", "SALLING": "Salling", "Navitas": "Navitas", "NewBusgadehuset": "Busgadehuset" } cummulatives = {"Urban Level 1": "Dokk1", "Urban Level 2+3": "Dokk1"} cumulative_lots = {} for record in data_as_json["result"]["records"]: lot_code = record["garageCode"] total = int(record["totalSpaces"]) free = max(int(record["totalSpaces"]) - int(record["vehicleCount"]), 0) if lot_code not in map_json_names.keys( ) and lot_code not in cummulatives.keys(): continue elif lot_code in map_json_names.keys(): lot_name = map_json_names[lot_code] lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, }) elif lot_code not in cummulatives.keys(): lot_name = cummulatives[lot_code] if lot_name not in cumulative_lots.keys(): cumulative_lots[lot_name] = { "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, } else: current_data = cumulative_lots[lot_name] cumulative_lots[lot_name] = { "name": lot_name, "free": current_data["free"] + free, "total": current_data["total"] + total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, } for lot in cumulative_lots: data["lots"].append(lot) return data
def parse_html(html): soup = BeautifulSoup(html, "html.parser") stand=soup.select('span') # this gives you: # in stand[0]: <span style="font-weight: normal; letter-spacing: 0px;"> # Stand: 10.04.2019 15:09 </span> # splitting it gives you: u'10.04.2019', u'15:09' # putting it together: u'10.04.2019 15:09' last_updated_date=stand[0].text.strip().split()[1] last_updated_time=stand[0].text.strip().split()[2] last_updated = last_updated_date + " " + last_updated_time data = { "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M"), "lots": [] } # everything is in table-objects table=soup.select('table') # table[0] is a big table-object around everything # table[1] contains some headers # table[2] contains column-headers and one row for each parking-lot # so we look in this for name and values td = table[2].find_all('td') i = 0 while i < len(td)-4 : # for each row # td[0] contains an image # td[1] contains the name of the parking-lot # td[2] contains the text 'geschlossen' or the values in the form xxx / xxx parking_name = td[i+1].text.strip() # work-around for the sz-problem: Coulinstraße if ( 'Coulinstr' in parking_name ) : parking_name = 'Coulinstraße' # get the data lot = geodata.lot(parking_name) try: parking_state = 'open' parking_free = 0 parking_total = 0 if ( 'geschlossen' in td[i+2].text ) : parking_state = 'closed' else : parking_free = int(td[i+2].text.split()[0]) parking_total = int(td[i+2].text.split()[2]) except: parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": parking_total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) i += 5 # next parking-lot return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and # find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated last_updated = str(soup.select("body")) start = str.find(last_updated, "Letzte Aktualisierung:") + 23 last_updated = last_updated[start:start + 16] + ' Uhr' data = { # convert_date is a utility function # you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), "lots": [] } status_map = {"Offen": "open", "Geschlossen": "closed"} # Oldenburg does not send the totals on there website, # so wie take some Values from a 2011st PDF: # http://www.oldenburg.de/fileadmin/oldenburg/Benutzer/PDF/41/414/Parkplatz_Uebersicht2.pdf # and http://gis4oldenburg.oldenburg.de/?es=C12S77 # what possible can go wrong ¯\_(ツ)_/¯ lots_map = { "Waffenplatz": [650, "Waffenplatz 3"], "City": [440, "Staulinie 10"], "Galeria Kaufhof": [326, "Ritterstraße"], "Pferdemarkt": [401, "Pferdemarkt 13"], # CCO 1 & 2 are together only known together with 420, # but they seem to be somewhat like this "CCO Parkdeck 1": [190, "Heiligengeiststraße 4"], "CCO Parkdeck 2": [230, "Heiligengeiststraße 4"], "Hbf/ZOB": [358, "Karlstraße"], "Theaterwall": [125, "Theaterwall 4"], "Theatergarage": [107, "Roonstraße"], "Heiligengeist-Höfe": [275, "Georgstraße"], "Schlosshöfe": [430, "Mühlenstraße"], } for tr in soup.find_all("tr"): if tr.td is None: continue td = tr.findAll('td') lot_name = td[0].b.string lot_free = int(td[1].b.text) # get the values from the map above, or return zero # should trown an execption -> [email protected] lot_total = lots_map[lot_name][0] lot_address = lots_map[lot_name][1] # lot_type = tr.find("td").text # please be careful about the state only being allowed to contain # either open, closed or nodata should the page list other states, # please map these into the three listed possibilities state = status_map.get(td[3].text, "nodata") lot = geodata.lot(lot_name) data["lots"].append({ "id": lot.id, "name": lot.name, "free": lot_free, "state": state, "total": lot_total, "address": lot_address, "coords": lot.coords, # "type": lot_type, "forecast": False }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and # find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated last_updated = str(soup.select("body")) start = str.find(last_updated, "Letzte Aktualisierung:") + 23 last_updated = last_updated[start:start + 16] + ' Uhr' data = { # convert_date is a utility function # you can use to turn this date into the correct string format "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), "lots": [] } status_map = { "Offen": "open", "Geschlossen": "closed" } # Oldenburg does not send the totals on there website, # so wie take some Values from a 2011st PDF: # http://www.oldenburg.de/fileadmin/oldenburg/Benutzer/PDF/41/414/Parkplatz_Uebersicht2.pdf # and http://gis4oldenburg.oldenburg.de/?es=C12S77 # what possible can go wrong ¯\_(ツ)_/¯ lots_map = { "Waffenplatz": [650, "Waffenplatz 3"], "City": [440, "Staulinie 10"], "Galeria Kaufhof": [326, "Ritterstraße"], "Pferdemarkt": [401, "Pferdemarkt 13"], # CCO 1 & 2 are together only known together with 420, # but they seem to be somewhat like this "CCO Parkdeck 1": [190, "Heiligengeiststraße 4"], "CCO Parkdeck 2": [230, "Heiligengeiststraße 4"], "Hbf/ZOB": [358, "Karlstraße"], "Theaterwall": [125, "Theaterwall 4"], "Theatergarage": [107, "Roonstraße"], "Heiligengeist-Höfe": [275, "Georgstraße"], "Schlosshöfe": [430, "Mühlenstraße"], } for tr in soup.find_all("tr"): if tr.td is None: continue td = tr.findAll('td') lot_name = td[0].b.string lot_free = int(td[1].b.text) # get the values from the map above, or return zero # should trown an execption -> [email protected] lot_total = lots_map[lot_name][0] lot_address = lots_map[lot_name][1] # lot_type = tr.find("td").text # please be careful about the state only being allowed to contain # either open, closed or nodata should the page list other states, # please map these into the three listed possibilities state = status_map.get(td[3].text, "nodata") lot = geodata.lot(lot_name) data["lots"].append({ "id": lot.id, "name": lot.name, "free": lot_free, "state": state, "total": lot_total, "address": lot_address, "coords": lot.coords, # "type": lot_type, "forecast": False }) return data
def parse_html(text_content): data_as_json = json.loads(text_content) # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = data_as_json["result"]["records"][0]["date"] data = { "last_updated": convert_date(last_updated, "%Y/%m/%d %H:%M:%S"), "lots": [] } # The page at https://www.odaa.dk/dataset/parkeringshuse-i-aarhus describes how the counts are made map_json_names = { "NORREPORT": "Nørreport", # "SKOLEBAKKEN": None, "SCANDCENTER": "Scandinavian Center", "BRUUNS": "Bruuns Galleri", "MAGASIN": "Magasin", "KALKVAERKSVEJ": "Kalkværksvej", "SALLING": "Salling", "Navitas": "Navitas", "NewBusgadehuset": "Busgadehuset" } cummulatives = { "Urban Level 1": "Dokk1", "Urban Level 2+3": "Dokk1" } cumulative_lots = {} for record in data_as_json["result"]["records"]: lot_code = record["garageCode"] total = int(record["totalSpaces"]) free = max(int(record["totalSpaces"]) - int(record["vehicleCount"]), 0) if lot_code not in map_json_names.keys() and lot_code not in cummulatives.keys(): continue elif lot_code in map_json_names.keys(): lot_name = map_json_names[lot_code] lot = geodata.lot(lot_name) data["lots"].append({ "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, }) elif lot_code not in cummulatives.keys(): lot_name = cummulatives[lot_code] if lot_name not in cumulative_lots.keys(): cumulative_lots[lot_name] = { "name": lot_name, "free": free, "total": total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, } else: current_data = cumulative_lots[lot_name] cumulative_lots[lot_name] = { "name": lot_name, "free": current_data["free"] + free, "total": current_data["total"] + total, "address": lot.address, "coords": lot.coords, "state": "unknown", "lot_type": lot.type, "id": lot.id, "forecast": False, } for lot in cumulative_lots: data["lots"].append(lot) return data
def parse_html(html): if geodata.private_data: api_data = json.loads(html) dt = time.strptime(api_data[0]["timestamp"].split(".")[0], "%Y-%m-%dT%H:%M:%S") ts = time.gmtime(time.mktime(dt)) data = { "lots": [], "last_updated": time.strftime("%Y-%m-%dT%H:%M:%S", ts) } status = ['open', 'closed', 'unknown'] id_lots = {geodata.lots[n].aux: geodata.lots[n] for n in geodata.lots} for dataset in api_data: try: lot = id_lots[dataset['id']] forecast = os.path.isfile("forecast_data/" + lot.id + ".csv") data["lots"].append({ "coords": lot.coords, "name": lot.name, "total": lot.total, "free": max(lot.total - dataset["belegung"], 0), "state": status[dataset["status"] - 1], "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": forecast, "region": "" }) except KeyError: pass else: #use website soup = BeautifulSoup(html, "html.parser") date_field = soup.find(id="P1_LAST_UPDATE").text last_updated = convert_date(date_field, "%d.%m.%Y %H:%M:%S") data = { "lots": [], "last_updated": last_updated } for table in soup.find_all("table"): if table["summary"] != "": region = table["summary"] if region == "Busparkplätze": continue for lot_row in table.find_all("tr"): if lot_row.find("th") is not None: continue cls = lot_row.find("div")["class"] state = "nodata" if "green" in cls or "yellow" in cls or "red" in cls: state = "open" elif "park-closed" in cls: state = "closed" lot_name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text try: col = lot_row.find("td", {"headers": "FREI"}) free = int(col.text) except ValueError: free = 0 try: col = lot_row.find("td", {"headers": "KAPAZITAET"}) total = int(col.text) except ValueError: total = get_most_lots_from_known_data("Dresden", lot_name) lot = geodata.lot(lot_name) forecast = os.path.isfile("forecast_data/" + lot.id + ".csv") data["lots"].append({ "coords": lot.coords, "name": lot_name, "total": total, "free": free, "state": state, "id": lot.id, "lot_type": lot.type, "address": lot.address, "forecast": forecast, "region": region }) return data
def parse_html(html): # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated, it should be listed on most pages # suche: <td width="233"> date_time_text = soup.find('td', width='233').text.strip() data = { # convert_date is a utility function you can use to turn this date into the correct string format # 'Stand vom 05.06.2019, 14:40:20' "last_updated": convert_date(date_time_text, 'Stand vom %d.%m.%Y, %H:%M:%S'), # URL for the page where the scraper can gather the data "lots": [] } # everything is in table-objects # so we have to go down several levels of table-objects html_level0 = soup.find('table') html_level1 = html_level0.find_all('table') html_level2 = html_level1[1].find_all('table') html_level3 = html_level2[0].find_all('table') html_level4 = html_level3[2].find_all('table') # here we have the data of the tables # [0]: header # [1]: empty # all following: empty or Parkhaus for html_parkhaus in html_level4[2:]: if (html_parkhaus.text.strip() == ''): continue # table is empty html_parkhaus_all_rows = html_parkhaus.find_all('tr') for html_parkhaus_row in html_parkhaus_all_rows: # one row: one parkhaus html_parkhaus_data = html_parkhaus_row.find_all('td') parking_name_list = html_parkhaus_data[1].text.split() parking_name = '' for parking_name_part in parking_name_list: if (parking_name != ''): parking_name += ' ' parking_name += parking_name_part lot = geodata.lot(parking_name) parking_state = 'open' parking_free = 0 try: parking_free = int(html_parkhaus_data[2].text) except: parking_state = 'nodata' data["lots"].append({ "name": parking_name, "free": parking_free, "total": lot.total, "address": lot.address, "coords": lot.coords, "state": parking_state, "lot_type": lot.type, "id": lot.id, "forecast": False, }) return data