def get_climate_data(place): def find_separate_weatherbox_template(data): if data is False: return "" # {{cityname weatherbox}} seems to be the usual template name. # I'll just look for any template ending with weatherbox. # I've not seen a page this breaks on yet. # New York City includes its weatherbox through a reference # to {{New York City weatherbox/cached}}, where the /cached # template contains rendered HTML tables. I want to look at # "Template:New York City weatherbox" instead. Not sure how # common this is, but NYC is pretty major and handling it # is easy, so might as well. index2 = max(data.find("weatherbox}}"), data.find("weatherbox/cached}}"), data.find("weatherbox|collapsed=Y}}")) if index2 > -1: # there is separate template - get it and process it index1 = data.rfind("{{", 0, index2) template_name = "Template:" + data[index1 + 2 : index2 + 10] weatherbox_title, data = get_page_source(template_name) if data is not False: return find_template(data, "Weather box") # if we didn't find template, or we couldn't get it, fall back return "" def parse(text): text = text.strip().replace("−", "-") text = text.strip().replace("−", "-") if text == "-": # used on some pages to indicate a no data condition return None if text == "trace": # used on some pages to indicate essentially 0, I guess return 0 return float(text) def month_number(month): # convert text month to number return MONTHS.index(month) + 1 def daily_to_monthly(daily, month): month = month_number(month) # use a non-leap year since I suspect monthly numbers are given # for non-leap Februarys days = calendar.monthrange(2013, month)[1] return daily * days result = {"page_error": False} for row_name in ROWS: result[row_name] = [] result["title"], data = get_page_source(place) if data is False: # indicates a problem getting data - signal it so output # can be formatted accordingly result["page_error"] = True return result weatherbox = find_template(data, "Weather box") weatherbox_info = parse_infobox(weatherbox) if len(weatherbox_info) == 0: # weatherbox not found directly on page # see there's a dedicated city weather template we can look at weatherbox = find_separate_weatherbox_template(data).strip() weatherbox_info = parse_infobox(weatherbox) for key in weatherbox_info: value = weatherbox_info[key] # try to parse out location data - usually specifies a neighbourhood, # weather station, year range info, etc if key == "location": # trim off wikilink markers, the most common # wiki syntax in this field result["location"] = value.replace("[", "").replace("]", "") month = key[:3] if month in MONTHS: category = key[3:].strip() # take out the month to get data category value = parse(value) # parse value as number # last token of category name is sometimes the unit # (C, F, mm, inch, etc) unit = category.rsplit(None, 1)[-1] if category in result: # straightforward putting the data in result[category].append(value) elif unit in UNIT_CONVERSIONS: # try to convert units to known ones for target_unit in UNIT_CONVERSIONS[unit]: # try to find a category we collect that # we know how to convert into converted_category = category.replace(unit, target_unit) if converted_category in result: converted = UNIT_CONVERSIONS[unit][target_unit](value) result[converted_category].append(converted) break elif category == "d sun": # special handling for daily sun hours value = daily_to_monthly(value, month) result["sun"].append(value) # Process percentsun if present and we haven't found any other sun data. # Assume specific hour count is more precise than "% sunshine", so only # use percentsun if other data is not more available. # TODO: if percentsun is ahead of sun in the template, this # precautionary condition will still fail elif category == "percentsun" and len(result["sun"]) == 0: if "observer" not in result: location = result["title"] # will try to get lat,lng from wikipedia page if location # is not recognized by pyephem directly result["observer"] = astrodata.process_location(location) if result["observer"] != False: daylight = astrodata.month_daylight(result["observer"], month_number(month)) sun = (daylight.total_seconds() / 3600) * (value / 100) sun = round(sun, 1) result["sun"].append(sun) return result
import calendar import time import datetime import ephem import astrodata import climate if __name__ == '__main__': cities = climate.get_cities() data = {'r': [], 'e': []} for city in cities: time1 = time.time() for i in range(12): data['e'].append(astrodata.month_daylight(city, i+1, True).total_seconds() / 3600) print "exact: " + str(time.time() - time1) time2 = time.time() for i in range(12): data['r'].append(astrodata.month_daylight(city, i+1, False).total_seconds() / 3600) print "rough: " + str(time.time() - time2) for i in range(12): print str(i+1) + ' ', print str(round(data['e'][i], 2)) + '-' + str(round(data['r'][i], 2)), print ' = ' + str(round(data['e'][i] - data['r'][i], 3)), print '\t: ' + str(round(100 * (data['r'][i] - data['e'][i]) / data['e'][i], 2)) + '%' # TODO: graph the differences vs actual day lengths to see where # i'm undershooting and try to understand why?