class WeatherLogProcessor(object): """Used for reading, processing a tsv log file """ def __init__(self, config_file, flush_cache=False, verbose=False): self.max_forecasts = [] self.lines_processed = 0 self.delimiter = '\t' with open(config_file) as conf_file: config = json.load(conf_file) self.error_report = ErrorReport() self.time_utils = TimeUtils() self.verbose = verbose self.geo_precision = config['log_processor']['geo_precision'] self.geoip_db = open_database(config['geoip_db']['file']) self.weather_api = OpenWeatherMap(api_key=config['weather_api']['api_key'], units=config['weather_api']['units']) self.weather_cache = WeatherCache(host=config['cache']['host'], port=config['cache']['port'], redis_key_expiry_secs=config['cache']['key_expiry_secs'] ) if flush_cache: self.weather_cache.flush() def _round_to(self, val): """Returns a value rounded to the precision value""" correction = 0.5 if val >= 0 else -0.5 return int(val/self.geo_precision+correction) * self.geo_precision def get_tomorrows_max(self, latitude, longitude): """Returns the forecast max for tomorrow given latitude and longitude""" start_of_tomorrow = self.time_utils.get_start_of_tomorrow_utc() end_of_tomorrow = self.time_utils.get_end_of_tomorrow_utc() temperature = self.weather_cache.get_forecast_max(latitude=latitude, longitude=longitude, forecast_date=start_of_tomorrow) if temperature: if self.verbose: print("retrieved from cache") else: temperature = self.weather_api.get_geo_max_temperature(latitude=latitude, longitude=longitude, start=start_of_tomorrow, end=end_of_tomorrow) if self.verbose: print("retrieved from api call") self.weather_cache.set_forecast_max(latitude=latitude, longitude=longitude, forecast_date=start_of_tomorrow, temperature=temperature) return temperature def process_tsv(self, input_file): """Loops through values in tsv looking in forecast max in cache first Then does a api call if value is not in cache""" with open(input_file) as tsvfile: reader = csv.reader(tsvfile, delimiter=self.delimiter) line_number = 0 for row in reader: line_number = line_number + 1 try: temperature = None ip_address = row[23] geoip_result = self.geoip_db.get(ip_address) if geoip_result is None: raise Exception("No entry found in geodb") else: latitude = self._round_to(val=geoip_result['location']['latitude']) longitude = self._round_to(val=geoip_result['location']['longitude']) temperature = self.get_tomorrows_max(latitude=latitude, longitude=longitude) self.max_forecasts.append(temperature) except Exception as e: self.error_report.add_error(error_str=str(e), line_number=line_number) if len(self.max_forecasts) == 0: raise Exception("No forecasts returned") self.lines_processed = line_number self.max_forecasts = sorted(self.max_forecasts) def create_histogram_tsv(self, number_of_buckets, output_file): """ Creates the histogram tsv file once all max temperatures is created""" if len(self.max_forecasts) < number_of_buckets: number_of_buckets = len(self.max_forecasts) bucket_size = (self.max_forecasts[-1] - self.max_forecasts[0]) / number_of_buckets row_min = self.max_forecasts[0] row_max = self.max_forecasts[0] + bucket_size count = 0 with open(output_file, 'w') as file: filewriter = csv.writer(file, delimiter=self.delimiter, quoting=csv.QUOTE_MINIMAL, quotechar='|' ) filewriter.writerow(['bucketMin', 'bucketMax', 'count']) for forecast in self.max_forecasts: rounded_row_min = round(row_min, 2) rounded_row_max = round(row_max, 2) if forecast <= rounded_row_max: count = count + 1 else: filewriter.writerow([rounded_row_min, rounded_row_max, count]) count = 1 row_min = row_max row_max = row_max + bucket_size # prints the last row filewriter.writerow([rounded_row_min, rounded_row_max, count]) def print_error_report(self): errors = self.error_report.get_errors() if errors: print("ERROR REPORT:") for key, error in errors.items(): print("ERROR TYPE: {}".format(key)) print("count: {} out of {} lines processed".format(error['count'], self.lines_processed)) print("error happened on lines: {}".format(error['lines'])) print("") else: print("No errors found.") def close(self): self.geoip_db.close()