def submit(environ, start_response): setupStderr(environ["wsgi.errors"]) config = get_config() # Check that this is a POST request if environ["REQUEST_METHOD"] != "POST": return common.show_error("Unsupported request method", start_response) # Parse the submitted JSON try: data = json.loads(environ["wsgi.input"].read( int(environ["CONTENT_LENGTH"]))) except (KeyError, IOError, ValueError): return common.show_error("Error while parsing JSON data.", start_response) # Make sure the submitted data was contained within an object at least if not isinstance(data, dict): return common.show_error( "Error, data must be contained within an object.", start_response) # Log the data to a file log_dir = config.get("filterhitstats", "log_dir") try: log_file = log_filterhits(data, log_dir, environ.get("QUERY_STRING", "")) except (OSError, IOError): traceback.print_exc() return common.show_error("Failed to write data to log file!", start_response, "500 Logging error") # Update the geometrical_mean aggregations in the database interval = config.get("filterhitstats", "interval") try: db_connection = db.connect() try: db.write(db_connection, geometrical_mean.update(interval, data)) finally: db_connection.close() except: # Updating the aggregations in the database failed for whatever reason, # log the details but continue to return 204 to the client to avoid the # re-transmission of data. processing_error_log = os.path.join( config.get("filterhitstats", "log_dir"), "processing-errors.log") with open(processing_error_log, "a+") as f: message = "Problem processing data file %s:\n%s" % ( log_file, traceback.format_exc()) print >> f, "[%s] %s" % ( datetime.now().strftime("%d/%b/%Y:%H:%M:%S %z"), message) # Send back a 204 No Content start_response("204 No Content", []) return []
def submit(environ, start_response): setupStderr(environ["wsgi.errors"]) config = get_config() # Check that this is a POST request if environ["REQUEST_METHOD"] != "POST": return common.show_error("Unsupported request method", start_response) # Parse the submitted JSON try: data = json.loads(environ["wsgi.input"].read(int(environ["CONTENT_LENGTH"]))) except (KeyError, IOError, ValueError): return common.show_error("Error while parsing JSON data.", start_response) # Make sure the submitted data was contained within an object at least if not isinstance(data, dict): return common.show_error("Error, data must be contained within an object.", start_response) # Log the data to a file log_dir = config.get("filterhitstats", "log_dir") try: log_file = log_filterhits(data, log_dir, environ.get("QUERY_STRING", "")) except (OSError, IOError): traceback.print_exc() return common.show_error("Failed to write data to log file!", start_response, "500 Logging error") # Update the geometrical_mean aggregations in the database interval = config.get("filterhitstats", "interval") try: db_connection = db.connect() try: db.write(db_connection, geometrical_mean.update(interval, data)) finally: db_connection.close() except: # Updating the aggregations in the database failed for whatever reason, # log the details but continue to return 204 to the client to avoid the # re-transmission of data. processing_error_log = os.path.join(config.get("filterhitstats", "log_dir"), "processing-errors.log") with open(processing_error_log, "a+") as f: message = "Problem processing data file %s:\n%s" % ( log_file, traceback.format_exc() ) print >> f, "[%s] %s" % (datetime.now().strftime("%d/%b/%Y:%H:%M:%S %z"), message) # Send back a 204 No Content start_response("204 No Content", []) return []
def test_query_and_write(self): insert_sql = """INSERT INTO `filters` (filter, sha1) VALUES (%s, UNHEX(SHA1(filter)))""" select_sql = "SELECT filter FROM filters ORDER BY filter ASC" # Table should be empty to start with self.assertEqual(db.query(self.db, select_sql), ()) # Write some data and query it back db.write(self.db, ((insert_sql, "something"),)) self.assertEqual(db.query(self.db, select_sql), ((u"something",),)) # Write an array of SQL strings db.write(self.db, ((insert_sql, "a"), (insert_sql, "b"), (insert_sql, "c"))) self.assertEqual(db.query(self.db, select_sql), ((u"a",), (u"b",), (u"c",), (u"something",))) # Write a sequence of SQL but roll back when a problem arrises with self.assertRaises(MySQLdb.ProgrammingError): db.write(self.db, ((insert_sql, "f"), (insert_sql, "g"), (insert_sql, "h"), ("GFDGks",))) self.assertEqual(db.query(self.db, select_sql), ((u"a",), (u"b",), (u"c",), (u"something",)))
def _clear_database(self): db.write(self._db, (("DELETE FROM frequencies", ), ("DELETE FROM filters", )))
sys.exit("Could not read log file %s" % log_file) return data if __name__ == "__main__": if not len(sys.argv) == 2: print "Usage: python -m sitescripts.filterhits.bin.reprocess_logs /path/to/logs" sys.exit(1) interval = get_config().get("filterhitstats", "interval") def read_update(f): return geometrical_mean.update(interval, read_data(f)) if sys.argv[1].endswith(".log"): sql = read_update(sys.argv[1]) else: sql = itertools.chain.from_iterable( itertools.imap(read_update, log_files(sys.argv[1]))) db_connection = db.connect() try: db.write(db_connection, sql) except: logging.error("Failed to process file %s, all changes rolled back." % _last_log_file) raise finally: db_connection.close()
def test_calculations(self): interval = 86400 # Tables should be empty to start with self.assertEqual(db.query(self.db, "SELECT * FROM filters"), ()) self.assertEqual(db.query(self.db, "SELECT * FROM frequencies"), ()) # First batch db.write(self.db, geometrical_mean.update(interval, test_data[0])) self.assertEqual(db.query(self.db, "SELECT * FROM filters"), (("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"##.top-box-right-ad"),)) self.assertEqual( db.query(self.db, "SELECT * FROM frequencies"), (("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"acxiom-online.com", 6L, datetime.utcfromtimestamp(1414817340948 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"demdex.net", 36L, datetime.utcfromtimestamp(1414838712373 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"doubleclick.net", 26L, datetime.utcfromtimestamp(1414823430333 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"google.com", 50L, datetime.utcfromtimestamp(1414849084678 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"yahoo.com", 14L, datetime.utcfromtimestamp(1414859271125 / 1000)))) # Second batch db.write(self.db, geometrical_mean.update(interval, test_data[1])) self.assertEqual(db.query(self.db, "SELECT * FROM filters"), (("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"##.top-box-right-ad"),)) self.assertEqual( db.query(self.db, "SELECT * FROM frequencies"), (("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"acxiom-online.com", 6L, datetime.utcfromtimestamp(1414817340948 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"adsymptotic.com", 49L, datetime.utcfromtimestamp(1414953943015 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"amazon.com", 2L, datetime.utcfromtimestamp(1414913563746 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"demdex.net", 36L, datetime.utcfromtimestamp(1414838712373 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"doubleclick.net", 26L, datetime.utcfromtimestamp(1414823430333 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"google.com", self.geometrical(interval, 21, 1414953920364, 50, 1414849084678), datetime.utcfromtimestamp(1414953920364 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"live.com", 34L, datetime.utcfromtimestamp(1414916268769 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"yahoo.com", self.geometrical(interval, 27, 1414917270343, 14, 1414859271125), datetime.utcfromtimestamp(1414917270343 / 1000)))) # Third batch db.write(self.db, geometrical_mean.update(interval, test_data[2])) self.assertEqual(db.query(self.db, "SELECT * FROM filters"), (("22de8d2ba8429eb170a0ece6ea7a426f7b22e574".decode("hex"), u"stevedeace.com##.topAddHolder"), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"##.top-box-right-ad"))) self.assertEqual( db.query(self.db, "SELECT * FROM frequencies"), (("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"acxiom-online.com", 6L, datetime.utcfromtimestamp(1414817340948 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"adsymptotic.com", self.geometrical(interval, 15, 1414994112862, 49, 1414953943015), datetime.utcfromtimestamp(1414994112862 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"amazon.com", 2L, datetime.utcfromtimestamp(1414913563746 / 1000)), ("22de8d2ba8429eb170a0ece6ea7a426f7b22e574".decode("hex"), u"amazonaws.com", 18L, datetime.utcfromtimestamp(1414977342966 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"demdex.net", 36L, datetime.utcfromtimestamp(1414838712373 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"doubleclick.net", 26L, datetime.utcfromtimestamp(1414823430333 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"google.com", self.geometrical(interval, 14, 1415008533089, self.geometrical(interval, 21, 1414953920364, 50, 1414849084678), 1414953920364), datetime.utcfromtimestamp(1415008533089 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"live.com", 34L, datetime.utcfromtimestamp(1414916268769 / 1000)), ("22de8d2ba8429eb170a0ece6ea7a426f7b22e574".decode("hex"), u"mathtag.com", 14L, datetime.utcfromtimestamp(1415032601175 / 1000)), ("8c5ea548436c61f05536e205a29ada6204f603b0".decode("hex"), u"yahoo.com", self.geometrical(interval, 43, 1415045194098, self.geometrical(interval, 27, 1414917270343, 14, 1414859271125), 1414917270343), datetime.utcfromtimestamp(1415045194098 / 1000))))
_last_log_file = log_file except IOError: sys.exit("Could not read log file %s" % log_file) return data if __name__ == "__main__": if not len(sys.argv) == 2: print "Usage: python -m sitescripts.filterhits.bin.reprocess_logs /path/to/logs" sys.exit(1) interval = get_config().get("filterhitstats", "interval") def read_update(f): return geometrical_mean.update(interval, read_data(f)) if sys.argv[1].endswith(".log"): sql = read_update(sys.argv[1]) else: sql = itertools.chain.from_iterable(itertools.imap(read_update, log_files(sys.argv[1]))) db_connection = db.connect() try: db.write(db_connection, sql) except: logging.error("Failed to process file %s, all changes rolled back." % _last_log_file) raise finally: db_connection.close()
def _clear_database(self): db.write(self._db, (("DELETE FROM frequencies",), ("DELETE FROM filters",)))