def __init__(self): # Single GeoIP object for referencing self.geoip = pygeoip.GeoIP('/home/carl/Projects/opms_master/OPMS/data/geoip/GeoIP.dat',pygeoip.MMAP_CACHE) # Single UASparser instand for referencing self.uasp = UASparser(cache_dir="/home/carl/Projects/opms_master/OPMS/opms/stats/ua_data/") self.uasp_format = "" # datetime value for any rdns timeout problems self.rdns_timeout = 0 # Toggle debug statements on/off self.debug = False # Record basic information about the import process for reporting self.import_stats = {} # Error logging file and string cache self.error_log = "" self.error_cache = "" # Cache objects to hold subtables in memory self.cache_user_agent = list(UserAgent.objects.all()) self.cache_rdns = list(Rdns.objects.all()) self.cache_referer = list(Referer.objects.all()) self.cache_file_request = list(FileRequest.objects.all()) self.cache_server = list(Server.objects.all()) # Log entry cache only prefetches a set number of records from the current timestamp self.cache_log_entry = [] self.cache_log_entry_size = 100 # Option flag to enable or disable the parallel import safety checks self.single_import = False super(Command, self).__init__()
def __init__(self): # Single GeoIP object for referencing self.geoip = pygeoip.GeoIP(os.path.join(PROJECT_ROOT, "stats/geoip_data/GeoIP.dat"),pygeoip.MMAP_CACHE) # Single UASparser instand for referencing self.uasp = UASparser(cache_dir=os.path.join(PROJECT_ROOT, "stats/ua_data/")) # datetime value for any rdns timeout problems self.rdns_timeout = 0 # Record basic information about the import process for reporting self.import_stats = dict() # Record summary counts of action types parsed through this file self.summary = dict() # Cache objects to hold subtables in memory self.cache_user_agent = list(UserAgent.objects.all()) self.cache_rdns = list(Rdns.objects.all()) # # Option flag to enable or disable the parallel import safety checks # self.single_import = Trueim super(Command, self).__init__()
class Command(LabelCommand): args = 'filename' help = 'Imports the contents of the specified logfile into the database.' option_list = LabelCommand.option_list + ( make_option('--startline', action='store', dest='start_at_line', default=1, help='Optional start line to allow resumption of large log files. Default is 1.'), make_option('--cache-size', action='store', dest='cache_size', default=100, help='Number of records to prefetch in the ApacheLogEntry lookup. Default is 100.'), make_option('--log-service', action='store', dest='log_service', default='mpoau', help='What service has produced this log? Used to determine the apache format expression. Default is "mpoau".'), make_option('--single-import', action='store', dest='single_import', default=False, help='Speeds up import rate by disabling support for parallel imports.'), ) def __init__(self): # Single GeoIP object for referencing self.geoip = pygeoip.GeoIP('/home/carl/Projects/opms_master/OPMS/data/geoip/GeoIP.dat',pygeoip.MMAP_CACHE) # Single UASparser instand for referencing self.uasp = UASparser(cache_dir="/home/carl/Projects/opms_master/OPMS/opms/stats/ua_data/") self.uasp_format = "" # datetime value for any rdns timeout problems self.rdns_timeout = 0 # Toggle debug statements on/off self.debug = False # Record basic information about the import process for reporting self.import_stats = {} # Error logging file and string cache self.error_log = "" self.error_cache = "" # Cache objects to hold subtables in memory self.cache_user_agent = list(UserAgent.objects.all()) self.cache_rdns = list(Rdns.objects.all()) self.cache_referer = list(Referer.objects.all()) self.cache_file_request = list(FileRequest.objects.all()) self.cache_server = list(Server.objects.all()) # Log entry cache only prefetches a set number of records from the current timestamp self.cache_log_entry = [] self.cache_log_entry_size = 100 # Option flag to enable or disable the parallel import safety checks self.single_import = False super(Command, self).__init__() def handle_label(self, filename, **options): print "Import started at " + str(datetime.datetime.utcnow()) + "\n" # Some basic checking if filename.endswith('.gz'): raise CommandError("This file is still compressed. Uncompress and try again.\n\n") # Create an error log per import file self._errorlog_start(filename + '_import-error.log') # Test the log_service option is valid. Use the same list as LogFile.SERVICE_NAME_CHOICES log_service = str(options.get('log_service', 'mpoau')) # Add the remaining services here when we start testing with that data if log_service == 'mpoau': # Assume mpoau logfiles self.uasp_format = r'%Y-%m-%dT%H:%M:%S%z %v %A:%p %h %l %u \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' else: raise CommandError("This service name is unknown:" + log_service +".\n\n") # Reset statistics self.import_stats['filename'] = filename self.import_stats['line_counter'] = 0 self.import_stats['line_count'] = 0 self.import_stats['duplicatecount'] = 0 self.import_stats['import_starttime'] = datetime.datetime.utcnow() self.import_stats['import_startline'] = int(options.get('start_at_line', 1)) self.cache_log_entry_size = int(options.get('cache_size', 100)) if options.get('single_import', False) != False: print "Processing file with parallel import safties disabled." self.single_import = True # This only needs setting/getting the once per call of this function logfile_obj = self._logfile(filename, log_service) # Send the file off to be parsed self._parsefile(logfile_obj) # Final stats output at end of file try: self.import_stats['import_duration'] = float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds) self.import_stats['import_rate'] = float(self.import_stats.get('line_counter')-self.import_stats.get('import_startline')) /\ self.import_stats['import_duration'] except ZeroDivisionError: self.import_stats['import_rate'] = 0 # print "\nImport finished at " + str(datetime.datetime.utcnow()) +\ # "\nLines parsed: " + str(self.import_stats.get('line_counter')) +\ # "\nDuplicates: " + str(self.import_stats.get('duplicatecount')) +\ # "\nImported at " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec\n" print """ Import finished at %s %s Lines parsed over %s seconds Giving a rate of %s lines/sec """ % (datetime.datetime.utcnow(), self.import_stats.get('line_counter'), self.import_stats.get('import_duration'), str(self.import_stats.get('import_rate'))[0:6] ) # Write the error cache to disk self._error_log_save() self._errorlog_stop() return None def _logfile(self, filename, log_service): "Get or create a LogFile record for the given filename" # Simple hack for this method initially... logfile = {} logfile['service_name'] = log_service try: logfile['file_name'] = filename[filename.rindex('/')+1:] logfile['file_path'] = filename[:filename.rindex('/')+1] except ValueError: # Likely path doesn't feature any directories... so improvise logfile['file_name'] = filename logfile['file_path'] = "./" logfile['last_updated'] = datetime.datetime.utcnow() obj, created = LogFile.objects.get_or_create( service_name = logfile.get('service_name'), file_name = logfile.get('file_name'), file_path = logfile.get('file_path'), defaults = logfile) # If this isn't the first time, and the datetime is significantly different from last access, update the time if not created and (logfile.get('last_updated') - obj.last_updated).days > 0: obj.last_updated = logfile.get('last_updated') obj.save() return obj def _parsefile(self, logfile_obj): # Create a parser for this file parser = apachelog.parser(self.uasp_format) filename = logfile_obj.file_path + logfile_obj.file_name # Attempt to determine the number of lines in the log log = open(filename) for line in log: self.import_stats['line_count'] += 1 print str(self.import_stats.get('line_count')) + " lines to parse. Beginning at line " + str(self.import_stats.get('import_startline')) + "\n" log.close() log = open(filename) previous_line = "" for line in log: # Update stats self.import_stats['line_counter'] += 1 if self.import_stats.get('line_counter') < self.import_stats.get('import_startline'): # Skip through to the specified line number previous_line = line continue # # Test for duplicate log entries immediately preceding # if line == previous_line: # self._errorlog("##### DUPLICATE LINE DETECTED ##### \n" +\ # "Line # :" + str(self.import_stats.get('line_counter')) + "\n" +\ # "Line : " + str(line) + "\n") # self.import_stats['duplicatecount'] += 1 # else: # # Parse and store the line # Removing the Duplicate line detection as I can't rule out the 0.5-1% of duplicates as being invalid yet: CM 9-12-11 self._parseline(parser, line, logfile_obj) # Print progress report every 500 lines. if (self.import_stats.get('line_counter') % 500) == 0: # Calculate the average rate of import for the whole process try: self.import_stats['import_rate'] = \ float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\ float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds) except ZeroDivisionError: self.import_stats['import_rate'] = 1 # Calculate how long till finished try: efs = int( float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\ float(self.import_stats.get('import_rate')) ) except ZeroDivisionError: efs = 1 efhr = efs // (60*60) efs = efs % (60*60) efmin = efs // 60 efsec = efs % 60 efstring = str(efhr) + "h " + str(efmin) + "m " + str(efsec) + "s." # Output the status print str(datetime.datetime.utcnow()) + ": " +\ str((float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')))*100)[0:5] + "% completed. " +\ "Parsed " + str(self.import_stats.get('line_counter')) + " lines. " +\ "Duplicates: " + str(self.import_stats.get('duplicatecount')) + ". " +\ "Rate: " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec. " +\ "Est. finish in " + efstring # Write the error cache to disk self._error_log_save() # Update duplicate line string for next pass previous_line = line return None def _parseline(self, parser_obj, line, logfile_obj): # Parse the raw line into a dictionary of data data = parser_obj.parse(line) #self._debug('============================') #self._debug('Data: ' + str(data)) # Validate the data - Count the number of elements if len(data) <> 11: self._errorlog("#### TOO FEW ITEMS IN THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\ + "Data:" + str(data) + "\n") return # Status code validation status_code = self._status_code_validation(int(data.get('%>s'))) if status_code == 0: self._errorlog("#### STATUS CODE 0 PROBLEM WITH THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\ + "Data:" + str(data) + "\n") return # Get or create the foreign key elements, Logfile, Rdns, FileRequest, Referer, UserAgent remote_rdns = self._ip_to_domainname(data.get('%h')) file_request = self._file_request(data.get('%r')) if file_request == None: self._errorlog("#### INVALID REQUEST STRING IN THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\ + "Data:" + str(data) + "\n") return referer = self._referer(data.get('%{Referer}i'), status_code) user_agent = self._user_agent(data.get('%{User-Agent}i')) # Pull apart the date time string date_string, time_string = data.get('%Y-%m-%dT%H:%M:%S%z').split('T') date_yyyy, date_mm, date_dd = date_string.split('-') time_hh, time_mm, time_ss = time_string.split(':') # Pull apart the server and port server_ip, server_port = data.get('%A:%p').split(':') server = self._server(data.get('%v'), server_ip, server_port) # Size of response validation. Can be '-' when status not 200 size_of_response = data.get('%b') if size_of_response.isdigit(): size_of_response = int(size_of_response) else: size_of_response = 0 # Build the log entry dictionary log_entry = { 'logfile': logfile_obj, 'time_of_request': datetime.datetime( int(date_yyyy), int(date_mm), int(date_dd), int(time_hh), int(time_mm), int(time_ss[0:2]) # Cut off the +0000 ), 'server': server, 'remote_logname': str(data.get('%l'))[:200], 'remote_user': str(data.get('%u'))[:200], 'remote_rdns': remote_rdns, 'status_code': status_code, 'size_of_response': size_of_response, 'file_request': file_request, 'referer': referer, 'user_agent': user_agent, } # self._debug('log_entry=' + str(log_entry)) # Create if there isn't already a duplicate record in place # obj, created = self._get_or_create_log_entry( # time_of_request = log_entry.get('time_of_request'), # server = log_entry.get('server'), # remote_rdns = log_entry.get('remote_rdns'), # size_of_response = log_entry.get('size_of_response'), # status_code = log_entry.get('status_code'), # file_request = log_entry.get('file_request'), # defaults = log_entry) obj, created = self._get_or_create_log_entry(defaults = log_entry) # Analyse obj.file_request.argument_string & obj.referer.full_string as part of another process # self._debug('============================') return None # def _get_or_create_log_entry(self, time_of_request, server, remote_rdns, size_of_response, # status_code, file_request, defaults = {}): def _get_or_create_log_entry(self, defaults = {}): obj = ApacheLogEntry() # Set this manually, longhand because the for key,value loop causes errors obj.logfile = defaults.get('logfile') obj.time_of_request = defaults.get('time_of_request') obj.server = defaults.get('server') obj.remote_logname = defaults.get('remote_logname') obj.remote_user = defaults.get('remote_user') obj.remote_rdns = defaults.get('remote_rdns') obj.status_code = defaults.get('status_code') obj.size_of_response = defaults.get('size_of_response') obj.file_request = defaults.get('file_request') obj.referer = defaults.get('referer') obj.user_agent = defaults.get('user_agent') # Removing the need for a memory cache as we don't acknowledge duplicates and trust the log file is exactly as we want # CM 9-12-11 obj.save() # # Trusting that items in the import log appear in chronological order # if len(self.cache_log_entry) == 0 or len(self.cache_log_entry) > (self.cache_log_entry_size*2): # time_limit = time_of_request+datetime.timedelta(minutes=10) # self.cache_log_entry = list(ApacheLogEntry.objects.filter(\ # time_of_request__gte=time_of_request, time_of_request__lte=time_limit, \ # server=server).order_by('time_of_request')[:self.cache_log_entry_size]) # # Attempt to locate in memory cache # for item in self.cache_log_entry: # if item.time_of_request == obj.time_of_request and item.server == obj.server and \ # item.remote_rdns == obj.remote_rdns and item.size_of_response == obj.size_of_response and \ # item.status_code == obj.status_code and item.file_request == obj.file_request: # # self._errorlog("##### DUPLICATE RECORD AT INSERTION DETECTED ##### \n" +\ # "Database row id: " + str(item.id) + "\n" +\ # "DB: " + str(item) + "\n" +\ # "Logfile line number:" + str(self.import_stats.get('line_counter')) + "\n") # self.import_stats['duplicatecount'] = self.import_stats.get('duplicatecount') + 1 # # return item, False # # Shortcut to escape if this is purely a single process import # if self.single_import: # obj.save() # self.cache_log_entry.append(obj) # return obj, True # # # Couldn't find it in the list, check the database incase another process has added it # try: # obj = ApacheLogEntry.objects.get(time_of_request = obj.time_of_request, server = obj.server, # remote_rdns = obj.remote_rdns, size_of_response = obj.size_of_response, # status_code = obj.status_code, file_request = obj.file_request) # except ApacheLogEntry.DoesNotExist: # obj.save() # except ApacheLogEntry.MultipleObjectsReturned: # self._errorlog("Funky shit just happened(!). MultipleObjectsReturned: " + str(obj) + "\n") # self.cache_log_entry.append(obj) return obj, True def _status_code_validation(self,status_code): "Check the supplied status code value against known good codes" for item in ApacheLogEntry.STATUS_CODE_CHOICES: if status_code == item[0]: return status_code return 0 def _ip_to_domainname(self, ipaddress): "Returns the domain name for a given IP where known" # self._debug('_ip_to_domainname('+str(ipaddress)+') called') # validate IP address # try: adr = IP(ipaddress) # PUT ERROR HANDLING IN HERE! rdns = Rdns() rdns.ip_address = adr.strNormal(0) rdns.resolved_name = 'Unknown' rdns.last_updated = datetime.datetime.utcnow() # Attempt to locate in memory cache for item in self.cache_rdns: if item.ip_address == rdns.ip_address: return item # Couldn't find it in the list, check the database incase another process has added it try: if self.single_import: # Shortcut if parallel import disabled raise Rdns.DoesNotExist rdns = Rdns.objects.get(ip_address = rdns.ip_address) except Rdns.DoesNotExist: # Go get the location for this address rdns.country_code = self.geoip.country_code_by_addr(rdns.ip_address) rdns.country_name = self.geoip.country_name_by_addr(rdns.ip_address) rdns.save() self.cache_rdns.insert(0,rdns) return rdns def _file_request(self, request_string): "Get or create a FileRequest object for a given request string" # self._debug('_file_request(' + request_string + ')') # Example request strings # GET /philfac/lockelectures/locke_album_cover.jpg HTTP/1.1 # GET / HTTP/1.0 # GET /robots.txt HTTP/1.0 # GET /astro/introduction/astronomy_intro-medium-audio.mp3?CAMEFROM=podcastsGET HTTP/1.1 # \xc5^) <--- Example bad data in log from 2011 fr = FileRequest() # Crude splitting... first on spaces, then on file/querystring ts = request_string.split() if len(ts) != 3: # Something is wrong with this request string. Exit and stop processing this record return None fs = ts[1].split('?') # Validate method: Either GET or POST or HEAD or corrupted fr.method = ts[0] if fr.method != "GET" and fr.method != "POST" and fr.method != "HEAD": return None fr.uri_string = str(fs[0]) fr.protocol = str(ts[2])[:20] # Querystring is optional, so test for it first. if len(fs)==2: fr.argument_string = fs[1] else: fr.argument_string = "" # Attempt to locate in memory cache for item in self.cache_file_request: if item.method == fr.method and item.uri_string == fr.uri_string and \ item.argument_string == fr.argument_string and item.protocol == fr.protocol: return item # Couldn't find it in the list, check the database incase another process has added it try: if self.single_import: # Shortcut if parallel import disabled raise FileRequest.DoesNotExist fr = FileRequest.objects.get(method = fr.method, uri_string = fr.uri_string, argument_string = fr.argument_string, protocol = fr.protocol) except FileRequest.DoesNotExist: fr.save() self.cache_file_request.insert(0,fr) return fr def _referer(self, referer_string, status_code): "Get or create a Referer record for the given string" ref = Referer() ref.full_string = "" if status_code in (200,206,304): ref.full_string = referer_string # Attempt to locate in memory cache for item in self.cache_referer: if item.full_string == ref.full_string: return item # Couldn't find it in the list, check the database incase another process has added it try: if self.single_import: # Shortcut if parallel import disabled raise Referer.DoesNotExist ref = Referer.objects.get(full_string=ref.full_string) except Referer.DoesNotExist: ref.save() self.cache_referer.insert(0,ref) return ref def _user_agent(self, agent_string): "Get or create a UserAgent record for the given string" user_agent = UserAgent() # Store the full string for later analysis user_agent.full_string = agent_string # Create some defaults that we'll likely overwrite. OS and UA can be null, so ignore. user_agent.type = "" # Attempt to locate in memory cache for item in self.cache_user_agent: if item.full_string == user_agent.full_string: return item # Couldn't find it in the list, check the database incase another process has added it try: if self.single_import: # Shortcut if parallel import disabled raise UserAgent.DoesNotExist user_agent = UserAgent.objects.get(full_string=user_agent.full_string) except UserAgent.DoesNotExist: # Parse the string to extract the easy bits try: uas_dict = self.uasp.parse(user_agent.full_string) #Set the type string user_agent.type = uas_dict.get('typ')[:50] # Deal with the OS record os = {} os['company'] = uas_dict.get('os_company')[:200] os['family'] = uas_dict.get('os_family')[:100] os['name'] = uas_dict.get('os_name')[:200] # Now get or create an OS record user_agent.os, created = OS.objects.get_or_create( company = os.get('company'), family = os.get('family'), name = os.get('name'), defaults = os) if created: user_agent.os.save() # Deal with the UA record ua = {} ua['company'] = uas_dict.get('ua_company')[:200] ua['family'] = uas_dict.get('ua_family')[:100] ua['name'] = uas_dict.get('ua_name')[:200] # Now get or create an UA record user_agent.ua, created = UA.objects.get_or_create( company = ua.get('company'), family = ua.get('family'), name = ua.get('name'), defaults = ua) if created: user_agent.ua.save() except UASException: self._errorlog('_user_agent() parsing FAILED. agent_string=' + str(agent_string) + "\n") #Not there, so write to database user_agent.save() # Update the cache self.cache_user_agent.insert(0,user_agent) return user_agent def _server(self, server_name, server_ip, server_port): "Store the server information" server = Server() server.name = server_name[:200] server.ip_address = server_ip server.port = int(server_port) # Attempt to locate in memory cache for item in self.cache_server: if item.name == server.name and item.ip_address == server.ip_address and \ item.port == server.port: return item # Couldn't find it in the list, check the database incase another process has added it try: if self.single_import: # Shortcut if parallel import disabled raise Server.DoesNotExist server = Server.objects.get(name=server.name, ip_address=server.ip_address, port=server.port) except Server.DoesNotExist: #Not there, so write to database and to cache server.save() self.cache_server.insert(0,server) return server # DEBUG AND INTERNAL HELP METHODS ============================================================== def _debug(self,error_str): "Basic optional debug function. Print the string if enabled" if self.debug: print 'DEBUG:' + str(error_str) + '\n' return None def _errorlog(self,error_str): "Write errors to a log file" # sys.stderr.write('ERROR:' + str(error_str) + '\n') #self.error_log.write('ERROR:' + str(error_str) + '\n') self.error_cache += 'ERROR:' + str(error_str) + '\n' return None def _errorlog_start(self, path_to_file): try: self.error_log = open(path_to_file,'a') except IOError: sys.stderr.write("WARNING: Could not open existing error file. New file being created") self.error_log = open(path_to_file,'w') self.error_log.write("Log started at " + str(datetime.datetime.utcnow()) + "\n") print "Writing errors to: " + path_to_file return None def _error_log_save(self): "Write errors to a log file" self.error_log.write(self.error_cache) self.error_cache = "" return None def _errorlog_stop(self): self.error_log.write("Log ended at " + str(datetime.datetime.utcnow()) + "\n") self.error_log.close() return None
class Command(LabelCommand): args = '<path/to/logfiles/>' help = 'Imports the contents of the Apple Raw logfiles into the database.' # option_list = LabelCommand.option_list + ( # make_option('--startline', action='store', dest='start_at_line', # default=1, help='Optional start line to allow resumption of large log files. Default is 1.'), # make_option('--single-import', action='store', dest='single_import', # default=True, help='Speeds up import rate by disabling support for parallel imports.'), # ) def __init__(self): # Single GeoIP object for referencing self.geoip = pygeoip.GeoIP(os.path.join(PROJECT_ROOT, "stats/geoip_data/GeoIP.dat"),pygeoip.MMAP_CACHE) # Single UASparser instand for referencing self.uasp = UASparser(cache_dir=os.path.join(PROJECT_ROOT, "stats/ua_data/")) # datetime value for any rdns timeout problems self.rdns_timeout = 0 # Record basic information about the import process for reporting self.import_stats = dict() # Record summary counts of action types parsed through this file self.summary = dict() # Cache objects to hold subtables in memory self.cache_user_agent = list(UserAgent.objects.all()) self.cache_rdns = list(Rdns.objects.all()) # # Option flag to enable or disable the parallel import safety checks # self.single_import = Trueim super(Command, self).__init__() def _list_files(self, path): file_list = [] for root, dirs, files in os.walk(path): for file in files: if file.lower()[-4:] == ".txt": file_list.append(os.path.abspath(os.path.join(root)+"/"+file)) # Yes, unix specific hack return file_list def handle_label(self, path, **options): verbosity = int(options.get('verbosity', 0)) if verbosity > 1: debug.DEBUG = True # # Some basic checking # if not filename.endswith('.txt'): # raise CommandError("This is not a text (.txt) log file.\n\n") # Scan directory for files, compare them to names in the existing LogFile list. Import the first X new files. found_files_list = self._list_files(path) found_files_list.sort() # Trust the naming conventions to put a sortable date on them import_file_limit = 21 if len(found_files_list) < import_file_limit: import_file_limit = len(found_files_list) # Have this only work on the one file when debug is switched on if debug.DEBUG: import_file_limit = 1 print "{0} files have been found. Importing up to {1} of them now.".format( len(found_files_list), import_file_limit ) for filename in found_files_list: if import_file_limit > 0: # Reset statistics self.import_stats['filename'] = filename self.import_stats['line_counter'] = 0 self.import_stats['line_count'] = 0 self.import_stats['import_starttime'] = datetime.datetime.now(pytz.utc) self.import_stats['import_startline'] = int(options.get('start_at_line', 1)) # This only needs setting/getting the once per call of this function logfile_obj, created = self._logfile(filename, 'itu-raw') if not created: err_string = "This file has already been imported: ({0})".format(filename) debug.onscreen(err_string) continue import_file_limit -= 1 print "Import of [{0}] started at {1:%Y-%m-%d %H:%M:%S}\n".format( filename, datetime.datetime.now(pytz.utc) ) # Create an error log per import file debug.errorlog_start(filename + '_import-error.log') # Send the file off to be parsed self._parsefile(logfile_obj) # Final stats output at end of file try: self.import_stats['import_duration'] = float((datetime.datetime.now(pytz.utc) - self.import_stats.get('import_starttime')).seconds) self.import_stats['import_rate'] = float(self.import_stats.get('line_counter')-self.import_stats.get('import_startline')) /\ self.import_stats['import_duration'] except ZeroDivisionError: self.import_stats['import_rate'] = 0 # Write the error cache to disk debug.errorlog_stop() print """ Import finished at {0:%Y-%m-%d %H:%M:%S} {1:d} Lines parsed over {2:.1f} seconds Giving a rate of {3:.3f} lines/sec """.format( datetime.datetime.now(pytz.utc), self.import_stats.get('line_counter'), self.import_stats.get('import_duration'), self.import_stats.get('import_rate') ) return None def _logfile(self, filename, log_service): """Get or create a LogFile record for the given filename""" # Simple hack for this method initially... logfile = dict() logfile['service_name'] = log_service try: logfile['file_name'] = filename[filename.rindex('/')+1:] logfile['file_path'] = filename[:filename.rindex('/')+1] except ValueError: # Likely path doesn't feature any directories... so improvise logfile['file_name'] = filename logfile['file_path'] = "./" logfile['last_updated'] = datetime.datetime.now(pytz.utc) obj, created = LogFile.objects.get_or_create( service_name = logfile.get('service_name'), file_name = logfile.get('file_name'), # file_path = logfile.get('file_path'), # Filenames are unique for the raw logs, so skip this defaults = logfile) # If this isn't the first time, and the datetime is significantly different from last access, update the time if not created and (logfile.get('last_updated') - obj.last_updated).days > 0: obj.last_updated = logfile.get('last_updated') obj.save() return obj, created def _parsefile(self, logfile_obj): filename = logfile_obj.file_path + logfile_obj.file_name datareader = csv.reader(open(filename), dialect='excel-tab') tsvdata = [] # The data structure changes over time, and new columns are added (and perhaps removed) so map each file to a dictionary column_headers = [] for i,row in enumerate(datareader): if i==0: for col, title in enumerate(row): column_headers.append(title.lower()) else: row_dict = dict() for col, title in enumerate(column_headers): row_dict[title] = row[col] tsvdata.append(row_dict) self.import_stats['line_counter'] = i self.import_stats['line_count'] = len(tsvdata) # Reset line counter for parsing scan self.import_stats['line_counter'] = self.import_stats.get('import_startline') for i in range(self.import_stats.get('line_counter')-1,len(tsvdata)): self._parseline(tsvdata[i], logfile_obj) self.import_stats['line_counter'] = i # Print progress report every 500 lines. if (self.import_stats.get('line_counter') % 500) == 0: # Calculate the average rate of import for the whole process try: self.import_stats['import_rate'] = \ float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\ float((datetime.datetime.now(pytz.utc) - self.import_stats.get('import_starttime')).seconds) except ZeroDivisionError: self.import_stats['import_rate'] = 1 # Calculate how long till finished try: efs = int( float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\ float(self.import_stats.get('import_rate')) ) except ZeroDivisionError: efs = 1 efhr = efs // (60*60) efs = efs % (60*60) efmin = efs // 60 efsec = efs % 60 efstring = "{0:d}h {1:d}m {2:d}s".format(efhr,efmin,efsec) # Output the status output = "{0:%Y-%m-%d %H:%M:%S} : {1:.1%} completed. Parsed {2:d} lines. Rate: {3:.2f} lines/sec. " \ "Estimated finish in {4}".format( datetime.datetime.now(pytz.utc), float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')), self.import_stats.get('line_counter'), self.import_stats.get('import_rate'), efstring ) debug.errorlog(output, display=True) # Write the error cache to disk debug.errorlog_save() # Create a summary record for this day's data debug.errorlog("Daily summary: " + str(self.summary), display=True) # print "Daily summary: \n" + str(self.summary) ds, created = AppleRawLogDailySummary.objects.get_or_create( date = self.summary.get("date", None), defaults = self.summary ) ds.save() # Reset the values after each file! self.summary = dict() return None def _parseline(self, entrydict, logfile_obj): # # Build the log entry dictionary # arle = { # "logfile" : logfile_obj, # "artist_id" : long(entrydict.get("artist_id")), # "itunes_id" : long(entrydict.get("itunes_id")), # "action_type" : self._action_type_validation(entrydict.get("action_type")), # "title" : entrydict.get("title","Unknown"), # "url" : entrydict.get("url",""), # "episode_id" : long(entrydict.get("episode_id",0)), # "episode_title" : entrydict.get("episode_title",None), # "episode_type" : entrydict.get("episode_type",None), # "storefront" : int(entrydict.get("storefront",0)), # "user_agent" : self._user_agent(entrydict.get("useragent","")), # "ipaddress" : self._ip_to_domainname(entrydict.get("ip_address",None)), # "timestamp" : self._parse_timestamp(entrydict.get("timestamp")), # "user_id" : entrydict.get("user_id","") # } debug.onscreen("_parseline():entrydict=" + str(entrydict)) # Build the log entry dictionary arle = AppleRawLogEntry() arle.logfile = logfile_obj try: arle.artist_id = long(entrydict.get("artist_id")) except ValueError: arle.artist_id = -1 except TypeError: arle.artist_id = -1 try: arle.itunes_id = long(entrydict.get("itunes_id")) except ValueError: arle.itunes_id = -1 except TypeError: arle.itunes_id = -1 arle.action_type = self._action_type_validation(entrydict.get("action_type")) arle.title = entrydict.get("title","Unknown") arle.url = entrydict.get("url","") try: arle.episode_id = long(entrydict.get("episode_id")) except ValueError: arle.episode_id = None except TypeError: arle.episode_id = None arle.episode_title = entrydict.get("episode_title",None) arle.episode_type = entrydict.get("episode_type",None) arle.storefront = self._storefront(entrydict.get("storefront","0")) arle.user_agent = self._user_agent(entrydict.get("useragent","")) arle.ipaddress = self._ip_to_domainname(entrydict.get("ip_address",None)) arle.timestamp = self._parse_timestamp(entrydict.get("timestamp")) arle.user_id = entrydict.get("user_id","") arle.save(force_insert=True) # Add to the daily summary dictionary if self.summary.get("date", None) is None: self.summary["date"] = "{0:%Y-%m-%d}".format(arle.timestamp) if arle.action_type == "AutoDownload": self.summary["auto_download"] = int(self.summary.get("auto_download",0) + 1) elif arle.action_type == "Browse": self.summary["browse"] = int(self.summary.get("browse",0) + 1) elif arle.action_type == "Download": self.summary["download"] = int(self.summary.get("download",0) + 1) elif arle.action_type == "DownloadAll": self.summary["download_all"] = int(self.summary.get("download_all",0) + 1) elif arle.action_type == "Stream": self.summary["stream"] = int(self.summary.get("stream",0) + 1) elif arle.action_type == "Subscribe": self.summary["subscribe"] = int(self.summary.get("subscribe",0) + 1) elif arle.action_type == "SubscriptionEnclosure": self.summary["subscription_enclosure"] = int(self.summary.get("subscription_enclosure",0) + 1) else: self.summary["unknown"] = int(self.summary.get("unknown",0) + 1) return None def _action_type_validation(self, action_string): """Analyse the supplied action type string, and see if it's one we know about already. If it isn't, then don't panic, and store it anyway. We just need to update the choices list based on what we see in the error logs """ for item in AppleRawLogEntry.ACTION_TYPE_CHOICES: if action_string == item[0]: return action_string debug.errorlog( "#### PROBLEM WITH THIS ENTRY. Unknown Action Type ({0}) Line: {1:d}\n".format( action_string, self.import_stats.get('line_counter') ) ) return action_string def _ip_to_domainname(self, ipaddress): """Returns the domain name for a given IP where known""" # debug.onscreen('_ip_to_domainname('+str(ipaddress)+') called') # These are partial ipaddress of the format nnn.nnn.x.x so replace the x with 0 as a guess. if ipaddress and len(ipaddress)>8: # i.e. not None adr = IP(ipaddress.replace('x','0')) rdns = Rdns() rdns.ip_address = adr.strNormal(0) rdns.resolved_name = 'Partial' rdns.last_updated = datetime.datetime.now(pytz.utc) # Attempt to locate in memory cache for item in self.cache_rdns: if item.ip_address == rdns.ip_address: return item # Couldn't find it in the list rdns.country_code = self.geoip.country_code_by_addr(rdns.ip_address) rdns.country_name = self.geoip.country_name_by_addr(rdns.ip_address) rdns.save() self.cache_rdns.insert(0,rdns) return rdns else: return None def _storefront(self,initial_value): if initial_value.isdigit(): return int(initial_value) else: return 0 def _user_agent(self, agent_string): "Get or create a UserAgent record for the given string" user_agent = UserAgent() # Store the full string for later analysis user_agent.full_string = agent_string # Create some defaults that we'll likely overwrite. OS and UA can be null, so ignore. user_agent.type = "" # Attempt to locate in memory cache for item in self.cache_user_agent: if item.full_string == user_agent.full_string: return item # Parse the string to extract the easy bits try: uas_dict = self.uasp.parse(user_agent.full_string) #Set the type string user_agent.type = uas_dict.get('typ')[:50] # Deal with the OS record os = {} os['company'] = uas_dict.get('os_company')[:200] os['family'] = uas_dict.get('os_family')[:100] os['name'] = uas_dict.get('os_name')[:200] # Now get or create an OS record user_agent.os, created = OS.objects.get_or_create( company = os.get('company'), family = os.get('family'), name = os.get('name'), defaults = os) if created: user_agent.os.save() # Deal with the UA record ua = {} ua['company'] = uas_dict.get('ua_company')[:200] ua['family'] = uas_dict.get('ua_family')[:100] ua['name'] = uas_dict.get('ua_name')[:200] # Now get or create an UA record user_agent.ua, created = UA.objects.get_or_create( company = ua.get('company'), family = ua.get('family'), name = ua.get('name'), defaults = ua) if created: user_agent.ua.save() except UASException: debug.errorlog('_user_agent() parsing FAILED. agent_string=' + str(agent_string) + "\n") #Not there, so write to database user_agent.save() # Update the cache self.cache_user_agent.insert(0,user_agent) return user_agent def _parse_timestamp(self,initialstring): """Adjust timestamp supplied to GMT and returns a datetime object""" input_format = "%Y-%m-%d %H:%M:%S" base_time = datetime.datetime.strptime(initialstring[:-9],input_format) try: offset = int(initialstring[-5:]) delta = timedelta(hours = offset / 100) ts = base_time - delta except: ts = base_time # dt = datetime.datetime.fromtimestamp(time.mktime(ts)) return ts.replace(tzinfo = pytz.utc)