示例#1
0
 def __init__(self):
     # Single GeoIP object for referencing
     self.geoip = pygeoip.GeoIP('/home/carl/Projects/opms_master/OPMS/data/geoip/GeoIP.dat',pygeoip.MMAP_CACHE)
     # Single UASparser instand for referencing
     self.uasp = UASparser(cache_dir="/home/carl/Projects/opms_master/OPMS/opms/stats/ua_data/")
     self.uasp_format = ""
     # datetime value for any rdns timeout problems
     self.rdns_timeout = 0
     # Toggle debug statements on/off
     self.debug = False
     # Record basic information about the import process for reporting
     self.import_stats = {}
     # Error logging file and string cache
     self.error_log = ""
     self.error_cache = ""
     # Cache objects to hold subtables in memory 
     self.cache_user_agent = list(UserAgent.objects.all())
     self.cache_rdns = list(Rdns.objects.all())
     self.cache_referer = list(Referer.objects.all())
     self.cache_file_request = list(FileRequest.objects.all())
     self.cache_server = list(Server.objects.all())
     # Log entry cache only prefetches a set number of records from the current timestamp
     self.cache_log_entry = []
     self.cache_log_entry_size = 100
     # Option flag to enable or disable the parallel import safety checks
     self.single_import = False
     
     
     super(Command, self).__init__()
示例#2
0
    def __init__(self):
        # Single GeoIP object for referencing
        self.geoip = pygeoip.GeoIP(os.path.join(PROJECT_ROOT, "stats/geoip_data/GeoIP.dat"),pygeoip.MMAP_CACHE)
        # Single UASparser instand for referencing
        self.uasp = UASparser(cache_dir=os.path.join(PROJECT_ROOT, "stats/ua_data/"))
        # datetime value for any rdns timeout problems
        self.rdns_timeout = 0
        # Record basic information about the import process for reporting
        self.import_stats = dict()
        # Record summary counts of action types parsed through this file
        self.summary = dict()
        # Cache objects to hold subtables in memory 
        self.cache_user_agent = list(UserAgent.objects.all())
        self.cache_rdns = list(Rdns.objects.all())
#        # Option flag to enable or disable the parallel import safety checks
#        self.single_import = Trueim
        
        super(Command, self).__init__()
示例#3
0
class Command(LabelCommand):
    args = 'filename'
    help = 'Imports the contents of the specified logfile into the database.'
    option_list = LabelCommand.option_list + (
        make_option('--startline', action='store', dest='start_at_line',
            default=1, help='Optional start line to allow resumption of large log files. Default is 1.'),
        make_option('--cache-size', action='store', dest='cache_size',
            default=100, help='Number of records to prefetch in the ApacheLogEntry lookup. Default is 100.'),
        make_option('--log-service', action='store', dest='log_service',
            default='mpoau', help='What service has produced this log? Used to determine the apache format expression. Default is "mpoau".'),
        make_option('--single-import', action='store', dest='single_import',
            default=False, help='Speeds up import rate by disabling support for parallel imports.'),
    )
    
    def __init__(self):
        # Single GeoIP object for referencing
        self.geoip = pygeoip.GeoIP('/home/carl/Projects/opms_master/OPMS/data/geoip/GeoIP.dat',pygeoip.MMAP_CACHE)
        # Single UASparser instand for referencing
        self.uasp = UASparser(cache_dir="/home/carl/Projects/opms_master/OPMS/opms/stats/ua_data/")
        self.uasp_format = ""
        # datetime value for any rdns timeout problems
        self.rdns_timeout = 0
        # Toggle debug statements on/off
        self.debug = False
        # Record basic information about the import process for reporting
        self.import_stats = {}
        # Error logging file and string cache
        self.error_log = ""
        self.error_cache = ""
        # Cache objects to hold subtables in memory 
        self.cache_user_agent = list(UserAgent.objects.all())
        self.cache_rdns = list(Rdns.objects.all())
        self.cache_referer = list(Referer.objects.all())
        self.cache_file_request = list(FileRequest.objects.all())
        self.cache_server = list(Server.objects.all())
        # Log entry cache only prefetches a set number of records from the current timestamp
        self.cache_log_entry = []
        self.cache_log_entry_size = 100
        # Option flag to enable or disable the parallel import safety checks
        self.single_import = False
        
        
        super(Command, self).__init__()


    def handle_label(self, filename, **options):
        print "Import started at " + str(datetime.datetime.utcnow()) + "\n"

        # Some basic checking
        if filename.endswith('.gz'):
           raise CommandError("This file is still compressed. Uncompress and try again.\n\n")

        # Create an error log per import file
        self._errorlog_start(filename + '_import-error.log')
     
        # Test the log_service option is valid. Use the same list as LogFile.SERVICE_NAME_CHOICES
        log_service = str(options.get('log_service', 'mpoau'))
        # Add the remaining services here when we start testing with that data
        if log_service == 'mpoau':
            # Assume mpoau logfiles
            self.uasp_format = r'%Y-%m-%dT%H:%M:%S%z %v %A:%p %h %l %u \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
        else:
            raise CommandError("This service name is unknown:" + log_service +".\n\n")
        
        # Reset statistics
        self.import_stats['filename'] = filename
        self.import_stats['line_counter'] = 0
        self.import_stats['line_count'] = 0
        self.import_stats['duplicatecount'] = 0
        self.import_stats['import_starttime'] = datetime.datetime.utcnow()
        self.import_stats['import_startline'] = int(options.get('start_at_line', 1))
        
        self.cache_log_entry_size = int(options.get('cache_size', 100))
        if options.get('single_import', False) != False:
            print "Processing file with parallel import safties disabled."
            self.single_import = True
        
        
        # This only needs setting/getting the once per call of this function
        logfile_obj = self._logfile(filename, log_service)
        
        # Send the file off to be parsed
        self._parsefile(logfile_obj)

        # Final stats output at end of file
        try:
            self.import_stats['import_duration'] = float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds)
            self.import_stats['import_rate'] = float(self.import_stats.get('line_counter')-self.import_stats.get('import_startline')) /\
                                                    self.import_stats['import_duration']
        except ZeroDivisionError:
            self.import_stats['import_rate'] = 0
        
#        print "\nImport finished at " + str(datetime.datetime.utcnow()) +\
#            "\nLines parsed: " + str(self.import_stats.get('line_counter')) +\
#            "\nDuplicates: " + str(self.import_stats.get('duplicatecount')) +\
#            "\nImported at " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec\n"
        print """
            Import finished at %s
            %s Lines parsed over %s seconds
            Giving a rate of %s lines/sec
            """ % (datetime.datetime.utcnow(),
                   self.import_stats.get('line_counter'),
                   self.import_stats.get('import_duration'),
                   str(self.import_stats.get('import_rate'))[0:6]
            )
        
        # Write the error cache to disk
        self._error_log_save()
        self._errorlog_stop()
            
        return None



    def _logfile(self, filename, log_service):
        "Get or create a LogFile record for the given filename"
        
        # Simple hack for this method initially...
        logfile = {}
        logfile['service_name'] = log_service
        try:
            logfile['file_name'] = filename[filename.rindex('/')+1:]
            logfile['file_path'] = filename[:filename.rindex('/')+1]
        except ValueError:
            # Likely path doesn't feature any directories... so improvise
            logfile['file_name'] = filename
            logfile['file_path'] = "./"
            
        logfile['last_updated'] = datetime.datetime.utcnow()
        
        obj, created = LogFile.objects.get_or_create(
            service_name = logfile.get('service_name'),
            file_name = logfile.get('file_name'),
            file_path = logfile.get('file_path'),
            defaults = logfile)
        
        # If this isn't the first time, and the datetime is significantly different from last access, update the time
        if not created and (logfile.get('last_updated') - obj.last_updated).days > 0:
            obj.last_updated = logfile.get('last_updated')
        
        obj.save()

        return obj



    def _parsefile(self, logfile_obj):
        # Create a parser for this file
        parser = apachelog.parser(self.uasp_format)
        filename = logfile_obj.file_path + logfile_obj.file_name
        
        # Attempt to determine the number of lines in the log
        log = open(filename)
        for line in log:
            self.import_stats['line_count'] += 1
        print str(self.import_stats.get('line_count')) + " lines to parse. Beginning at line " + str(self.import_stats.get('import_startline')) + "\n"
        log.close()

        log = open(filename)
        
        previous_line = ""
        for line in log:
            # Update stats
            self.import_stats['line_counter'] += 1
            if self.import_stats.get('line_counter') < self.import_stats.get('import_startline'):
                # Skip through to the specified line number
                previous_line = line
                continue
            
#            # Test for duplicate log entries immediately preceding
#            if line == previous_line:
#                self._errorlog("##### DUPLICATE LINE DETECTED ##### \n" +\
#                    "Line # :" + str(self.import_stats.get('line_counter')) + "\n" +\
#                    "Line   : " + str(line) + "\n")
#                self.import_stats['duplicatecount'] += 1
#            else:
#                # Parse and store the line
#           Removing the Duplicate line detection as I can't rule out the 0.5-1% of duplicates as being invalid yet: CM 9-12-11
            self._parseline(parser, line, logfile_obj)

            # Print progress report every 500 lines.
            if (self.import_stats.get('line_counter') % 500) == 0:
                # Calculate the average rate of import for the whole process
                try: 
                    self.import_stats['import_rate'] = \
                    float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\
                    float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds)
                except ZeroDivisionError:
                    self.import_stats['import_rate'] = 1
                # Calculate how long till finished
                try: 
                    efs = int(
                        float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\
                        float(self.import_stats.get('import_rate'))
                    )
                except ZeroDivisionError:
                    efs = 1
                efhr = efs // (60*60)
                efs = efs % (60*60)
                efmin = efs // 60
                efsec = efs % 60
                efstring = str(efhr) + "h " + str(efmin) + "m " + str(efsec) + "s."
                
                # Output the status
                print str(datetime.datetime.utcnow()) + ": " +\
                    str((float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')))*100)[0:5] + "% completed. " +\
                    "Parsed " + str(self.import_stats.get('line_counter')) + " lines. " +\
                    "Duplicates: " + str(self.import_stats.get('duplicatecount')) + ". " +\
                    "Rate: " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec. " +\
                    "Est. finish in " + efstring
                    
                # Write the error cache to disk
                self._error_log_save()

            # Update duplicate line string for next pass
            previous_line = line
            
        return None



    def _parseline(self, parser_obj, line, logfile_obj):
        # Parse the raw line into a dictionary of data
        data = parser_obj.parse(line)
        
        #self._debug('============================')
        #self._debug('Data: ' + str(data))

        # Validate the data - Count the number of elements
        if len(data) <> 11:
            self._errorlog("#### TOO FEW ITEMS IN THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\
            + "Data:" + str(data) + "\n")
            return
        
        # Status code validation
        status_code = self._status_code_validation(int(data.get('%>s')))
        if status_code == 0:
            self._errorlog("#### STATUS CODE 0 PROBLEM WITH THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\
            + "Data:" + str(data) + "\n")
            return
        
        # Get or create the foreign key elements, Logfile, Rdns, FileRequest, Referer, UserAgent
        remote_rdns = self._ip_to_domainname(data.get('%h'))
        
        file_request = self._file_request(data.get('%r'))
        if file_request == None:
            self._errorlog("#### INVALID REQUEST STRING IN THIS ENTRY. Line: " + str(self.import_stats.get('line_counter')) + "\n"\
            + "Data:" + str(data) + "\n")
            return
        
        referer = self._referer(data.get('%{Referer}i'), status_code)
        
        user_agent = self._user_agent(data.get('%{User-Agent}i'))
                        
        # Pull apart the date time string
        date_string, time_string = data.get('%Y-%m-%dT%H:%M:%S%z').split('T')
        date_yyyy, date_mm, date_dd = date_string.split('-')
        time_hh, time_mm, time_ss = time_string.split(':')
        
        # Pull apart the server and port
        server_ip, server_port = data.get('%A:%p').split(':')
        server = self._server(data.get('%v'), server_ip, server_port)
        
        # Size of response validation. Can be '-' when status not 200
        size_of_response = data.get('%b')
        if size_of_response.isdigit():
            size_of_response = int(size_of_response)
        else:
            size_of_response = 0
        
        # Build the log entry dictionary
        log_entry = {
            'logfile': logfile_obj,
            'time_of_request': datetime.datetime(
                int(date_yyyy), 
                int(date_mm), 
                int(date_dd), 
                int(time_hh), 
                int(time_mm), 
                int(time_ss[0:2]) # Cut off the +0000
                ),
            'server': server,
            'remote_logname': str(data.get('%l'))[:200],
            'remote_user': str(data.get('%u'))[:200],
            'remote_rdns': remote_rdns,
            'status_code': status_code,
            'size_of_response': size_of_response,
            'file_request': file_request,
            'referer': referer,
            'user_agent': user_agent,
        }
        
        # self._debug('log_entry=' + str(log_entry))
        
        # Create if there isn't already a duplicate record in place
#        obj, created = self._get_or_create_log_entry(
#            time_of_request = log_entry.get('time_of_request'),
#            server = log_entry.get('server'),
#            remote_rdns = log_entry.get('remote_rdns'),
#            size_of_response = log_entry.get('size_of_response'),
#            status_code = log_entry.get('status_code'),
#            file_request = log_entry.get('file_request'),
#            defaults = log_entry)
        obj, created = self._get_or_create_log_entry(defaults = log_entry)

        # Analyse obj.file_request.argument_string & obj.referer.full_string as part of another process

        # self._debug('============================')
        return None



#    def _get_or_create_log_entry(self, time_of_request, server, remote_rdns, size_of_response,
#                                 status_code, file_request, defaults = {}):
    def _get_or_create_log_entry(self, defaults = {}):
        obj = ApacheLogEntry()
        # Set this manually, longhand because the for key,value loop causes errors
        obj.logfile = defaults.get('logfile')
        obj.time_of_request = defaults.get('time_of_request')
        obj.server = defaults.get('server')
        obj.remote_logname = defaults.get('remote_logname')
        obj.remote_user = defaults.get('remote_user')
        obj.remote_rdns = defaults.get('remote_rdns')
        obj.status_code = defaults.get('status_code')
        obj.size_of_response = defaults.get('size_of_response')
        obj.file_request = defaults.get('file_request')
        obj.referer = defaults.get('referer')
        obj.user_agent = defaults.get('user_agent')

# Removing the need for a memory cache as we don't acknowledge duplicates and trust the log file is exactly as we want
# CM 9-12-11
        obj.save()

#        # Trusting that items in the import log appear in chronological order
#        if len(self.cache_log_entry) == 0 or len(self.cache_log_entry) > (self.cache_log_entry_size*2):
#            time_limit = time_of_request+datetime.timedelta(minutes=10)
#            self.cache_log_entry = list(ApacheLogEntry.objects.filter(\
#                time_of_request__gte=time_of_request, time_of_request__lte=time_limit, \
#                server=server).order_by('time_of_request')[:self.cache_log_entry_size])

#        # Attempt to locate in memory cache
#        for item in self.cache_log_entry:
#            if item.time_of_request == obj.time_of_request and item.server == obj.server and \
#                item.remote_rdns == obj.remote_rdns and item.size_of_response == obj.size_of_response and \
#                item.status_code == obj.status_code and item.file_request == obj.file_request:
#
#                self._errorlog("##### DUPLICATE RECORD AT INSERTION DETECTED ##### \n" +\
#                    "Database row id: " + str(item.id) + "\n" +\
#                    "DB: " + str(item) + "\n" +\
#                    "Logfile line number:" + str(self.import_stats.get('line_counter')) + "\n")
#                self.import_stats['duplicatecount'] = self.import_stats.get('duplicatecount') + 1
#
#                return item, False
        
#        # Shortcut to escape if this is purely a single process import
#        if self.single_import:
#            obj.save()
#            self.cache_log_entry.append(obj)
#            return obj, True
#
#        # Couldn't find it in the list, check the database incase another process has added it
#        try:
#            obj = ApacheLogEntry.objects.get(time_of_request = obj.time_of_request, server = obj.server,
#                remote_rdns = obj.remote_rdns, size_of_response = obj.size_of_response,
#                status_code = obj.status_code, file_request = obj.file_request)
#        except ApacheLogEntry.DoesNotExist:
#            obj.save()
#        except ApacheLogEntry.MultipleObjectsReturned:
#            self._errorlog("Funky shit just happened(!). MultipleObjectsReturned: " + str(obj) + "\n")
            
#        self.cache_log_entry.append(obj)
        return obj, True



    def _status_code_validation(self,status_code):
        "Check the supplied status code value against known good codes"
        for item in ApacheLogEntry.STATUS_CODE_CHOICES:
            if status_code == item[0]:
                return status_code
        return 0




    def _ip_to_domainname(self, ipaddress):
        "Returns the domain name for a given IP where known"
        # self._debug('_ip_to_domainname('+str(ipaddress)+') called')
        # validate IP address
        # try: 
        adr = IP(ipaddress)
        # PUT ERROR HANDLING IN HERE!
        
        rdns = Rdns()
        rdns.ip_address = adr.strNormal(0)
        rdns.resolved_name = 'Unknown'
        rdns.last_updated = datetime.datetime.utcnow()
        
        # Attempt to locate in memory cache
        for item in self.cache_rdns:
            if item.ip_address == rdns.ip_address:
                return item
        
        # Couldn't find it in the list, check the database incase another process has added it
        try:
            if self.single_import:
                # Shortcut if parallel import disabled
                raise Rdns.DoesNotExist
                
            rdns = Rdns.objects.get(ip_address = rdns.ip_address)
        except Rdns.DoesNotExist:
            # Go get the location for this address
            rdns.country_code = self.geoip.country_code_by_addr(rdns.ip_address)
            rdns.country_name = self.geoip.country_name_by_addr(rdns.ip_address)
            rdns.save()
            
        self.cache_rdns.insert(0,rdns)
        
        return rdns
        
        



    def _file_request(self, request_string):
        "Get or create a FileRequest object for a given request string"
        # self._debug('_file_request(' + request_string + ')')
        
        # Example request strings
        # GET /philfac/lockelectures/locke_album_cover.jpg HTTP/1.1
        # GET / HTTP/1.0
        # GET /robots.txt HTTP/1.0
        # GET /astro/introduction/astronomy_intro-medium-audio.mp3?CAMEFROM=podcastsGET HTTP/1.1
        # \xc5^)         <--- Example bad data in log from 2011
        
        fr = FileRequest()
        
        # Crude splitting... first on spaces, then on file/querystring
        ts = request_string.split()
        if len(ts) != 3:
            # Something is wrong with this request string. Exit and stop processing this record
            return None
        
        fs = ts[1].split('?')
        
        # Validate method: Either GET or POST or HEAD or corrupted
        fr.method = ts[0]
        if fr.method != "GET" and fr.method != "POST" and fr.method != "HEAD":
            return None
        
        fr.uri_string = str(fs[0])
        fr.protocol = str(ts[2])[:20]
        
        # Querystring is optional, so test for it first.
        if len(fs)==2:
            fr.argument_string = fs[1]
        else:
            fr.argument_string = ""
        
        # Attempt to locate in memory cache
        for item in self.cache_file_request:
            if item.method == fr.method and item.uri_string == fr.uri_string and \
                item.argument_string == fr.argument_string and item.protocol == fr.protocol:
                return item
        
        # Couldn't find it in the list, check the database incase another process has added it
        try:
            if self.single_import:
                # Shortcut if parallel import disabled
                raise FileRequest.DoesNotExist
                
            fr = FileRequest.objects.get(method = fr.method, uri_string = fr.uri_string,
                argument_string = fr.argument_string, protocol = fr.protocol)
        except FileRequest.DoesNotExist:
            fr.save()
            
        self.cache_file_request.insert(0,fr)
        
        return fr





    def _referer(self, referer_string, status_code):
        "Get or create a Referer record for the given string"
        ref = Referer()
        ref.full_string = ""
        
        if status_code in (200,206,304):
            ref.full_string = referer_string
        
        # Attempt to locate in memory cache
        for item in self.cache_referer:
            if item.full_string == ref.full_string:
                return item
                
        # Couldn't find it in the list, check the database incase another process has added it
        try:
            if self.single_import:
                # Shortcut if parallel import disabled
                raise Referer.DoesNotExist
                
            ref = Referer.objects.get(full_string=ref.full_string)
        except Referer.DoesNotExist:
            ref.save()
        
        self.cache_referer.insert(0,ref)
        
        return ref
        
        



    def _user_agent(self, agent_string):
        "Get or create a UserAgent record for the given string"
        user_agent = UserAgent()
        
        # Store the full string for later analysis
        user_agent.full_string = agent_string
        
        # Create some defaults that we'll likely overwrite. OS and UA can be null, so ignore.
        user_agent.type = ""            
            
        # Attempt to locate in memory cache
        for item in self.cache_user_agent:
            if item.full_string == user_agent.full_string:
                return item
                
        # Couldn't find it in the list, check the database incase another process has added it
        try:
            if self.single_import:
                # Shortcut if parallel import disabled
                raise UserAgent.DoesNotExist
            
            user_agent = UserAgent.objects.get(full_string=user_agent.full_string)
        except UserAgent.DoesNotExist:
            # Parse the string to extract the easy bits
            try:
                uas_dict = self.uasp.parse(user_agent.full_string)
    
                #Set the type string
                user_agent.type = uas_dict.get('typ')[:50]
                
                # Deal with the OS record
                os = {}
                os['company'] = uas_dict.get('os_company')[:200]
                os['family'] = uas_dict.get('os_family')[:100]
                os['name'] = uas_dict.get('os_name')[:200]
                
                # Now get or create an OS record
                user_agent.os, created = OS.objects.get_or_create(
                    company = os.get('company'), 
                    family = os.get('family'), 
                    name = os.get('name'), 
                    defaults = os)
                if created:
                    user_agent.os.save()
                    
                
                # Deal with the UA record
                ua = {}
                ua['company'] = uas_dict.get('ua_company')[:200]
                ua['family'] = uas_dict.get('ua_family')[:100]
                ua['name'] = uas_dict.get('ua_name')[:200]
                
                # Now get or create an UA record
                user_agent.ua, created = UA.objects.get_or_create(
                    company = ua.get('company'), 
                    family = ua.get('family'), 
                    name = ua.get('name'), 
                    defaults = ua)
                if created:
                    user_agent.ua.save()
                    
            except UASException:
                self._errorlog('_user_agent() parsing FAILED. agent_string=' + str(agent_string) + "\n")
            
            #Not there, so write to database
            user_agent.save()
        
        # Update the cache
        self.cache_user_agent.insert(0,user_agent)
        
        return user_agent




    def _server(self, server_name, server_ip, server_port):
        "Store the server information"
        server = Server()
        server.name = server_name[:200]
        server.ip_address = server_ip
        server.port = int(server_port)

        # Attempt to locate in memory cache
        for item in self.cache_server:
            if item.name == server.name and item.ip_address == server.ip_address and \
                item.port == server.port:
                return item

        # Couldn't find it in the list, check the database incase another process has added it
        try:
            if self.single_import:
                # Shortcut if parallel import disabled
                raise Server.DoesNotExist
            
            server = Server.objects.get(name=server.name, ip_address=server.ip_address, port=server.port)
        except Server.DoesNotExist:
            #Not there, so write to database and to cache
            server.save()
        
        self.cache_server.insert(0,server)

        return server



    # DEBUG AND INTERNAL HELP METHODS ==============================================================

    def _debug(self,error_str):
        "Basic optional debug function. Print the string if enabled"
        if self.debug:
            print 'DEBUG:' + str(error_str) + '\n'
        return None
            

    def _errorlog(self,error_str):
        "Write errors to a log file"
        # sys.stderr.write('ERROR:' + str(error_str) + '\n')
        #self.error_log.write('ERROR:' + str(error_str) + '\n')
        self.error_cache += 'ERROR:' + str(error_str) + '\n'
        return None


    def _errorlog_start(self, path_to_file):
        try:
            self.error_log = open(path_to_file,'a') 
        except IOError:
            sys.stderr.write("WARNING: Could not open existing error file. New file being created")
            self.error_log = open(path_to_file,'w')
        
        self.error_log.write("Log started at " + str(datetime.datetime.utcnow()) + "\n")
        print "Writing errors to: " + path_to_file
        return None

    def _error_log_save(self):
        "Write errors to a log file"
        self.error_log.write(self.error_cache)
        self.error_cache = ""
        return None


    def _errorlog_stop(self):
        self.error_log.write("Log ended at " + str(datetime.datetime.utcnow()) + "\n")
        self.error_log.close()
        return None
示例#4
0
class Command(LabelCommand):
    args = '<path/to/logfiles/>'
    help = 'Imports the contents of the Apple Raw logfiles into the database.'
#    option_list = LabelCommand.option_list + (
#        make_option('--startline', action='store', dest='start_at_line',
#            default=1, help='Optional start line to allow resumption of large log files. Default is 1.'),
#        make_option('--single-import', action='store', dest='single_import',
#            default=True, help='Speeds up import rate by disabling support for parallel imports.'),
#    )
    
    def __init__(self):
        # Single GeoIP object for referencing
        self.geoip = pygeoip.GeoIP(os.path.join(PROJECT_ROOT, "stats/geoip_data/GeoIP.dat"),pygeoip.MMAP_CACHE)
        # Single UASparser instand for referencing
        self.uasp = UASparser(cache_dir=os.path.join(PROJECT_ROOT, "stats/ua_data/"))
        # datetime value for any rdns timeout problems
        self.rdns_timeout = 0
        # Record basic information about the import process for reporting
        self.import_stats = dict()
        # Record summary counts of action types parsed through this file
        self.summary = dict()
        # Cache objects to hold subtables in memory 
        self.cache_user_agent = list(UserAgent.objects.all())
        self.cache_rdns = list(Rdns.objects.all())
#        # Option flag to enable or disable the parallel import safety checks
#        self.single_import = Trueim
        
        super(Command, self).__init__()

    def _list_files(self, path):
        file_list = []
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.lower()[-4:] == ".txt":
                    file_list.append(os.path.abspath(os.path.join(root)+"/"+file)) # Yes, unix specific hack
        return file_list

    def handle_label(self, path, **options):
        verbosity = int(options.get('verbosity', 0))
        if verbosity > 1:
            debug.DEBUG = True

#        # Some basic checking
#        if not filename.endswith('.txt'):
#           raise CommandError("This is not a text (.txt) log file.\n\n")

        # Scan directory for files, compare them to names in the existing LogFile list. Import the first X new files.
        found_files_list = self._list_files(path)
        found_files_list.sort() # Trust the naming conventions to put a sortable date on them
        import_file_limit = 21
        if len(found_files_list) < import_file_limit:
            import_file_limit = len(found_files_list)
        # Have this only work on the one file when debug is switched on
        if debug.DEBUG:
            import_file_limit = 1
        print "{0} files have been found. Importing up to {1} of them now.".format(
            len(found_files_list),
            import_file_limit
        )
        for filename in found_files_list:
            if import_file_limit > 0:
                # Reset statistics
                self.import_stats['filename'] = filename
                self.import_stats['line_counter'] = 0
                self.import_stats['line_count'] = 0
                self.import_stats['import_starttime'] = datetime.datetime.now(pytz.utc)
                self.import_stats['import_startline'] = int(options.get('start_at_line', 1))

                # This only needs setting/getting the once per call of this function
                logfile_obj, created = self._logfile(filename, 'itu-raw')
                if not created:
                    err_string = "This file has already been imported: ({0})".format(filename)
                    debug.onscreen(err_string)
                    continue

                import_file_limit -= 1

                print "Import of [{0}] started at {1:%Y-%m-%d %H:%M:%S}\n".format(
                    filename,
                    datetime.datetime.now(pytz.utc)
                )

                # Create an error log per import file
                debug.errorlog_start(filename + '_import-error.log')

                # Send the file off to be parsed
                self._parsefile(logfile_obj)

                # Final stats output at end of file
                try:
                    self.import_stats['import_duration'] = float((datetime.datetime.now(pytz.utc) - self.import_stats.get('import_starttime')).seconds)
                    self.import_stats['import_rate'] = float(self.import_stats.get('line_counter')-self.import_stats.get('import_startline')) /\
                                                            self.import_stats['import_duration']
                except ZeroDivisionError:
                    self.import_stats['import_rate'] = 0

                # Write the error cache to disk
                debug.errorlog_stop()

                print """
                    Import finished at {0:%Y-%m-%d %H:%M:%S}
                    {1:d} Lines parsed over {2:.1f} seconds
                    Giving a rate of {3:.3f} lines/sec
                    """.format(
                        datetime.datetime.now(pytz.utc),
                        self.import_stats.get('line_counter'),
                        self.import_stats.get('import_duration'),
                        self.import_stats.get('import_rate')
                    )

        return None

    def _logfile(self, filename, log_service):
        """Get or create a LogFile record for the given filename"""
        
        # Simple hack for this method initially...
        logfile = dict()
        logfile['service_name'] = log_service
        try:
            logfile['file_name'] = filename[filename.rindex('/')+1:]
            logfile['file_path'] = filename[:filename.rindex('/')+1]
        except ValueError:
            # Likely path doesn't feature any directories... so improvise
            logfile['file_name'] = filename
            logfile['file_path'] = "./"
            
        logfile['last_updated'] = datetime.datetime.now(pytz.utc)
        
        obj, created = LogFile.objects.get_or_create(
            service_name = logfile.get('service_name'),
            file_name = logfile.get('file_name'),
#            file_path = logfile.get('file_path'),  # Filenames are unique for the raw logs, so skip this
            defaults = logfile)
        
        # If this isn't the first time, and the datetime is significantly different from last access, update the time
        if not created and (logfile.get('last_updated') - obj.last_updated).days > 0:
            obj.last_updated = logfile.get('last_updated')
        
        obj.save()

        return obj, created

    def _parsefile(self, logfile_obj):
        filename = logfile_obj.file_path + logfile_obj.file_name

        datareader = csv.reader(open(filename), dialect='excel-tab')
        tsvdata = []
        # The data structure changes over time, and new columns are added (and perhaps removed) so map each file to a dictionary
        column_headers = []
        for i,row in enumerate(datareader):
            if i==0:
                for col, title in enumerate(row):
                    column_headers.append(title.lower())
            else:
                row_dict = dict()
                for col, title in enumerate(column_headers):
                    row_dict[title] = row[col]
                tsvdata.append(row_dict)
            self.import_stats['line_counter'] = i
        self.import_stats['line_count'] = len(tsvdata)
        # Reset line counter for parsing scan
        self.import_stats['line_counter'] = self.import_stats.get('import_startline')

        for i in range(self.import_stats.get('line_counter')-1,len(tsvdata)):
            self._parseline(tsvdata[i], logfile_obj)
            self.import_stats['line_counter'] = i

            # Print progress report every 500 lines.
            if (self.import_stats.get('line_counter') % 500) == 0:
                # Calculate the average rate of import for the whole process
                try:
                    self.import_stats['import_rate'] = \
                    float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\
                    float((datetime.datetime.now(pytz.utc) - self.import_stats.get('import_starttime')).seconds)
                except ZeroDivisionError:
                    self.import_stats['import_rate'] = 1
                # Calculate how long till finished
                try:
                    efs = int(
                        float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\
                        float(self.import_stats.get('import_rate'))
                    )
                except ZeroDivisionError:
                    efs = 1
                efhr = efs // (60*60)
                efs = efs % (60*60)
                efmin = efs // 60
                efsec = efs % 60
                efstring = "{0:d}h {1:d}m {2:d}s".format(efhr,efmin,efsec)

                # Output the status
                output = "{0:%Y-%m-%d %H:%M:%S} : {1:.1%} completed. Parsed {2:d} lines. Rate: {3:.2f} lines/sec. " \
                    "Estimated finish in {4}".format(
                    datetime.datetime.now(pytz.utc),
                    float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')),
                    self.import_stats.get('line_counter'),
                    self.import_stats.get('import_rate'),
                    efstring
                )
                debug.errorlog(output, display=True)

                # Write the error cache to disk
                debug.errorlog_save()

        # Create a summary record for this day's data
        debug.errorlog("Daily summary: " + str(self.summary), display=True)
#        print "Daily summary: \n" + str(self.summary)
        ds, created = AppleRawLogDailySummary.objects.get_or_create(
            date = self.summary.get("date", None),
            defaults = self.summary
        )
        ds.save()
        # Reset the values after each file!
        self.summary = dict()
        return None

    def _parseline(self, entrydict, logfile_obj):
#        # Build the log entry dictionary
#        arle = {
#            "logfile" : logfile_obj,
#            "artist_id" : long(entrydict.get("artist_id")),
#            "itunes_id" : long(entrydict.get("itunes_id")),
#            "action_type" : self._action_type_validation(entrydict.get("action_type")),
#            "title" : entrydict.get("title","Unknown"),
#            "url" : entrydict.get("url",""),
#            "episode_id" : long(entrydict.get("episode_id",0)),
#            "episode_title" : entrydict.get("episode_title",None),
#            "episode_type" : entrydict.get("episode_type",None),
#            "storefront" : int(entrydict.get("storefront",0)),
#            "user_agent" : self._user_agent(entrydict.get("useragent","")),
#            "ipaddress" : self._ip_to_domainname(entrydict.get("ip_address",None)),
#            "timestamp" : self._parse_timestamp(entrydict.get("timestamp")),
#            "user_id" : entrydict.get("user_id","")
#        }
        debug.onscreen("_parseline():entrydict=" + str(entrydict))

        # Build the log entry dictionary
        arle = AppleRawLogEntry()
        arle.logfile = logfile_obj
        try:
            arle.artist_id = long(entrydict.get("artist_id"))
        except ValueError:
            arle.artist_id = -1
        except TypeError:
            arle.artist_id = -1
        try:
            arle.itunes_id = long(entrydict.get("itunes_id"))
        except ValueError:
            arle.itunes_id = -1
        except TypeError:
            arle.itunes_id = -1
        arle.action_type = self._action_type_validation(entrydict.get("action_type"))
        arle.title = entrydict.get("title","Unknown")
        arle.url = entrydict.get("url","")
        try:
            arle.episode_id = long(entrydict.get("episode_id"))
        except ValueError:
            arle.episode_id = None
        except TypeError:
            arle.episode_id = None
        arle.episode_title = entrydict.get("episode_title",None)
        arle.episode_type = entrydict.get("episode_type",None)
        arle.storefront = self._storefront(entrydict.get("storefront","0"))
        arle.user_agent = self._user_agent(entrydict.get("useragent",""))
        arle.ipaddress = self._ip_to_domainname(entrydict.get("ip_address",None))
        arle.timestamp = self._parse_timestamp(entrydict.get("timestamp"))
        arle.user_id = entrydict.get("user_id","")
        arle.save(force_insert=True)

        # Add to the daily summary dictionary
        if self.summary.get("date", None) is None:
            self.summary["date"] = "{0:%Y-%m-%d}".format(arle.timestamp)
        if arle.action_type == "AutoDownload":
            self.summary["auto_download"] = int(self.summary.get("auto_download",0) + 1)
        elif arle.action_type == "Browse":
            self.summary["browse"] = int(self.summary.get("browse",0) + 1)
        elif arle.action_type == "Download":
            self.summary["download"] = int(self.summary.get("download",0) + 1)
        elif arle.action_type == "DownloadAll":
            self.summary["download_all"] = int(self.summary.get("download_all",0) + 1)
        elif arle.action_type == "Stream":
            self.summary["stream"] = int(self.summary.get("stream",0) + 1)
        elif arle.action_type == "Subscribe":
            self.summary["subscribe"] = int(self.summary.get("subscribe",0) + 1)
        elif arle.action_type == "SubscriptionEnclosure":
            self.summary["subscription_enclosure"] = int(self.summary.get("subscription_enclosure",0) + 1)
        else:
            self.summary["unknown"] = int(self.summary.get("unknown",0) + 1)

        return None

    def _action_type_validation(self, action_string):
        """Analyse the supplied action type string, and see if it's one we know about already. If it isn't, then
        don't panic, and store it anyway. We just need to update the choices list based on what we see in the error
        logs """
        for item in AppleRawLogEntry.ACTION_TYPE_CHOICES:
            if action_string == item[0]:
                return action_string
        debug.errorlog(
            "#### PROBLEM WITH THIS ENTRY. Unknown Action Type ({0}) Line: {1:d}\n".format(
                action_string,
                self.import_stats.get('line_counter')
            )
        )
        return action_string

    def _ip_to_domainname(self, ipaddress):
        """Returns the domain name for a given IP where known"""
        # debug.onscreen('_ip_to_domainname('+str(ipaddress)+') called')
        # These are partial ipaddress of the format nnn.nnn.x.x so replace the x with 0 as a guess.
        if ipaddress and len(ipaddress)>8: # i.e. not None
            adr = IP(ipaddress.replace('x','0'))

            rdns = Rdns()
            rdns.ip_address = adr.strNormal(0)
            rdns.resolved_name = 'Partial'
            rdns.last_updated = datetime.datetime.now(pytz.utc)

            # Attempt to locate in memory cache
            for item in self.cache_rdns:
                if item.ip_address == rdns.ip_address:
                    return item

            # Couldn't find it in the list
            rdns.country_code = self.geoip.country_code_by_addr(rdns.ip_address)
            rdns.country_name = self.geoip.country_name_by_addr(rdns.ip_address)
            rdns.save()

            self.cache_rdns.insert(0,rdns)
        
            return rdns
        else:
            return None

    def _storefront(self,initial_value):
        if initial_value.isdigit():
            return int(initial_value)
        else:
            return 0

    def _user_agent(self, agent_string):
        "Get or create a UserAgent record for the given string"
        user_agent = UserAgent()
        
        # Store the full string for later analysis
        user_agent.full_string = agent_string
        
        # Create some defaults that we'll likely overwrite. OS and UA can be null, so ignore.
        user_agent.type = ""            
            
        # Attempt to locate in memory cache
        for item in self.cache_user_agent:
            if item.full_string == user_agent.full_string:
                return item
                
        # Parse the string to extract the easy bits
        try:
            uas_dict = self.uasp.parse(user_agent.full_string)

            #Set the type string
            user_agent.type = uas_dict.get('typ')[:50]

            # Deal with the OS record
            os = {}
            os['company'] = uas_dict.get('os_company')[:200]
            os['family'] = uas_dict.get('os_family')[:100]
            os['name'] = uas_dict.get('os_name')[:200]

            # Now get or create an OS record
            user_agent.os, created = OS.objects.get_or_create(
                company = os.get('company'),
                family = os.get('family'),
                name = os.get('name'),
                defaults = os)
            if created:
                user_agent.os.save()

            # Deal with the UA record
            ua = {}
            ua['company'] = uas_dict.get('ua_company')[:200]
            ua['family'] = uas_dict.get('ua_family')[:100]
            ua['name'] = uas_dict.get('ua_name')[:200]

            # Now get or create an UA record
            user_agent.ua, created = UA.objects.get_or_create(
                company = ua.get('company'),
                family = ua.get('family'),
                name = ua.get('name'),
                defaults = ua)
            if created:
                user_agent.ua.save()

        except UASException:
            debug.errorlog('_user_agent() parsing FAILED. agent_string=' + str(agent_string) + "\n")

        #Not there, so write to database
        user_agent.save()
        
        # Update the cache
        self.cache_user_agent.insert(0,user_agent)
        
        return user_agent

    def _parse_timestamp(self,initialstring):
        """Adjust timestamp supplied to GMT and returns a datetime object"""
        input_format = "%Y-%m-%d %H:%M:%S"
        base_time = datetime.datetime.strptime(initialstring[:-9],input_format)
        try:
            offset = int(initialstring[-5:])
            delta = timedelta(hours = offset / 100)
            ts = base_time - delta
        except:
            ts = base_time
#        dt = datetime.datetime.fromtimestamp(time.mktime(ts))
        return ts.replace(tzinfo = pytz.utc)