def execute(module_name): """ Given a list of files with firefox logs, process all of them. Process the files containing the events. Return True if no error is detected. [('name', 'visit_url'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'firefox'), ('invocation', URL)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line to find out if it is one of the special # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the # processing because it is done in other specific function. fields = line[:-1].split() # If something weird happened and there are no fields, ignoredumpt # the line if len(fields) != 3: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Ignoring:', line continue dtime = datetime.datetime.strptime(' '.join(fields[0:2]).strip(), '%Y-%m-%d %H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ('visit_url', dtime, anon_user_id, [('application', 'firefox'), ('invocation', fields[2])]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event])
if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ( "gdb", dtime, anon_user_id, [ ("program", "gdb"), ("command", command), ("session_cmds", '"' + ";".join(session_cmds) + '"'), ("session_end", session_end), ], ) try: event_output.out(event) except Exception, e: print "Exception while processing", filename, ":", line_number print str(e) sys.exit(1) data_in.close() detect_new_files.update(None, module_name + "//" + filename, [new_last_event]) print >> sys.stderr
def execute(module_name): """ Given a list of files with Apache logs, process all of them that contain the word mark word and produce the following events: [('name', 'embedded_question_correct'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_incorrect'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_blank'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_show'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] """ global clf_re global filter_function global remap_pairs # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Get the remap_pairs evaluated from the options remap_pairs = eval('[' + rule_manager.get_property(None, module_name, 'remap_pairs') + \ ']') remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs] # Fetch the word to detect an embeddedq to use it later mark_word = rule_manager.get_property(None, module_name, 'mark_word') # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] fields = clf_re.match(line).groups() if fields[2] == '': raise ValueError('Empty string' + line) # Translate date time of the event dtime = datetime.datetime.strptime(fields[3].strip()[:-6], '%d/%b/%Y:%H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # Split the url to match and see if it has the mark word (method, url, protocol) = fields[4].split() # Only 404 that have the mark word substring are accepted if fields[5] != '404' or url.find(mark_word) == -1: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # At this point we have an event of an embedded question. event_pairs = process_log_line(url, mark_word) for (event_suffix, question_id) in event_pairs: event = ('embedded_question_' + event_suffix, dtime, anonymize.find_or_encode_string(fields[2]), [('application', 'unknown'), ('url', url), ('ip', fields[0]), ('question_id', question_id)]) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) print >> sys.stderr
def execute(module_name): """ Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands: CustomLog [destionation log file] "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION Sample: [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args For each line in the file, the following event structure is produced [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. ('datetime', dtime), ('user', anonymize(user)), ('repository', repository name), ('directory', directory) (optional), ('revision', r??? (optional)), ('comment', (max 256 chars)) # Only if commit and repository given] """ global filter_function # Get the level of debug debug = int(rule_manager.get_property(None, module_name, "debug")) # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = rules_common.files_to_process(module_name) # Get the flag to see if the commits need to be processed process_commits = rule_manager.get_property(None, module_name, "process_commits") == "" # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S") new_last_event = last_event data_in = codecs.open(filename, "r", encoding="utf8", errors="replace") old = "" counter = 0 for line in data_in: # Advance counters and print progress character if needed counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >>sys.stderr, "+", sys.stderr.flush() # Chop line into fields line = line[:-1] fields = line.split() if len(fields) < 3: raise ValueError("Erroneous log line:" + line) # Get the event type to quickly detect if we need to skip it event_type = fields[4] if (not process_commits) and event_type == "commit": continue # Translate date time of the event and check if within process # interval dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S") if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # Create the first three pairs of the event event = ( "svn_" + event_type, dtime, anonymize.find_or_encode_string(fields[2]), [("repository", fields[3])], ) # Structure of the different events # # checkout-or-export /path r62 depth=infinity # commit harry r100 # diff /path r15:20 depth=infinity ignore-ancestry # get-dir /trunk r17 text # get-file /path r20 props # get-file-revs /path r12:15 include-merged-revisions # get-mergeinfo (/path1 /path2) # lock /path steal # log (/path1,/path2) r20:90 discover-changed-paths revprops=() # replay /path r19 # change-rev-prop r50 propertyname # rev-proplist r34 # status /path r62 depth=infinity # switch /pathA /pathB@50 depth=infinity # unlock /path break # update /path r17 send-copyfrom-args if event_type == "checkout-or-export": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) if event_type == "commit": event[3].append(("revision", fields[5])) # Fetch the log message if svn_client is not None if svn_client != None: pass elif event_type == "diff": event[3].append(("location", fields[5] + " " + fields[6])) elif event_type == "get-dir" or event_type == "get-file" or event_type == "update": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) elif event_type == "get-file-revs": event[3].append(("revision", "r" + fields[6].split(":")[1])) event[3].append(("location", fields[5])) elif event_type == "lock" or event_type == "unlock": event[3].append(("location", fields[5])) elif event_type == "log": event[3].append(("location", fields[5])) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + "//" + filename, [new_last_event]) print >>sys.stderr
def execute(module_name): """ Given a list of files with bash logs, process all of them. [('name', 'bashcmd'), ('datetime', datetime), ('user', anonymize(user_id)), ('program', program), ('command', command)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Commands that even though the appear as bash, they require special # processing thus, they are processed somewhere else. skip_commands = set(['gcc', 'valgrind', 'gdb', 'kate', 'kdevelop', '/usr/bin/gcc', '/usr/bin/valgrind', '/usr/bin/gdb', '/usr/bin/kate', '/usr/bin/kdevelop']) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Detect and skip empty lines, MS-DOS empty lines, # only if re.match('^[ ]*\n$', line) or re.match('^\r\n$', line) or \ re.match('^#[ ]*\n$', line): continue # Detect timestamp if re.match('^#[0-9]+', line): milliseconds = float(line.split('#')[1]) stamp = datetime.datetime.fromtimestamp(milliseconds) continue if stamp <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if stamp < from_date or stamp > until_date: # Ignore event because it is outside the given window continue # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line to find out if it is one of the special # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the # processing because it is done in other specific function. fields = line.split() # If something weird happened and there are no fields, ignoredumpt # the line if len(fields) == 0: continue # Process the command line # Skip certain commands if os.path.basename(fields[0]) in skip_commands: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if stamp > new_last_event: new_last_event = stamp event = ('bashcmd', stamp, anon_user_id, [('program', fields[0]), ('command', line[:-1])]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event])
def process_csv_file(module_name, filename, mark_lines, total_counter, last_event, from_date, until_date, datetime_fmt): """ Receives the following parameters: - module_name: to record the modification of the file. - filename: file to process in CSV format - mark_lines: the number of lines to process to print out a mark - total_counter: total number of lines to be processed - last_event: the last event processed by this function in this file - from_date - to_date: the date limits to process events Returns the total_counter updated with the processed lines. Operations: - Open the file - Loop over each line - Mark a line if needed - Split the line into fields - Check if the date/time of the event is allowed - Store the new_last_event - Dump the event - Close the file - Update the info in the detect_new_files """ global filter_function new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] # Detect, accumulate \x0D to be removed if line[-1] == '\x0D': old = old + line[:-1] continue # If there is something in old, dump it if old != '': old = '' # Check the number of fields and skip lines without 6 fields fields = line.split('\t') if len(fields) != 6: continue # Dump event and remember the last one new_last_event = check_data_and_dump_event(fields, datetime_fmt, last_event, new_last_event, from_date, until_date) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) return total_counter
def execute(module_name): """ Given a list of files with Apache logs, process all of them. [('name', 'visit_url'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP)] """ global clf_re global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] fields = clf_re.match(line).groups() if fields[2] == '': raise ValueError('Empty string' + line) # Translate date time of the event dtime = datetime.datetime.strptime(fields[3].strip()[:-6], '%d/%b/%Y:%H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime (method, url, protocol) = fields[4].split() event = ('visit_url', dtime, anonymize.find_or_encode_string(fields[2]), [('application', 'unknown'), ('url', url), ('ip', fields[0])]) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) print >> sys.stderr