def initialize(module_name): """ Initialize the output process for events. """ global debug global output_format global exclude_users # Get the level of debug debug = int(rule_manager.get_property(None, module_name, "debug")) # Fetch format output_format = rule_manager.get_property(None, module_name, "format") # Create set of users to exclude exclude_users = map( lambda x: anonymize.find_or_encode_string(x), set(rule_manager.get_property(None, module_prefix, "exclude_users").split(",")), ) # Make sure we initialize the anonymize features common to all methods anonymize.initialize() if output_format == "CSV": init_csv(module_name) elif output_format == "mongo": init_mongo_db(module_name) else: init_csv(module_name)
def find_or_encode_string(value, synonyms = None): """ Given a string, obtains its sha256 digest with the password stored in the module. The map anonymize_map is updated. The strings included in the synonyms are also added to the map with the same key. """ global anonymize_map global module_prefix # Remove leading and trailing whitespace value = value.strip() # If anonymize is disabled, terminate if anonymize_map == None: return value # See if any of the IDs is in the digest = find_string(value) if digest != None: # Hit, return return digest passwd = rule_manager.get_property(None, module_prefix, 'passwd') min_length = int(rule_manager.get_property(None, module_prefix, 'min_length')) # String not present. Encode, store and return digest = hashlib.sha256((value + passwd).encode('utf-8')).hexdigest() while anonymize_map.get(digest[0:min_length]): min_length += 1 # Decide the final key digest = digest[0:min_length] # Look up the given values in LDAP and add to the collection of synonyms other_ids = set([value]) ldap_dict = ldap_lookup.get(value) if ldap_dict != None: other_ids = other_ids.union(set(map(lambda x: x[0].decode('utf-8'), ldap_dict.values()))) if synonyms != None: other_ids = other_ids.union(set(synonyms)) # Other_ids has now all possible synonyms for the given value. See if any of # them is in the table. old_digest = next((anonymize_map.get(x) for x in other_ids \ if anonymize_map.get(x) != None), None) # If one synonym was found, take that as the digest. if old_digest != None: digest = old_digest # Propagate the digest to the rest of synonyms for other_id in other_ids: anonymize_map[other_id] = digest return digest
def init_csv(module_name): """ Initalize the dump procedure in CSV format. """ global config_parmas global output_file # global csv_hash global print_ordinal # Reset the set of hashes # csv_hash = set([]) # Set the output_file if rule_manager.get_property(None, module_name, "output_file") == "": output_file = sys.stdout else: output_file = codecs.open(config_params["output_file"], "w", encoding="utf-8") # Create the header to print as first line header = ["datetime", "type", "user", "application", "invocation", "aux1", "aux2"] # See if the first column should include the ordinal print_ordinal = rule_manager.get_property(None, module_name, "print_ordinal") == "yes" if print_ordinal: header.index(0, "n") # Print the first line of the CSV with the column names print >> output_file, ",".join(header)
def get(lookup_name): """ Looks up the given name in the directory and returns synonyms """ global module_prefix global ldap_obj # If the object has not been initialized, terminate if ldap_obj == None: return None base = rule_manager.get_property(None, module_prefix, 'base') attr_list = rule_manager.get_property(None, module_prefix, 'fields').split() expr = reduce(lambda x, y: '(|(' + y + '=' + lookup_name + ')' \ + x + ')', attr_list, '') try: l = ldap_obj.search_s(base, ldap.SCOPE_SUBTREE, expr, map(lambda x: str(x), attr_list)) except: # Something went wrong, punt. return None # If empty result or more than one record, ignore if l == [] or len(l) != 1: return None # Return the dictionary with the attributes of the first element return l[0][1]
def init_mongo_db(module_name): """ Initialize the mongo db process """ mongodb.connect( rule_manager.get_property(None, module_name, "db_host"), rule_manager.get_property(None, module_name, "db_user"), rule_manager.get_property(None, module_name, "db_passwd"), rule_manager.get_property(None, module_name, "db_name"), )
def main(): """ Read a configuration file and perform the different event updates. A list of the modules to execute can be given. script configfile [module module ...] Example: script update_events.cfg moodle_log apache_log """ global config_defaults ####################################################################### # # OPTIONS # ####################################################################### args = sys.argv[1:] # Check that there are additional arguments if len(args) < 1: print >> sys.stderr, 'Script needs at least one parameter' sys.exit(1) if not os.path.exists(args[0]): print >> sys.stderr, 'File', args[0], 'not found.' sys.exit(1) # Initial options included in the global dictionary at the top of the # module. rule_manager.options = rule_manager.initial_config(config_defaults) # Traverse the modules and load the default values load_defaults(rule_manager.options) # Load the rules in the given configuration file rules = rule_manager.load_config_file(None, args[0], {})[1] # Initialize the file modification cache mechanism detect_new_files.initialize(\ rule_manager.get_property(None, 'anonymize', 'file_modification_cache'), True) # Traverse the sections and run first the "initialize" function and then # the "execute" function. for module_name in rules: module_prefix = module_name.split('.')[0] getattr(sys.modules[module_prefix], 'initialize')(module_name) for module_name in rules: module_prefix = module_name.split('.')[0] print >> sys.stderr, '### Execute' , module_name getattr(sys.modules[module_prefix], 'execute')(module_name) return
def files_to_process(module_name): """ Given a module name, obtains from the global rule manager the value of the "files" variable, computes the total number of lines, and those that need to be processed to print a tick in stdout. Returns: ([list of files], total_lines, mark_lines) """ # Expand wildcards in file names files = sum([glob.glob(x) for x in \ rule_manager.get_property(None, module_name, 'files').split()], []) # Fetch value to see if the cache for modified files is enabled file_modification_cache = \ rule_manager.get_property(None, module_name, 'file_modification_cache') # If modified files cache enabled, filter out those that were not modified if file_modification_cache != '': new_files = [] for x in files: file_annotation = detect_new_files.needs_processing(None, module_name + '//' + x) if file_annotation == None: print >> sys.stderr, 'File', x, 'not modified. Skipping' else: new_files.append((x, file_annotation[1:])) files = new_files else: files = [(x, ['1970-01-01 00:00:00']) for x in files] # Count the total number of lines in the files total_lines = sum(map(lambda x: file_len(x[0]), files)) mark_lines = total_lines / 40 + 1 return (files, total_lines, mark_lines)
def initialize_filter(module_name): """ Gets two options from the given dictionary, the filter file and the filter function. Imports the file and if a function with name "initialize_" followed by filter_function is found, it is executed. Modifies the dictionary so that filter_function points to the function instead of the name. """ filter_file = rule_manager.get_property(None, module_name, 'filter_file') function = None if filter_file != '': filter_function = rule_manager.get_property(None, module_name, 'filter_function') (head, tail) = os.path.split(filter_file) # Add the source directory to the path to fetch python modules sys.path.insert(0, head) try: module = __import__(tail, fromlist=[]) except ImportError, e: print >> sys.stderr, 'Unable to import file', tail print str(e) sys.exit(1) # If the file of the import is not what is expected, notify and # terminate. if not module.__file__.startswith(head): print >> sys.stderr, 'Collision when importing', filter_file sys.exit(1) # Fetch the initialization function, and if found, execute it function = None try: function = getattr(sys.modules[tail], 'initialize_' + filter_function) except AttributeError, e: pass
def window_dates(module_name): """ Given a module name, it obtains from the global rule_manager object the value of the variables 'from_date' and 'until_date'. Translates them to datetime.datetime objects and returns the pair (from_date, until_date) as result. """ # Translate the date from text to datetime from_date = rule_manager.get_property(None, module_name, 'from_date') if from_date == '': from_date = datetime.datetime.min else: from_date = datetime.datetime.strptime(from_date, '%Y/%m/%d %H:%M:%S') until_date = rule_manager.get_property(None, module_name, 'until_date') if until_date == '': until_date = datetime.datetime.max else: until_date = datetime.datetime.strptime(until_date, '%Y/%m/%d %H:%M:%S') return (from_date, until_date)
def initialize(module_name): """ Initialization function. Must be here always. """ global filter_function global debug # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) filter_function = process_filters.initialize_filter(module_name) return
def initialize(module_name = None): """ Read an anonymize map from a file. Lines are comma separated pairs of name, sha256 key. """ global anonymize_map global module_prefix global debug if module_name == None: module_name = module_prefix # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) # Get values from config map_file = rule_manager.get_property(None, module_name, 'file') passwd = rule_manager.get_property(None, module_name, 'passwd') min_length = int(rule_manager.get_property(None, module_name, 'min_length')) # Load the content in the dictionary load_data(map_file)
def initialize(module_name): """ Initialize the ldap_obj. """ global ldap_obj global debug # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) uri = rule_manager.get_property(None, module_name, 'uri') if uri == '': # Nothing to do return try: ldap_obj = ldap.initialize(uri) except: print >> sys.stderr, 'LDAP exception when initializing' sys.exit(1) print >> sys.stderr, 'LDAP object initialized successfully'
def flush(): """ Make sure all the transactions have been executed """ global module_prefix # Fetch format output_format = rule_manager.get_property(None, module_prefix, "format") if output_format == "CSV": flush_csv(event_list) elif output_format == "mongo": flush_mongo_db(event_list) else: flush_csv(event_list)
def initialize(module_name): """ Initialization function. Must be here always. """ global svn_client global filter_function global debug # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) filter_function = process_filters.initialize_filter(module_name) svn_client = pysvn.Client() svn_client.exception_style = 1 return
def execute(module_name): """ Given a list of directories with vm logs, process all of them. """ global svn_client # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) repository = rule_manager.get_property(None, module_name, 'repository') if repository == '': # No data available, no initialization done return repository_root = \ svn_client.info2(repository, depth = pysvn.depth.empty)[0][1]['repos_root_URL'] # Fetch all the files in the given repository dir_info = svn_client.list(repository, depth = pysvn.depth.immediates) # Select only those that are directories and match the given expression dir_info = [x[0]['repos_path'][1:] for x in dir_info \ if x[0]['kind'] == pysvn.node_kind.dir] source_dirs = fnmatch.filter(dir_info, rule_manager.get_property(None, module_name, 'files')) dst_dir = rule_manager.get_property(None, module_name, 'dst_dir') if dst_dir == '': print >> sys.stderr, 'VM_Logs: dst_dir is empty.' sys.exit(1) if not(os.path.exists(dst_dir)): os.makedirs(dst_dir) # Loop over all the directories for directory_name in source_dirs: # Calculate the dst full name (head, dst_tail) = os.path.split(directory_name) dst_full_name = os.path.join(dst_dir, dst_tail) # Fetch all the files in the given repository file_info = svn_client.list(os.path.join(repository_root, directory_name, '.pladata'), depth = pysvn.depth.immediates) # Select only those that are directories and match the *.tgz pattern file_info = [x[0]['repos_path'][1:] for x in file_info \ if x[0]['kind'] == pysvn.node_kind.file] data_files = [x for x in file_info if re.search('[0-9]+_[0-9]+\.tgz$', x)] if debug != 0: print >> sys.stderr, ' Dir', dst_tail, ':', len(data_files), 'files' # Loop over all the data files for data_file in data_files: # Separate file name from dir name (head_dir, file_name) = os.path.split(data_file) # Obtain the author that did the commit data_info = svn_client.info2(os.path.join(repository_root, data_file), depth = pysvn.depth.empty) author_id = data_info[0][1]['last_changed_author'] # Create the path to the author dir and additional dirs if needed dst_author_dir = os.path.join(dst_full_name, author_id) if not os.path.exists(dst_author_dir): os.makedirs(dst_author_dir) dst_file = os.path.join(dst_author_dir, file_name) done_author_dir = os.path.join(dst_full_name, author_id, 'tgzs') if not os.path.exists(done_author_dir): os.makedirs(done_author_dir) # If the file has NOT been unpacked already, process it if os.path.exists(os.path.join(done_author_dir, file_name)): continue # Get a copy of the *.tgz from the repository with the export # command try: svn_client.export(os.path.join(repository_root, data_file), dst_file, recurse = False) except Exception, e: print >> sys.stderr, 'Error while exporting', data_file print >> sys.stderr, str(e) # Expand the data in the tar if unpack_tgz_file(dst_file, done_author_dir): print >> sys.stderr, 'Error while unpacking', data_file continue if debug != 0: print >> sys.stderr, ' ', dst_tail, 'expanded.'
def execute(module_name): """ Given a list of files with Apache logs, process all of them that contain the word mark word and produce the following events: [('name', 'embedded_question_correct'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_incorrect'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_blank'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_show'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] """ global clf_re global filter_function global remap_pairs # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Get the remap_pairs evaluated from the options remap_pairs = eval('[' + rule_manager.get_property(None, module_name, 'remap_pairs') + \ ']') remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs] # Fetch the word to detect an embeddedq to use it later mark_word = rule_manager.get_property(None, module_name, 'mark_word') # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] fields = clf_re.match(line).groups() if fields[2] == '': raise ValueError('Empty string' + line) # Translate date time of the event dtime = datetime.datetime.strptime(fields[3].strip()[:-6], '%d/%b/%Y:%H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # Split the url to match and see if it has the mark word (method, url, protocol) = fields[4].split() # Only 404 that have the mark word substring are accepted if fields[5] != '404' or url.find(mark_word) == -1: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # At this point we have an event of an embedded question. event_pairs = process_log_line(url, mark_word) for (event_suffix, question_id) in event_pairs: event = ('embedded_question_' + event_suffix, dtime, anonymize.find_or_encode_string(fields[2]), [('application', 'unknown'), ('url', url), ('ip', fields[0]), ('question_id', question_id)]) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) print >> sys.stderr
def execute(module_name): """ Given a list of files with gcc logs, process all of them. Process the files containing the events. Return True if no error is detected. The event have the form of: -BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gcc prueba r where q -END [('name', 'compiler'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'gcc'), ('invocation', command), ('messages', message extract)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) message_lines = int(rule_manager.get_property(None, module_name, 'message_lines')) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 messages = [] begin_fields = [] for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Skip the empty lines if line == '\n': continue # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line fields = line[:-1].split() # Beginning of log. Catch command invocation and dates if re.match('^\-BEGIN .+$', line): begin_fields = line.split() try: dtime = \ datetime.datetime.strptime(' '.join(begin_fields[4:6]), '%Y-%m-%d %H:%M:%S') except ValueError, e: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Ignoring:', line command = ' '.join(begin_fields[6:]) messages = [] continue # If not the end of an event, concatenate the line and keep looping if not re.match('^\-END$', line): if len(messages) < message_lines: messages.append(line[:-1]) continue # At this point we have the complete information about the event if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue # If out of time window, ignore if dtime < from_date or dtime > until_date: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(begin_fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ('gcc', dtime, anon_user_id, [('program', 'gcc'), ('command', command), ('messages', '"' + '|||'.join(messages) + '"')]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1)
def execute(module_name): """ Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands: CustomLog [destionation log file] "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION Sample: [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args For each line in the file, the following event structure is produced [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. ('datetime', dtime), ('user', anonymize(user)), ('repository', repository name), ('directory', directory) (optional), ('revision', r??? (optional)), ('comment', (max 256 chars)) # Only if commit and repository given] """ global filter_function # Get the level of debug debug = int(rule_manager.get_property(None, module_name, "debug")) # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = rules_common.files_to_process(module_name) # Get the flag to see if the commits need to be processed process_commits = rule_manager.get_property(None, module_name, "process_commits") == "" # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S") new_last_event = last_event data_in = codecs.open(filename, "r", encoding="utf8", errors="replace") old = "" counter = 0 for line in data_in: # Advance counters and print progress character if needed counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >>sys.stderr, "+", sys.stderr.flush() # Chop line into fields line = line[:-1] fields = line.split() if len(fields) < 3: raise ValueError("Erroneous log line:" + line) # Get the event type to quickly detect if we need to skip it event_type = fields[4] if (not process_commits) and event_type == "commit": continue # Translate date time of the event and check if within process # interval dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S") if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # Create the first three pairs of the event event = ( "svn_" + event_type, dtime, anonymize.find_or_encode_string(fields[2]), [("repository", fields[3])], ) # Structure of the different events # # checkout-or-export /path r62 depth=infinity # commit harry r100 # diff /path r15:20 depth=infinity ignore-ancestry # get-dir /trunk r17 text # get-file /path r20 props # get-file-revs /path r12:15 include-merged-revisions # get-mergeinfo (/path1 /path2) # lock /path steal # log (/path1,/path2) r20:90 discover-changed-paths revprops=() # replay /path r19 # change-rev-prop r50 propertyname # rev-proplist r34 # status /path r62 depth=infinity # switch /pathA /pathB@50 depth=infinity # unlock /path break # update /path r17 send-copyfrom-args if event_type == "checkout-or-export": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) if event_type == "commit": event[3].append(("revision", fields[5])) # Fetch the log message if svn_client is not None if svn_client != None: pass elif event_type == "diff": event[3].append(("location", fields[5] + " " + fields[6])) elif event_type == "get-dir" or event_type == "get-file" or event_type == "update": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) elif event_type == "get-file-revs": event[3].append(("revision", "r" + fields[6].split(":")[1])) event[3].append(("location", fields[5])) elif event_type == "lock" or event_type == "unlock": event[3].append(("location", fields[5])) elif event_type == "log": event[3].append(("location", fields[5])) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + "//" + filename, [new_last_event]) print >>sys.stderr
def execute(module_name): """ Given a list of files with Moodle logs, process all of them. Some lines contain spurious 0d in the middle. They are removed. [('name', 'lms_' + eventtype), ('datetime', datetime), ('user', anonymize(user)), ('application', 'moodle'), ('community', Community ID), ('ip', IP), ('resource', fields[5])] """ global remap_pairs # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Get the type of file to process event_file_type = rule_manager.get_property(None, module_name, 'event_file_type') event_file_type = event_file_type.lower().strip() if event_file_type != 'csv' and event_file_type != 'html': print >> sys.stderr, 'Incorrect value for option event_file_type' print >> sys.stderr, 'Only "csv" or "html" allowed' sys.exit(2) # Get the remap_pairs evaluated from the options remap_pairs = eval('[' + rule_manager.get_property(None, module_name, 'remap_pairs') + \ ']') remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs] datetime_fmt = rule_manager.get_property(None, module_name, 'datetime_format') print >> sys.stderr, 'Processing', len(files), 'files' # Loop over all the given args total_counter = 0 for file_annotation in sorted(files): # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') if event_file_type == 'csv': total_counter = process_csv_file(module_name, filename, mark_lines, total_counter, last_event, from_date, until_date, datetime_fmt) else: total_counter = process_html_file(module_name, filename, mark_lines, total_counter, last_event, from_date, until_date, datetime_fmt) print >> sys.stderr
def execute(module_name): """ Process the files contained in the given repository. [('name', 'svn_commit'), ('datetime', dtime), ('user', anonymize(user)), ('program', 'svn'), ('repository', repository name), ('comment', (max 256 chars))] """ global svn_client global filter_function global svn_special_event_comment global svn_special_event_names # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) repository = rule_manager.get_property(None, module_name, 'repository') if repository == '': # No data available, no initialization done return repository_root = \ svn_client.info2(repository, depth = pysvn.depth.empty)[0][1]['repos_root_URL'] repository_name = rule_manager.get_property(None, module_name, 'repository_name') # Fetch all the files in the given repository dir_info = svn_client.list(repository, depth = pysvn.depth.immediates) # Select only those that are directories and match the given expression dir_info = [x[0]['repos_path'][1:] for x in dir_info \ if x[0]['kind'] == pysvn.node_kind.dir] source_dirs = fnmatch.filter(dir_info, rule_manager.get_property(None, module_name, 'files')) # Dump the dirs being processed if debug != 0: print >> sys.stderr, repository_root, ':', len(source_dirs), print >> sys.stderr, 'svndirs being processed.' # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Set the date/times to ask for the logs if from_date != None: seconds = calendar.timegm(from_date.utctimetuple()) revision_start = pysvn.Revision(pysvn.opt_revision_kind.date, seconds) else: revision_start = pysvn.Revision(pysvn.opt_revision_kind.head) if until_date != None: seconds = calendar.timegm(until_date.utctimetuple()) revision_end = pysvn.Revision(pysvn.opt_revision_kind.date, seconds) else: revision_end = pysvn.Revision(pysvn.opt_revision_kind.number, 0) msg_size = int(rule_manager.get_property(None, module_name, 'msg_length')) # Loop over the directories and collect al the logs all_logs = [] for directory_name in source_dirs: # Slurp al the logs in the server all_logs.extend(svn_client.log(os.path.join(repository_root, directory_name), revision_start = revision_start, revision_end = revision_end)) # Dump the dirs being processed if debug != 0: print >> sys.stderr, len(all_logs), 'logs being processed.' # Loop over all the log elements total_counter = 0 mark_lines = len(all_logs) / 40 + 1 for log_data in all_logs: # Count the logs to print the mark string on the screen total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Fetch the three important fields, author, date/time and msg anon_user_id = anonymize.find_or_encode_string(log_data['author']) dtime = datetime.datetime.fromtimestamp(log_data['date']) # How can be a substring of a specific length be obtained? msg = unicode(log_data['message'], 'utf-8') # This subsetting needs to be done after encoding to make sure the # string is broken in a safe location (and not in the mid of a utf-8 # character). msg = msg[:msg_size] if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and \ filter_function([log_data['author'], log_data['date'], log_data['message']]) == None: continue try: special_idx = svn_special_event_comment.index(msg) event_name = svn_special_event_names[special_idx] except ValueError, e: event_name = 'svn_commit' event = (event_name, dtime, anon_user_id, [('program', 'svn'), ('repository', repository_name), ('comment', msg)]) try: event_output.out(event) except Exception, e: print 'Exception while processing', module_name print str(e) sys.exit(1)