def process_log(log, config, source): ''' Function take on data entry as input an transform it into a preliminary observation ''' record = faac.Record(log, config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['All']) obs = faac.Observation.fromRecord(record, config['FEATURES'][source]) if config['Keys']: tag = list() tag2 = normalize_timestamps(record.variables['timestamp'][0], config, source) tag.append(tag2.strftime("%Y%m%d%H%M")) for i in range(len(config['Keys'])): if len(record.variables[config['Keys'][i]]) > 0: tag.append( str(record.variables[config['Keys'][i]][0]) ) # Careful!, only works (intentionally) for the first instance of a variable in a record if len(tag) > 1: tag = tuple(tag) else: tag = tag[0] else: tag2 = normalize_timestamps(record.variables['timestamp'][0], config, source) tag = tag2.strftime("%Y%m%d%H%M") return tag, obs
def process_log(log, config, source, instances): ''' Function take on data entry as input an transform it into a preliminary observation ''' timearg = config['TIMEARG'][ source] # name of variable which contains timestamp record = faac.Record( log, config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['SOURCES'][source]['CONFIG']['timestamp_format'], config['All']) instances['count'] += 1 for variable, features in record.variables.items(): if variable != timearg: if variable in instances.keys(): for feature in features: if str(feature) in instances[variable].keys(): instances[variable][str(feature)] += 1 else: instances[variable][str(feature)] = 1 else: instances[variable] = dict() for feature in features: instances[variable][str(feature)] = 1 return instances
def unstr_deparsing(config, sourcepath, deparsInput, source, formated_timestamps): ''' Deparsing process for unstructured text based data sources like a log file. ''' threshold = config['threshold'] OUTDIR = config['OUTDIR'] depars_features = deparsInput['features'] timearg = config['TIMEARG'][source] # name of the variable which contains timestamp selection = [] # indices of features in config file matching depars_features for i in range(len(config['FEATURES'][source])): if config['FEATURES'][source][i]['name'] in depars_features: selection.append(i) FEATURES_sel = [] # all feature fields for features in depars_features for i in selection: FEATURES_sel.append(config['FEATURES'][source][i]) VARIABLES = {} # all variables from config file for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']: try: VARIABLES[variable['name']] = variable except: print ("Configuration file error: missing variables") exit(1) count_unstructured = 0 count_tot = 0 # while count_source < lines[source]*0.01 and (not features_needed <= 0) : feat_appear = {} indices = {} for file in sourcepath: feat_appear[file] = [] indices[file] = {} for nfeatures in range(len(depars_features),0,-1): indices[file][nfeatures] = [] # dict of dicts for each number of features if file.endswith('.gz'): input_file = gzip.open(file,'r') else: input_file = open(file,'r') if debugmode: faac.debugProgram('fcdeparser.load_message', [file]) # First read to generate list of number of appearances line = input_file.readline() nline=0 log_indices = [] if line: log = "" log_indices.append(nline+1) while line: nline+=1 log += line if len(log.split(config['RECORD_SEPARATOR'][source])) > 1: count_tot+=1 log_indices.append(nline) logExtract = log.split(config['RECORD_SEPARATOR'][source])[0] # For each log, extract timestamp with regular expresions and check if in formated_timestamps try: t = getUnstructuredTime(logExtract, VARIABLES[timearg]['where'], config['TSFORMAT'][source]) if str(t).strip() in formated_timestamps or not formated_timestamps: # Check if features appear in the log in order to write in the file later record = faac.Record(logExtract,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All']) obs = faac.Observation.fromRecord(record, FEATURES_sel) feature_count = sum( [obs.data[i].value for i in range(len(obs.data))] ) feat_appear[file].append(feature_count) indices[file][feature_count].append(log_indices) except: pass log = "" log_indices = [nline+1] # reset log_indices adding first line of next log for n in logExtract.split(config['RECORD_SEPARATOR'][source])[1::]: log += n # if next log in the same line, add remaining part log_indices = [nline] # in this case, next log first index is actual line line = input_file.readline() # Deal with the last log, not processed during while loop. log += line log_indices.append(nline) try: t = getUnstructuredTime(log, VARIABLES[timearg]['where'], config['TSFORMAT'][source]) if str(t) in formated_timestamps or not formated_timestamps: record = faac.Record(logExtract,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All']) obs = faac.Observation.fromRecord(record, FEATURES_sel) feature_count = sum( [obs.data[i].value for i in range(len(obs.data))] ) feat_appear[file].append(feature_count) indices[file][feature_count].append(log_indices) except: pass input_file.close() # Print number of matched logs for each features number (feature selection criteria) matched_logs = faac.debugProgram('fcdeparser.unstr_deparsing.feat_appear', [feat_appear[file], depars_features]) # Obtain number of features needed to extract the log with the given threshold features_threshold = len(depars_features) count = 0 while features_threshold>0: # if no threshold, extract all logs with >0 matched features if not threshold or (threshold and count < int(threshold)): nfeatures = features_threshold for file in feat_appear: count += feat_appear[file].count(int(nfeatures)) features_threshold -= 1 else: break if debugmode: opmode = faac.debugProgram('fcdeparser.unstr_deparsing.user_input', [config['threshold'], features_threshold, matched_logs]) else: if threshold: print("Considering the feature counters and a threshold of %d log entries, we will extract logs with >=%d matched features" %(config['threshold'],features_threshold+1)) else: print("As no threshold is defined, we will extract all logs with >=1 matched features") print("Note that the output will be generated in different files according to their number of features") #Re-read desired lines for file in sourcepath: if file.endswith('.gz'): input_file = gzip.open(file,'r') else: input_file = open(file,'r') if not debugmode: for nfeatures in range(len(depars_features),features_threshold,-1): if indices[file][nfeatures]: output_file = open(OUTDIR + "output_%s_%sfeat" %(source,nfeatures),'w') for line_indices in indices[file][nfeatures]: log="" for index in range(line_indices[0], 1+line_indices[1]): log+=linecache.getline(file, index) logExtract = log.split(config['RECORD_SEPARATOR'][source])[0] if log.split(config['RECORD_SEPARATOR'][source])[1]: logExtract = log.split(config['RECORD_SEPARATOR'][source])[1] # if characters after the separator, take them instead output_file.write(logExtract + config['RECORD_SEPARATOR'][source]) count_unstructured += 1 output_file.close() else: index = 0 index_deparsed = 0 line = input_file.readline() if line: log = "" while line: log += line if len(log.split(config['RECORD_SEPARATOR'][source])) > 1: logExtract = log.split(config['RECORD_SEPARATOR'][source])[0] try: t = getUnstructuredTime(logExtract, VARIABLES[timearg]['where'], config['TSFORMAT'][source]) if str(t).strip() in formated_timestamps or not formated_timestamps: if feat_appear[file][index_deparsed] > features_threshold and opmode in {1,2}: faac.debugProgram('fcdeparser.unstr_deparsing.deparsed_log', [index+1, logExtract, feat_appear[file][index_deparsed], opmode]) elif opmode in {1}: faac.debugProgram('fcdeparser.unstr_deparsing.unmatched_criteria1', [index+1, logExtract, feat_appear[file][index_deparsed]]) index_deparsed+=1 elif opmode in {1}: faac.debugProgram('fcdeparser.unstr_deparsing.unmatched_criteria2', [index+1, logExtract]) index += 1 except SystemExit: exit(1) except: pass log = "" for n in logExtract.split(config['RECORD_SEPARATOR'][source])[1::]: log += n line = input_file.readline() input_file.close() return (count_unstructured, count_tot)
def stru_deparsing(config, sourcepath, deparsInput, source, formated_timestamps): ''' Deparsing process for structured data sources like csv. ''' threshold = config['threshold'] OUTDIR = config['OUTDIR'] depars_features = deparsInput['features'] # features in deparsing_input file # Store features and variables from config. file in dictionaries FEATURES, VARIABLES FEATURES = {} VARIABLES = {} for feature in config['SOURCES'][source]['CONFIG']['FEATURES']: try: FEATURES[feature['name']] = feature except: print ("Configuration file error: missing features") exit(1) for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']: try: VARIABLES[variable['name']] = variable except: print ("Configuration file error: missing variables") exit(1) selection = [] # indices of features in config file matching depars_features for i in range(len(config['FEATURES'][source])): if config['FEATURES'][source][i]['name'] in depars_features: selection.append(i) FEATURES_sel = [] # all feature fields for features in depars_features for i in selection: FEATURES_sel.append(config['FEATURES'][source][i]) timestamp_pos = VARIABLES[config['TIMEARG'][source]]['where'] # position (column) of timestamp field count_structured = 0 # structured logs found during deparsing process count_tot = 0 # total logs feat_appear = {} feat_appear_names = {} for file in sourcepath: feat_appear[file] = [] feat_appear_names[file] = [] if file.endswith('.gz'): input_file = gzip.open(file,'r') else: input_file = open(file,'r') if debugmode: faac.debugProgram('fcdeparser.load_message', [file]) line = input_file.readline() # First read to generate a list with the number of depars_features present in each line nline=0 while line: nline+=1 try: t = getStructuredTime(line, timestamp_pos, config['TSFORMAT'][source]) # timestamp in that line # extract amount of features that appear in the line if its timestamp is included in formated_timestamps if t.strip() in formated_timestamps or not formated_timestamps: record = faac.Record(line,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All']) obs = faac.Observation.fromRecord(record, FEATURES_sel) # to make default features counter work properly, use config['FEATURES'][source] instead of FEATURES_sel (but execution will be significantly slower) feature_count, matched_features = search_features_str(obs, VARIABLES) feat_appear[file].append(feature_count) feat_appear_names[file].append(matched_features) else: # it is necessary to fill with zeros so that indices match the lines later feat_appear[file].append(0) feat_appear_names[file].append([]) except Exception as error: print ('\033[33m'+ "Error finding features in line %d: %s" %(nline,error) +'\033[m') feat_appear[file].append(0) feat_appear_names[file].append([]) line = input_file.readline() input_file.close() count_tot+=nline # add nlines of this source to total lines counter # Print number of matched logs for each features number (feature selection criteria) matched_lines = faac.debugProgram('fcdeparser.stru_deparsing.feat_appear', [feat_appear[file], depars_features, nline]) # Obtain number of features needed to extract the log with the given threshold features_threshold = len(depars_features) indices = {} for file in sourcepath: indices[file]={} # new dict for every data file count = 0 while features_threshold>0: # if no threshold, extract all logs with >0 matched features if not threshold or (threshold and count < int(threshold)): nfeatures = features_threshold for file in feat_appear: count += feat_appear[file].count(int(nfeatures)) indices[file][nfeatures] = [i for i, val in enumerate(feat_appear[file]) if val==nfeatures] features_threshold -= 1 else: break if debugmode: opmode = faac.debugProgram('fcdeparser.stru_deparsing.user_input', [config['threshold'], features_threshold, matched_lines]) else: if threshold: print("Considering the feature counters and a threshold of %d log entries, we will extract logs with >=%d matched features" %(config['threshold'],features_threshold+1)) else: print("As no threshold is defined, we will extract all logs with >=1 matched features") print("Note that the output will be generated in different files according to their number of features") # Re-read the file extracting the raw data for file in sourcepath: if file.endswith('.gz'): input_file = gzip.open(file,'r') else: input_file = open(file,'r') if not debugmode: for nfeatures in indices[file]: if indices[file][nfeatures]: output_file = open(OUTDIR + "output_%s_%sfeat" %(source,nfeatures),'w') for line_index in indices[file][nfeatures]: line = linecache.getline(file, line_index+1) # index starting by 1 with linecache function output_file.write(line + "\n") count_structured += 1 output_file.close() else: for position, line in enumerate(input_file): nfeatures = feat_appear[file][position] features_names = feat_appear_names[file][position] if (nfeatures>features_threshold and position in indices[file][nfeatures]) and opmode in {1,2}: faac.debugProgram('fcdeparser.stru_deparsing.deparsed_log', [position+1, line, nfeatures, features_names, opmode]) elif opmode in {1}: faac.debugProgram('fcdeparser.stru_deparsing.unmatched_criteria', [position+1, line, nfeatures, features_names]) input_file.close() return (count_structured, count_tot)
def unstr_deparsing(config, threshold, sourcepath, deparsInput, source, formated_timestamps): ''' Deparsing process for unstructured text based data sources like a log file. ''' OUTDIR = config['OUTDIR'] features = deparsInput['features'] selection = [] for i in range(len(config['FEATURES'][source])): if config['FEATURES'][source][i]['name'] in features: selection.append(i) FEATURES_sel = [] for i in selection: FEATURES_sel.append(config['FEATURES'][source][i]) VARIABLES = {} for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']: try: VARIABLES[variable['name']] = variable except: print "Cofiguration file error: missing vriables" exit(1) count_unstructured = 0 count_tot = 0 print OUTDIR + "output_" + source output_file = open(OUTDIR + "output_" + source, 'w') # while count_source < lines[source]*0.01 and (not features_needed <= 0) : feat_appear = {} for file in sourcepath: feat_appear[file] = [] if file.endswith('.gz'): input_file = gzip.open(file, 'r') else: input_file = open(file, 'r') line = input_file.readline() # First read to generate list of number of appearances if line: log = "" while line: log += line if len(log.split(config['SEPARATOR'][source])) > 1: logExtract = log.split(config['SEPARATOR'][source])[0] # For each log, extract timestamp with regular expresions and check if it is in the # input timestamps try: t = getUnstructuredTime( logExtract, VARIABLES['timestamp']['where'], config['SOURCES'][source]['CONFIG'] ['timestamp_format']) if str(t).strip() in formated_timestamps: # Check if features appear in the log to write in the file. record = faac.Record( logExtract, config['SOURCES'][source]['CONFIG'] ['VARIABLES'], config['STRUCTURED'][source], config['All']) obs = faac.Observation.fromRecord( record, FEATURES_sel) feat_appear[file].append( sum([ obs.data[i].value for i in range(len(obs.data)) ])) except: pass log = "" for n in logExtract.split( config['SEPARATOR'][source])[1::]: log += n line = input_file.readline() # Deal with the last log, not processed during while loop. log += line try: t = getUnstructuredTime( log, VARIABLES['timestamp']['where'], config['SOURCES'][source]['CONFIG']['timestamp_format']) if str(t) in timestamps: record = faac.Record( logExtract, config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['All']) obs = faac.Observation.fromRecord(record, FEATURES_sel) feat_appear[file].append( sum([obs.data[i].value for i in range(len(obs.data))])) except: pass input_file.close() # Obtain number of features needed to extract the log features_needed = len(features) count = 0 while count < int(threshold) and (not features_needed <= 1): for file in feat_appear: count += feat_appear[file].count(int(features_needed)) print("There are " + str(count) + " unstructured logs with more than " + str(features_needed) + " matching features...") features_needed -= 1 # Re-read the file for file in sourcepath: index = 0 if file.endswith('.gz'): input_file = gzip.open(file, 'r') else: input_file = open(file, 'r') input_file.seek(0) line = input_file.readline() if line: log = "" + line while line: log += line if len(log.split(config['SEPARATOR'][source])) > 1: count_tot += 1 logExtract = log.split(config['SEPARATOR'][source])[0] # For each log, extract timestamp with regular expresions and check if it is in the # input timestamps try: t = getUnstructuredTime( logExtract, VARIABLES['timestamp']['where'], config['SOURCES'][source]['CONFIG'] ['timestamp_format']) if str(t).strip() in formated_timestamps: # Check if features appear in the log to write in the file. if feat_appear[file][index] > features_needed: output_file.write(logExtract + config['SEPARATOR'][source]) count_unstructured += 1 index += 1 except: pass log = "" for n in logExtract.split( config['SEPARATOR'][source])[1::]: log += n line = input_file.readline() input_file.close() output_file.close() return (count_unstructured, count_tot)
def process_log(log, config, source): ''' Function take on data entry as input an transform it into a preliminary observation ''' ignore_log = 0 # flag to skip processing this log if not log or not log.strip(): ignore_log = 1 # do not process empty logs or containing only spaces print('\033[31m' + "The entry log is empty and will not be processed\n" + '\033[m') if not ignore_log: record = faac.Record(log, config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All']) if debugmode: faac.debugProgram('fcparser.process_log.record', [record]) obs = faac.Observation.fromRecord(record, config['FEATURES'][source]) if debugmode: faac.debugProgram('fcparser.process_log.observation', [obs]) timearg = config['TIMEARG'][ source] # name of variable which contains timestamp log_timestamp = record.variables[timearg][0].value # Check if log_timestamp will be considered according to time sampling parameters if 'start' in config['Time']: if log_timestamp < config['Time']['start']: ignore_log = 1 if 'end' in config['Time']: if log_timestamp > config['Time']['end']: ignore_log = 1 if not ignore_log: window = config['Time']['window'] try: if config['Keys']: tag = list() tag2 = normalize_timestamps(log_timestamp, window) tag.append(tag2.strftime("%Y%m%d%H%M")) for i in range(len(config['Keys'])): if len(record.variables[config['Keys'][i]]) > 0: tag.append( str(record.variables[config['Keys'][i]][0]) ) # Careful!, only works (intentionally) for the first instance of a variable in a record if len(tag) > 1: tag = tuple(tag) else: tag = tag[0] else: tag2 = normalize_timestamps(log_timestamp, window) tag = tag2.strftime("%Y%m%d%H%M") except: # Exception as err #print("[!] Log failed. Reason: "+ (str(err) + "\nLog entry: " + repr(log[:300])+ "\nRecord value: "+ str(record))) tag, obs = None, None if debugmode: print('\033[31m' + "This entry log would be ignored due to errors" + '\033[m') else: tag, obs = None, None return tag, obs