Exemplo n.º 1
0
def process_log(log, config, source):
    '''
	Function take on data entry as input an transform it into a preliminary observation
	'''
    record = faac.Record(log, config['SOURCES'][source]['CONFIG']['VARIABLES'],
                         config['STRUCTURED'][source], config['All'])
    obs = faac.Observation.fromRecord(record, config['FEATURES'][source])

    if config['Keys']:
        tag = list()
        tag2 = normalize_timestamps(record.variables['timestamp'][0], config,
                                    source)
        tag.append(tag2.strftime("%Y%m%d%H%M"))
        for i in range(len(config['Keys'])):
            if len(record.variables[config['Keys'][i]]) > 0:
                tag.append(
                    str(record.variables[config['Keys'][i]][0])
                )  # Careful!, only works (intentionally) for the first instance of a variable in a record
        if len(tag) > 1:
            tag = tuple(tag)
        else:
            tag = tag[0]
    else:
        tag2 = normalize_timestamps(record.variables['timestamp'][0], config,
                                    source)
        tag = tag2.strftime("%Y%m%d%H%M")

    return tag, obs
Exemplo n.º 2
0
def process_log(log, config, source, instances):
    '''
    Function take on data entry as input an transform it into a preliminary observation
    '''
    timearg = config['TIMEARG'][
        source]  # name of variable which contains timestamp
    record = faac.Record(
        log, config['SOURCES'][source]['CONFIG']['VARIABLES'],
        config['STRUCTURED'][source],
        config['SOURCES'][source]['CONFIG']['timestamp_format'], config['All'])

    instances['count'] += 1

    for variable, features in record.variables.items():
        if variable != timearg:
            if variable in instances.keys():
                for feature in features:
                    if str(feature) in instances[variable].keys():
                        instances[variable][str(feature)] += 1
                    else:
                        instances[variable][str(feature)] = 1
            else:
                instances[variable] = dict()
                for feature in features:
                    instances[variable][str(feature)] = 1

    return instances
Exemplo n.º 3
0
def unstr_deparsing(config, sourcepath, deparsInput, source, formated_timestamps):
    '''
    Deparsing process for unstructured text based data sources like a log file.
    '''
    threshold = config['threshold']
    OUTDIR = config['OUTDIR']
    depars_features = deparsInput['features']
    timearg = config['TIMEARG'][source] # name of the variable which contains timestamp 

    selection = []  # indices of features in config file matching depars_features
    for i in range(len(config['FEATURES'][source])):
        if config['FEATURES'][source][i]['name'] in depars_features:
            selection.append(i)

    FEATURES_sel = []   # all feature fields for features in depars_features
    for i in selection:
        FEATURES_sel.append(config['FEATURES'][source][i])

    VARIABLES = {}  # all variables from config file

    for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']:
        try:
            VARIABLES[variable['name']] = variable
        except:
            print ("Configuration file error: missing variables")
            exit(1)
    
    count_unstructured = 0
    count_tot = 0

    # while count_source < lines[source]*0.01 and (not features_needed <= 0) : 
    feat_appear = {}
    indices = {}
    for file in sourcepath:
        feat_appear[file] = []   
        indices[file] = {}
        for nfeatures in range(len(depars_features),0,-1):
            indices[file][nfeatures] = []   # dict of dicts for each number of features

        if file.endswith('.gz'):
            input_file = gzip.open(file,'r')
        else:
            input_file = open(file,'r')
            
        if debugmode:
            faac.debugProgram('fcdeparser.load_message', [file])


        # First read to generate list of number of appearances
        line = input_file.readline()
        nline=0
        log_indices = []

        if line:
            log = "" 
            log_indices.append(nline+1)
            while line:
                nline+=1
                log += line 
    
                if len(log.split(config['RECORD_SEPARATOR'][source])) > 1:
                    count_tot+=1
                    log_indices.append(nline)
                    logExtract = log.split(config['RECORD_SEPARATOR'][source])[0]
                    
                    # For each log, extract timestamp with regular expresions and check if in formated_timestamps
                    try:
                        t = getUnstructuredTime(logExtract, VARIABLES[timearg]['where'], config['TSFORMAT'][source])                    
                        if str(t).strip() in formated_timestamps or not formated_timestamps:    
                            # Check if features appear in the log in order to write in the file later
                            record = faac.Record(logExtract,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All'])
                            obs = faac.Observation.fromRecord(record, FEATURES_sel)
                            feature_count = sum( [obs.data[i].value for i in range(len(obs.data))] )
                            feat_appear[file].append(feature_count)
                            indices[file][feature_count].append(log_indices)
                    except:
                        pass
                        
                    log = ""
                    log_indices = [nline+1] # reset log_indices adding first line of next log
                    for n in logExtract.split(config['RECORD_SEPARATOR'][source])[1::]:
                        log += n    # if next log in the same line, add remaining part
                        log_indices = [nline]   # in this case, next log first index is actual line
                
                line = input_file.readline()

            # Deal with the last log, not processed during while loop.
            log += line
            log_indices.append(nline)
            try:                                
                t = getUnstructuredTime(log, VARIABLES[timearg]['where'], config['TSFORMAT'][source])
                if str(t) in formated_timestamps or not formated_timestamps:
                    record = faac.Record(logExtract,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All'])
                    obs = faac.Observation.fromRecord(record, FEATURES_sel)
                    feature_count = sum( [obs.data[i].value for i in range(len(obs.data))] )
                    feat_appear[file].append(feature_count)
                    indices[file][feature_count].append(log_indices)
                    
            except:
                pass

        input_file.close()
        
        # Print number of matched logs for each features number (feature selection criteria)
        matched_logs = faac.debugProgram('fcdeparser.unstr_deparsing.feat_appear', [feat_appear[file], depars_features])

        
    # Obtain number of features needed to extract the log with the given threshold
    features_threshold = len(depars_features)
    count = 0
    while features_threshold>0:     # if no threshold, extract all logs with >0 matched features
        if not threshold or (threshold and count < int(threshold)): 
            nfeatures = features_threshold
            for file in feat_appear:
                count += feat_appear[file].count(int(nfeatures))
            features_threshold -= 1
        else:
            break
    
        
    if debugmode:
        opmode = faac.debugProgram('fcdeparser.unstr_deparsing.user_input', [config['threshold'], features_threshold, matched_logs])
    else:
        if threshold:
            print("Considering the feature counters and a threshold of %d log entries, we will extract logs with >=%d matched features" %(config['threshold'],features_threshold+1))
        else:
            print("As no threshold is defined, we will extract all logs with >=1 matched features")
        print("Note that the output will be generated in different files according to their number of features")
        
        
    #Re-read desired lines
    for file in sourcepath:
    
        if file.endswith('.gz'):
            input_file = gzip.open(file,'r')
        else:
            input_file = open(file,'r')  
            
        
        if not debugmode:
            for nfeatures in range(len(depars_features),features_threshold,-1):
                if indices[file][nfeatures]:
                    output_file = open(OUTDIR + "output_%s_%sfeat" %(source,nfeatures),'w')
                    for line_indices in indices[file][nfeatures]:
                        log=""
                        for index in range(line_indices[0], 1+line_indices[1]):
                            log+=linecache.getline(file, index)
                        
                        logExtract = log.split(config['RECORD_SEPARATOR'][source])[0]
                        if log.split(config['RECORD_SEPARATOR'][source])[1]: 
                            logExtract = log.split(config['RECORD_SEPARATOR'][source])[1] # if characters after the separator, take them instead
                        output_file.write(logExtract + config['RECORD_SEPARATOR'][source])
                        count_unstructured += 1
                    output_file.close()
                
        else:
            index = 0
            index_deparsed = 0
            line = input_file.readline()
                
            if line:
                log = ""     
                while line:
                    log += line 
                    if len(log.split(config['RECORD_SEPARATOR'][source])) > 1:
                        logExtract = log.split(config['RECORD_SEPARATOR'][source])[0]
                        try:
                            t = getUnstructuredTime(logExtract, VARIABLES[timearg]['where'], config['TSFORMAT'][source])     
                            if str(t).strip() in formated_timestamps or not formated_timestamps:
                                if feat_appear[file][index_deparsed] > features_threshold and opmode in {1,2}:    
                                    faac.debugProgram('fcdeparser.unstr_deparsing.deparsed_log', [index+1, logExtract, feat_appear[file][index_deparsed], opmode])
                                elif opmode in {1}:
                                    faac.debugProgram('fcdeparser.unstr_deparsing.unmatched_criteria1', [index+1, logExtract, feat_appear[file][index_deparsed]])
                                index_deparsed+=1
                            elif opmode in {1}:
                                faac.debugProgram('fcdeparser.unstr_deparsing.unmatched_criteria2', [index+1, logExtract])
                            index += 1
                        except SystemExit:
                            exit(1)
                        except:
                            pass

                        log = ""
                        for n in logExtract.split(config['RECORD_SEPARATOR'][source])[1::]:
                            log += n
                    line = input_file.readline()

        
        input_file.close()
    
    return (count_unstructured, count_tot)
Exemplo n.º 4
0
def stru_deparsing(config, sourcepath, deparsInput, source, formated_timestamps):
    '''
    Deparsing process for structured data sources like csv.
    '''
    threshold = config['threshold']
    OUTDIR = config['OUTDIR']
    depars_features = deparsInput['features']  # features in deparsing_input file

    # Store features and variables from config. file in dictionaries FEATURES, VARIABLES
    FEATURES = {}
    VARIABLES = {}
    for feature in config['SOURCES'][source]['CONFIG']['FEATURES']:
        try:
            FEATURES[feature['name']] = feature
        except:
            print ("Configuration file error: missing features")
            exit(1)

    for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']:
        try:
            VARIABLES[variable['name']] = variable
        except:
            print ("Configuration file error: missing variables")
            exit(1)
            
    selection = []  # indices of features in config file matching depars_features
    for i in range(len(config['FEATURES'][source])):
        if config['FEATURES'][source][i]['name'] in depars_features:
            selection.append(i)

    FEATURES_sel = []   # all feature fields for features in depars_features
    for i in selection:
        FEATURES_sel.append(config['FEATURES'][source][i])
            
    timestamp_pos = VARIABLES[config['TIMEARG'][source]]['where']   # position (column) of timestamp field


    count_structured = 0    # structured logs found during deparsing process
    count_tot = 0           # total logs
    feat_appear = {}
    feat_appear_names = {}
    
    for file in sourcepath:
        feat_appear[file] = []
        feat_appear_names[file] = []
        
        if file.endswith('.gz'):
            input_file = gzip.open(file,'r')
        else:
            input_file = open(file,'r')

        if debugmode:
            faac.debugProgram('fcdeparser.load_message', [file])

        line = input_file.readline()
        # First read to generate a list with the number of depars_features present in each line
        nline=0   
        while line:
            nline+=1  
            try:
                t = getStructuredTime(line, timestamp_pos, config['TSFORMAT'][source])  # timestamp in that line

                # extract amount of features that appear in the line if its timestamp is included in formated_timestamps
                if t.strip() in formated_timestamps or not formated_timestamps:
                    record = faac.Record(line,config['SOURCES'][source]['CONFIG']['VARIABLES'], config['STRUCTURED'][source], config['TSFORMAT'][source], config['All'])
                    obs = faac.Observation.fromRecord(record, FEATURES_sel)         # to make default features counter work properly, use config['FEATURES'][source] instead of FEATURES_sel (but execution will be significantly slower)
                    feature_count, matched_features = search_features_str(obs, VARIABLES)
                    feat_appear[file].append(feature_count)
                    feat_appear_names[file].append(matched_features)
                    
                else:
                    # it is necessary to fill with zeros so that indices match the lines later
                    feat_appear[file].append(0) 
                    feat_appear_names[file].append([])
                    
            except Exception as error:
                print ('\033[33m'+ "Error finding features in line %d: %s" %(nline,error) +'\033[m')
                feat_appear[file].append(0)
                feat_appear_names[file].append([])
                    
            line = input_file.readline()
        input_file.close()
        count_tot+=nline    # add nlines of this source to total lines counter
        
        # Print number of matched logs for each features number (feature selection criteria)
        matched_lines = faac.debugProgram('fcdeparser.stru_deparsing.feat_appear', [feat_appear[file], depars_features, nline])
    
    
    # Obtain number of features needed to extract the log with the given threshold
    features_threshold = len(depars_features)
    indices = {}
    for file in sourcepath: indices[file]={}   # new dict for every data file
    count = 0
    
    while features_threshold>0:     # if no threshold, extract all logs with >0 matched features
        if not threshold or (threshold and count < int(threshold)): 
            nfeatures = features_threshold
            for file in feat_appear:
                count += feat_appear[file].count(int(nfeatures))
                indices[file][nfeatures] = [i for i, val in enumerate(feat_appear[file]) if val==nfeatures]
            features_threshold -= 1
        else:
            break
    
    
    if debugmode:
        opmode = faac.debugProgram('fcdeparser.stru_deparsing.user_input', [config['threshold'], features_threshold, matched_lines])
    else:
        if threshold:
            print("Considering the feature counters and a threshold of %d log entries, we will extract logs with >=%d matched features" %(config['threshold'],features_threshold+1))
        else:
            print("As no threshold is defined, we will extract all logs with >=1 matched features")
        print("Note that the output will be generated in different files according to their number of features")

        
    # Re-read the file extracting the raw data
    for file in sourcepath:
        
        if file.endswith('.gz'):
            input_file = gzip.open(file,'r')
        else:
            input_file = open(file,'r')
        
        
        if not debugmode:
            for nfeatures in indices[file]:
                if indices[file][nfeatures]:
                    output_file = open(OUTDIR + "output_%s_%sfeat" %(source,nfeatures),'w')
                    for line_index in indices[file][nfeatures]:
                        line = linecache.getline(file, line_index+1) # index starting by 1 with linecache function
                        output_file.write(line + "\n")
                        count_structured += 1
                    output_file.close()
                
        else:
            for position, line in enumerate(input_file):
                nfeatures = feat_appear[file][position]
                features_names = feat_appear_names[file][position]
                if (nfeatures>features_threshold and position in indices[file][nfeatures]) and opmode in {1,2}: 
                    faac.debugProgram('fcdeparser.stru_deparsing.deparsed_log', [position+1, line, nfeatures, features_names, opmode])
                elif opmode in {1}:
                    faac.debugProgram('fcdeparser.stru_deparsing.unmatched_criteria', [position+1, line, nfeatures, features_names])
                
                
        input_file.close()

    return (count_structured, count_tot)
Exemplo n.º 5
0
def unstr_deparsing(config, threshold, sourcepath, deparsInput, source,
                    formated_timestamps):
    '''
	Deparsing process for unstructured text based data sources like a log file.
	'''
    OUTDIR = config['OUTDIR']
    features = deparsInput['features']

    selection = []
    for i in range(len(config['FEATURES'][source])):
        if config['FEATURES'][source][i]['name'] in features:
            selection.append(i)

    FEATURES_sel = []
    for i in selection:
        FEATURES_sel.append(config['FEATURES'][source][i])

    VARIABLES = {}

    for variable in config['SOURCES'][source]['CONFIG']['VARIABLES']:
        try:
            VARIABLES[variable['name']] = variable
        except:
            print "Cofiguration file error: missing vriables"
            exit(1)

    count_unstructured = 0
    count_tot = 0
    print OUTDIR + "output_" + source
    output_file = open(OUTDIR + "output_" + source, 'w')

    # while count_source < lines[source]*0.01 and (not features_needed <= 0) :
    feat_appear = {}
    for file in sourcepath:
        feat_appear[file] = []

        if file.endswith('.gz'):
            input_file = gzip.open(file, 'r')
        else:
            input_file = open(file, 'r')

        line = input_file.readline()

        # First read to generate list of number of appearances
        if line:
            log = ""
            while line:
                log += line

                if len(log.split(config['SEPARATOR'][source])) > 1:
                    logExtract = log.split(config['SEPARATOR'][source])[0]

                    # For each log, extract timestamp with regular expresions and check if it is in the
                    # input timestamps
                    try:

                        t = getUnstructuredTime(
                            logExtract, VARIABLES['timestamp']['where'],
                            config['SOURCES'][source]['CONFIG']
                            ['timestamp_format'])
                        if str(t).strip() in formated_timestamps:
                            # Check if features appear in the log to write in the file.
                            record = faac.Record(
                                logExtract, config['SOURCES'][source]['CONFIG']
                                ['VARIABLES'], config['STRUCTURED'][source],
                                config['All'])
                            obs = faac.Observation.fromRecord(
                                record, FEATURES_sel)
                            feat_appear[file].append(
                                sum([
                                    obs.data[i].value
                                    for i in range(len(obs.data))
                                ]))
                    except:
                        pass

                    log = ""
                    for n in logExtract.split(
                            config['SEPARATOR'][source])[1::]:
                        log += n
                line = input_file.readline()

            # Deal with the last log, not processed during while loop.
            log += line
            try:
                t = getUnstructuredTime(
                    log, VARIABLES['timestamp']['where'],
                    config['SOURCES'][source]['CONFIG']['timestamp_format'])
                if str(t) in timestamps:
                    record = faac.Record(
                        logExtract,
                        config['SOURCES'][source]['CONFIG']['VARIABLES'],
                        config['STRUCTURED'][source], config['All'])
                    obs = faac.Observation.fromRecord(record, FEATURES_sel)
                    feat_appear[file].append(
                        sum([obs.data[i].value for i in range(len(obs.data))]))
            except:
                pass

        input_file.close()

    # Obtain number of features needed to extract the log
    features_needed = len(features)
    count = 0
    while count < int(threshold) and (not features_needed <= 1):
        for file in feat_appear:
            count += feat_appear[file].count(int(features_needed))

        print("There are " + str(count) +
              " unstructured logs with more than " + str(features_needed) +
              " matching features...")
        features_needed -= 1

    # Re-read the file
    for file in sourcepath:
        index = 0

        if file.endswith('.gz'):
            input_file = gzip.open(file, 'r')
        else:
            input_file = open(file, 'r')

        input_file.seek(0)
        line = input_file.readline()

        if line:
            log = "" + line
            while line:
                log += line
                if len(log.split(config['SEPARATOR'][source])) > 1:
                    count_tot += 1
                    logExtract = log.split(config['SEPARATOR'][source])[0]

                    # For each log, extract timestamp with regular expresions and check if it is in the
                    # input timestamps
                    try:
                        t = getUnstructuredTime(
                            logExtract, VARIABLES['timestamp']['where'],
                            config['SOURCES'][source]['CONFIG']
                            ['timestamp_format'])
                        if str(t).strip() in formated_timestamps:
                            # Check if features appear in the log to write in the file.
                            if feat_appear[file][index] > features_needed:
                                output_file.write(logExtract +
                                                  config['SEPARATOR'][source])
                                count_unstructured += 1
                            index += 1
                    except:
                        pass

                    log = ""
                    for n in logExtract.split(
                            config['SEPARATOR'][source])[1::]:
                        log += n
                line = input_file.readline()

        input_file.close()
    output_file.close()
    return (count_unstructured, count_tot)
Exemplo n.º 6
0
def process_log(log, config, source):
    '''
    Function take on data entry as input an transform it into a preliminary observation
    '''

    ignore_log = 0  # flag to skip processing this log
    if not log or not log.strip():
        ignore_log = 1  # do not process empty logs or containing only spaces
        print('\033[31m' +
              "The entry log is empty and will not be processed\n" + '\033[m')

    if not ignore_log:
        record = faac.Record(log,
                             config['SOURCES'][source]['CONFIG']['VARIABLES'],
                             config['STRUCTURED'][source],
                             config['TSFORMAT'][source], config['All'])
        if debugmode:
            faac.debugProgram('fcparser.process_log.record', [record])

        obs = faac.Observation.fromRecord(record, config['FEATURES'][source])
        if debugmode:
            faac.debugProgram('fcparser.process_log.observation', [obs])

        timearg = config['TIMEARG'][
            source]  # name of variable which contains timestamp
        log_timestamp = record.variables[timearg][0].value

        # Check if log_timestamp will be considered according to time sampling parameters
        if 'start' in config['Time']:
            if log_timestamp < config['Time']['start']:
                ignore_log = 1
        if 'end' in config['Time']:
            if log_timestamp > config['Time']['end']:
                ignore_log = 1

    if not ignore_log:
        window = config['Time']['window']
        try:
            if config['Keys']:
                tag = list()
                tag2 = normalize_timestamps(log_timestamp, window)
                tag.append(tag2.strftime("%Y%m%d%H%M"))
                for i in range(len(config['Keys'])):
                    if len(record.variables[config['Keys'][i]]) > 0:
                        tag.append(
                            str(record.variables[config['Keys'][i]][0])
                        )  # Careful!, only works (intentionally) for the first instance of a variable in a record
                if len(tag) > 1:
                    tag = tuple(tag)
                else:
                    tag = tag[0]
            else:
                tag2 = normalize_timestamps(log_timestamp, window)
                tag = tag2.strftime("%Y%m%d%H%M")

        except:
            # Exception as err
            #print("[!] Log failed. Reason: "+ (str(err) + "\nLog entry: " + repr(log[:300])+ "\nRecord value: "+ str(record)))
            tag, obs = None, None
            if debugmode:
                print('\033[31m' +
                      "This entry log would be ignored due to errors" +
                      '\033[m')

    else:
        tag, obs = None, None

    return tag, obs