def read_dpi_logs(domain_patterns, golden_config_dict, analyzer_conf_dict, mdreq, device_map_name): headers_list = [] global cmds_list global header_cols global comp global fields_old global TOTAL_COLS num_lines = 0 filename = None try: fire_pbr_for_cacheable_url = analyzer_conf_dict["fire_pbr_for_cacheable_url"].strip().lower() except KeyError as e: fire_pbr_for_cacheable_url = "no" MAX_FILTER_RULES_NODE = "/nkn/device_map/config/" + device_map_name + "/filter_config/max_rules" try: max_filter_rules = mdreq.query(MAX_FILTER_RULES_NODE) except KeyError as e: dpiloganalyzer_log.error("The filter_config/max_rules node is not available. Defauling to max-rules of 10000") max_filter_rules = 10000 # Look for files starting with dpilog_ keyword and with a number as extension # Sort the accesslog files based on the timestamp. Latest files will be accessed first dpi_logfiles = sorted(glob.glob("dpilog_*.[0-9]*"), key=lambda x: os.path.getmtime(x), reverse=True) for filename in dpi_logfiles: # print filename with open(filename, "a+") as fp: # Check if the file has already been processed by looking for the signature in the last line fp.seek(-9, os.SEEK_END) if fp.read().strip() != header.SIGNATURE: dpiloganalyzer_log.info("Reading the file %s" % filename) # New file so go to the begining of the file fp.seek(0, 0) # Read the fields format line line = fp.readline() # Loop through till we hit the 'Fields' line in the log file while line != None and not "#Fields" in line: line = fp.readline() # If the '#Fields' is not present in the log file then skip that file if line == None: dpiloganalyzer_log.info("The '#Fields' line not present in the accesslog.Skipping the file") continue # Generate the http header to column mapping fields_new = line # Check if the format has changed while reading the files if fields_new != fields_old: fields_old = fields_new # If the accesslog file does not have all the mandatroy http headers then skip that file http_headers = line[8:].strip() for items in http_headers.split(): if items[0] == '"' or items[0] == "'": headers_list.append(items[1:-1]) else: headers_list.append(items) if not isMandatoryHttpHeadersPresent(headers_list): dpiloganalyzer_log.info("One or more of the mandatory http headers is missing in the dpilog") continue # Generate the http header to column mapping header_cols = gen_utils.generate_headers_column_mapping(headers_list) TOTAL_COLS = len(headers_list) del headers_list[:] num_lines = 0 for line in fp: list = comp.findall(line) # If the total columns in the log don't match the TOTAL_COLS just continue if len(list) != TOTAL_COLS: continue # If the Host header is not there in the log skip that line host = list[header_cols["cs_Host_"]] if host == "-": continue # If the host header matches with the domain pattern and the uri matches the uri pattern # of the golden config then the pbr should be fired # result = domain_regex_pattern.search(list[header_cols['cs_Host_']]) result = is_golden_config_match(golden_config_dict, list, line) num_lines += 1 if result or ( fire_pbr_for_cacheable_url == "yes" and isCacheable(line, list, domain_patterns, num_lines) ): # Get the dest-ip from the log destip = list[header_cols["s_ip"]] # Append only the unique dest-ip to the list if not destip in dest_ips: # This way of appending in list is faster than the call to 'append' dest_ips[destip] = 1 set_str = "set policy-options prefix-list redirect-to-proxy %s" % (destip) cmds_list.append(set_str) else: dest_ips[destip] += 1 fp.write(header.SIGNATURE) try: if len(cmds_list) >= int(max_filter_rules): sendPbrs(cmds_list) except ValueError as e: dpiloganalyzer_log.info( "Integer value has to be specified for max_filter_rules in the analyzer.conf. Defaulting to 10000" ) max_filter_rules = 10000 if len(cmds_list) >= int(max_filter_rules): sendPbrs(cmds_list) else: continue fp.close()
def parse_mfc_accesslog(generic_namespace_name, analyzer_conf_dict, mfc_accesslog_path): namespace_dict =dict() filename = None global fields_old global num_files_read global header_cols #Sort the accesslog files based on the timestamp. Latest files will be read first accesslog_files = sorted(glob.glob('*access.log*.gz'), key=lambda x: os.path.getmtime(x), reverse=True) #Loop through all the accesslog files for filename in accesslog_files: #print filename #Check if the accesslog file is already processessed if isFileAlreadyProcessessed(filename): continue mfcloganalyzer_log.info("Reading the file %s"%filename) fp = gen_utils.zlib_file() start_time = time.time() fp.open(filename) line = fp.readline() #Loop through till we hit the 'Fields' line in the log file while line!= None and not '#Fields' in line: line = fp.readline() #If the '#Fields' is not present in the log file then skip that file if line == None: mfcloganalyzer_log.info("The '#Fields' line not present in the accesslog.Skipping the file") continue #Generate the http header to column mapping fields_new = line if fields_new != fields_old: fields_old = fields_new #If the accesslog file does not have all the mandatroy http headers then skip that file http_headers = line[8:].strip() headers_list = http_headers.split() if not isMandatoryHttpHeadersPresent(headers_list): mfcloganalyzer_log.info("One of the mandatory http headers namespace or server-ip is missing in the accesslog") continue header_cols = gen_utils.generate_headers_column_mapping(headers_list) TOTAL_COLS = len(headers_list) num_files_read += 1 num_lines = 0 #Now read the rest of the mfc accesslog file line by line while line: line = fp.readline() #The line is empty, just skip it. if line == None: continue line = line.strip() #The line is commented, just skip it. if len(line) > 0 and line[0] == '#': continue #Find all the words with Space as delimiter #All words with whitespace within double or single quotes are consisdered one word lst = comp.findall(line) #Skip the tunneled data if len(lst) == 0 or 'Tunnel' in lst[0]: continue #If the number of columns logged don't match the number of http headers skip that line #Log only for the first line as you don't want to fill the log files if len(lst) != TOTAL_COLS: num_lines += 1 if num_lines == 1: mfcloganalyzer_log.info("%s", line) mfcloganalyzer_log.info("# of hdrs in 'Fields' doesn't match # of hdrs generated, hdrs with whitespaces needs to be quoted") continue namespace = lst[header_cols['x_namespace']] dest_ip = lst[header_cols['s_ip']] #If the namespace name is not equal to the generic namespace then store the namespace name #as a key and the list of server-ip's as the values in a dictionary if namespace != generic_namespace_name and namespace != '-': #Check if the namespace does not already exist in the dictionary if namespace in namespace_dict: server_ip = namespace_dict[namespace] #If the dest ip is not already there in the list then append it if dest_ip not in server_ip: server_ip.append(dest_ip) else: #New namespace so just add it and create a list of dest ip's namespace_dict[namespace] = [dest_ip] fp.close() if str(num_files_read) == analyzer_conf_dict['no_of_accesslog_files_read']: #print "SENDING THE FILE" num_files_read = 0 send_cmd_on_threshold_limit(namespace_dict) #Persist with the checksum dictionary entries with open('log_analyzer_checksum_dict.pickle', 'wb') as f: pickle.dump(checksum_dict, f) #mfcloganalyzer_log.info("%s seconds", (time.time() - start_time)) #print time.time() - start_time, "seconds" return 0
def read_dpi_logs(domain_patterns, golden_config_dict, analyzer_conf_dict, mdreq, device_map_name): headers_list = [] global cmds_list global header_cols global comp global fields_old global TOTAL_COLS num_lines = 0 filename = None try: fire_pbr_for_cacheable_url = analyzer_conf_dict[ 'fire_pbr_for_cacheable_url'].strip().lower() except KeyError as e: fire_pbr_for_cacheable_url = 'no' MAX_FILTER_RULES_NODE = '/nkn/device_map/config/' + device_map_name + '/filter_config/max_rules' try: max_filter_rules = mdreq.query(MAX_FILTER_RULES_NODE) except KeyError as e: dpiloganalyzer_log.error( "The filter_config/max_rules node is not available. Defauling to max-rules of 10000" ) max_filter_rules = 10000 #Look for files starting with dpilog_ keyword and with a number as extension #Sort the accesslog files based on the timestamp. Latest files will be accessed first dpi_logfiles = sorted(glob.glob('dpilog_*.[0-9]*'), key=lambda x: os.path.getmtime(x), reverse=True) for filename in dpi_logfiles: #print filename with open(filename, 'a+') as fp: #Check if the file has already been processed by looking for the signature in the last line fp.seek(-9, os.SEEK_END) if fp.read().strip() != header.SIGNATURE: dpiloganalyzer_log.info("Reading the file %s" % filename) #New file so go to the begining of the file fp.seek(0, 0) #Read the fields format line line = fp.readline() #Loop through till we hit the 'Fields' line in the log file while line != None and not '#Fields' in line: line = fp.readline() #If the '#Fields' is not present in the log file then skip that file if line == None: dpiloganalyzer_log.info( "The '#Fields' line not present in the accesslog.Skipping the file" ) continue #Generate the http header to column mapping fields_new = line #Check if the format has changed while reading the files if fields_new != fields_old: fields_old = fields_new #If the accesslog file does not have all the mandatroy http headers then skip that file http_headers = line[8:].strip() for items in http_headers.split(): if items[0] == '"' or items[0] == '\'': headers_list.append(items[1:-1]) else: headers_list.append(items) if not isMandatoryHttpHeadersPresent(headers_list): dpiloganalyzer_log.info( "One or more of the mandatory http headers is missing in the dpilog" ) continue #Generate the http header to column mapping header_cols = gen_utils.generate_headers_column_mapping( headers_list) TOTAL_COLS = len(headers_list) del headers_list[:] num_lines = 0 for line in fp: list = comp.findall(line) #If the total columns in the log don't match the TOTAL_COLS just continue if len(list) != TOTAL_COLS: continue #If the Host header is not there in the log skip that line host = list[header_cols['cs_Host_']] if host == '-': continue #If the host header matches with the domain pattern and the uri matches the uri pattern #of the golden config then the pbr should be fired #result = domain_regex_pattern.search(list[header_cols['cs_Host_']]) result = is_golden_config_match(golden_config_dict, list, line) num_lines += 1 if result or (fire_pbr_for_cacheable_url == 'yes' and isCacheable(line, list, domain_patterns, num_lines)): #Get the dest-ip from the log destip = list[header_cols['s_ip']] #Append only the unique dest-ip to the list if not destip in dest_ips: #This way of appending in list is faster than the call to 'append' dest_ips[destip] = 1 set_str = 'set policy-options prefix-list redirect-to-proxy %s' % ( destip) cmds_list.append(set_str) else: dest_ips[destip] += 1 fp.write(header.SIGNATURE) try: if len(cmds_list) >= int(max_filter_rules): sendPbrs(cmds_list) except ValueError as e: dpiloganalyzer_log.info( "Integer value has to be specified for max_filter_rules in the analyzer.conf. Defaulting to 10000" ) max_filter_rules = 10000 if len(cmds_list) >= int(max_filter_rules): sendPbrs(cmds_list) else: continue fp.close()