def confution_matrix_and_metrics(path): lines = fm.readAllLine(path) count = 0 fp = tp = fn = tn = 0 for line in lines: if count != 0: text = line.split('|') #state = fm.clear_text(text[0], '"', '') current_label = fm.clear_text(text[1], '"', '') result_label = fm.clear_text(text[2], '"', '') if current_label == 'Botnet': tp += 1.0 if result_label == current_label else 0 fn += 1.0 if result_label != current_label else 0 if current_label == 'Normal': fp += 1.0 if result_label != current_label else 0 tn += 1.0 if result_label == current_label else 0 count += 1 total = tp + tn + fp + fn precision = tp / (tp + fp) recall = tp / (tp + fn) F1 = (2 * precision * recall) / (precision + recall) accuracy = (tp + tn) / total print 'tp: ' + str(tp) + '----' + 'fp: ' + str(fp) print 'tn: ' + str(tn) + '----' + 'fn: ' + str(fn) print '-------------------------------------------' print 'Total: ' + str(total) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1 Score: ' + str(F1) print 'Accuracy: ' + str(accuracy)
def build_json_with_length_fixed_using_c13_full(path): lines = fm.readAllLine(path, 'dataset_Cx') fileName = 'dataset_Cx.json' fileResult = open(path + os.sep + fileName, 'w') fileResult.write('[') count = 0 #variable para no usar la primera linea de los data set count_selected_element = 0 id_list = [] clusters = get_clusters() for line in lines: if count != 0 and len(line) > 1: text = line.split(' ') description_before_clear = fm.clear_text(text[7]) id = fm.clear_text(text[0])#[1:-1] if len(description_before_clear) > 20 and id not in id_list: #id = fm.clear_text(text[0])[1:-1] id_list.append(id) #id_connection = text[1] + '-' + text[2] + '-' + text[3] + '-' + text[4] id_value = text[1] + '-' + text[2] + '-' + text[3] + '-' + text[4] #fm.clear_text(text[2]) title = text[6]#fm.clear_text(text[1], "-") #making all word document = '' for word in all_word(description_before_clear, 5): document += word + ' ' for word in all_word(description_before_clear, 10): document += word + ' ' for word in all_word(description_before_clear, 15): document += word + ' ' description = document title = create_label(title)#'Unlabelled' if count_selected_element % 4 == 0 else create_label(title) data_json(fileResult, id, title, description, id_value, clusters[count_selected_element]) count_selected_element += 1 count += 1 fileResult.write(']') fileResult.close()
def build_json(path,word_len): lines = fm.readAllLine(path, 'dataset_Cx') fileName = 'dataset_Cx.json' fileResult = open(path+os.sep+fileName, 'w') fileResult.write('[') count = 0 #variable para no usar la primera linea de los data set count_selected_element = 0 for line in lines: if count != 0 and len(line) > 1: text = line.split('|') description_before_clear = fm.clear_text(text[3]) if len(description_before_clear) > 20: id = fm.clear_text(text[0])[1:-1] id_value = fm.clear_text(text[2]) title = fm.clear_text(text[1], "-") #making all word document = '' for word in all_word(description_before_clear, int(word_len)): document += word + ' ' description = document title = 'Unlabelled' if count_selected_element % 4 == 0 else create_label(title) data_json(fileResult, id, title, description, id_value) count_selected_element += 1 count += 1 fileResult.write(']') fileResult.close()
def ranking_similar_words(path,len_of_words): lines = file.readAllLine(path, 'ranking') fileName = 'ranking' #fileResult = open(path+os.sep+fileName, 'w') list_all_words = dict() result = dict() aux_dict = dict() count = 0 for line in lines: text = line.split('|') if count != 0 and len(text) == 4: connection = line.split('|') text = connection[3] status = file.clear_text(text) note = file.clear_text(connection[0]) if len(status) > len_of_words: #making word vector for connections list_all_words[note] = data_manager.all_word_list(status, len_of_words) count += 1 for k in list_all_words.keys(): build_word_ranking(result, list_all_words[k], k,aux_dict) # printing result d_view = [(v,k) for k,v in aux_dict.iteritems()] d_view.sort(reverse=True) for v,k in d_view: count = result[k][0] if count > 1: meta_data = '' dictMetaData = result[k][1] for k2 in dictMetaData.keys(): meta_data += k2 + '(' + str(dictMetaData[k2])+'),' print k + ' |=> ' + str(count) + ' - ' + meta_data
def secuence_learning_trainer(path): lines = fm.readAllLine(path) count = 0 #variable para no usar la primera linea graph = create_all_two_posible_subsecuence() for line in lines: if count != 0: text = line.split('|') state = fm.clear_text(text[1], '"', '') label = fm.clear_text(text[2], '"', '') for i in range(0, len(state) - 1): tuple = state[i] + state[i + 1] if tuple in graph: graph[tuple] += 1 if label == 'Botnet' else -1 count += 1 return graph
def join_label_form_ips_and_port(path): lines = fm.readAllLine(path) #, 'result') fileName = 'capture20110817.binetflow-result.labels' fileResult = open(path + os.sep + fileName, 'w') id_connection_hash = {} id_connection_labels_hash = {} count = 0 count2 = 0 for line in lines: count += 1 text = line.split(',') id_connection = text[1] + ',' + text[2] + ',' + text[3] + ',' + text[0] num_label = find_label( text[4] ) #1 si es normal y -1 si es botnet, de esta manera para un mismo id_connection en el hash quedara positivo si la mayoria es normal. id_connection_hash[id_connection] = id_connection_hash[ id_connection] + num_label if id_connection in id_connection_hash else num_label label = fm.clear_text(text[4], 'flow=') id_connection_labels_hash[id_connection] = id_connection_labels_hash[ id_connection] + label + ' ' if id_connection in id_connection_labels_hash else label + ' ' for connection in id_connection_hash.keys(): count2 += 1 label = find_normal_label( id_connection_labels_hash[connection] ) if id_connection_hash[connection] > 0 else find_botnet_label( id_connection_labels_hash[connection]) fileResult.write(connection + ',' + label + '\n') if id_connection_hash[connection] != 1 and id_connection_hash[ connection] != -1: print connection fileResult.close() print 'done: ' + str(count) + ' ' + str(count2)
def build_graph(path): lines = fm.readAllLine(path, 'graph_result') fileName = 'graph_result.txt' fileResult = open(path + os.sep + fileName, 'w') fileResult.write( 'a,b,c,d,e,f,g,h,i,A,B,C,D,E,F,G,H,I,r,s,t,u,v,w,x,y,z,R,S,T,U,V,W,X,Y,Z,1,2,3,4,5,6,7,8,9,.,-,+,*,0' + '\n') count = 0 #variable para no usar la primera linea de los data set count_selected_element = 0 matrix = list_of_characters() for line in lines: if count != 0 and len(line) > 1: text = line.split(' ') label = text[6] character_sequence = fm.clear_text(text[7]) fill_matrix(label, character_sequence, matrix) count += 1 for key in matrix.keys(): fileResult.write(str(key) + ', ') for i in range(0, len(matrix[key])): fileResult.write(str(matrix[key][i]) + ', ') fileResult.write('\n') return 0
def secuence_learning_testing(path, graph): lines = fm.readAllLine(path) fileName = 'labeling_result.txt' fileResult = open(path + os.sep + fileName, 'w') fileResult.write("State|CurrentLabel|LabelResult\n") count = 0 #variable para no usar la primera linea z = 0 flag = '' fp = tp = fn = tn = 0 for line in lines: if count != 0: text = line.split('|') state = fm.clear_text(text[1], '"', '') label = fm.clear_text(text[2], '"', '') flag = '' for i in range(0, len(state) - 1): tuple = state[i] + state[i + 1] z += graph[tuple] if tuple in graph else 0 if tuple in graph and graph[tuple] > 0: flag = 'bot' result = z #sigmoid(z) if flag == '': fileResult.write(state + '|' + label + '|' + 'Normal\n') tn += 1.0 if 'Normal' == label else 0 fn += 1.0 if 'Normal' != label else 0 else: fileResult.write(state + '|' + label + '|' + 'Botnet\n') tp += 1.0 if 'Botnet' == label else 0 fp += 1.0 if 'Botnet' != label else 0 count += 1 total = tp + tn + fp + fn precision = tp / (tp + fp) recall = tp / (tp + fn) F1 = (2 * precision * recall) / (precision + recall) accuracy = (tp + tn) / total print 'tp: ' + str(tp) + '----' + 'fp: ' + str(fp) print 'tn: ' + str(tn) + '----' + 'fn: ' + str(fn) print '-------------------------------------------' print 'Total: ' + str(total) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1 Score: ' + str(F1) print 'Accuracy: ' + str(accuracy) print 'testing done'
def document_vectors(path, len_of_words, value_document_vector): lines = file.readAllLine(path, 'ranking') list_all_words = dict() count = 0 for line in lines: text = line.split('|') if count != 0 and len(text) == 4: connection = line.split('|') note = file.clear_text(connection[0]) label = file.clear_text(connection[1]) model = file.clear_text(connection[2]) state = file.clear_text(connection[3]) if len(state) > len_of_words: #making word vector for connections list_all_words[note] = data_manager.all_word_list( state, len_of_words) value_document_vector[ note] = note + ' | ' + label + ' | ' + model + '\n' count += 1 return list_all_words
def filter_data(path, word_len): lines = fm.readAllLine(path, 'dataset_Cx') fileName = 'filter_data.txt' fileResult = open(path + os.sep + fileName, 'w') fileResult.write(' Note | Label | Model Id | State |' + '\n') count = 0 #variable para no usar la primera linea de los data set count_selected_element = 0 for line in lines: text = line.split('|') if count != 0 and len(line) > 1 and len(text) > 3: #text = line.split('|') description_before_clear = fm.clear_text(text[3]) if len(description_before_clear) > 20: #print line id = fm.clear_text(text[0])[1:-1] id_value = fm.clear_text(text[2]) title = fm.clear_text(text[1], "-") fileResult.write(id + ' | ' + title + ' | ' + id_value + '\n') count += 1 fileResult.write('\n') fileResult.close()
def build_json_with_length_fixed(path): lines = fm.readAllLine(path, 'dataset_ctu13') fileName = 'dataset_ctu13.json' fileResult = open(path+os.sep+fileName, 'w') fileResult.write('[') count = 0 #variable para no usar la primera linea de los data set count_selected_element = 0 id_list = [] clusters = get_clusters() for line in lines: if count != 0 and len(line) > 1: text = line.split('|') description_before_clear = fm.clear_text(text[3],'"') id = fm.clear_text(text[0]) if len(description_before_clear) > 0 and id not in id_list: #id = fm.clear_text(text[0])[1:-1] id_list.append(id) id_connection = fm.clear_text(text[2],'"') title = fm.clear_text(text[1],'"') cluster_connection = text[4] bot_prob = fm.clear_text(text[5]) #all_label = remove_end_number_label(fm.clear_text(text[1])) #making all word document = '' for word in all_word(description_before_clear, 5): document += word + ' ' for word in all_word(description_before_clear, 10): document += word + ' ' for word in all_word(description_before_clear, 15): document += word + ' ' if description_before_clear < 5: document = description_before_clear description = document #title = 'Unlabelled' if count_selected_element % 4 == 0 else create_label(title) data_json(fileResult, id, title, description_before_clear, id_connection,cluster_connection,bot_prob) #data_json(fileResult, id, title, description_before_clear, id_value,"0",all_label) count_selected_element += 1 count += 1 fileResult.write(']') fileResult.close()
def ranking_similar_words(path): lines = file.readAllLine(path, 'ranking') fileName = 'ranking' #fileResult = open(path+os.sep+fileName, 'w') list_all_words = list() ranking = dict() count = 0 for line in lines: if count != 0 and len(line) > 1: status = file.clear_text(line.split('|')[3]) if len(status) > 4: list_all_words += data_manager.all_word_list(status, 5) ranking = build_word_ranking(list_all_words) for kword in ranking.keys(): result_line = kword + ' -> ' + str(ranking[kword]) + '\n' print result_line
def setting_label(lines, lines2, result_name=''): #lines = fm.readAllLine(path) fileName = 'result-' + result_name + '.pcap.tsv' path_result = '/home/jorge/Data/result/' #'/media/jorge/0622F24F22F2436B/Phd/Data\ Sets/nuevos\ datasets/ctu-13-models/labelled/' fileResult = open(path_result + os.sep + fileName, 'w') id_connection_label_hash = {} fileResult.write('ModelId\tLabelName\tLabel\tState\n') count = 0 for line in lines: text = line.split(',') if count != 0 and len(text) > 14 and text[14].find('Background') == -1: proto = text[2] ip_o = text[3] ip_d = text[6] port = text[7] label = text[14] id_connection = ip_o + '-' + ip_d + '-' + port + '-' + proto #clean_id_connection(text[0]) id_connection_label_hash[id_connection] = fm.clear_text( label, '\n') if len(text) != 15: print count count += 1 #lines2 = fm.readAllLine('/home/jorge/Data/aux/') count = 0 #para no tener en cuenta la primera linea for line in lines2: text = line.split('\t') if count != 0 and len(text) >= 3: #text = line.split('-') new_id_connection = text[ 0] #+ '-' + text[1] + '-' + text[2] + '-' + text[3] if new_id_connection in id_connection_label_hash: current_label = id_connection_label_hash[new_id_connection] informal_label = get_informal_label(current_label) #text = line.split('\t') new_line = new_id_connection + '\t' + current_label + '\t' + informal_label + '\t' + text[ 1] + '\n' fileResult.write(new_line) #if not current_label.find('Background'): # fileResult.write(new_line) count += 1 fileResult.close() print 'finish file ' + result_name + ' label: ' + str( len(id_connection_label_hash.keys()))
def rebuild(path): lines = fm.readAllLine(path, 'result') fileName = 'result.js' fileResult = open(path+os.sep+fileName, 'w') fileResult.write("var consoleLog = [];\n") for line in lines: if len(line) > 0: clear_line = fm.clear_text(line) if clear_line[:11] != 'console.log': fileResult.write(line) else: text_id = clear_line[13:len(clear_line)-3] if text_id[:16] == '_PROFILING_INFO_': new_line = "if(consoleLog.indexOf('"+text_id+"') == -1){\n"+"consoleLog.push('"+text_id+"');\n"+line+"\n}\n" fileResult.write(new_line) else: fileResult.write(line) fileResult.close()