def confution_matrix_and_metrics(path):
    lines = fm.readAllLine(path)
    count = 0
    fp = tp = fn = tn = 0
    for line in lines:
        if count != 0:
            text = line.split('|')
            #state = fm.clear_text(text[0], '"', '')
            current_label = fm.clear_text(text[1], '"', '')
            result_label = fm.clear_text(text[2], '"', '')
            if current_label == 'Botnet':
                tp += 1.0 if result_label == current_label else 0
                fn += 1.0 if result_label != current_label else 0
            if current_label == 'Normal':
                fp += 1.0 if result_label != current_label else 0
                tn += 1.0 if result_label == current_label else 0
        count += 1

    total = tp + tn + fp + fn
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = (2 * precision * recall) / (precision + recall)
    accuracy = (tp + tn) / total

    print 'tp: ' + str(tp) + '----' + 'fp: ' + str(fp)
    print 'tn: ' + str(tn) + '----' + 'fn: ' + str(fn)
    print '-------------------------------------------'
    print 'Total: ' + str(total)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1 Score: ' + str(F1)
    print 'Accuracy: ' + str(accuracy)
예제 #2
0
def build_json_with_length_fixed_using_c13_full(path):
    lines = fm.readAllLine(path, 'dataset_Cx')
    fileName = 'dataset_Cx.json'
    fileResult = open(path + os.sep + fileName, 'w')
    fileResult.write('[')
    count = 0 #variable para no usar la primera linea de los data set
    count_selected_element = 0
    id_list = []
    clusters = get_clusters()
    for line in lines:
        if count != 0 and len(line) > 1:
            text = line.split(' ')
            description_before_clear = fm.clear_text(text[7])
            id = fm.clear_text(text[0])#[1:-1]
            if len(description_before_clear) > 20 and id not in id_list:
                #id = fm.clear_text(text[0])[1:-1]
                id_list.append(id)
                #id_connection = text[1] + '-' + text[2] + '-' + text[3] + '-' + text[4]
                id_value = text[1] + '-' + text[2] + '-' + text[3] + '-' + text[4] #fm.clear_text(text[2])
                title = text[6]#fm.clear_text(text[1], "-")
                #making all word
                document = ''
                for word in all_word(description_before_clear, 5):
                    document += word + ' '
                for word in all_word(description_before_clear, 10):
                    document += word + ' '
                for word in all_word(description_before_clear, 15):
                    document += word + ' '
                description = document
                title = create_label(title)#'Unlabelled' if count_selected_element % 4 == 0 else create_label(title)
                data_json(fileResult, id, title, description, id_value, clusters[count_selected_element])
                count_selected_element += 1
        count += 1
    fileResult.write(']')
    fileResult.close()
예제 #3
0
def build_json(path,word_len):
    lines = fm.readAllLine(path, 'dataset_Cx')
    fileName = 'dataset_Cx.json'
    fileResult = open(path+os.sep+fileName, 'w')
    fileResult.write('[')
    count = 0 #variable para no usar la primera linea de los data set
    count_selected_element = 0
    for line in lines:
        if count != 0 and len(line) > 1:
            text = line.split('|')
            description_before_clear = fm.clear_text(text[3])
            if len(description_before_clear) > 20:
                id = fm.clear_text(text[0])[1:-1]
                id_value = fm.clear_text(text[2])
                title = fm.clear_text(text[1], "-")
                #making all word
                document = ''
                for word in all_word(description_before_clear, int(word_len)):
                    document += word + ' '
                description = document
                title = 'Unlabelled' if count_selected_element % 4 == 0 else create_label(title)
                data_json(fileResult, id, title, description, id_value)
                count_selected_element += 1
        count += 1
    fileResult.write(']')
    fileResult.close()
예제 #4
0
def ranking_similar_words(path,len_of_words):
    lines = file.readAllLine(path, 'ranking')
    fileName = 'ranking'
    #fileResult = open(path+os.sep+fileName, 'w')
    list_all_words = dict()
    result = dict()
    aux_dict = dict()
    count = 0
    for line in lines:
        text = line.split('|')
        if count != 0 and len(text) == 4:
            connection = line.split('|')
            text = connection[3]
            status = file.clear_text(text)
            note = file.clear_text(connection[0])
            if len(status) > len_of_words:
                #making word vector for connections
                list_all_words[note] = data_manager.all_word_list(status, len_of_words)
        count += 1
    for k in list_all_words.keys():
        build_word_ranking(result, list_all_words[k], k,aux_dict)

    # printing result
    d_view = [(v,k) for k,v in aux_dict.iteritems()]
    d_view.sort(reverse=True)
    for v,k in d_view:
        count = result[k][0]
        if count > 1:
            meta_data = ''
            dictMetaData = result[k][1]
            for k2 in dictMetaData.keys():
                meta_data += k2 + '(' + str(dictMetaData[k2])+'),'
            print k + ' |=> ' + str(count) + ' - ' + meta_data
def secuence_learning_trainer(path):
    lines = fm.readAllLine(path)
    count = 0  #variable para no usar la primera linea
    graph = create_all_two_posible_subsecuence()
    for line in lines:
        if count != 0:
            text = line.split('|')
            state = fm.clear_text(text[1], '"', '')
            label = fm.clear_text(text[2], '"', '')
            for i in range(0, len(state) - 1):
                tuple = state[i] + state[i + 1]
                if tuple in graph:
                    graph[tuple] += 1 if label == 'Botnet' else -1
        count += 1
    return graph
def join_label_form_ips_and_port(path):
    lines = fm.readAllLine(path)  #, 'result')
    fileName = 'capture20110817.binetflow-result.labels'
    fileResult = open(path + os.sep + fileName, 'w')
    id_connection_hash = {}
    id_connection_labels_hash = {}
    count = 0
    count2 = 0

    for line in lines:
        count += 1
        text = line.split(',')
        id_connection = text[1] + ',' + text[2] + ',' + text[3] + ',' + text[0]
        num_label = find_label(
            text[4]
        )  #1 si es normal y -1 si es botnet, de esta manera para un mismo id_connection en el hash quedara positivo si la mayoria es normal.
        id_connection_hash[id_connection] = id_connection_hash[
            id_connection] + num_label if id_connection in id_connection_hash else num_label
        label = fm.clear_text(text[4], 'flow=')
        id_connection_labels_hash[id_connection] = id_connection_labels_hash[
            id_connection] + label + ' ' if id_connection in id_connection_labels_hash else label + ' '

    for connection in id_connection_hash.keys():
        count2 += 1
        label = find_normal_label(
            id_connection_labels_hash[connection]
        ) if id_connection_hash[connection] > 0 else find_botnet_label(
            id_connection_labels_hash[connection])
        fileResult.write(connection + ',' + label + '\n')
        if id_connection_hash[connection] != 1 and id_connection_hash[
                connection] != -1:
            print connection

    fileResult.close()
    print 'done: ' + str(count) + ' ' + str(count2)
예제 #7
0
def build_graph(path):
    lines = fm.readAllLine(path, 'graph_result')
    fileName = 'graph_result.txt'
    fileResult = open(path + os.sep + fileName, 'w')
    fileResult.write(
        'a,b,c,d,e,f,g,h,i,A,B,C,D,E,F,G,H,I,r,s,t,u,v,w,x,y,z,R,S,T,U,V,W,X,Y,Z,1,2,3,4,5,6,7,8,9,.,-,+,*,0'
        + '\n')
    count = 0  #variable para no usar la primera linea de los data set
    count_selected_element = 0
    matrix = list_of_characters()
    for line in lines:
        if count != 0 and len(line) > 1:
            text = line.split(' ')
            label = text[6]
            character_sequence = fm.clear_text(text[7])
            fill_matrix(label, character_sequence, matrix)
        count += 1

    for key in matrix.keys():
        fileResult.write(str(key) + ', ')
        for i in range(0, len(matrix[key])):
            fileResult.write(str(matrix[key][i]) + ', ')
        fileResult.write('\n')

    return 0
def secuence_learning_testing(path, graph):
    lines = fm.readAllLine(path)
    fileName = 'labeling_result.txt'
    fileResult = open(path + os.sep + fileName, 'w')
    fileResult.write("State|CurrentLabel|LabelResult\n")
    count = 0  #variable para no usar la primera linea
    z = 0
    flag = ''
    fp = tp = fn = tn = 0
    for line in lines:
        if count != 0:
            text = line.split('|')
            state = fm.clear_text(text[1], '"', '')
            label = fm.clear_text(text[2], '"', '')
            flag = ''
            for i in range(0, len(state) - 1):
                tuple = state[i] + state[i + 1]
                z += graph[tuple] if tuple in graph else 0
                if tuple in graph and graph[tuple] > 0:
                    flag = 'bot'
            result = z  #sigmoid(z)
            if flag == '':
                fileResult.write(state + '|' + label + '|' + 'Normal\n')
                tn += 1.0 if 'Normal' == label else 0
                fn += 1.0 if 'Normal' != label else 0
            else:
                fileResult.write(state + '|' + label + '|' + 'Botnet\n')
                tp += 1.0 if 'Botnet' == label else 0
                fp += 1.0 if 'Botnet' != label else 0
        count += 1

    total = tp + tn + fp + fn
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = (2 * precision * recall) / (precision + recall)
    accuracy = (tp + tn) / total

    print 'tp: ' + str(tp) + '----' + 'fp: ' + str(fp)
    print 'tn: ' + str(tn) + '----' + 'fn: ' + str(fn)
    print '-------------------------------------------'
    print 'Total: ' + str(total)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1 Score: ' + str(F1)
    print 'Accuracy: ' + str(accuracy)
    print 'testing done'
def document_vectors(path, len_of_words, value_document_vector):
    lines = file.readAllLine(path, 'ranking')
    list_all_words = dict()
    count = 0
    for line in lines:
        text = line.split('|')
        if count != 0 and len(text) == 4:
            connection = line.split('|')
            note = file.clear_text(connection[0])
            label = file.clear_text(connection[1])
            model = file.clear_text(connection[2])
            state = file.clear_text(connection[3])
            if len(state) > len_of_words:
                #making word vector for connections
                list_all_words[note] = data_manager.all_word_list(
                    state, len_of_words)
                value_document_vector[
                    note] = note + ' | ' + label + ' | ' + model + '\n'
        count += 1
    return list_all_words
예제 #10
0
def filter_data(path, word_len):
    lines = fm.readAllLine(path, 'dataset_Cx')
    fileName = 'filter_data.txt'
    fileResult = open(path + os.sep + fileName, 'w')
    fileResult.write(' Note | Label | Model Id | State |' + '\n')
    count = 0  #variable para no usar la primera linea de los data set
    count_selected_element = 0
    for line in lines:
        text = line.split('|')
        if count != 0 and len(line) > 1 and len(text) > 3:
            #text = line.split('|')
            description_before_clear = fm.clear_text(text[3])
            if len(description_before_clear) > 20:
                #print line
                id = fm.clear_text(text[0])[1:-1]
                id_value = fm.clear_text(text[2])
                title = fm.clear_text(text[1], "-")
                fileResult.write(id + ' | ' + title + ' | ' + id_value + '\n')
        count += 1
    fileResult.write('\n')
    fileResult.close()
예제 #11
0
def build_json_with_length_fixed(path):
    lines = fm.readAllLine(path, 'dataset_ctu13')
    fileName = 'dataset_ctu13.json'
    fileResult = open(path+os.sep+fileName, 'w')
    fileResult.write('[')
    count = 0 #variable para no usar la primera linea de los data set
    count_selected_element = 0
    id_list = []
    clusters = get_clusters()
    for line in lines:
        if count != 0 and len(line) > 1:
            text = line.split('|')
            description_before_clear = fm.clear_text(text[3],'"')
            id = fm.clear_text(text[0])
            if len(description_before_clear) > 0 and id not in id_list:
                #id = fm.clear_text(text[0])[1:-1]
                id_list.append(id)
                id_connection = fm.clear_text(text[2],'"')
                title = fm.clear_text(text[1],'"')
                cluster_connection = text[4]
                bot_prob = fm.clear_text(text[5])
                #all_label = remove_end_number_label(fm.clear_text(text[1]))
                #making all word
                document = ''
                for word in all_word(description_before_clear, 5):
                    document += word + ' '
                for word in all_word(description_before_clear, 10):
                    document += word + ' '
                for word in all_word(description_before_clear, 15):
                    document += word + ' '
                if description_before_clear < 5:
                    document = description_before_clear
                description = document
                #title = 'Unlabelled' if count_selected_element % 4 == 0 else create_label(title)
                data_json(fileResult, id, title, description_before_clear, id_connection,cluster_connection,bot_prob)
                #data_json(fileResult, id, title, description_before_clear, id_value,"0",all_label)
                count_selected_element += 1
        count += 1
    fileResult.write(']')
    fileResult.close()
예제 #12
0
def ranking_similar_words(path):
    lines = file.readAllLine(path, 'ranking')
    fileName = 'ranking'
    #fileResult = open(path+os.sep+fileName, 'w')
    list_all_words = list()
    ranking = dict()
    count = 0
    for line in lines:
        if count != 0 and len(line) > 1:
            status = file.clear_text(line.split('|')[3])
            if len(status) > 4:
                list_all_words += data_manager.all_word_list(status, 5)

    ranking = build_word_ranking(list_all_words)
    for kword in ranking.keys():
        result_line = kword + ' -> ' + str(ranking[kword]) + '\n'
        print result_line
def setting_label(lines, lines2, result_name=''):
    #lines = fm.readAllLine(path)
    fileName = 'result-' + result_name + '.pcap.tsv'
    path_result = '/home/jorge/Data/result/'  #'/media/jorge/0622F24F22F2436B/Phd/Data\ Sets/nuevos\ datasets/ctu-13-models/labelled/'
    fileResult = open(path_result + os.sep + fileName, 'w')
    id_connection_label_hash = {}
    fileResult.write('ModelId\tLabelName\tLabel\tState\n')
    count = 0
    for line in lines:
        text = line.split(',')
        if count != 0 and len(text) > 14 and text[14].find('Background') == -1:
            proto = text[2]
            ip_o = text[3]
            ip_d = text[6]
            port = text[7]
            label = text[14]
            id_connection = ip_o + '-' + ip_d + '-' + port + '-' + proto  #clean_id_connection(text[0])
            id_connection_label_hash[id_connection] = fm.clear_text(
                label, '\n')
        if len(text) != 15:
            print count
        count += 1

    #lines2 = fm.readAllLine('/home/jorge/Data/aux/')
    count = 0  #para no tener en cuenta la primera linea
    for line in lines2:
        text = line.split('\t')
        if count != 0 and len(text) >= 3:
            #text = line.split('-')
            new_id_connection = text[
                0]  #+ '-' + text[1] + '-' + text[2] + '-' + text[3]
            if new_id_connection in id_connection_label_hash:
                current_label = id_connection_label_hash[new_id_connection]
                informal_label = get_informal_label(current_label)
                #text = line.split('\t')
                new_line = new_id_connection + '\t' + current_label + '\t' + informal_label + '\t' + text[
                    1] + '\n'
                fileResult.write(new_line)
                #if not current_label.find('Background'):
                #    fileResult.write(new_line)
        count += 1

    fileResult.close()
    print 'finish file ' + result_name + ' label: ' + str(
        len(id_connection_label_hash.keys()))
예제 #14
0
def rebuild(path):
    lines = fm.readAllLine(path, 'result')
    fileName = 'result.js'
    fileResult = open(path+os.sep+fileName, 'w')
    fileResult.write("var consoleLog = [];\n")
    for line in lines:
        if len(line) > 0:
            clear_line = fm.clear_text(line)
            if clear_line[:11] != 'console.log':
                fileResult.write(line)
            else:
                text_id = clear_line[13:len(clear_line)-3]
                if text_id[:16] == '_PROFILING_INFO_':
                    new_line = "if(consoleLog.indexOf('"+text_id+"') == -1){\n"+"consoleLog.push('"+text_id+"');\n"+line+"\n}\n"
                    fileResult.write(new_line)
                else:
                    fileResult.write(line)

    fileResult.close()