예제 #1
0
    def save_feature(self, file_name, save_file_name):
        db_handler = DatabaseHandler()
        
        opcode_variety = db_handler.extract_opcode_variety()

        opcode_sequence_O0 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O0')
        opcode_sequence_O1 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O1')
        opcode_sequence_O2 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O2')
        opcode_sequence_O3 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O3')
        
        with open(self.csv_save_dir_name + os.sep + save_file_name, 'wb') as f:
            writer = csv.writer(f)
            writer.writerow([file_name, 'O0', 'O1', 'O2', 'O3'])
            for opcode in opcode_variety:
                row = []
                row.append(opcode)
                row.append(opcode_sequence_O0.count(opcode))
                row.append(opcode_sequence_O1.count(opcode))
                row.append(opcode_sequence_O2.count(opcode))
                row.append(opcode_sequence_O3.count(opcode))
                writer.writerow(row)
            row = []
            row.append('Sum')
            row.append(len(opcode_sequence_O0))
            row.append(len(opcode_sequence_O1))
            row.append(len(opcode_sequence_O2))
            row.append(len(opcode_sequence_O3))
            writer.writerow(row)
예제 #2
0
 def extract_feature_vector(self, file_id, extraction_method):
     db_handler = DatabaseHandler()
     
     if self.opcode_variety_ is None:
         self.set_opcode_variety_from_database()
     
     opcode_sequence = db_handler.extract_opcode_sequence(file_id)
     if extraction_method == 'bag-of-opcodes':
         feature_vector = self.extract_bag_of_opcodes(opcode_sequence)
     elif extraction_method == '2-gram':
         feature_vector = self.extract_ngram(opcode_sequence, 2)
     elif extraction_method == '3-gram':
         feature_vector = self.extract_ngram(opcode_sequence, 3)
     elif extraction_method == 'proposed':
         subroutine_sequence = db_handler.extract_subroutine_sequence(file_id)
         average_subroutine_length = self.extract_average_subroutine_length(subroutine_sequence)
         location_sequence = db_handler.extract_location_sequence(file_id)
         average_basicblock_length = self.extract_average_basicblock_length(location_sequence)
         # construct feature_vector here
     else:
         sys.stderr.write('Error: no extraction method "' + extraction_method + '" found.')
         sys.exit()
     
     return feature_vector
예제 #3
0
        opcode_sequence = db_handler.extract_opcode_sequence(file_id)
        if extraction_method == 'bag-of-opcodes':
            feature_vector = self.extract_bag_of_opcodes(opcode_sequence)
        elif extraction_method == '2-gram':
            feature_vector = self.extract_ngram(opcode_sequence, 2)
        elif extraction_method == '3-gram':
            feature_vector = self.extract_ngram(opcode_sequence, 3)
        elif extraction_method == 'proposed':
            subroutine_sequence = db_handler.extract_subroutine_sequence(file_id)
            average_subroutine_length = self.extract_average_subroutine_length(subroutine_sequence)
            location_sequence = db_handler.extract_location_sequence(file_id)
            average_basicblock_length = self.extract_average_basicblock_length(location_sequence)
            # construct feature_vector here
        else:
            sys.stderr.write('Error: no extraction method "' + extraction_method + '" found.')
            sys.exit()
        
        return feature_vector

if __name__ == '__main__':
    db_handler = DatabaseHandler()
    opcode_sequence = db_handler.extract_opcode_sequence(500)
    # bigrams = nltk.bigrams(opcode_sequence)
    # fd = nltk.FreqDist(bigrams)
    # cfd = nltk.ConditionalFreqDist(bigrams)
    # cfd[u'cmp'].plot(50)
    trigrams = nltk.trigrams(opcode_sequence)
    print list(trigrams)
    # fd = nltk.FreqDist(trigrams)
    cfd = nltk.ConditionalFreqDist(trigrams)
    cfd[u'cmp'].plot(50)