class SAFE: def __init__(self, model): self.converter = InstructionsConverter("data/i2v/word2id.json") self.normalizer = FunctionNormalizer(max_instruction=150) self.embedder = SAFEEmbedder(model) self.embedder.loadmodel() self.embedder.get_tensor() def embedd_function(self, filename, address): analyzer = RadareFunctionAnalyzer(filename, use_symbol=False, depth=0) functions = analyzer.analyze() instructions_list = None for function in functions: if functions[function]['address'] == address: instructions_list = functions[function][ 'filtered_instructions'] break if instructions_list is None: print("Function not found") return None converted_instructions = self.converter.convert_to_ids( instructions_list) instructions, length = self.normalizer.normalize_functions( [converted_instructions]) embedding = self.embedder.embedd(instructions, length) return embedding
class FunctionsEmbedder: def __init__(self, model, batch_size, max_instruction): self.batch_size = batch_size self.normalizer = FunctionNormalizer(max_instruction) self.safe = SAFEEmbedder(model) self.safe.loadmodel() self.safe.get_tensor() def compute_embeddings(self, functions): functions, lenghts = self.normalizer.normalize_functions(functions) embeddings = self.safe.embedd(functions, lenghts) return embeddings @staticmethod def create_table(db_name, table_name): conn = sqlite3.connect(db_name) c = conn.cursor() c.execute( "CREATE TABLE IF NOT EXISTS {} (id INTEGER PRIMARY KEY, {} TEXT)". format(table_name, table_name)) conn.commit() conn.close() def compute_and_save_embeddings_from_db(self, db_name, table_name): FunctionsEmbedder.create_table(db_name, table_name) conn = sqlite3.connect(db_name) cur = conn.cursor() q = cur.execute( "SELECT id FROM functions WHERE id not in (SELECT id from {})". format(table_name)) ids = q.fetchall() for i in tqdm(range(0, len(ids), self.batch_size)): functions = [] batch_ids = ids[i:i + self.batch_size] for my_id in batch_ids: q = cur.execute( "SELECT instructions_list FROM filtered_functions where id=?", my_id) functions.append(json.loads(q.fetchone()[0])) embeddings = self.compute_embeddings(functions) for l, id in enumerate(batch_ids): cur.execute("INSERT INTO {} VALUES (?,?)".format(table_name), (id[0], np.array2string(embeddings[l]))) conn.commit()