示例#1
0
import numpy as np
import math
#https://towardsdatascience.com/different-techniques-to-represent-words-as-vectors-word-embeddings-3e4b9ab7ceb4
""" as mentioned in the above article, every instruction is a sentence in this study. 
i am going to use whole instruction without addresses. (dropping out the addresses in instructions (0x840000 etc.) 
e.g 
while ['push', 'ebx'] -> "push ebx" will be a sentence
['push', '0x402058'] -> "push 0x402058" won't be used as a sentence instead "push addr" will be used. 

all addresses with a pattern of r'(0x[0-9a-fA-F]+)(?:)?' will be replaced with addr. 
DWORD PTR, QWORD PTR and BYTE PTR will be replaced with APTR

"""
file = './a.exe'
syntax = "intel"
shellcode, code, opcodes, operands, instructions = parse(file, syntax, None)
"""
Sample Data
--------------------instructions--------------------

[
'push ebp','push ebx','push edi','push esi','sub esp addr','mov eax APTR [esp+addr]','mov ecx APTR ds:addr','mov esi APTR [esp+addr]','mov ebx APTR [esp+addr]','cmp eax addr','mov APTR [esp+addr] ecx','je addr','mov edi APTR [esp+addr]','cmp eax addr','je addr','test eax eax','jne addr','mov esi esp','push addr','push esi','push edi','call addr','add esp addr','push edi','call addr','add esp addr','test eax eax','je addr','push APTR [esp+addr]','push esi','push addr','jmp addr','mov ebp APTR [esp+addr]','push esi','push addr','call addr','add esp addr','mov esi eax','push esi','call addr','add esp addr','mov edi eax','cmp edi addr','jg addr','push addr','push addr','push addr','call addr','add esp addr','push addr','push ebp','call addr','add esp addr','xor ecx ecx','cmp APTR [esp+addr] addr','setne cl','cmp eax addr','cmovne ecx eax','test ecx ecx','jne addr','push edi','push esi','push addr','push ebx','call addr','add esp addr','mov APTR [esi+edi*1-addr] addr','jmp addr','mov ebp esp','push addr','push ebp','push edi','call addr','add esp addr','push esi','push ebp','push addr','call addr','add esp addr','jmp addr','push esi','push addr','call addr','add esp addr','mov esi eax','test esi esi','je addr','push esi','push ebx','call addr','add esp addr','push esi','call addr','add esp addr','mov ecx APTR [esp+addr]','call addr','add esp addr','pop esi','pop edi','pop ebx','pop ebp','ret','push ebp','push ebx','push edi','push esi','push eax','mov edi APTR [esp+addr]','xor ebp ebp','test edi edi','jle addr','mov ecx APTR [esp+addr]','mov eax edi','sub eax ebp','lea esi [ecx+ebp*1]','push eax','push addr','push esi','call addr','add esp addr',
....................

"""
"""
Step 1: Identify unique words in the complete text data. In our case, the list is as follows we use unique opcodes and unique operands:
--------------------unique_opcodes--------------------

'push','sub','mov','cmp','je','test','jne','call','add','jmp','jg','xor','setne','cmovne','pop','ret','jle','lea','inc','jl','int3','dec','js','or','movsx','jae','ja','sbb','shl','jge','setg','movzx','bswap','cmovle','shr','not','and','sete','jns','neg','cvtsi2sd','divsd','movsd','fstp','mulsd','cvttsd2si','sar','cmovg','cmovl','jb','cmove','movaps','movups','seta','cmovb','imul','cmovs','jbe','rep','bt','div','cmovbe','mul','cmovge','cmovns','setb','cmova','adc','cdq','idiv','movd','pshufd','movdqu','punpckldq','punpcklqdq','cmovae','pxor','pinsrw','punpcklbw','punpcklwd','pslld','por','setle','shld','setae','setbe','movq','rol','paddd','movdqa','psrld','shufps','andps','pshuflw','psrlq','pand','pshufhw','packuswb','pandn','xorps','seto','lahf','sahf','ror','psrad','psrlw','punpckhbw','punpckhwd','punpckhdq','shrd','setns','setl','cwde','setge','pcmpeqd','cvtss2sd','addsd','sets','packssdw','paddw','pcmpeqb','pmovmskb','bnd','xchg','int','data16','pushf','fnclex','lock','movs','cpuid','xgetbv','fwait','cs','nop','popf','vfrczpd','palignr','std','cld','in','stos','movlhps','bsf','bsr','leave','pcmpistri','repnz','pinsrb','psrldq','jno','arpl','les','cmps','scas','lds','jecxz','(bad)','cbw','hlt','cmc','fldz','clc','icebp','bts','fld','vxorps','vpcmpeqb','vpmovmskb','vzeroupper','vpcmpeqw','pcmpeqw','rcr','fnstsw','stmxcsr','ldmxcsr','fstcw','fldcw','stc','fst','fnstcw','fnstenv','fldenv','fistp','fstsw','fld1','fdivp','fldpi','fldlg2','fxch','fyl2x','fild','fsubrp','movapd','andpd','psubd','psllq','ucomisd','jnp','cmpnlepd','movlpd','unpcklpd','pextrw','orpd','addpd','subpd','mulpd','unpckhpd','cmpeqsd','xorpd','fldl2e','fmulp','faddp','fscale','fchs','fabs','fldln2','ftst','fcompp','frndint','fsub','f2xm1','fcomp','fmul','fsave','frstor','fxam','xlat','fadd','fucom','jp','fucompp','fcom',
--------------------unique_opcodes_length--------------------
示例#2
0
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--destination')
parser.add_argument('-s', '--source')
args = parser.parse_args()

if args.destination:
    destination = args.destination
else:
    print('need a destination directory parameter')
    sys.exit(0)
if args.source:
    source = args.source
else:
    print('need a source directory parameter')
    sys.exit(0)

for file in os.listdir(source):
    try:
        syntax = "intel"
        print('file process started > {}'.format(file))
        shellcode, code, opcodes, operands, instructions = parse(os.path.join(source,file), syntax, None)

        sentences = instructions

        with open(os.path.join(destination+file+'.asm'), 'wb') as sentences_file:
            #print(sentences)
            pickle.dump(sentences, sentences_file)
            print('file processed > {}'.format(file))
    except Exception as e:
        print('file couldnt be processed > {} {} '.format(file, str(e)))