def filter_txs_by_block(self, days): # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict() NUM_BLOCK_PER_DAY = 6 * 60 * 24 tr_dico = [[[[], []] for i in range(len(self.op[0]))], [[[], []] for i in range(len(self.op[1]))]] count_txs = 0 for np_index in range(2): i = 0 for contract_index in range(len(self.tr_dico_all[np_index])): i += 1 # print(f"Filtering txs, np_index={np_index}, count={i}, process={round(i * 100 / len(self.op[np_index]), 2)}%") for nml_index in range(2): data = [] for tx_dict in self.tr_dico_all[np_index][contract_index][ nml_index]: tx_block = int(tx_dict['blockNumber']) contract_addr = self.op[np_index][contract_index] creation_block = self.contract_creation_block[ contract_addr] if tx_block - creation_block < NUM_BLOCK_PER_DAY * days: data.append(tx_dict) count_txs += 1 tr_dico[np_index][contract_index][nml_index] = data self.tr_dico = tr_dico tl.compute_time(self.cur_time) print(f"days={days}, num_txs={count_txs}")
def create_pandas_dataframe(self, ft, ft_names): print("Creating pandas dataframe...") columns = [s + '@NUMERIC' for s in ft_names] columns[0] = "ponzi@{ponzi,non_ponzi}" df = pd.DataFrame(data=ft, columns=columns) df['size_info@NUMERIC'] = self.size_info # data.loc[:, data.columns != columns[0]] = data.loc[:, data.columns != columns[0]].astype(np.float64) tl.compute_time(self.cur_time) return df
def load_op(self): print("Loading op, opcodes, op_freq, size_info...") self.op = [[ fname.split('.json')[0] for fname in os.listdir(self.paths['database_op']) if fname.endswith('.json') ], [ fname.split('.json')[0] for fname in os.listdir(self.paths['database_op_np']) if fname.endswith('.json') ]] self.opcodes = [ 'SWAP8', 'DUP11', 'DUP14', 'SWAP10', 'DUP15', 'LOG2', 'INVALID', 'SWAP9', 'SWAP5', 'SWAP12', 'SWAP16', 'DUP9', 'LOG1', 'DUP12', 'SWAP11', 'SWAP2', 'MSTORE8', 'SWAP14', 'DUP13', 'POP', 'DUP1', 'DUP8', 'DUP7', 'DUP3', 'DUP4', 'MSTORE', 'SWAP3', 'CODECOPY', 'JUMP', 'DUP5', 'SWAP13', 'STOP', 'CALLDATACOPY', 'SWAP7', 'SWAP1', 'SWAP6', 'RETURN', 'DUP6', 'SWAP4', 'REVERT', 'DUP2', 'SELFDESTRUCT', 'DUP10', 'DUP16', 'JUMPI', 'SSTORE', 'PUSH', 'LOG3', 'LOG4', 'Missing', 'SWAP15' ] for i in self.op[0]: self.size_info.append( os.path.getsize(self.paths['db'] + 'bytecode/' + i + '.json')) for i in self.op[1]: self.size_info.append( os.path.getsize(self.paths['db'] + 'bytecode_np/' + i + '.json')) with open( self.paths['db'] + 'op_freq.json', 'rb', ) as f: self.op_freq = json.loads(f.read()) self.cur_time = tl.compute_time(self.cur_time)
def gen_tr_dico(self): # tr_dico is ordered by op[] # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict() tr_dico = [[[0, 0] for i in range(len(self.op[0]))], [[0, 0] for i in range(len(self.op[1]))]] file_paths = ['database_nml', 'database_int', 'database_nml_np', 'database_int_np'] op_indices = [0, 0, 1, 1] nml_int_indices = [0, 1, 0, 1] for i in range(4): tr_index = op_indices[i] cur_op = self.op[tr_index] nml_int_index = nml_int_indices[i] print("loading " + file_paths[i]) count = 0 with open(self.paths[file_paths[i]]) as f: while True: count += 1 if count % 100 == 0: print(count) contract_hash = f.readline().strip('\n') list_line = f.readline() if not contract_hash: break if contract_hash not in cur_op: continue tr_dico[tr_index][cur_op.index(contract_hash)][nml_int_index] = ast.literal_eval(list_line.strip('\n')) self.cur_time = tl.compute_time(self.cur_time) self.save_tr_dico(tr_dico)
def gen_op_freq(self): print("EtherDataToFreqAndTrDisc: generating op_freq.json") # op_freq = [[], []] op_freq = [{}, {}] for i in range(2): db_path = self.paths['database_op'] if i == 0 else self.paths['database_op_np'] for addr in self.op[i]: with open(db_path + addr + '.csv', 'r') as f: raw = f.readlines() res = [0 for i in range(len(self.opcodes))] if len(raw) > 1: tot = 0 for opcode in raw: opcode = opcode.strip('\n') code = opcode.split(',')[0] count = int(opcode.split(',')[1]) tot += count res[self.opcodes.index(code)] += count tot = tot if len(raw) > 1 else 1 res = [x / tot for x in res] # op_freq[i].append(res) op_freq[i][addr] = res print(f"{len(op_freq[0])}, {len(op_freq[1])}") self.cur_time = tl.compute_time(self.cur_time) with open(self.paths['db'] + 'op_freq.json', 'w') as outfile: outfile.write(json.dumps(op_freq)) print('op_freq serialized')
def define_path(self): self.cur_time = time.clock() print("EtherDataToFreqAndTrDisc: define variables...") self.paths['db'] = '../dataset/' self.paths[ 'database_nml'] = self.paths['db'] + 'sm_database/normal.json' self.paths[ 'database_int'] = self.paths['db'] + 'sm_database/internal.json' self.paths['database_op'] = self.paths['db'] + 'ponzi/op_count/' self.paths['database_nml_np'] = self.paths[ 'db'] + 'sm_database/normal_np.json' self.paths['database_int_np'] = self.paths[ 'db'] + 'sm_database/internal_np.json' self.paths['database_op_np'] = self.paths['db'] + 'non_ponzi/op_count/' self.paths['opcode'] = self.paths['db'] + 'ponzi/opcode/' self.paths['opcode_np'] = self.paths['db'] + 'non_ponzi/opcode/' # # For original data Marion_files # self.paths['db'] = '../dataset/sm_database/' # self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/' # self.paths['database_op_np'] = self.paths['db'] + 'opcode_np/opcode_count/bytecode_np/' self.cur_time = tl.compute_time(self.cur_time) pass
def load_op(self): # op[p=0, np=1][index] = contract_address print("Loading op, opcodes, op_freq, size_info...") self.op = [[ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op']) if fname.endswith('.csv') ], [ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op_np']) if fname.endswith('.csv') ]] self.opcodes = OPCODES for i in self.op[0]: self.size_info.append( os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i + '.json')) for i in self.op[1]: self.size_info.append( os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i + '.json')) with open( self.paths['db'] + 'op_freq.json', 'rb', ) as f: self.op_freq = json.loads(f.read()) self.cur_time = tl.compute_time(self.cur_time)
def define_path(self): print("Feature: define variable and load data") self.paths['db'] = '../dataset/' self.paths['database_nml'] = self.paths['db'] + 'sm_database/normal/' self.paths['database_int'] = self.paths['db'] + 'sm_database/internal/' self.paths['database_op'] = self.paths['db'] + 'ponzi/op_count/' self.paths[ 'database_nml_np'] = self.paths['db'] + 'sm_database/normal_np/' self.paths[ 'database_int_np'] = self.paths['db'] + 'sm_database/internal_np/' self.paths['database_op_np'] = self.paths['db'] + 'non_ponzi/op_count/' # self.paths['opcode'] = self.paths['db'] + 'ponzi/opcode/' # self.paths['opcode_np'] = self.paths['db'] + 'non_ponzi/opcode/' self.paths['opcode'] = self.paths['db'] + 'ponzi_official_opcode/' self.paths[ 'opcode_np'] = self.paths['db'] + 'non_ponzi_official_opcode/' # # For original data Marion_files # self.paths['db'] = '../dataset/sm_database/' # self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/' # self.paths['database_op_np'] = self.paths['db'] + 'opcode_np/opcode_count/bytecode_np/' self.cur_time = tl.compute_time(self.cur_time)
def dump_arff(self): print("Dumping into arff files ...") with open(self.paths['db'] + 'models/PONZI_' + str(self.J) + '.arff', 'w') as f: a2p.dump(self.df, f) with open( self.paths['db'] + 'models/PONZI_out_' + str(self.J) + '.arff', 'w') as f: a2p.dump(self.df_out, f) self.cur_time = tl.compute_time(self.cur_time)
def create_pandas_dataframe(self): print("Creating pandas dataframe...") columns = [s + '@NUMERIC' for s in self.ft_names + self.opcodes] columns[0] = "ponzi@{ponzi,non_ponzi}" df = pd.DataFrame(data=self.ft, columns=columns) df['size_info@NUMERIC'] = self.size_info # data.loc[:, data.columns != columns[0]] = data.loc[:, data.columns != columns[0]].astype(np.float64) self.cur_time = tl.compute_time(self.cur_time) self.df = df self.get_rid_of_outliers(columns)
def load_op(self): # op[p=0, np=1][index] = contract_address print("Loading op, opcodes, op_freq, size_info...") self.op = [ sorted([ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op']) if fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]] ]), sorted([ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op_np']) if fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]] ]) ] self.opcodes = OPCODES for i in self.op[0]: self.size_info.append( os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i + '.json')) for i in self.op[1]: self.size_info.append( os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i + '.json')) self.load_op_freq() # Do some statistics # print(len(self.op_freq)) # print(len(self.op_freq[0])) # print(len(self.op_freq[0][0])) # print(len(OPCODES)) # for tr_index in range(2): # print(f"avg of {'Ponzi' if tr_index == 0 else 'Non-Ponzi'}") # nums = [[] for i in range(len(OPCODES))] # for contract in self.op_freq[tr_index]: # for i in range(len(OPCODES)): # nums[i].append(float(contract[i])) # for i in range(len(OPCODES)): # print(f"{OPCODES[i]}: {sum(nums[i]) / len(nums[i])}") # End doing some statistics tl.compute_time(self.cur_time)
def get_rid_of_outliers(self, columns): print("Getting rid of outliers for the non ponzi instances") out_index = 3 """ min_max_scaler = preprocessing.StandardScaler() dum = df.drop(df[df[columns[0]] == 'ponzi'].index) df_out = dum[(np.abs(stats.zscore(min_max_scaler.transform(dum.drop(labels=[columns[0]]+columns[n:],axis=1)))) < out_index).all(axis=1)] df_out = df_out.append(df.drop(df[df[columns[0]] == 'non_ponzi'].index)) """ dum = self.df.drop(self.df[self.df[columns[0]] == 'ponzi'].index) df_out = dum[(np.abs( stats.zscore( np.asarray(dum.drop( labels=[columns[0]] + columns[len(self.ft_names):], axis=1), dtype='float64'))) < out_index).all(axis=1)] self.df_out = df_out.append( self.df.drop(self.df[self.df[columns[0]] == 'non_ponzi'].index)) tl.compute_time(self.cur_time)
def load_txs_data_one_directory(self, path, np_index, nml_index): # self.op[p=0, np=1][index] = contract_address for i in range(len(self.op[np_index])): contract_addr = self.op[np_index][i] # print(f"contract_addr={contract_addr}") data = [] if not self.revert[contract_addr]: try: file_index = 0 while True: # print(f"Try load {contract_addr}_{file_index}.json") with open(f"{path}{contract_addr}_{file_index}.json" ) as f: data += json.loads(f.read()) file_index += 1 except FileNotFoundError as e: pass self.tr_dico_all[np_index][i][nml_index] = data # print(f"data_len={len(data)}, tr_dico[{np_index}][{i}][{nml_index}] = data") tl.compute_time(self.cur_time)
def dump_arff(self, days): print("Dumping into arff files ...") if days < 10: day = f"0000{days}" elif days < 100: day = f"000{days}" elif days < 1000: day = f"00{days}" elif days < 10000: day = f"0{days}" else: day = f"{days}" with open(self.paths['arff'] + f'PONZI_{str(self.J)}_day_{day}.arff', 'w') as f: a2p.dump(self.df, f) # with open(self.paths['arff'] + 'PONZI_opcodes_' + str(self.J) + '.arff', 'w') as f: # a2p.dump(self.df_opcodes, f) # with open(self.paths['arff'] + 'PONZI_basic_' + str(self.J) + '.arff', 'w') as f: # a2p.dump(self.df_basic, f) tl.compute_time(self.cur_time) self.tr_dico = {}
def load_op(self): # op[p=0, np=1][index] = contract_address print("Loading op, opcodes, op_freq, size_info...") self.op = [ # sorted([fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op']) if # fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]]]), # sorted([fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op_np']) if # fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]]]) sorted([ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op']) if fname.endswith('.csv') ]), sorted([ fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op_np']) if fname.endswith('.csv') ]) ] self.opcodes = OPCODES for i in self.op[0]: self.size_info.append( os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i + '.json')) for i in self.op[1]: self.size_info.append( os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i + '.json')) with open( self.paths['db'] + 'op_freq_list.json', 'rb', ) as f: self.op_freq = json.loads(f.read()) self.load_op_freq() # Do some statistics # Prof EJ required to get the numbers print(len(self.op_freq)) print(len(self.op_freq[0])) print(len(self.op_freq[0][0])) print(len(self.opcodes)) for tr_index in range(2): print(f"avg of {'Ponzi' if tr_index == 0 else 'Non-Ponzi'}") nums = [[] for i in range(len(self.opcodes)) ] # 50 == len(original OPCODE) for contract in self.op_freq[tr_index]: for i in range(len(self.opcodes)): nums[i].append(float(contract[i])) for i in range(len(self.opcodes)): print(f'{self.opcodes[i]}: {sum(nums[i]) / len(nums[i])}') # End doing some statistics self.cur_time = tl.compute_time(self.cur_time)
def load_tr_dico(self): tr_dico = [[], []] with open(self.paths['db'] + 'tr_dico_ponzi.json', 'rb') as f: tr_dico[0] = json.loads(f.read()) with open(self.paths['db'] + 'tr_dico_nonponzi0.json', 'rb') as f: tr_dico[1] = json.loads(f.read()) print("Reading tr_dico: " + str(len(tr_dico[1]))) for i in range(1, len(self.op[1]) // 500 + 1): with open(self.paths['db'] + 'tr_dico_nonponzi' + str(i) + '.json', 'rb') as f: tr_dico[1] += json.loads(f.read()) print("Reading tr_dico: " + str(len(tr_dico[1]))) self.tr_dico = tr_dico self.cur_time = tl.compute_time(self.cur_time)
def define_path(self): print("Feature: define variable and load data") self.paths['db'] = '../Marion_files/sm_database/' self.paths['database_nml'] = self.paths['db'] + 'normal.json' self.paths['database_int'] = self.paths['db'] + 'internal.json' # Same as opcode/raw_opcodes/ in origin feature.py self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/' self.paths['database_nml_np'] = self.paths['db'] + 'normal_np.json' self.paths['database_int_np'] = self.paths['db'] + 'internal_np.json' self.paths['database_op_np'] = self.paths[ 'db'] + 'opcode_np/opcode_count/bytecode_np/' self.cur_time = tl.compute_time(self.cur_time) pass
def load_tr_dico(self): # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict() tr_dico = [[], []] with open(self.paths['db'] + 'tr_dico_ponzi.json', 'rb') as f: tr_dico[0] = json.loads(f.read()) with open(self.paths['db'] + 'tr_dico_nonponzi0.json', 'rb') as f: tr_dico[1] = json.loads(f.read()) print("Reading tr_dico: " + str(len(tr_dico[1]))) for i in range(1, len(self.op[1]) // 500 + 1): with open(self.paths['db'] + 'tr_dico_nonponzi' + str(i) + '.json', 'rb') as f: tr_dico[1] += json.loads(f.read()) print("Reading tr_dico: " + str(len(tr_dico[1]))) self.tr_dico = tr_dico self.cur_time = tl.compute_time(self.cur_time)
def gen_op_freq_origin(self): op_freq = [[], []] for add in self.op[0]: with open(self.paths['database_op'] + add + '.json', 'r') as f: # print(self.paths['database_op'] + add + '.json') raw = f.readlines() res = [0 for i in range(len(self.opcodes))] if len(raw) > 1: tot = 0 for opcode in raw: # count = number % 10 instead of number? count = float(opcode[3]) tot += count res[self.opcodes.index(opcode[5:-1])] = count else: tot = 1 res = [x / tot for x in res] op_freq[0].append(res) print(res) # non ponzi instances for add in self.op[1]: with open(self.paths['database_op_np'] + add + '.json', 'r') as f: raw = f.readlines() res = [0 for i in range(len(self.opcodes))] if len(raw) > 1: tot = 0 for opcode in raw: # count = number % 10 instead of number? count = float(opcode[3]) tot += count res[self.opcodes.index(opcode[5:-1])] = count else: tot = 1 res = [x / tot for x in res] op_freq[1].append(res) print(res) t2 = tl.compute_time(self.cur_time) with open(self.paths['db'] + 'op_freq.json', 'w') as outfile: outfile.write(json.dumps(op_freq)) print('op_freq serialized')
def cal_value_time_in_out(self): ft = [] len_op = [len(self.op[0]), len(self.op[1])] for tr_index in range(2): print('computing features for' + 'ponzi' if tr_index == 0 else 'non ponzi') for i in range(len_op[tr_index]): val_in = [] val_out = [] time_in = [] time_out = [] birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp']) for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[ tr_index][i][1]: timestamp = float(tx['timeStamp']) if (timestamp - birth) / (60 * 60 * 24) <= self.J: if tx['from'] == '' or tx['from'] == self.op[tr_index][ i]: val_out.append(float(tx['value'])) time_out.append(timestamp) else: val_in.append(float(tx['value'])) time_in.append(timestamp) val_in = np.asarray(val_in) val_out = np.asarray(val_out) time_in = np.asarray(time_in) time_out = np.asarray(time_out) res = tl.basic_features( 'ponzi' if tr_index == 0 else 'non_ponzi', np.asarray(val_in), np.asarray(val_out), np.asarray(time_in), np.asarray(time_out)) ft.append( np.concatenate((res, np.asarray(self.op_freq[tr_index][i], dtype='float32')))) self.cur_time = tl.compute_time(self.cur_time) self.ft = ft
def cal_advanced_features(self): ft = [] ft_opcodes = [] ft_basic = [] len_op = [len(self.op[0]), len(self.op[1])] nbrs = [[], []] lifes = [[], []] for tr_index in range(2): print('computing features for ' + ('ponzi' if tr_index == 0 else 'non ponzi')) for i in range(len_op[tr_index]): # for each contract val_in = [] val_out = [] time_in = [] time_out = [] pay_in = 0 pay_out = 0 addr_in = set() addr_out = set() # print('=======index error=====') # print(self.tr_dico[0]) # print('=======================') birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp']) for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[ tr_index][i][1]: # for each tx of that contract contract_hash = self.op[tr_index][i] timestamp = float(tx['timeStamp']) if (timestamp - birth) / (60 * 60 * 24) <= self.J: self.cal_value_time_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'val_in': val_in, 'val_out': val_out, 'time_in': time_in, 'time_out': time_out, 'timestamp': timestamp }) (pay_in, pay_out) = self.cal_addr_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'pay_in': pay_in, 'pay_out': pay_out, 'addr_in': addr_in, 'addr_out': addr_out }) num_overlap_addr = len(addr_in.intersection(addr_out)) res = tl.basic_features({ 'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi', 'val_in': np.asarray(val_in), 'val_out': np.asarray(val_out), 'time_in': np.asarray(time_in), 'time_out': np.asarray(time_out), 'pay_in': pay_in, 'pay_out': pay_out, 'num_overlap_addr': num_overlap_addr }) # gini: 12 in 13 out of 15 timeout # 1 nbr_tx_in, 16 lifetime # CALLDATACOPY, CODECOPY, SWAP3, SSTORE, DUP6, SWAP6, REVERT, SSTORE # print('=========================') # print('nbrs: ', nbrs) # print('type(nbrs): ', type(nbrs)) # print('type(nbrs[0]): ', type(nbrs[0])) # print('=========================') nbrs[tr_index].append(float(res[1])) lifes[tr_index].append(float(res[16])) ft.append( np.concatenate((res, np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_opcodes.append( np.concatenate((np.asarray([ 'ponzi' if tr_index == 0 else 'non_ponzi' ]), np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_basic.append(res) self.cur_time = tl.compute_time(self.cur_time) self.ft = ft self.ft_opcodes = ft_opcodes self.ft_basic = ft_basic print('nbrs:') print( f'P={sum(nbrs[0]) / len(nbrs[0])}, NP={sum(nbrs[1]) / len(nbrs[1])}' ) print('lifes:') print( f'P={sum(lifes[0]) / len(lifes[0])}, NP={sum(lifes[1]) / len(lifes[1])}' )
def cal_advanced_features(self): ft = [] ft_opcodes = [] ft_basic = [] len_op = [len(self.op[0]), len(self.op[1])] nbrs = [[], []] lifes = [[], []] top_30_avg_stdev = [{ 'size_info': [], 'nbr_tx_in': [], 'lifetime': [], 'num_paid_in_addr': [], 'gini_in': [], 'overlap_in_out_addr': [], 'gini_time_out': [], 'avg_time_btw_tx': [] }, { 'size_info': [], 'nbr_tx_in': [], 'lifetime': [], 'num_paid_in_addr': [], 'gini_in': [], 'overlap_in_out_addr': [], 'gini_time_out': [], 'avg_time_btw_tx': [] }] for tr_index in range(2): print('computing features for ' + ('ponzi' if tr_index == 0 else 'non ponzi')) for i in range(len_op[tr_index]): # for each contract val_in = [] val_out = [] time_in = [] time_out = [] pay_in = 0 pay_out = 0 addr_in = set() addr_out = set() # The timeStamp of all nml TXs is sorted asc, so the first normal TX is the TX creates the contract. # That's from Charles. I checked the timeStamps of all nml TXs, and they were asc sorted. birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp']) for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[ tr_index][i][1]: # for each tx of that contract contract_hash = self.op[tr_index][i] timestamp = float(tx['timeStamp']) if (timestamp - birth) / (60 * 60 * 24) <= self.J: self.cal_value_time_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'val_in': val_in, 'val_out': val_out, 'time_in': time_in, 'time_out': time_out, 'timestamp': timestamp }) (pay_in, pay_out) = self.cal_addr_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'pay_in': pay_in, 'pay_out': pay_out, 'addr_in': addr_in, 'addr_out': addr_out }) num_overlap_addr = len(addr_in.intersection(addr_out)) res = tl.basic_features({ 'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi', 'val_in': np.asarray(val_in), 'val_out': np.asarray(val_out), 'time_in': np.asarray(time_in), 'time_out': np.asarray(time_out), 'pay_in': pay_in, 'pay_out': pay_out, 'num_overlap_addr': num_overlap_addr }) # gini: 12 in 13 out 15 timeout # 1 nbr_tx_in, 16 lifetime # CALLDATACOPY, CODECOPY, SWAP3, SSTORE, DUP6, SWAP6, REVERT, SSTORE nbrs[tr_index].append(float(res[1])) lifes[tr_index].append(float(res[16])) # if lifetime in lifes[tr_index]: # lifes[tr_index][lifetime] += 1 # else: # lifes[tr_index][lifetime] = 1 top_30_avg_stdev[tr_index]['nbr_tx_in'].append(res[1]) top_30_avg_stdev[tr_index]['lifetime'].append(res[16]) top_30_avg_stdev[tr_index]['num_paid_in_addr'].append(res[5]) top_30_avg_stdev[tr_index]['gini_in'].append(res[12]) top_30_avg_stdev[tr_index]['overlap_in_out_addr'].append( res[7]) top_30_avg_stdev[tr_index]['gini_time_out'].append(res[15]) top_30_avg_stdev[tr_index]['avg_time_btw_tx'].append(res[14]) ft.append( np.concatenate((res, np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_opcodes.append( np.concatenate((np.asarray([ 'ponzi' if tr_index == 0 else 'non_ponzi' ]), np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_basic.append(res) tl.compute_time(self.cur_time) self.ft = ft self.ft_opcodes = ft_opcodes self.ft_basic = ft_basic self.top_30_avg_stdev = top_30_avg_stdev print("nbrs==========================") print( f"P={sum(nbrs[0]) / len(nbrs[0])}, NP={sum(nbrs[1]) / len(nbrs[1])}" ) print("lifes==========================") print( f"P={sum(lifes[0]) / len(lifes[0])}, NP={sum(lifes[1]) / len(lifes[1])}" ) for i in self.op[0]: top_30_avg_stdev[0]['size_info'].append( os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i + '.json')) for i in self.op[1]: top_30_avg_stdev[1]['size_info'].append( os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i + '.json')) for tr_index in range(2): print(f'tr_index={tr_index}') for name, data in top_30_avg_stdev[tr_index].items(): print(f"name={name}, len={len(data)}") avg = sum([float(each) for each in data]) / len(data) print( f"\tname={name}, avg={avg}, stdev={statistics.stdev(data)}" )
def cal_advanced_features(self): ft = [] ft_opcodes = [] ft_basic = [] len_op = [len(self.op[0]), len(self.op[1])] for tr_index in range(2): print('computing features for ' + ('ponzi' if tr_index == 0 else 'non ponzi')) for i in range(len_op[tr_index]): # for each contract val_in = [] val_out = [] time_in = [] time_out = [] pay_in = 0 pay_out = 0 addr_in = set() addr_out = set() birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp']) for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[ tr_index][i][1]: # for each tx of that contract contract_hash = self.op[tr_index][i] timestamp = float(tx['timeStamp']) if (timestamp - birth) / (60 * 60 * 24) <= self.J: self.cal_value_time_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'val_in': val_in, 'val_out': val_out, 'time_in': time_in, 'time_out': time_out, 'timestamp': timestamp }) (pay_in, pay_out) = self.cal_addr_in_out({ 'tx': tx, 'contract_hash': contract_hash, 'pay_in': pay_in, 'pay_out': pay_out, 'addr_in': addr_in, 'addr_out': addr_out }) num_overlap_addr = len(addr_in.intersection(addr_out)) res = tl.basic_features({ 'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi', 'val_in': np.asarray(val_in), 'val_out': np.asarray(val_out), 'time_in': np.asarray(time_in), 'time_out': np.asarray(time_out), 'pay_in': pay_in, 'pay_out': pay_out, 'num_overlap_addr': num_overlap_addr }) ft.append( np.concatenate((res, np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_opcodes.append( np.concatenate((np.asarray([ 'ponzi' if tr_index == 0 else 'non_ponzi' ]), np.asarray(self.op_freq[tr_index][i], dtype='float64')))) ft_basic.append(res) self.cur_time = tl.compute_time(self.cur_time) self.ft = ft self.ft_opcodes = ft_opcodes self.ft_basic = ft_basic