示例#1
0
 def filter_txs_by_block(self, days):
     # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict()
     NUM_BLOCK_PER_DAY = 6 * 60 * 24
     tr_dico = [[[[], []] for i in range(len(self.op[0]))],
                [[[], []] for i in range(len(self.op[1]))]]
     count_txs = 0
     for np_index in range(2):
         i = 0
         for contract_index in range(len(self.tr_dico_all[np_index])):
             i += 1
             # print(f"Filtering txs, np_index={np_index}, count={i}, process={round(i * 100 / len(self.op[np_index]), 2)}%")
             for nml_index in range(2):
                 data = []
                 for tx_dict in self.tr_dico_all[np_index][contract_index][
                         nml_index]:
                     tx_block = int(tx_dict['blockNumber'])
                     contract_addr = self.op[np_index][contract_index]
                     creation_block = self.contract_creation_block[
                         contract_addr]
                     if tx_block - creation_block < NUM_BLOCK_PER_DAY * days:
                         data.append(tx_dict)
                         count_txs += 1
                 tr_dico[np_index][contract_index][nml_index] = data
     self.tr_dico = tr_dico
     tl.compute_time(self.cur_time)
     print(f"days={days}, num_txs={count_txs}")
示例#2
0
 def create_pandas_dataframe(self, ft, ft_names):
     print("Creating pandas dataframe...")
     columns = [s + '@NUMERIC' for s in ft_names]
     columns[0] = "ponzi@{ponzi,non_ponzi}"
     df = pd.DataFrame(data=ft, columns=columns)
     df['size_info@NUMERIC'] = self.size_info
     # data.loc[:, data.columns != columns[0]] = data.loc[:, data.columns != columns[0]].astype(np.float64)
     tl.compute_time(self.cur_time)
     return df
示例#3
0
 def load_op(self):
     print("Loading op, opcodes, op_freq, size_info...")
     self.op = [[
         fname.split('.json')[0]
         for fname in os.listdir(self.paths['database_op'])
         if fname.endswith('.json')
     ],
                [
                    fname.split('.json')[0]
                    for fname in os.listdir(self.paths['database_op_np'])
                    if fname.endswith('.json')
                ]]
     self.opcodes = [
         'SWAP8', 'DUP11', 'DUP14', 'SWAP10', 'DUP15', 'LOG2', 'INVALID',
         'SWAP9', 'SWAP5', 'SWAP12', 'SWAP16', 'DUP9', 'LOG1', 'DUP12',
         'SWAP11', 'SWAP2', 'MSTORE8', 'SWAP14', 'DUP13', 'POP', 'DUP1',
         'DUP8', 'DUP7', 'DUP3', 'DUP4', 'MSTORE', 'SWAP3', 'CODECOPY',
         'JUMP', 'DUP5', 'SWAP13', 'STOP', 'CALLDATACOPY', 'SWAP7', 'SWAP1',
         'SWAP6', 'RETURN', 'DUP6', 'SWAP4', 'REVERT', 'DUP2',
         'SELFDESTRUCT', 'DUP10', 'DUP16', 'JUMPI', 'SSTORE', 'PUSH',
         'LOG3', 'LOG4', 'Missing', 'SWAP15'
     ]
     for i in self.op[0]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'bytecode/' + i + '.json'))
     for i in self.op[1]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'bytecode_np/' + i +
                             '.json'))
     with open(
             self.paths['db'] + 'op_freq.json',
             'rb',
     ) as f:
         self.op_freq = json.loads(f.read())
     self.cur_time = tl.compute_time(self.cur_time)
示例#4
0
 def gen_tr_dico(self):
     # tr_dico is ordered by op[]
     # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict()
     tr_dico = [[[0, 0] for i in range(len(self.op[0]))], [[0, 0] for i in range(len(self.op[1]))]]
     file_paths = ['database_nml', 'database_int', 'database_nml_np', 'database_int_np']
     op_indices = [0, 0, 1, 1]
     nml_int_indices = [0, 1, 0, 1]
     for i in range(4):
         tr_index = op_indices[i]
         cur_op = self.op[tr_index]
         nml_int_index = nml_int_indices[i]
         print("loading " + file_paths[i])
         count = 0
         with open(self.paths[file_paths[i]]) as f:
             while True:
                 count += 1
                 if count % 100 == 0:
                     print(count)
                 contract_hash = f.readline().strip('\n')
                 list_line = f.readline()
                 if not contract_hash:
                     break
                 if contract_hash not in cur_op:
                     continue
                 tr_dico[tr_index][cur_op.index(contract_hash)][nml_int_index] = ast.literal_eval(list_line.strip('\n'))
     self.cur_time = tl.compute_time(self.cur_time)
     self.save_tr_dico(tr_dico)
示例#5
0
    def gen_op_freq(self):
        print("EtherDataToFreqAndTrDisc: generating op_freq.json")
        # op_freq = [[], []]
        op_freq = [{}, {}]
        for i in range(2):
            db_path = self.paths['database_op'] if i == 0 else self.paths['database_op_np']
            for addr in self.op[i]:
                with open(db_path + addr + '.csv', 'r') as f:
                    raw = f.readlines()
                    res = [0 for i in range(len(self.opcodes))]
                    if len(raw) > 1:
                        tot = 0
                        for opcode in raw:
                            opcode = opcode.strip('\n')
                            code = opcode.split(',')[0]
                            count = int(opcode.split(',')[1])
                            tot += count
                            res[self.opcodes.index(code)] += count
                    tot = tot if len(raw) > 1 else 1
                    res = [x / tot for x in res]
                    # op_freq[i].append(res)
                    op_freq[i][addr] = res

        print(f"{len(op_freq[0])}, {len(op_freq[1])}")
        self.cur_time = tl.compute_time(self.cur_time)
        with open(self.paths['db'] + 'op_freq.json', 'w') as outfile:
            outfile.write(json.dumps(op_freq))
            print('op_freq serialized')
示例#6
0
    def define_path(self):
        self.cur_time = time.clock()
        print("EtherDataToFreqAndTrDisc: define variables...")
        self.paths['db'] = '../dataset/'

        self.paths[
            'database_nml'] = self.paths['db'] + 'sm_database/normal.json'
        self.paths[
            'database_int'] = self.paths['db'] + 'sm_database/internal.json'
        self.paths['database_op'] = self.paths['db'] + 'ponzi/op_count/'

        self.paths['database_nml_np'] = self.paths[
            'db'] + 'sm_database/normal_np.json'
        self.paths['database_int_np'] = self.paths[
            'db'] + 'sm_database/internal_np.json'
        self.paths['database_op_np'] = self.paths['db'] + 'non_ponzi/op_count/'

        self.paths['opcode'] = self.paths['db'] + 'ponzi/opcode/'
        self.paths['opcode_np'] = self.paths['db'] + 'non_ponzi/opcode/'

        # # For original data Marion_files
        # self.paths['db'] = '../dataset/sm_database/'
        # self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/'
        # self.paths['database_op_np'] = self.paths['db'] + 'opcode_np/opcode_count/bytecode_np/'

        self.cur_time = tl.compute_time(self.cur_time)
        pass
示例#7
0
 def load_op(self):
     # op[p=0, np=1][index] = contract_address
     print("Loading op, opcodes, op_freq, size_info...")
     self.op = [[
         fname.split('.csv')[0]
         for fname in os.listdir(self.paths['database_op'])
         if fname.endswith('.csv')
     ],
                [
                    fname.split('.csv')[0]
                    for fname in os.listdir(self.paths['database_op_np'])
                    if fname.endswith('.csv')
                ]]
     self.opcodes = OPCODES
     for i in self.op[0]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i +
                             '.json'))
     for i in self.op[1]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i +
                             '.json'))
     with open(
             self.paths['db'] + 'op_freq.json',
             'rb',
     ) as f:
         self.op_freq = json.loads(f.read())
     self.cur_time = tl.compute_time(self.cur_time)
示例#8
0
    def define_path(self):
        print("Feature: define variable and load data")
        self.paths['db'] = '../dataset/'

        self.paths['database_nml'] = self.paths['db'] + 'sm_database/normal/'
        self.paths['database_int'] = self.paths['db'] + 'sm_database/internal/'
        self.paths['database_op'] = self.paths['db'] + 'ponzi/op_count/'

        self.paths[
            'database_nml_np'] = self.paths['db'] + 'sm_database/normal_np/'
        self.paths[
            'database_int_np'] = self.paths['db'] + 'sm_database/internal_np/'
        self.paths['database_op_np'] = self.paths['db'] + 'non_ponzi/op_count/'

        # self.paths['opcode'] = self.paths['db'] + 'ponzi/opcode/'
        # self.paths['opcode_np'] = self.paths['db'] + 'non_ponzi/opcode/'
        self.paths['opcode'] = self.paths['db'] + 'ponzi_official_opcode/'
        self.paths[
            'opcode_np'] = self.paths['db'] + 'non_ponzi_official_opcode/'

        # # For original data Marion_files
        # self.paths['db'] = '../dataset/sm_database/'
        # self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/'
        # self.paths['database_op_np'] = self.paths['db'] + 'opcode_np/opcode_count/bytecode_np/'

        self.cur_time = tl.compute_time(self.cur_time)
示例#9
0
 def dump_arff(self):
     print("Dumping into arff files ...")
     with open(self.paths['db'] + 'models/PONZI_' + str(self.J) + '.arff',
               'w') as f:
         a2p.dump(self.df, f)
     with open(
             self.paths['db'] + 'models/PONZI_out_' + str(self.J) + '.arff',
             'w') as f:
         a2p.dump(self.df_out, f)
     self.cur_time = tl.compute_time(self.cur_time)
示例#10
0
 def create_pandas_dataframe(self):
     print("Creating pandas dataframe...")
     columns = [s + '@NUMERIC' for s in self.ft_names + self.opcodes]
     columns[0] = "ponzi@{ponzi,non_ponzi}"
     df = pd.DataFrame(data=self.ft, columns=columns)
     df['size_info@NUMERIC'] = self.size_info
     # data.loc[:, data.columns != columns[0]] = data.loc[:, data.columns != columns[0]].astype(np.float64)
     self.cur_time = tl.compute_time(self.cur_time)
     self.df = df
     self.get_rid_of_outliers(columns)
示例#11
0
 def load_op(self):
     # op[p=0, np=1][index] = contract_address
     print("Loading op, opcodes, op_freq, size_info...")
     self.op = [
         sorted([
             fname.split('.csv')[0]
             for fname in os.listdir(self.paths['database_op'])
             if fname.endswith('.csv')
             and not self.revert[fname.split('.csv')[0]]
         ]),
         sorted([
             fname.split('.csv')[0]
             for fname in os.listdir(self.paths['database_op_np'])
             if fname.endswith('.csv')
             and not self.revert[fname.split('.csv')[0]]
         ])
     ]
     self.opcodes = OPCODES
     for i in self.op[0]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i +
                             '.json'))
     for i in self.op[1]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i +
                             '.json'))
     self.load_op_freq()
     # Do some statistics
     # print(len(self.op_freq))
     # print(len(self.op_freq[0]))
     # print(len(self.op_freq[0][0]))
     # print(len(OPCODES))
     # for tr_index in range(2):
     #     print(f"avg of {'Ponzi' if tr_index == 0 else 'Non-Ponzi'}")
     #     nums = [[] for i in range(len(OPCODES))]
     #     for contract in self.op_freq[tr_index]:
     #         for i in range(len(OPCODES)):
     #             nums[i].append(float(contract[i]))
     #     for i in range(len(OPCODES)):
     #         print(f"{OPCODES[i]}: {sum(nums[i]) / len(nums[i])}")
     # End doing some statistics
     tl.compute_time(self.cur_time)
示例#12
0
    def get_rid_of_outliers(self, columns):
        print("Getting rid of outliers for the non ponzi instances")
        out_index = 3
        """
        min_max_scaler = preprocessing.StandardScaler()
        dum = df.drop(df[df[columns[0]] == 'ponzi'].index)
        df_out = dum[(np.abs(stats.zscore(min_max_scaler.transform(dum.drop(labels=[columns[0]]+columns[n:],axis=1)))) < out_index).all(axis=1)]
        df_out = df_out.append(df.drop(df[df[columns[0]] == 'non_ponzi'].index))

        """
        dum = self.df.drop(self.df[self.df[columns[0]] == 'ponzi'].index)
        df_out = dum[(np.abs(
            stats.zscore(
                np.asarray(dum.drop(
                    labels=[columns[0]] + columns[len(self.ft_names):],
                    axis=1),
                           dtype='float64'))) < out_index).all(axis=1)]
        self.df_out = df_out.append(
            self.df.drop(self.df[self.df[columns[0]] == 'non_ponzi'].index))
        tl.compute_time(self.cur_time)
示例#13
0
 def load_txs_data_one_directory(self, path, np_index, nml_index):
     # self.op[p=0, np=1][index] = contract_address
     for i in range(len(self.op[np_index])):
         contract_addr = self.op[np_index][i]
         # print(f"contract_addr={contract_addr}")
         data = []
         if not self.revert[contract_addr]:
             try:
                 file_index = 0
                 while True:
                     # print(f"Try load {contract_addr}_{file_index}.json")
                     with open(f"{path}{contract_addr}_{file_index}.json"
                               ) as f:
                         data += json.loads(f.read())
                     file_index += 1
             except FileNotFoundError as e:
                 pass
         self.tr_dico_all[np_index][i][nml_index] = data
         # print(f"data_len={len(data)}, tr_dico[{np_index}][{i}][{nml_index}] = data")
     tl.compute_time(self.cur_time)
示例#14
0
 def dump_arff(self, days):
     print("Dumping into arff files ...")
     if days < 10:
         day = f"0000{days}"
     elif days < 100:
         day = f"000{days}"
     elif days < 1000:
         day = f"00{days}"
     elif days < 10000:
         day = f"0{days}"
     else:
         day = f"{days}"
     with open(self.paths['arff'] + f'PONZI_{str(self.J)}_day_{day}.arff',
               'w') as f:
         a2p.dump(self.df, f)
     # with open(self.paths['arff'] + 'PONZI_opcodes_' + str(self.J) + '.arff', 'w') as f:
     #     a2p.dump(self.df_opcodes, f)
     # with open(self.paths['arff'] + 'PONZI_basic_' + str(self.J) + '.arff', 'w') as f:
     #     a2p.dump(self.df_basic, f)
     tl.compute_time(self.cur_time)
     self.tr_dico = {}
示例#15
0
 def load_op(self):
     # op[p=0, np=1][index] = contract_address
     print("Loading op, opcodes, op_freq, size_info...")
     self.op = [
         # sorted([fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op']) if
         #         fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]]]),
         # sorted([fname.split('.csv')[0] for fname in os.listdir(self.paths['database_op_np']) if
         #         fname.endswith('.csv') and not self.revert[fname.split('.csv')[0]]])
         sorted([
             fname.split('.csv')[0]
             for fname in os.listdir(self.paths['database_op'])
             if fname.endswith('.csv')
         ]),
         sorted([
             fname.split('.csv')[0]
             for fname in os.listdir(self.paths['database_op_np'])
             if fname.endswith('.csv')
         ])
     ]
     self.opcodes = OPCODES
     for i in self.op[0]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i +
                             '.json'))
     for i in self.op[1]:
         self.size_info.append(
             os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i +
                             '.json'))
     with open(
             self.paths['db'] + 'op_freq_list.json',
             'rb',
     ) as f:
         self.op_freq = json.loads(f.read())
     self.load_op_freq()
     # Do some statistics
     # Prof EJ required to get the numbers
     print(len(self.op_freq))
     print(len(self.op_freq[0]))
     print(len(self.op_freq[0][0]))
     print(len(self.opcodes))
     for tr_index in range(2):
         print(f"avg of {'Ponzi' if tr_index == 0 else 'Non-Ponzi'}")
         nums = [[] for i in range(len(self.opcodes))
                 ]  # 50 == len(original OPCODE)
         for contract in self.op_freq[tr_index]:
             for i in range(len(self.opcodes)):
                 nums[i].append(float(contract[i]))
         for i in range(len(self.opcodes)):
             print(f'{self.opcodes[i]}: {sum(nums[i]) / len(nums[i])}')
     # End doing some statistics
     self.cur_time = tl.compute_time(self.cur_time)
示例#16
0
    def load_tr_dico(self):
        tr_dico = [[], []]
        with open(self.paths['db'] + 'tr_dico_ponzi.json', 'rb') as f:
            tr_dico[0] = json.loads(f.read())

        with open(self.paths['db'] + 'tr_dico_nonponzi0.json', 'rb') as f:
            tr_dico[1] = json.loads(f.read())
            print("Reading tr_dico: " + str(len(tr_dico[1])))
        for i in range(1, len(self.op[1]) // 500 + 1):
            with open(self.paths['db'] + 'tr_dico_nonponzi' + str(i) + '.json',
                      'rb') as f:
                tr_dico[1] += json.loads(f.read())
                print("Reading tr_dico: " + str(len(tr_dico[1])))
        self.tr_dico = tr_dico
        self.cur_time = tl.compute_time(self.cur_time)
示例#17
0
    def define_path(self):
        print("Feature: define variable and load data")
        self.paths['db'] = '../Marion_files/sm_database/'

        self.paths['database_nml'] = self.paths['db'] + 'normal.json'
        self.paths['database_int'] = self.paths['db'] + 'internal.json'
        # Same as opcode/raw_opcodes/ in origin feature.py
        self.paths['database_op'] = self.paths['db'] + 'opcode/opcodes_count/'

        self.paths['database_nml_np'] = self.paths['db'] + 'normal_np.json'
        self.paths['database_int_np'] = self.paths['db'] + 'internal_np.json'
        self.paths['database_op_np'] = self.paths[
            'db'] + 'opcode_np/opcode_count/bytecode_np/'

        self.cur_time = tl.compute_time(self.cur_time)
        pass
示例#18
0
    def load_tr_dico(self):
        # tr_dico[p=0, np=1][# of Contracts][nml=0, int=1][list of TXs in nml.json] = {'blockNumber': xxx} = dict()
        tr_dico = [[], []]
        with open(self.paths['db'] + 'tr_dico_ponzi.json', 'rb') as f:
            tr_dico[0] = json.loads(f.read())

        with open(self.paths['db'] + 'tr_dico_nonponzi0.json', 'rb') as f:
            tr_dico[1] = json.loads(f.read())
            print("Reading tr_dico: " + str(len(tr_dico[1])))
        for i in range(1, len(self.op[1]) // 500 + 1):
            with open(self.paths['db'] + 'tr_dico_nonponzi' + str(i) + '.json',
                      'rb') as f:
                tr_dico[1] += json.loads(f.read())
                print("Reading tr_dico: " + str(len(tr_dico[1])))
        self.tr_dico = tr_dico
        self.cur_time = tl.compute_time(self.cur_time)
示例#19
0
    def gen_op_freq_origin(self):
        op_freq = [[], []]
        for add in self.op[0]:
            with open(self.paths['database_op'] + add + '.json', 'r') as f:
                # print(self.paths['database_op'] + add + '.json')
                raw = f.readlines()
                res = [0 for i in range(len(self.opcodes))]
                if len(raw) > 1:
                    tot = 0
                    for opcode in raw:
                        # count = number % 10 instead of number?
                        count = float(opcode[3])
                        tot += count
                        res[self.opcodes.index(opcode[5:-1])] = count
                else:
                    tot = 1
                res = [x / tot for x in res]
                op_freq[0].append(res)
                print(res)

        # non ponzi instances

        for add in self.op[1]:
            with open(self.paths['database_op_np'] + add + '.json', 'r') as f:
                raw = f.readlines()
                res = [0 for i in range(len(self.opcodes))]
                if len(raw) > 1:
                    tot = 0
                    for opcode in raw:
                        # count = number % 10 instead of number?
                        count = float(opcode[3])
                        tot += count
                        res[self.opcodes.index(opcode[5:-1])] = count
                else:
                    tot = 1

                res = [x / tot for x in res]
                op_freq[1].append(res)
                print(res)

        t2 = tl.compute_time(self.cur_time)

        with open(self.paths['db'] + 'op_freq.json', 'w') as outfile:
            outfile.write(json.dumps(op_freq))
            print('op_freq serialized')
示例#20
0
 def cal_value_time_in_out(self):
     ft = []
     len_op = [len(self.op[0]), len(self.op[1])]
     for tr_index in range(2):
         print('computing features for' +
               'ponzi' if tr_index == 0 else 'non ponzi')
         for i in range(len_op[tr_index]):
             val_in = []
             val_out = []
             time_in = []
             time_out = []
             birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp'])
             for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[
                     tr_index][i][1]:
                 timestamp = float(tx['timeStamp'])
                 if (timestamp - birth) / (60 * 60 * 24) <= self.J:
                     if tx['from'] == '' or tx['from'] == self.op[tr_index][
                             i]:
                         val_out.append(float(tx['value']))
                         time_out.append(timestamp)
                     else:
                         val_in.append(float(tx['value']))
                         time_in.append(timestamp)
             val_in = np.asarray(val_in)
             val_out = np.asarray(val_out)
             time_in = np.asarray(time_in)
             time_out = np.asarray(time_out)
             res = tl.basic_features(
                 'ponzi' if tr_index == 0 else 'non_ponzi',
                 np.asarray(val_in), np.asarray(val_out),
                 np.asarray(time_in), np.asarray(time_out))
             ft.append(
                 np.concatenate((res,
                                 np.asarray(self.op_freq[tr_index][i],
                                            dtype='float32'))))
         self.cur_time = tl.compute_time(self.cur_time)
     self.ft = ft
示例#21
0
    def cal_advanced_features(self):
        ft = []
        ft_opcodes = []
        ft_basic = []
        len_op = [len(self.op[0]), len(self.op[1])]
        nbrs = [[], []]
        lifes = [[], []]
        for tr_index in range(2):
            print('computing features for ' +
                  ('ponzi' if tr_index == 0 else 'non ponzi'))
            for i in range(len_op[tr_index]):
                # for each contract
                val_in = []
                val_out = []
                time_in = []
                time_out = []
                pay_in = 0
                pay_out = 0
                addr_in = set()
                addr_out = set()

                # print('=======index error=====')
                # print(self.tr_dico[0])
                # print('=======================')

                birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp'])
                for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[
                        tr_index][i][1]:
                    # for each tx of that contract
                    contract_hash = self.op[tr_index][i]
                    timestamp = float(tx['timeStamp'])
                    if (timestamp - birth) / (60 * 60 * 24) <= self.J:
                        self.cal_value_time_in_out({
                            'tx': tx,
                            'contract_hash': contract_hash,
                            'val_in': val_in,
                            'val_out': val_out,
                            'time_in': time_in,
                            'time_out': time_out,
                            'timestamp': timestamp
                        })
                    (pay_in, pay_out) = self.cal_addr_in_out({
                        'tx':
                        tx,
                        'contract_hash':
                        contract_hash,
                        'pay_in':
                        pay_in,
                        'pay_out':
                        pay_out,
                        'addr_in':
                        addr_in,
                        'addr_out':
                        addr_out
                    })
                num_overlap_addr = len(addr_in.intersection(addr_out))
                res = tl.basic_features({
                    'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi',
                    'val_in': np.asarray(val_in),
                    'val_out': np.asarray(val_out),
                    'time_in': np.asarray(time_in),
                    'time_out': np.asarray(time_out),
                    'pay_in': pay_in,
                    'pay_out': pay_out,
                    'num_overlap_addr': num_overlap_addr
                })
                # gini: 12 in 13 out of 15 timeout
                # 1 nbr_tx_in, 16 lifetime
                # CALLDATACOPY, CODECOPY, SWAP3, SSTORE, DUP6, SWAP6, REVERT, SSTORE

                # print('=========================')
                # print('nbrs: ', nbrs)
                # print('type(nbrs): ', type(nbrs))
                # print('type(nbrs[0]): ', type(nbrs[0]))
                # print('=========================')

                nbrs[tr_index].append(float(res[1]))
                lifes[tr_index].append(float(res[16]))
                ft.append(
                    np.concatenate((res,
                                    np.asarray(self.op_freq[tr_index][i],
                                               dtype='float64'))))
                ft_opcodes.append(
                    np.concatenate((np.asarray([
                        'ponzi' if tr_index == 0 else 'non_ponzi'
                    ]), np.asarray(self.op_freq[tr_index][i],
                                   dtype='float64'))))
                ft_basic.append(res)
            self.cur_time = tl.compute_time(self.cur_time)
        self.ft = ft
        self.ft_opcodes = ft_opcodes
        self.ft_basic = ft_basic
        print('nbrs:')
        print(
            f'P={sum(nbrs[0]) / len(nbrs[0])}, NP={sum(nbrs[1]) / len(nbrs[1])}'
        )
        print('lifes:')
        print(
            f'P={sum(lifes[0]) / len(lifes[0])},  NP={sum(lifes[1]) / len(lifes[1])}'
        )
示例#22
0
    def cal_advanced_features(self):
        ft = []
        ft_opcodes = []
        ft_basic = []
        len_op = [len(self.op[0]), len(self.op[1])]
        nbrs = [[], []]
        lifes = [[], []]
        top_30_avg_stdev = [{
            'size_info': [],
            'nbr_tx_in': [],
            'lifetime': [],
            'num_paid_in_addr': [],
            'gini_in': [],
            'overlap_in_out_addr': [],
            'gini_time_out': [],
            'avg_time_btw_tx': []
        }, {
            'size_info': [],
            'nbr_tx_in': [],
            'lifetime': [],
            'num_paid_in_addr': [],
            'gini_in': [],
            'overlap_in_out_addr': [],
            'gini_time_out': [],
            'avg_time_btw_tx': []
        }]
        for tr_index in range(2):
            print('computing features for ' +
                  ('ponzi' if tr_index == 0 else 'non ponzi'))
            for i in range(len_op[tr_index]):
                # for each contract
                val_in = []
                val_out = []
                time_in = []
                time_out = []
                pay_in = 0
                pay_out = 0
                addr_in = set()
                addr_out = set()
                # The timeStamp of all nml TXs is sorted asc, so the first normal TX is the TX creates the contract.
                # That's from Charles. I checked the timeStamps of all nml TXs, and they were asc sorted.
                birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp'])
                for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[
                        tr_index][i][1]:
                    # for each tx of that contract
                    contract_hash = self.op[tr_index][i]
                    timestamp = float(tx['timeStamp'])
                    if (timestamp - birth) / (60 * 60 * 24) <= self.J:
                        self.cal_value_time_in_out({
                            'tx': tx,
                            'contract_hash': contract_hash,
                            'val_in': val_in,
                            'val_out': val_out,
                            'time_in': time_in,
                            'time_out': time_out,
                            'timestamp': timestamp
                        })
                    (pay_in, pay_out) = self.cal_addr_in_out({
                        'tx':
                        tx,
                        'contract_hash':
                        contract_hash,
                        'pay_in':
                        pay_in,
                        'pay_out':
                        pay_out,
                        'addr_in':
                        addr_in,
                        'addr_out':
                        addr_out
                    })
                num_overlap_addr = len(addr_in.intersection(addr_out))
                res = tl.basic_features({
                    'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi',
                    'val_in': np.asarray(val_in),
                    'val_out': np.asarray(val_out),
                    'time_in': np.asarray(time_in),
                    'time_out': np.asarray(time_out),
                    'pay_in': pay_in,
                    'pay_out': pay_out,
                    'num_overlap_addr': num_overlap_addr
                })
                # gini: 12 in 13 out 15 timeout
                # 1 nbr_tx_in, 16 lifetime
                # CALLDATACOPY, CODECOPY, SWAP3, SSTORE, DUP6, SWAP6, REVERT, SSTORE
                nbrs[tr_index].append(float(res[1]))
                lifes[tr_index].append(float(res[16]))
                # if lifetime in lifes[tr_index]:
                #     lifes[tr_index][lifetime] += 1
                # else:
                #     lifes[tr_index][lifetime] = 1

                top_30_avg_stdev[tr_index]['nbr_tx_in'].append(res[1])
                top_30_avg_stdev[tr_index]['lifetime'].append(res[16])
                top_30_avg_stdev[tr_index]['num_paid_in_addr'].append(res[5])
                top_30_avg_stdev[tr_index]['gini_in'].append(res[12])
                top_30_avg_stdev[tr_index]['overlap_in_out_addr'].append(
                    res[7])
                top_30_avg_stdev[tr_index]['gini_time_out'].append(res[15])
                top_30_avg_stdev[tr_index]['avg_time_btw_tx'].append(res[14])

                ft.append(
                    np.concatenate((res,
                                    np.asarray(self.op_freq[tr_index][i],
                                               dtype='float64'))))
                ft_opcodes.append(
                    np.concatenate((np.asarray([
                        'ponzi' if tr_index == 0 else 'non_ponzi'
                    ]), np.asarray(self.op_freq[tr_index][i],
                                   dtype='float64'))))
                ft_basic.append(res)
            tl.compute_time(self.cur_time)
        self.ft = ft
        self.ft_opcodes = ft_opcodes
        self.ft_basic = ft_basic
        self.top_30_avg_stdev = top_30_avg_stdev
        print("nbrs==========================")
        print(
            f"P={sum(nbrs[0]) / len(nbrs[0])}, NP={sum(nbrs[1]) / len(nbrs[1])}"
        )
        print("lifes==========================")
        print(
            f"P={sum(lifes[0]) / len(lifes[0])}, NP={sum(lifes[1]) / len(lifes[1])}"
        )

        for i in self.op[0]:
            top_30_avg_stdev[0]['size_info'].append(
                os.path.getsize(self.paths['db'] + 'ponzi/bcode/' + i +
                                '.json'))
        for i in self.op[1]:
            top_30_avg_stdev[1]['size_info'].append(
                os.path.getsize(self.paths['db'] + 'non_ponzi/bcode/' + i +
                                '.json'))
        for tr_index in range(2):
            print(f'tr_index={tr_index}')
            for name, data in top_30_avg_stdev[tr_index].items():
                print(f"name={name}, len={len(data)}")
                avg = sum([float(each) for each in data]) / len(data)
                print(
                    f"\tname={name}, avg={avg}, stdev={statistics.stdev(data)}"
                )
示例#23
0
 def cal_advanced_features(self):
     ft = []
     ft_opcodes = []
     ft_basic = []
     len_op = [len(self.op[0]), len(self.op[1])]
     for tr_index in range(2):
         print('computing features for ' +
               ('ponzi' if tr_index == 0 else 'non ponzi'))
         for i in range(len_op[tr_index]):
             # for each contract
             val_in = []
             val_out = []
             time_in = []
             time_out = []
             pay_in = 0
             pay_out = 0
             addr_in = set()
             addr_out = set()
             birth = float(self.tr_dico[tr_index][i][0][0]['timeStamp'])
             for tx in self.tr_dico[tr_index][i][0] + self.tr_dico[
                     tr_index][i][1]:
                 # for each tx of that contract
                 contract_hash = self.op[tr_index][i]
                 timestamp = float(tx['timeStamp'])
                 if (timestamp - birth) / (60 * 60 * 24) <= self.J:
                     self.cal_value_time_in_out({
                         'tx': tx,
                         'contract_hash': contract_hash,
                         'val_in': val_in,
                         'val_out': val_out,
                         'time_in': time_in,
                         'time_out': time_out,
                         'timestamp': timestamp
                     })
                 (pay_in, pay_out) = self.cal_addr_in_out({
                     'tx':
                     tx,
                     'contract_hash':
                     contract_hash,
                     'pay_in':
                     pay_in,
                     'pay_out':
                     pay_out,
                     'addr_in':
                     addr_in,
                     'addr_out':
                     addr_out
                 })
             num_overlap_addr = len(addr_in.intersection(addr_out))
             res = tl.basic_features({
                 'ponzi': 'ponzi' if tr_index == 0 else 'non_ponzi',
                 'val_in': np.asarray(val_in),
                 'val_out': np.asarray(val_out),
                 'time_in': np.asarray(time_in),
                 'time_out': np.asarray(time_out),
                 'pay_in': pay_in,
                 'pay_out': pay_out,
                 'num_overlap_addr': num_overlap_addr
             })
             ft.append(
                 np.concatenate((res,
                                 np.asarray(self.op_freq[tr_index][i],
                                            dtype='float64'))))
             ft_opcodes.append(
                 np.concatenate((np.asarray([
                     'ponzi' if tr_index == 0 else 'non_ponzi'
                 ]), np.asarray(self.op_freq[tr_index][i],
                                dtype='float64'))))
             ft_basic.append(res)
         self.cur_time = tl.compute_time(self.cur_time)
     self.ft = ft
     self.ft_opcodes = ft_opcodes
     self.ft_basic = ft_basic