def process_data(): """covert the string data to idx, and save.""" with codecs.open(cg.DATA_PATH, 'r', 'utf-8') as file: data = file.read().split('\n') if len(data[-1]) == 0: _error('The last line which is empty has been removed.') data = data[:-1] questions = [] answers = [] for line in data: line_split = line.split('=') que, ans = line_split[0], line_split[1] questions.append(que) answers.append(ans) assert len(questions) == len(answers),\ _error('The number of quesiton: {} not equal to the number of answer: {}'.format(len(questions), len(answers))) que_idx = [str_to_idx(que) for que in questions] ans_idx = [str_to_idx(ans) for ans in answers] Path('processed_data/').mkdir(exist_ok=True) with codecs.open('processed_data/questions.bin', 'wb') as file: pickle.dump(que_idx, file) with codecs.open('processed_data/answers.bin', 'wb') as file: pickle.dump(ans_idx, file) _info( 'Coverted questions and answers have been saved into `processed_data` directory.' )
def find_moduledata(): pModuleData = 0 if common.check_is_stripped(): log._info("binary is not stripped!") for addr, name in idautils.Names(): if name == "runtime.firstmoduledata": pModuleData = addr break else: log._info("binary is stripped..") log._info("Now find the moduledata by using brute force searching") GO1_16_MAGIC = 0xFFFFFFFA # <-- go 1.16 magic text_section = common.get_segment_addr_by_name(name=".text") rdata_section = common.get_segment_addr_by_name(name=".rdata") data_section = common.get_segment_addr_by_name(name=".data") sections = [(".text", text_section), (".rdata", rdata_section), (".data", data_section)] for sec_name, section_addr in sections: cur_addr = section_addr next_section_addr = common.get_next_segment_addr(addr=cur_addr) pModuleData = find_module_data_bruteforce( start_addr=section_addr, break_addr=next_section_addr, magic=GO1_16_MAGIC) if pModuleData != 0: log._info("ModuleData Structure locate at [%s] - @0x%x" % (sec_name, pModuleData)) break if pModuleData == 0: log._error("Cannot find ModuleData Structre in current binary...") return pModuleData
def parse_pclntable(module_data): pPcHeader = module_data.pPcHeader pc_header = parse_pc_header(pMem=pPcHeader) ptrSize = pc_header.ptrSize numberOfFuncs = pc_header.nFunc log._info("Number of Functions : %d" % numberOfFuncs) pclntable_start = module_data.pPclnTable cur_addr = pclntable_start for idx in range(numberOfFuncs): cur_addr = pclntable_start + (2 * ptrSize) * idx func_rva = common.mem_read_integer(addr=cur_addr, read_size=ptrSize) _func_structure_offset = common.mem_read_integer(addr=cur_addr + ptrSize, read_size=ptrSize) _func_addr = pclntable_start + _func_structure_offset if not idc.GetFunctionName(func_rva): log._info("Unk Func @0x%x" % func_rva) idc.MakeUnkn(func_rva, idc.DOUNK_EXPAND) idaapi.autoWait() idc.MakeCode(func_rva) idaapi.autoWait() if idc.MakeFunction(func_rva): idaapi.autoWait() log._info("Create Func @0x%x" % func_rva) _func = parse__func(pMem=_func_addr) #args=_func.args #func_id=_func.args func_name_addr = module_data.pFuncNameTable + _func.nameoff func_name = idc.GetString(func_name_addr) if func_name: clean_func_name = utils.clean_function_name(func_name) log._info("@0x%x Name : [%s]" % (func_rva, func_name)) idc.MakeComm(func_rva, "@0x" + str(hex(func_rva)) + " entry") idaapi.autoWait() if idc.MakeStr(func_name_addr, func_name_addr + len(func_name) + 1): idaapi.autoWait() else: log._error("@0x%x Name : [%s] Failed..." % (func_rva, func_name)) _func_addr = idaapi.get_func(func_rva) if _func_addr is not None: if idc.MakeNameEx(_func_addr.startEA, func_name, flags=idaapi.SN_FORCE): idaapi.autoWait() log._info("@0x%x Name : [%s]" % (func_rva, func_name)) else: log._error("@0x%x Name : [%s] Failed..." % (func_rva, func_name))
def __init__(self, config, is_training, input_text, input_image, scope=None): """"Constructor for EANN Model. Args: config: Config Object, hyparameters set. is_training: Boolean, whether train or not. input_text: tf.int32 Tensor, [batch_size, seq_length]. input_image: tf.float32 Tensor, [batch_size, h, w, c]. """ # config config = copy.deepcopy(config) # textCNN config self.vocab_size = config.vocab_size self.embedding_size = config.embedding_size self.window_size = config.window_size self.pool_size = config.pool_size self.filter_number_text = config.filter_number_text self.seq_length = config.max_length # VGG_19 try: self.vgg = tf.keras.applications.VGG19(input_shape=(224, 224, 3), include_top=False, weights='imagenet') _info('Successfully load the pre-trained VGG-19 weights.') except Exception: _error( 'Please download the VGG_19 weights from : \n{}\n, then put the file \ into ~/.keras/models'.format(_cg.VGG_19_Weights_Download_URL)) self.vgg.trainable = False # do not train the vgg pretrained parameters # global config self.hidden_size = config.hidden_size self.num_classes = config.num_classes self.num_domains = config.num_domains # basic config self.initializer_range = config.initializer_range self.dropout = config.dropout if not is_training: self.dropout = 0.0 # Build the Graph self.label_output, self.domain_output, self.batch_size = self.build( input_text, input_image)
def _embedding_positional(self, pos_type, embedded_input, embedding_size, dropout_prob, name=None, max_position_embedding=100): """add positional embeddings to the original embeddings. Args: pos_type: the positional type to use, either 'normal' or 'positional_embeddings'. embedded_input: original embeddings, [batch_size, seq_length, embedding_size]. embedding_size: embedding size. dropout_prob: dropout probability, refer to the 'rate' parameter in tf.nn.dropout() max_position_embedding: for 'normal' type, the model learn a new positional matrix, so set a max sequence length. Returns: output: identical type and shape to the embedded input. """ assert_op = tf.assert_less_equal(self.input_length, max_position_embedding) self.pos_type = pos_type with tf.control_dependencies([assert_op]): # select sin & cos or normal positional embedding if pos_type == 'normal': positional_embeddings = tf.get_variable( name='positional_embeddings', shape=[max_position_embedding, embedding_size], dtype=tf.float32) # slice the positional embeddings according to the actual length ac_pos_embed = tf.slice(positional_embeddings, [0, 0], [self.input_length, -1]) embedded_input += ac_pos_embed elif pos_type == 'trigonometrical': self.positional_embeddings = tf.placeholder( dtype=tf.float32, shape=[None, None], name='positional_embeddings') positional_embeddings = tf.convert_to_tensor( self.positional_embeddings) embedded_input += positional_embeddings else: _error('unknown positional type <{}>'.format(pos_type), head='ERROR') raise ValueError output = self._layer_norm_and_dropout(embedded_input, dropout_prob, name) return output
def de_func_inner(): data_path = Path(__file__).absolute().parent / 'processed_data' files = list(data_path.rglob('*.bin')) if need_exit: if len(files) != 2: _error('No data exists.') raise FileNotFoundError else: if len(files) > 0: _error('The data exists.') raise FileExistsError if func.__name__ == 'process_data': func() else: for data in func(): yield data
def train_generator(): """make train, test data.""" # load the data with codecs.open(pos_data_path, 'rb') as file_p,\ codecs.open(neg_data_path, 'rb') as file_n: pos_data = pickle.load(file_p) neg_data = pickle.load(file_n) assert len(pos_data) == len(neg_data), _error('Data distribution uneven.', head='ERROR') # shuffle the data train_data = pos_data + neg_data random.shuffle(train_data) # create batch for (start, end) in provide_batch_idx(len(train_data), batch_size): data_batch = train_data[start: end] sentences = [data[1] for data in data_batch] labels = [data[0] for data in data_batch] sentences_idx = list(map(process_line, sentences)) sentences_idx_padded = padding_data(sentences_idx) input_mask = list(map(make_mask, sentences_idx_padded)) features = {'input_data': sentences_idx_padded, 'input_mask': input_mask} yield(features, labels)
def select_initializer(itype=None, seed=None, init_weight=0.01): if itype.upper() == 'UNIFORM': return tf.random_uniform_initializer(-init_weight, init_weight, seed=seed) elif itype.upper() == 'GLOROT_N': return tf.contrib.keras.initializer.glorot_normal(seed=seed) elif itype.upper() == 'GLOROT_U': return tf.contrib.keras.initializer.glorot_uniform(seed=seed) elif itype.upper() == 'RANDOM': return tf.random_normal_initializer(mean=0.0, stddev=init_weight, seed=seed, dtype=tf.float32) else: _error('Not support <{}> initializer'.format(itype), head='ERROR') raise ValueError
def create_or_load(model, ckpt_path, session, force=False): """create a new model or load from the existing one""" dir_path = '/'.join(ckpt_path.split('/')[:-1]) latest_ckpt = tf.train.latest_checkpoint(dir_path) if latest_ckpt and not force: try: model.saver.restore(session, latest_ckpt) except Exception as e: _error(e, head='ERROR') raise e _info('successfully load model from <{}>'.format(latest_ckpt), head='INFO') else: session.run(tf.global_variables_initializer()) session.run(tf.local_variables_initializer()) session.run(tf.tables_initializer()) _info('successfully create a new model', head='INFO') global_step = model.global_step.eval(session=session) return model, global_step
def parse_func_pointer(): renamed = 0 for segea in idautils.Segments(): for addr in idautils.Functions(segea, idc.SegEnd(segea)): #for addr in idautils.Functions(text_seg.startEA, text_seg.endEA): name = idc.GetFunctionName(addr) # Look at data xrefs to the function - find the pointer that is located in .rodata data_ref = idaapi.get_first_dref_to(addr) while data_ref != idc.BADADDR: if 'rodata' in idc.get_segm_name(data_ref): # Only rename things that are currently listed as an offset; eg. off_9120B0 if 'off_' in idc.GetTrueName(data_ref): if idc.MakeNameEx(data_ref, ('%s_ptr' % name), flags=idaapi.SN_FORCE): idaapi.autoWait() renamed += 1 else: log._error('Failed to name pointer @ 0x%02x for %s' % (data_ref, name)) data_ref = idaapi.get_next_dref_to(addr, data_ref)
def save_to_binary(data, save_path, replace=False): """convert the data to binary file and save. Args: data: object, the original file. save_path: str, the absolute path to save the data. replace: boolean, Whether to replace the file when the file exits. """ # change the str path to PosixPath save_path = Path(save_path) # check the file exits or not if save_path.is_file(): if not replace: _error('{} already exits.'.format(save_path), head='ERROR') raise FileExistsError else: _info('{} already exits, replaced.'.format(save_path)) with codecs.open(save_path, 'wb') as file: pickle.dump(data, file)
def train_generator(): """generator to yield data.""" # load the data with codecs.open(pos_data_path, 'rb') as file_p,\ codecs.open(neg_data_path, 'rb') as file_n: pos_data = pickle.load(file_p) neg_data = pickle.load(file_n) assert len(pos_data) == len(neg_data), _error('Data distribution uneven.', head='ERROR') # shuffle the data train_data = pos_data + neg_data random.shuffle(train_data) # create batch for (start, end) in provide_batch_idx(len(train_data), batch_size): data_batch = train_data[start:end] yield extract_features(data_batch, False)
def __setattr__(self, name, value): if hasattr(self, name): wrapped_setatrr(self, name, value) else: _error('Add new {} is forbidden'.format(name)) raise AttributeError
PATTERN = r'^(loss = )\d{1,5}\.\d{1,10}' PATTERN_2 = r'(ppl = )\d{1,5}\.\d{1,10}' def extract(log_path, save_path, save_path_2): with codecs.open(log_path, 'r', 'utf-8') as file, \ codecs.open(save_path, 'w', 'utf-8') as file_2, \ codecs.open(save_path_2, 'w', 'utf-8') as file_3: for line in file: if re.search(PATTERN, line): match = re.search(PATTERN, line).group() loss = match.split(' ')[2] file_2.write('sup_avg:' + loss + '\n') file_2.flush() if re.search(PATTERN_2, line): match = re.search(PATTERN_2, line).group() ppl = match.split(' ')[2] file_3.write('sup_avg:' + ppl + '\n') file_3.flush() _info('The loss record have been save to {}.'.format(save_path)) if __name__ == '__main__': if len(sys.argv) < 3: _error('Please specify the log path and the save path.') raise ValueError else: log_path = sys.argv[1] save_path = sys.argv[2] save_path_2 = sys.argv[3] extract(log_path, save_path, save_path_2)
def train_generator(): """this could achieve padding among each batch.""" # load the data with codecs.open('processed_data/questions.bin', 'rb') as file: questions = pickle.load(file) with codecs.open('processed_data/answers.bin', 'rb') as file: answers = pickle.load(file) assert len(questions) == len(answers),\ _error('The number of quesiton: {} not equal to the number of answer: {}'.format(len(questions), len(answers))) # random shuffle the data questions_answers = list(zip(questions, answers)) random.shuffle(questions_answers) questions, answers = zip(*questions_answers) questions = list(questions) answers = list(answers) que_batch = [] input_ans_batch = [] output_ans_batch = [] seq_length_decoder_input_data = [] batch_num = len(questions) // batch_size for idx, que in enumerate(questions): if len(que_batch) < batch_size: # que que_batch.append(que) # ans inp_ans = copy.deepcopy(answers[idx]) out_ans = copy.deepcopy(answers[idx]) inp_ans.insert(0, sos_id) out_ans.append(eos_id) input_ans_batch.append(inp_ans) output_ans_batch.append(out_ans) seq_length_decoder_input_data.append(len(inp_ans)) # check whether a batch is full if len(que_batch) == batch_size: que_batch_padded, inp_ans_batch_padded, out_ans_batch_padded, mask = padding_data( que_batch, input_ans_batch, output_ans_batch) features = { 'input_x': que_batch_padded, 'input_mask': mask, 'input_y': inp_ans_batch_padded, 'seq_length': seq_length_decoder_input_data } yield (features, out_ans_batch_padded) que_batch = [] input_ans_batch = [] output_ans_batch = [] seq_length_decoder_input_data = [] if idx > (batch_num * batch_size - 1) and len(questions) % batch_size != 0: que_batch = copy.deepcopy(questions[idx:]) input_ans_batch = copy.deepcopy(answers[idx:]) output_ans_batch = copy.deepcopy(answers[idx:]) for _ in range(batch_size - len(que_batch)): aug_que = random.choice(questions) aug_que_idx = questions.index(aug_que) que_batch.append(aug_que) inp_ans = copy.deepcopy(answers[aug_que_idx]) out_ans = copy.deepcopy(answers[aug_que_idx]) inp_ans.insert(0, sos_id) out_ans.append(eos_id) input_ans_batch.append(inp_ans) output_ans_batch.append(out_ans) for idx, _ in enumerate(input_ans_batch): input_ans_batch[idx].insert(0, sos_id) for idx, _ in enumerate(output_ans_batch): output_ans_batch[idx].append(eos_id) seq_length_decoder_input_data = [ len(ans) for ans in input_ans_batch ] assert len(que_batch) == len(input_ans_batch) == len( output_ans_batch) == batch_size que_batch_padded, inp_ans_batch_padded, out_ans_batch_padded, mask = padding_data( que_batch, input_ans_batch, output_ans_batch) features = { 'input_x': que_batch_padded, 'input_mask': mask, 'input_y': inp_ans_batch_padded, 'seq_length': seq_length_decoder_input_data } yield (features, out_ans_batch_padded) break
def no_mask(data): def select_useful_sentiment(scores): """return either positive tag or negative tag for each word.""" pos_score, neg_score = scores[0], scores[1] return 1 if pos_score > neg_score else 0 # clean data data = data.replace('<br />', ' ') # split sentence sentences_set = sent_tokenize(data) data_final = [] word_polarity_labels = [] mask_indices = [] preb_sentence_length = 0 # this is used for shiftting mask_indices for sentence in sentences_set: # tokenize and stemming sentence_tokenized = word_tokenize(sentence) # disable stemming # sentence_stem = [ps.stem(v) for v in sentence_tokenized] sentence_stem = [v for v in sentence_tokenized] # pos tag sentence_tagged = nltk.pos_tag(sentence_stem) # get necessary sentiment for each word sentence_sentiment = [ get_sentiment(v, p) for (v, p) in sentence_tagged ] # keep the words have sentiment selected_inputs_initial_step = [ sentence_tokenized[i] for i, item in enumerate(sentence_sentiment) if len(item) > 0 ] selected_sentiment_initial_step = [ item for item in sentence_sentiment if len(item) > 0 ] assert len(selected_inputs_initial_step) == len( selected_sentiment_initial_step), _error('Length not match.') # selected the words which have positive score or negative score, then keep the bigger score selected_inputs = [ selected_inputs_initial_step[i] for i, item in enumerate(selected_sentiment_initial_step) if item[2] != 1.0 ] selected_sentiment_mid_step = [ item for item in selected_sentiment_initial_step if item[2] != 1.0 ] selected_sentiment = list( map(select_useful_sentiment, selected_sentiment_mid_step)) # save the indices so that when calculating loss, [SEP], [PAD] will not be considered mask_indices.extend( [preb_sentence_length + i for i in range(len(selected_inputs))]) assert len(selected_inputs) == len(selected_sentiment), _error( 'The lengths of inputs and sentiment mismatch.') if len(selected_inputs) == 0: continue data_temp = [] for vocab in selected_inputs: if vocab in vocab_idx: data_temp.append(vocab_idx[vocab]) else: data_temp.append(vocab_idx['[UNK]']) data_temp.append(vocab_idx['[SEP]']) # increase length here, because the [SEP] # the mask indice could be [0, 1, 2, 4, 5], where 3 refers to [SEP] # the gathered sequence output s hould be [0, 1, 2, 4, 5] # the labels are [X, X, X, X, X] preb_sentence_length += len(data_temp) data_final.extend(data_temp) word_polarity_labels.extend(selected_sentiment) data_final.insert(0, vocab_idx['[CLS]']) return (data_final, word_polarity_labels, mask_indices)