Пример #1
0
def process_data():
    """covert the string data to idx, and save."""
    with codecs.open(cg.DATA_PATH, 'r', 'utf-8') as file:
        data = file.read().split('\n')

    if len(data[-1]) == 0:
        _error('The last line which is empty has been removed.')
        data = data[:-1]

    questions = []
    answers = []
    for line in data:
        line_split = line.split('=')
        que, ans = line_split[0], line_split[1]
        questions.append(que)
        answers.append(ans)
    assert len(questions) == len(answers),\
     _error('The number of quesiton: {} not equal to the number of answer: {}'.format(len(questions), len(answers)))

    que_idx = [str_to_idx(que) for que in questions]
    ans_idx = [str_to_idx(ans) for ans in answers]

    Path('processed_data/').mkdir(exist_ok=True)
    with codecs.open('processed_data/questions.bin', 'wb') as file:
        pickle.dump(que_idx, file)
    with codecs.open('processed_data/answers.bin', 'wb') as file:
        pickle.dump(ans_idx, file)

    _info(
        'Coverted questions and answers have been saved into `processed_data` directory.'
    )
Пример #2
0
def find_moduledata():
    pModuleData = 0
    if common.check_is_stripped():
        log._info("binary is not stripped!")
        for addr, name in idautils.Names():
            if name == "runtime.firstmoduledata":
                pModuleData = addr
                break
    else:
        log._info("binary is stripped..")
        log._info("Now find the moduledata by using brute force searching")
        GO1_16_MAGIC = 0xFFFFFFFA  # <-- go 1.16 magic
        text_section = common.get_segment_addr_by_name(name=".text")
        rdata_section = common.get_segment_addr_by_name(name=".rdata")
        data_section = common.get_segment_addr_by_name(name=".data")

        sections = [(".text", text_section), (".rdata", rdata_section),
                    (".data", data_section)]

        for sec_name, section_addr in sections:
            cur_addr = section_addr
            next_section_addr = common.get_next_segment_addr(addr=cur_addr)
            pModuleData = find_module_data_bruteforce(
                start_addr=section_addr,
                break_addr=next_section_addr,
                magic=GO1_16_MAGIC)
            if pModuleData != 0:
                log._info("ModuleData Structure locate at [%s] - @0x%x" %
                          (sec_name, pModuleData))
                break

        if pModuleData == 0:
            log._error("Cannot find ModuleData Structre in current binary...")

        return pModuleData
Пример #3
0
def parse_pclntable(module_data):
    pPcHeader = module_data.pPcHeader
    pc_header = parse_pc_header(pMem=pPcHeader)
    ptrSize = pc_header.ptrSize
    numberOfFuncs = pc_header.nFunc

    log._info("Number of Functions : %d" % numberOfFuncs)

    pclntable_start = module_data.pPclnTable
    cur_addr = pclntable_start
    for idx in range(numberOfFuncs):
        cur_addr = pclntable_start + (2 * ptrSize) * idx
        func_rva = common.mem_read_integer(addr=cur_addr, read_size=ptrSize)
        _func_structure_offset = common.mem_read_integer(addr=cur_addr +
                                                         ptrSize,
                                                         read_size=ptrSize)
        _func_addr = pclntable_start + _func_structure_offset

        if not idc.GetFunctionName(func_rva):
            log._info("Unk Func @0x%x" % func_rva)
            idc.MakeUnkn(func_rva, idc.DOUNK_EXPAND)
            idaapi.autoWait()
            idc.MakeCode(func_rva)
            idaapi.autoWait()
            if idc.MakeFunction(func_rva):
                idaapi.autoWait()
                log._info("Create Func @0x%x" % func_rva)

        _func = parse__func(pMem=_func_addr)
        #args=_func.args
        #func_id=_func.args

        func_name_addr = module_data.pFuncNameTable + _func.nameoff
        func_name = idc.GetString(func_name_addr)
        if func_name:
            clean_func_name = utils.clean_function_name(func_name)
            log._info("@0x%x Name : [%s]" % (func_rva, func_name))
            idc.MakeComm(func_rva, "@0x" + str(hex(func_rva)) + " entry")
            idaapi.autoWait()

            if idc.MakeStr(func_name_addr,
                           func_name_addr + len(func_name) + 1):
                idaapi.autoWait()
            else:
                log._error("@0x%x Name : [%s] Failed..." %
                           (func_rva, func_name))

        _func_addr = idaapi.get_func(func_rva)
        if _func_addr is not None:
            if idc.MakeNameEx(_func_addr.startEA,
                              func_name,
                              flags=idaapi.SN_FORCE):
                idaapi.autoWait()
                log._info("@0x%x Name : [%s]" % (func_rva, func_name))
            else:
                log._error("@0x%x Name : [%s] Failed..." %
                           (func_rva, func_name))
Пример #4
0
    def __init__(self,
                 config,
                 is_training,
                 input_text,
                 input_image,
                 scope=None):
        """"Constructor for EANN Model.
      
      Args:
        config: Config Object, hyparameters set.
        is_training: Boolean, whether train or not.
        input_text: tf.int32 Tensor, [batch_size, seq_length].
        input_image: tf.float32 Tensor, [batch_size, h, w, c].
      """
        # config
        config = copy.deepcopy(config)

        # textCNN config
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.window_size = config.window_size
        self.pool_size = config.pool_size
        self.filter_number_text = config.filter_number_text
        self.seq_length = config.max_length

        # VGG_19
        try:
            self.vgg = tf.keras.applications.VGG19(input_shape=(224, 224, 3),
                                                   include_top=False,
                                                   weights='imagenet')
            _info('Successfully load the pre-trained VGG-19 weights.')
        except Exception:
            _error(
                'Please download the VGG_19 weights from : \n{}\n, then put the file \
        into ~/.keras/models'.format(_cg.VGG_19_Weights_Download_URL))

        self.vgg.trainable = False  # do not train the vgg pretrained parameters

        # global config
        self.hidden_size = config.hidden_size
        self.num_classes = config.num_classes
        self.num_domains = config.num_domains

        # basic config
        self.initializer_range = config.initializer_range
        self.dropout = config.dropout
        if not is_training:
            self.dropout = 0.0

        # Build the Graph
        self.label_output, self.domain_output, self.batch_size = self.build(
            input_text, input_image)
Пример #5
0
    def _embedding_positional(self,
                              pos_type,
                              embedded_input,
                              embedding_size,
                              dropout_prob,
                              name=None,
                              max_position_embedding=100):
        """add positional embeddings to the original embeddings.

        Args:
            pos_type: the positional type to use, either 'normal' or 'positional_embeddings'.
            embedded_input: original embeddings, [batch_size, seq_length, embedding_size].
            embedding_size: embedding size.
            dropout_prob: dropout probability, refer to the 'rate' parameter in tf.nn.dropout()
            max_position_embedding: for 'normal' type, the model learn a new positional matrix,
                so set a max sequence length.
        
        Returns:
            output: identical type and shape to the embedded input.
        """
        assert_op = tf.assert_less_equal(self.input_length,
                                         max_position_embedding)
        self.pos_type = pos_type
        with tf.control_dependencies([assert_op]):
            # select sin & cos or normal positional embedding
            if pos_type == 'normal':
                positional_embeddings = tf.get_variable(
                    name='positional_embeddings',
                    shape=[max_position_embedding, embedding_size],
                    dtype=tf.float32)
                # slice the positional embeddings according to the actual length
                ac_pos_embed = tf.slice(positional_embeddings, [0, 0],
                                        [self.input_length, -1])
                embedded_input += ac_pos_embed
            elif pos_type == 'trigonometrical':
                self.positional_embeddings = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, None],
                    name='positional_embeddings')
                positional_embeddings = tf.convert_to_tensor(
                    self.positional_embeddings)
                embedded_input += positional_embeddings
            else:
                _error('unknown positional type <{}>'.format(pos_type),
                       head='ERROR')
                raise ValueError

        output = self._layer_norm_and_dropout(embedded_input, dropout_prob,
                                              name)
        return output
Пример #6
0
 def de_func_inner():
     data_path = Path(__file__).absolute().parent / 'processed_data'
     files = list(data_path.rglob('*.bin'))
     if need_exit:
         if len(files) != 2:
             _error('No data exists.')
             raise FileNotFoundError
     else:
         if len(files) > 0:
             _error('The data exists.')
             raise FileExistsError
     if func.__name__ == 'process_data':
         func()
     else:
         for data in func():
             yield data
def train_generator():
  """make train, test data."""
  # load the data
  with codecs.open(pos_data_path, 'rb') as file_p,\
       codecs.open(neg_data_path, 'rb') as file_n:
      pos_data = pickle.load(file_p)
      neg_data = pickle.load(file_n)
  assert len(pos_data) == len(neg_data), _error('Data distribution uneven.', head='ERROR')

  # shuffle the data
  train_data = pos_data + neg_data
  random.shuffle(train_data)
  
  # create batch
  for (start, end) in provide_batch_idx(len(train_data), batch_size):
    data_batch = train_data[start: end]
    sentences = [data[1] for data in data_batch]
    labels = [data[0] for data in data_batch]

    sentences_idx = list(map(process_line, sentences))
    sentences_idx_padded = padding_data(sentences_idx)
    input_mask = list(map(make_mask, sentences_idx_padded))

    features = {'input_data': sentences_idx_padded,
                'input_mask': input_mask}
    yield(features, labels)
Пример #8
0
def select_initializer(itype=None, seed=None, init_weight=0.01):
    if itype.upper() == 'UNIFORM':
        return tf.random_uniform_initializer(-init_weight,
                                             init_weight,
                                             seed=seed)
    elif itype.upper() == 'GLOROT_N':
        return tf.contrib.keras.initializer.glorot_normal(seed=seed)
    elif itype.upper() == 'GLOROT_U':
        return tf.contrib.keras.initializer.glorot_uniform(seed=seed)
    elif itype.upper() == 'RANDOM':
        return tf.random_normal_initializer(mean=0.0,
                                            stddev=init_weight,
                                            seed=seed,
                                            dtype=tf.float32)
    else:
        _error('Not support <{}> initializer'.format(itype), head='ERROR')
        raise ValueError
Пример #9
0
def create_or_load(model, ckpt_path, session, force=False):
    """create a new model or load from the existing one"""
    dir_path = '/'.join(ckpt_path.split('/')[:-1])
    latest_ckpt = tf.train.latest_checkpoint(dir_path)

    if latest_ckpt and not force:
        try:
            model.saver.restore(session, latest_ckpt)
        except Exception as e:
            _error(e, head='ERROR')
            raise e
        _info('successfully load model from <{}>'.format(latest_ckpt),
              head='INFO')
    else:
        session.run(tf.global_variables_initializer())
        session.run(tf.local_variables_initializer())
        session.run(tf.tables_initializer())
        _info('successfully create a new model', head='INFO')
    global_step = model.global_step.eval(session=session)
    return model, global_step
Пример #10
0
def parse_func_pointer():
    renamed = 0
    for segea in idautils.Segments():
        for addr in idautils.Functions(segea, idc.SegEnd(segea)):
        #for addr in idautils.Functions(text_seg.startEA, text_seg.endEA):
            name = idc.GetFunctionName(addr)

            # Look at data xrefs to the function - find the pointer that is located in .rodata
            data_ref = idaapi.get_first_dref_to(addr)
            while data_ref != idc.BADADDR:
                if 'rodata' in idc.get_segm_name(data_ref):
                    # Only rename things that are currently listed as an offset; eg. off_9120B0
                    if 'off_' in idc.GetTrueName(data_ref):
                        if idc.MakeNameEx(data_ref, ('%s_ptr' % name), flags=idaapi.SN_FORCE):
                            idaapi.autoWait()
                            renamed += 1
                        else:
                            log._error('Failed to name pointer @ 0x%02x for %s' % (data_ref, name))

                data_ref = idaapi.get_next_dref_to(addr, data_ref)
def save_to_binary(data, save_path, replace=False):
  """convert the data to binary file and save.
  
  Args:
    data: object, the original file.
    save_path: str, the absolute path to save the data.
    replace: boolean, Whether to replace the file when the file exits.
  """
  # change the str path to PosixPath
  save_path = Path(save_path)
  
  # check the file exits or not
  if save_path.is_file():
    if not replace:
      _error('{} already exits.'.format(save_path), head='ERROR')
      raise FileExistsError
    else:
      _info('{} already exits, replaced.'.format(save_path))
  
  with codecs.open(save_path, 'wb') as file:
    pickle.dump(data, file)
def train_generator():
    """generator to yield data."""
    # load the data
    with codecs.open(pos_data_path, 'rb') as file_p,\
         codecs.open(neg_data_path, 'rb') as file_n:
        pos_data = pickle.load(file_p)
        neg_data = pickle.load(file_n)
    assert len(pos_data) == len(neg_data), _error('Data distribution uneven.',
                                                  head='ERROR')

    # shuffle the data
    train_data = pos_data + neg_data
    random.shuffle(train_data)

    # create batch
    for (start, end) in provide_batch_idx(len(train_data), batch_size):
        data_batch = train_data[start:end]
        yield extract_features(data_batch, False)
Пример #13
0
 def __setattr__(self, name, value):
     if hasattr(self, name):
         wrapped_setatrr(self, name, value)
     else:
         _error('Add new {} is forbidden'.format(name))
         raise AttributeError
Пример #14
0
PATTERN = r'^(loss = )\d{1,5}\.\d{1,10}'
PATTERN_2 = r'(ppl = )\d{1,5}\.\d{1,10}'


def extract(log_path, save_path, save_path_2):
    with codecs.open(log_path, 'r', 'utf-8') as file, \
         codecs.open(save_path, 'w', 'utf-8') as file_2, \
         codecs.open(save_path_2, 'w', 'utf-8') as file_3:
        for line in file:
            if re.search(PATTERN, line):
                match = re.search(PATTERN, line).group()
                loss = match.split(' ')[2]
                file_2.write('sup_avg:' + loss + '\n')
                file_2.flush()
            if re.search(PATTERN_2, line):
                match = re.search(PATTERN_2, line).group()
                ppl = match.split(' ')[2]
                file_3.write('sup_avg:' + ppl + '\n')
                file_3.flush()
    _info('The loss record have been save to {}.'.format(save_path))


if __name__ == '__main__':
    if len(sys.argv) < 3:
        _error('Please specify the log path and the save path.')
        raise ValueError
    else:
        log_path = sys.argv[1]
        save_path = sys.argv[2]
        save_path_2 = sys.argv[3]
        extract(log_path, save_path, save_path_2)
Пример #15
0
def train_generator():
    """this could achieve padding among each batch."""
    # load the data
    with codecs.open('processed_data/questions.bin', 'rb') as file:
        questions = pickle.load(file)
    with codecs.open('processed_data/answers.bin', 'rb') as file:
        answers = pickle.load(file)
    assert len(questions) == len(answers),\
      _error('The number of quesiton: {} not equal to the number of answer: {}'.format(len(questions), len(answers)))

    # random shuffle the data
    questions_answers = list(zip(questions, answers))
    random.shuffle(questions_answers)
    questions, answers = zip(*questions_answers)
    questions = list(questions)
    answers = list(answers)

    que_batch = []
    input_ans_batch = []
    output_ans_batch = []
    seq_length_decoder_input_data = []
    batch_num = len(questions) // batch_size
    for idx, que in enumerate(questions):
        if len(que_batch) < batch_size:
            # que
            que_batch.append(que)

            # ans
            inp_ans = copy.deepcopy(answers[idx])
            out_ans = copy.deepcopy(answers[idx])
            inp_ans.insert(0, sos_id)
            out_ans.append(eos_id)
            input_ans_batch.append(inp_ans)
            output_ans_batch.append(out_ans)
            seq_length_decoder_input_data.append(len(inp_ans))

            # check whether a batch is full
            if len(que_batch) == batch_size:
                que_batch_padded, inp_ans_batch_padded, out_ans_batch_padded, mask = padding_data(
                    que_batch, input_ans_batch, output_ans_batch)
                features = {
                    'input_x': que_batch_padded,
                    'input_mask': mask,
                    'input_y': inp_ans_batch_padded,
                    'seq_length': seq_length_decoder_input_data
                }
                yield (features, out_ans_batch_padded)
                que_batch = []
                input_ans_batch = []
                output_ans_batch = []
                seq_length_decoder_input_data = []

            if idx > (batch_num * batch_size -
                      1) and len(questions) % batch_size != 0:
                que_batch = copy.deepcopy(questions[idx:])
                input_ans_batch = copy.deepcopy(answers[idx:])
                output_ans_batch = copy.deepcopy(answers[idx:])
                for _ in range(batch_size - len(que_batch)):
                    aug_que = random.choice(questions)
                    aug_que_idx = questions.index(aug_que)
                    que_batch.append(aug_que)

                    inp_ans = copy.deepcopy(answers[aug_que_idx])
                    out_ans = copy.deepcopy(answers[aug_que_idx])
                    inp_ans.insert(0, sos_id)
                    out_ans.append(eos_id)
                    input_ans_batch.append(inp_ans)
                    output_ans_batch.append(out_ans)

                for idx, _ in enumerate(input_ans_batch):
                    input_ans_batch[idx].insert(0, sos_id)
                for idx, _ in enumerate(output_ans_batch):
                    output_ans_batch[idx].append(eos_id)
                seq_length_decoder_input_data = [
                    len(ans) for ans in input_ans_batch
                ]

                assert len(que_batch) == len(input_ans_batch) == len(
                    output_ans_batch) == batch_size

                que_batch_padded, inp_ans_batch_padded, out_ans_batch_padded, mask = padding_data(
                    que_batch, input_ans_batch, output_ans_batch)
                features = {
                    'input_x': que_batch_padded,
                    'input_mask': mask,
                    'input_y': inp_ans_batch_padded,
                    'seq_length': seq_length_decoder_input_data
                }
                yield (features, out_ans_batch_padded)
                break
def no_mask(data):
    def select_useful_sentiment(scores):
        """return either positive tag or negative tag for each word."""
        pos_score, neg_score = scores[0], scores[1]
        return 1 if pos_score > neg_score else 0

    # clean data
    data = data.replace('<br />', ' ')
    # split sentence
    sentences_set = sent_tokenize(data)

    data_final = []
    word_polarity_labels = []
    mask_indices = []
    preb_sentence_length = 0  # this is used for shiftting mask_indices
    for sentence in sentences_set:
        # tokenize and stemming
        sentence_tokenized = word_tokenize(sentence)
        # disable stemming
        # sentence_stem = [ps.stem(v) for v in sentence_tokenized]
        sentence_stem = [v for v in sentence_tokenized]

        # pos tag
        sentence_tagged = nltk.pos_tag(sentence_stem)

        # get necessary sentiment for each word
        sentence_sentiment = [
            get_sentiment(v, p) for (v, p) in sentence_tagged
        ]

        # keep the words have sentiment
        selected_inputs_initial_step = [
            sentence_tokenized[i] for i, item in enumerate(sentence_sentiment)
            if len(item) > 0
        ]
        selected_sentiment_initial_step = [
            item for item in sentence_sentiment if len(item) > 0
        ]
        assert len(selected_inputs_initial_step) == len(
            selected_sentiment_initial_step), _error('Length not match.')

        # selected the words which have positive score or negative score, then keep the bigger score
        selected_inputs = [
            selected_inputs_initial_step[i]
            for i, item in enumerate(selected_sentiment_initial_step)
            if item[2] != 1.0
        ]
        selected_sentiment_mid_step = [
            item for item in selected_sentiment_initial_step if item[2] != 1.0
        ]
        selected_sentiment = list(
            map(select_useful_sentiment, selected_sentiment_mid_step))

        # save the indices so that when calculating loss, [SEP], [PAD] will not be considered
        mask_indices.extend(
            [preb_sentence_length + i for i in range(len(selected_inputs))])
        assert len(selected_inputs) == len(selected_sentiment), _error(
            'The lengths of inputs and sentiment mismatch.')
        if len(selected_inputs) == 0:
            continue

        data_temp = []
        for vocab in selected_inputs:
            if vocab in vocab_idx:
                data_temp.append(vocab_idx[vocab])
            else:
                data_temp.append(vocab_idx['[UNK]'])

        data_temp.append(vocab_idx['[SEP]'])
        # increase length here, because the [SEP]
        # the mask indice could be [0, 1, 2, 4, 5], where 3 refers to [SEP]
        # the gathered sequence output s  hould be [0, 1, 2, 4, 5]
        # the labels are [X, X, X, X, X]
        preb_sentence_length += len(data_temp)
        data_final.extend(data_temp)
        word_polarity_labels.extend(selected_sentiment)

    data_final.insert(0, vocab_idx['[CLS]'])

    return (data_final, word_polarity_labels, mask_indices)