class TextClassify: def __init__(self, messages): self.messages = messages self.cVerTer = Converter() self.freWords = {} self.mger = WordsMerger() self.analyzer = base_analyzer() self.features = None def merGeWords(self, freWords): comBineFreWords = [] for words in freWords: words = [word.split('_') for word in words] comBineFreWords.append(self.mger.mergeWords(words)) return comBineFreWords def cntWord(self, wSize, TK, wLen): freWords, _ = get_messages(self.messages, wSize, TK, wLen) wSetv = set() for freWordList in freWords: for word in freWordList: if word not in self.freWords: wSetv.add(word) textList = self.cVerTer.ConvertRawToLengthTexts(self.messages, '_') for word in wSetv: self.freWords[word] = textList.count(word) return self.freWords def cntPro(self, words=None): if words == None: self.freWords = self.analyzer.convert_num_to_frequent( self.freWords) else: self.freWords = self.analyzer.convert_num_to_frequent(words) return self.freWords def FeGenerator(self): tulWords = [word for word in self.freWords.items()] featureGene = WordsFeatureGene(tulWords) textMsgs = [ self.cVerTer.ConvertRawToLengthText(message, delimeter='_') for message in self.messages ] mFeatures = featureGene.cvtMsgs(textMsgs) self.features = mFeatures return mFeatures def clsMessages(self, wSize, TK, wLen, Kcls): self.cntWord(wSize, TK, wLen) self.cntPro() wFeatures = self.FeGenerator() clser = KmeansClasser() return clser.clsMessages(self.messages, wFeatures, Kcls) def clsByDbscan(self, wSize, TK, wLen, mindis, minpt): self.cntWord(wSize, TK, wLen) self.cntPro() wFeatures = self.FeGenerator() clser = DbScanClasser() return clser.clsMessages(self.messages, wFeatures, mindis, minpt)
def __init__(self, messages): self.messages = messages self.cVerTer = Converter() self.freWords = {} self.mger = WordsMerger() self.analyzer = base_analyzer() self.features = None
def __init__(self, messages=None): self.MaxLen = 40 self.lengthThreshold = 0.8 self.constThreshold = 0.98 self.idThreshold = 0.7 self.messages = messages self.cverter = Converter()
def __init__(self, messages=None): self.messages = messages self.wordTypeInfer = WholeFieldTypeInfer(self.messages) self.cvter = Converter() self.wcvter = word_convert() self.msgSplt = MsgSpliter() self.dataTuning = DataTuning() self.icsSymTree = IcsSymbolToTree()
def raw_to_log(file_path, r_way, protocol): datas = read_datas(file_path, r_way) datas = get_puredatas(datas) raw_datas = [] converter = Converter() logger_raw = get_logger(log_path + '/' + protocol, 'raw_message_logger') i = 0 for data in datas: logger_raw.error(str(i) + ':' + converter.convert_raw_to_text(data))
def inferLen(self, datas, lenDatas): datasLenBig = Converter.bytesToBigInt(datas) datasLittle = Converter.bytesToLittleInt(datas) personBig = base_analyzer.pearson(datasLenBig, lenDatas) personLittle = base_analyzer.pearson(datasLittle, lenDatas) if personBig > self.lengthThreshold or personLittle > self.lengthThreshold: return 1 else: return 0
def raw_to_redis(file_path, r_way): datas = read_datas(file_path, r_way) datas = get_puredatas(datas) raw_datas = [] converter = Converter() for data in datas: raw_datas.append(converter.convert_raw_to_text(data)) key = file_path phrase_redis = redis_deal() phrase_redis.insert_to_redis(key, raw_datas)
def __init__(self): super().__init__() self.converter = Converter() self.msgSpliter = MsgSpliter() self.redis_dealer = redis_deal() self.splt = splitter() self.desiner = Desiner() self.msAb = MeasureAb() self.cvt = Converter() self.dataTuning = DataTuning()
def __init__(self): self.msgLogic = MegSplitLogic() self.modbus = ModBusDataTuning() self.md = modbus() self.anlzer = base_analyzer() self.ftp = FTPDataTuning() self.ftpPaser = FTPParser() self.cmPaser = ComPaser() self.cvt = Converter() self.mtool = MessageSplitMeasure() self.rds = redis_deal()
def getDelimiter(datas): convert = Converter() messages = [convert.convert_raw_to_text(data) for data in datas] t_results = [] for message in messages: t_results.extend(get_ngram_words([message], (1, 2), 10)) words = analyzer.get_topk(t_results)[0:10] deliWords = filterWords(words) wordsList = [chr(int(word)) for word in deliWords.split(' ')] deliW = ''.join(wordsList) return deliW
def inferLen(self, Los=None, datas=None): if Los != None: datas = self.cverter.getDatasByLocs(self.messages, Los) lens = [len(data) for data in datas] datasLenBig = Converter.bytesToBigInt(datas) datasLittle = Converter.bytesToLittleInt(datas) personBig = base_analyzer.pearson(datasLenBig, lens) personLittle = base_analyzer.pearson(datasLittle, lens) if personBig > self.lengthThreshold or personLittle > self.lengthThreshold: return 1 else: return 0
def inferSeriesId(self, datas): ids = [] for i, data in enumerate(datas): ids.append(i) datasBigInt = Converter.bytesToBigInt(datas) datasLittle = Converter.byteToLittle(datas) tRate = max(base_analyzer.pearson(ids, datasBigInt), base_analyzer.pearson(ids, datasLittle)) if (tRate > self.idThreshold): return 1 else: return 0
def inferSeriesId(self, Los=None, datas=None): if Los != None: datas = self.cverter.getDatasByLocs(self.messages, Los) ids = [] for i, data in enumerate(datas): ids.append(i) datasBigInt = Converter.bytesToBigInt(datas) datasLittle = Converter.bytesToLittleInt(datas) tRate = max(base_analyzer.pearson(ids, datasBigInt), base_analyzer.pearson(ids, datasLittle)) if (tRate > self.idThreshold): return 1 else: return 0
def split_by_frequent(self, messages): prefix = ve_strategy().GetWordsKeys('FrequentWords') entry_words = None if self.redis_read.is_exist_key(prefix): frequent_words = self.redis_read.read_from_redis(prefix) else: raw_keys = ve_strategy().GetWordsKeys('RawWords') raw_words = self.redis_read.read_from_redis(raw_keys) frequent_words = Converter().ConvertRawToNormalFrequent(raw_words, self.parameters['height'] + 1) self.redis_read.insert_to_redis(prefix, frequent_words) frequent_voter = frequence_voter(frequent_words) PrimBorders = frequent_voter.vote_for_messages(messages, self.parameters['height']) FinalBorders = Desiner().VoteMultiM(PrimBorders, self.parameters['diff_measure'], self.parameters['decision_type'], self.parameters['Threshold_T'], self.parameters['Threshod_R']) return Converter().ConvertListToOrder(FinalBorders)
def inferConst(self, datas): wordDic = Converter.convert_raw_to_count(datas) wordDic = sorted(wordDic.items(), key=lambda x: x[1]) if (wordDic[-1][1] / len(datas) > self.constThreshold): return 1 else: return 0
def find_constone(self, datas): wordDic = Converter.convert_raw_to_count(datas) wordDic = sorted(wordDic.items(), key=lambda x: x[1]) if (wordDic[0][1] > self.constThreshold): return 1 else: return 0
def upDataByType(self): datas = self.getNodeDataV() if self.word_type == 'C': self.value.append(datas[0]) elif self.word_type == 'F': dicDatas = Converter().convert_raw_to_count(datas) for key in dicDatas: self.value.append(key)
def find_constfunc(self, datas): """ get the feature of the function code :param datas: List of bytes :return: entry and distinct num of datas """ t_l = Converter.convert_raw_to_count(datas) t_en = base_analyzer.get_entry([value for value in t_l.values()]) return t_en, len(t_l)
def getFuncScore(self, Los=None, datas=None): if Los != None: datas = self.cverter.getDatasByLocs(self.messages, Los) datasDic = Converter.convert_raw_to_count(datas) sumValue = 0 for value in datasDic.values(): sumValue = sumValue + value datas = [data / sumValue for data in datasDic.values()] datasEntry = base_analyzer.get_entry(datas) return datasEntry, len(datasDic)
def split_by_words_type(self, datas, T_max_range): fields_set = [] w_infer = word_infer() w_merger = base_merger() w_convert = Converter() b_analyzer = base_analyzer() for i in range(T_max_range): lo_datas = get_data_bylo(datas, i) w_cnt = w_convert.convert_raw_to_count(lo_datas) w_frequent = b_analyzer.convert_num_to_frequent(w_cnt) w_type = w_infer.is_const_word(w_frequent, 0.95) if w_type: t_field = loc_field((i,i), 0) else: t_field = loc_field((i,i), 4) fields_set.append(t_field) words_f = w_merger.merge_words(fields_set) candidate_borders = [w.loc[0] for w in words_f] return words_f, candidate_borders
def inferFunc(self, datas): datasDic = Converter.convert_raw_to_count(datas) sumValue = 0 for value in datasDic.values(): sumValue = sumValue + value datas = [data / sumValue for data in datasDic.values()] datasEntry = base_analyzer.get_entry(datas) if len(datasDic) < self.FuncT: return 1 else: return 0
class FieldHunter: def __init__(self): self.analyer = base_analyzer() self.convert = Converter() self.ranker = ranker() def itemJudge(self, item): if int(item) >= 48 and int(item) <= 57: return True if int(item) >= 65 and int(item) <= 90: return True if int(item) >= 97 and int(item) <= 122: return True return False def isNumOrAlpha(self, sequence): chars = sequence.split(' ') isNumAlpha = False for item in chars: if self.itemJudge(item): isNumAlpha = True break return isNumAlpha def findDelimiter(self, messages): messages = [self.convert.convert_raw_to_text(data) for data in messages] wordsNgram = self.convert.ConvertRawToSimDic(messages, (1, 2)) wordsNgram = self.ranker.rank_dic(wordsNgram, reverse=True) #print(wordsNgram) delimiter = None for word in wordsNgram: if not self.isNumOrAlpha(word[0]): delimiter = word break candidates = [] for word in wordsNgram: if not self.isNumOrAlpha(word[0]): candidates.append(word[0]) print(candidates[0:100]) print('cccd') return candidates[0:100]
def inferConst(self, Los=None, datas=None): #print(Los) if Los != None: datas = self.cverter.getDatasByLocs(self.messages, Los) #print(datas) wordDic = Converter.convert_raw_to_count(datas) wordDic = sorted(wordDic.items(), key=lambda x: x[1]) #print(wordDic) if (wordDic[-1][1] / len(datas) > self.constThreshold): return 1 else: return 0
def getBoundaries(self, configParas, gveConfigParas, messages): freGVotes, entryGVotes = self.getGVotes(configParas, messages) desiner = Desiner() paraFre = {} paraFre['diff_measure'] = gveConfigParas['diffMeasure'] paraFre['vWay'] = gveConfigParas['vWayFre'] paraFre['T'] = gveConfigParas['T'] paraFre['r'] = gveConfigParas['r'] freBoundaries = desiner.VoteSingleByDicParas(paraFre, freGVotes) paraFre['vWay'] = gveConfigParas['vWayEntry'] entryBoundaries = desiner.VoteSingleByDicParas(paraFre, entryGVotes) return Converter().MergeLists(freBoundaries, entryBoundaries)
def inferLenAccau(self, Los=None, datas=None): if Los != None: datas = self.cverter.getDatasByLocs(self.messages, Los) lens = [] for msg in self.messages: if len(msg) > Los[-1]: lens.append(len(msg) - Los[-1]) else: lens.append(-1) datasLenBig = Converter.bytesToBigInt(datas) datasLittle = Converter.bytesToLittleInt(datas) acc_big = 0 for i in range(len(datasLenBig)): if (abs((datasLenBig[i] - lens[i])) <= 1): acc_big = acc_big + 1 acc_small = 0 for i in range(len(datasLittle)): if (abs((datasLittle[i] - lens[i])) <= 1): acc_small = acc_small + 1 if ((acc_small / len(datas)) > self.lengthThreshold or (acc_big / len(datas)) > self.lengthThreshold): return 1 else: return 0
def ConvertRawWordsToOrder(self, rawwords, nrange, ordertype="abs"): Analyzer = base_analyzer() WordRanker = ranker() Converter = word_convert() num_words = Converter.splitwords_bylen(rawwords, nrange) for len_word in num_words: num_words[len_word] = WordRanker.rank_tulple(num_words[len_word], reverse=True) PrimeWords = [word[0] for word in num_words[4]] PrimeOrders = {} for i in range(len(PrimeWords)): PrimeOrders[PrimeWords[i]] = i OrderWords = {} OrderWords[4] = PrimeOrders start_time = time.time() for i in range(1, nrange - 1): if ordertype == 'abs': OrderWords[i] = self.ConvertWordToNumOrder( [word[0] for word in num_words[i]], PrimeWords, rawwords) else: OrderWords[i] = Converter.convert_word_order( [word[0] for word in num_words[i]], PrimeWords) OrderWords = self.convert_order_to_raw(OrderWords) return OrderWords
def SplitByOrder(self, messages): key = ve_strategy().GetWordsKeys('OrderWords') if self.redis_read.is_exist_key(key): OrderWords = self.redis_read.read_from_redis(key) else: raw_keys = ve_strategy().GetWordsKeys('RawWords') raw_words = self.redis_read.read_from_redis(raw_keys) OrderWords = word_convert().ConvertRawWordsToOrder(raw_words, self.parameters['height'] + 1) self.redis_read.insert_to_redis(key, OrderWords) orderVoter = OrderVoter(OrderWords) PrimBorders = orderVoter.vote_for_messages(messages, self.parameters['height']) FinalBorders = Desiner().VoteMultiM(PrimBorders, self.parameters['diff_measure'], self.parameters['decision_type'], self.parameters['Threshold_T'], self.parameters['Threshod_R']) return Converter().ConvertListToOrder(FinalBorders)
def splitMessage(self, boundary, message, maxRange): hexData = Converter.byteListToHex(message) splitMsg = '' los = 0 startLo = 0 hexDatas = hexData.split(' ') if boundary[los] == 0: los = los + 1 for i in range(min(len(hexDatas), maxRange)): if los >= len(boundary): splitMsg = splitMsg + hexDatas[i] + ' ' continue splitMsg = splitMsg + hexDatas[i] + ' ' if i + 1 == boundary[los]: splitMsg = splitMsg + '|' los = los + 1 return splitMsg
def splitMessageByType(self, boundary, message): hexData = Converter.byteListToHex(message) splitMsg = '' los = 0 spltMsgs = [] hexDatas = hexData.split(' ') if boundary[los] == 0: los = los + 1 for i in range((len(hexDatas))): if los >= len(boundary): splitMsg = splitMsg + hexDatas[i] + ' ' continue splitMsg = splitMsg + hexDatas[i] + ' ' if i + 1 == boundary[los]: spltMsgs.append(splitMsg) splitMsg = '' los = los + 1 return spltMsgs
def split_by_entry(self, messages): keys = ve_strategy().GetWordsKeys("EntryWords") entry_words = None if self.redis_read.is_exist_key(keys): entry_words = self.redis_read.read_from_redis(keys) else: raw_keys = ve_strategy().GetWordsKeys("RawWords") raw_words = self.redis_read.read_from_redis(raw_keys) entry_words = word_convert().convert_raw_to_entry( raw_words, self.parameters['height'] + 1) self.redis_read.insert_to_redis(keys, entry_words) entry_voter = Entry_voter(entry_words) PrimBorders = entry_voter.vote_for_messages(messages, self.parameters['height']) FinalBorders = Desiner().VoteMultiM(PrimBorders, self.parameters['diff_measure'], self.parameters['decision_type'], self.parameters['Threshold_T'], self.parameters['Threshod_R']) return Converter().ConvertListToOrder(FinalBorders)