def train(self, pos_samples): if len(pos_samples) < 2: print pos_samples print len(pos_samples) print 'not enough samples. bug??' assert False # reinit self.token_scores = None self.tokens = None self.tokentree = None self.pos_samples = pos_samples stree = sutil.STree(pos_samples) tokens = stree.common_sub(self.minlen, min(len(pos_samples), max(self.kmin, int(self.kfrac*len(pos_samples)))), prune=self.prune) stree = None self.tokens = tokens.keys() # self.tokentree = sutil.STree(tokens.keys()) token_strings = self.get_all_occurrences(pos_samples) self.token_scores = {} # calculate scores based on Bayes law for token in token_strings.keys(): prob_tok_giv_worm = 1.0 * len(token_strings[token]) \ / len(pos_samples) prob_tok_giv_nworm = sig_gen.est_fpos_rate(token, self.training_trace) part = prob_tok_giv_nworm \ / (.5*prob_tok_giv_worm + .5*prob_tok_giv_nworm) + 1e-300 token_score = max(-1 * math.log(part)/math.log(10),0) if token_score > 0: self.token_scores[token] = token_score # prevents a subtle bug later, when there turn out to be no # tokens kept. (i.e., they all had score 0) self.tokens = self.token_scores.keys() self.set_threshold() # Signature generation and signature matching code is tightly # coupled. TODO: refactor to return a separate signature object return [self]
def sig_gen_cb(left, right): lsig = left['sig'] rsig = right['sig'] # tokenize if possible if not lsig and not rsig and self.tokenize_pairs: lsig = pos_samples[left['samples'][0]] rsig = pos_samples[right['samples'][0]] (lsig, rsig) = self._tokenize_samples([lsig, rsig]) else: if lsig: lsig = lsig.lcs else: lsig = list(pos_samples[left['samples'][0]]) if rsig: rsig = rsig.lcs else: rsig = list(pos_samples[right['samples'][0]]) # find the common subsequence lcs = self._find_lcs(lsig, rsig) t = self._lcs_to_tuple(lcs) sig = TupleSig(lcs, t) # print self._lcs_to_regex(sig) # calculate a score for the resulting signature scores = [] for token in t: # prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1] prob = sig_gen.est_fpos_rate(token, self.fpos_training_streams) scores.append(-math.log(prob + 1e-300) / math.log(10)) # using all the token scores overly favors signatures # with many tokens. Current fix is to only use most distinctive # tokens to calculate the score. if self.max_tokens_in_est: scores.sort(lambda x, y: cmp(y, x)) score = sum(scores[:self.max_tokens_in_est]) else: score = sum(scores) return (sig, score)
def sig_gen_cb(left, right): lsig = left['sig'] rsig = right['sig'] # tokenize if possible if not lsig and not rsig and self.tokenize_pairs: lsig = pos_samples[left['samples'][0]] rsig = pos_samples[right['samples'][0]] (lsig, rsig) = self._tokenize_samples([lsig, rsig]) else: if lsig: lsig = lsig.lcs else: lsig = list(pos_samples[left['samples'][0]]) if rsig: rsig = rsig.lcs else: rsig = list(pos_samples[right['samples'][0]]) # find the common subsequence lcs = self._find_lcs(lsig, rsig) t = self._lcs_to_tuple(lcs) sig = TupleSig(lcs, t) # print self._lcs_to_regex(sig) # calculate a score for the resulting signature scores = [] for token in t: # prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1] prob = sig_gen.est_fpos_rate(token, self.fpos_training_streams) scores.append(- math.log(prob + 1e-300)/math.log(10)) # using all the token scores overly favors signatures # with many tokens. Current fix is to only use most distinctive # tokens to calculate the score. if self.max_tokens_in_est: scores.sort(lambda x,y: cmp(y,x)) score = sum(scores[:self.max_tokens_in_est]) else: score = sum(scores) return (sig, score)