Пример #1
0
    def train(self, pos_samples):
        if len(pos_samples) < 2:
            print pos_samples
            print len(pos_samples)
            print 'not enough samples. bug??'
            assert False

        # reinit
        self.token_scores = None
        self.tokens = None
        self.tokentree = None
        self.pos_samples = pos_samples

        stree = sutil.STree(pos_samples)
        tokens = stree.common_sub(self.minlen, min(len(pos_samples), max(self.kmin, int(self.kfrac*len(pos_samples)))), prune=self.prune)
        stree = None
        self.tokens = tokens.keys()
#        self.tokentree = sutil.STree(tokens.keys())

        token_strings = self.get_all_occurrences(pos_samples)
        self.token_scores = {}

        # calculate scores based on Bayes law
        for token in token_strings.keys():
            prob_tok_giv_worm = 1.0 * len(token_strings[token])  \
                                / len(pos_samples)
            prob_tok_giv_nworm = sig_gen.est_fpos_rate(token, 
                                                       self.training_trace)
            part = prob_tok_giv_nworm \
                   / (.5*prob_tok_giv_worm + .5*prob_tok_giv_nworm) + 1e-300
            token_score = max(-1 * math.log(part)/math.log(10),0)
            if token_score > 0:
                self.token_scores[token] = token_score

        # prevents a subtle bug later, when there turn out to be no
        # tokens kept. (i.e., they all had score 0)
        self.tokens = self.token_scores.keys()

        self.set_threshold()

        # Signature generation and signature matching code is tightly
        # coupled. TODO: refactor to return a separate signature object
        return [self]
Пример #2
0
            def sig_gen_cb(left, right):
                lsig = left['sig']
                rsig = right['sig']

                # tokenize if possible
                if not lsig and not rsig and self.tokenize_pairs:
                    lsig = pos_samples[left['samples'][0]]
                    rsig = pos_samples[right['samples'][0]]
                    (lsig, rsig) = self._tokenize_samples([lsig, rsig])
                else:
                    if lsig:
                        lsig = lsig.lcs
                    else:
                        lsig = list(pos_samples[left['samples'][0]])
                    if rsig:
                        rsig = rsig.lcs
                    else:
                        rsig = list(pos_samples[right['samples'][0]])

                # find the common subsequence
                lcs = self._find_lcs(lsig, rsig)
                t = self._lcs_to_tuple(lcs)
                sig = TupleSig(lcs, t)
                #                print self._lcs_to_regex(sig)

                # calculate a score for the resulting signature
                scores = []
                for token in t:
                    #                    prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1]
                    prob = sig_gen.est_fpos_rate(token,
                                                 self.fpos_training_streams)
                    scores.append(-math.log(prob + 1e-300) / math.log(10))

                # using all the token scores overly favors signatures
                # with many tokens. Current fix is to only use most distinctive
                # tokens to calculate the score.
                if self.max_tokens_in_est:
                    scores.sort(lambda x, y: cmp(y, x))
                    score = sum(scores[:self.max_tokens_in_est])
                else:
                    score = sum(scores)
                return (sig, score)
Пример #3
0
            def sig_gen_cb(left, right):
                lsig = left['sig']
                rsig = right['sig']

                # tokenize if possible
                if not lsig and not rsig and self.tokenize_pairs:
                    lsig = pos_samples[left['samples'][0]]
                    rsig = pos_samples[right['samples'][0]]
                    (lsig, rsig) = self._tokenize_samples([lsig, rsig])
                else:
                    if lsig:
                        lsig = lsig.lcs
                    else:
                        lsig = list(pos_samples[left['samples'][0]])
                    if rsig:
                        rsig = rsig.lcs
                    else:
                        rsig = list(pos_samples[right['samples'][0]])

                # find the common subsequence
                lcs = self._find_lcs(lsig, rsig)
                t = self._lcs_to_tuple(lcs)
                sig = TupleSig(lcs, t)
#                print self._lcs_to_regex(sig)

                # calculate a score for the resulting signature
                scores = []
                for token in t:
#                    prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1]
                    prob = sig_gen.est_fpos_rate(token, self.fpos_training_streams)
                    scores.append(- math.log(prob + 1e-300)/math.log(10))

                # using all the token scores overly favors signatures
                # with many tokens. Current fix is to only use most distinctive
                # tokens to calculate the score.
                if self.max_tokens_in_est:
                    scores.sort(lambda x,y: cmp(y,x))
                    score = sum(scores[:self.max_tokens_in_est])
                else:
                    score = sum(scores)
                return (sig, score)