Exemplo n.º 1
0
 def test_select_ngrams_with_unicode_inputs(self):
     result = list(
         select_ngrams(x
                       for x in [('b', 'ä',
                                  'c'), ('ä', 'ä',
                                         'c'), ('e', 'ä',
                                                'c'), ('b', 'f',
                                                       'ä'), ('g', 'c',
                                                              'd')]))
     expected = [('b', 'ä', 'c'), ('ä', 'ä', 'c'), ('e', 'ä', 'c'),
                 ('b', 'f', 'ä'), ('g', 'c', 'd')]
     assert result == expected
Exemplo n.º 2
0
    def _add_rules(
        self,
        rules,
        _legalese=common_license_words,
        _spdx_tokens=frozenset(),
        _license_tokens=frozenset(),
    ):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_legalese` is a set of common license-specific words aka. legalese
        `_spdx_tokens` is a set of token strings used in SPDX license identifiers
        ``license_tokens`` is a set of "license" tokens used as start or end of a rule
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # initial dictionary mapping for known legalese tokens
        ########################################################################

        # FIXME: we should start enumerating at 1 below: token ids then become
        # valid "unichr" values, making it easier downstream when used in
        # automatons

        self.dictionary = dictionary = {
            ts: tid for tid, ts in enumerate(sorted(_legalese))
        }

        dictionary_get = dictionary.get

        self.len_legalese = len_legalese = len(dictionary)
        highest_tid = len_legalese - 1

        # Add SPDX key tokens to the dictionary
        # these are always treated as non-legalese. This may seem weird
        # but they are detected in expressions alright and some of their
        # tokens exist as rules too (e.g. GPL)
        ########################################################################
        for sts in sorted(_spdx_tokens):
            stid = dictionary_get(sts)
            if stid is None:
                # we have a never yet seen token, so we assign a new tokenid
                highest_tid += 1
                stid = highest_tid
                dictionary[sts] = stid

        self.rules_by_rid = rules_by_rid = list(rules)
        # ensure that rules are sorted
        rules_by_rid.sort()
        len_rules = len(rules_by_rid)

        # create index data structures
        # OPTIMIZATION: bind frequently used methods to the local scope for
        # index structures
        ########################################################################
        tids_by_rid_append = self.tids_by_rid.append

        false_positive_rids_add = self.false_positive_rids.add
        regular_rids_add = self.regular_rids.add
        approx_matchable_rids_add = self.approx_matchable_rids.add

        # since we only use these for regular rules, these lists may be sparse.
        # their index is the rule rid
        self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules
        self.sets_by_rid = sets_by_rid = [None] * len_rules
        self.msets_by_rid = msets_by_rid = [None] * len_rules

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # create a set of known "license" words used to determine if a rule
        # starts or ends with a "license" word/token
        ########################################################################
        license_tokens = set()
        for t in _license_tokens:
            tid = dictionary_get(t)
            if tid is not None:
                license_tokens.add(tid)

        rules_automaton_add = partial(match_aho.add_sequence,
            automaton=self.rules_automaton, with_duplicates=False)

        if USE_AHO_FRAGMENTS:
            fragments_automaton_add = partial(
                match_aho.add_sequence,
                automaton=self.fragments_automaton,
                with_duplicates=True,
            )

        if USE_RULE_STARTS:
            starts_automaton_add_start = partial(
                match_aho.add_start,
                automaton=self.starts_automaton,
            )

        # OPTIMIZED: bind frequently used objects to local scope
        rid_by_hash = self.rid_by_hash
        match_hash_index_hash = match_hash.index_hash
        match_set_tids_set_counter = match_set.tids_set_counter
        match_set_multiset_counter = match_set.multiset_counter

        len_starts = SMALL_RULE
        min_len_starts = SMALL_RULE * 6

        ngram_len = AHO_FRAGMENTS_NGRAM_LEN

        # Index each rule
        ########################################################################
        for rid, rule in enumerate(rules_by_rid):

            # assign rid
            rule.rid = rid

            rule_token_ids = array('h', [])
            tids_by_rid_append(rule_token_ids)
            rule_token_ids_append = rule_token_ids.append

            rule_tokens = []
            rule_tokens_append = rule_tokens.append

            # A rule is weak if it does not contain at least one legalese word:
            # we consider all rules to be weak until proven otherwise below.
            # "weak" rules can only be matched with an automaton.
            is_weak = True

            for rts in rule.tokens():
                rule_tokens_append(rts)
                rtid = dictionary_get(rts)
                if rtid is None:
                    # we have a never yet seen token, so we assign a new tokenid
                    # note: we could use the length of the dictionary instead
                    highest_tid += 1
                    rtid = highest_tid
                    dictionary[rts] = rtid
                if is_weak and rtid < len_legalese:
                    is_weak = False

                rule_token_ids_append(rtid)

            rule_length = rule.length
            is_tiny = rule_length < TINY_RULE

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash_index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            ####################
            # populate automaton with the whole rule tokens sequence, for all
            # RULEs, be they "standard"/regular, weak, false positive or small
            ####################
            rules_automaton_add(tids=rule_token_ids, rid=rid)

            if rule.is_false_positive:
                # False positive rules do not participate in the set or sequence
                # matching at all: they are used for exact matching and in post-
                # matching filtering
                false_positive_rids_add(rid)
                continue

            # from now on, we have regular rules
            rid_by_hash[rule_hash] = rid
            regular_rids_add(rid)

            # Does the rule starts or ends with a "license" word? We track this
            # to help disambiguate some overlapping false positive short rules
            # OPTIMIZED: the last rtid above IS the last token id
            if license_tokens:
                if rtid in license_tokens:
                    rule.ends_with_license = True
                if rule_token_ids[0] in license_tokens:
                    rule.starts_with_license = True

            # populate unknown_automaton that only makes sense for rules that
            # are also sequence matchable.
            ####################
            match_unknown.add_ngrams(
                automaton=self.unknown_automaton,
                tids=rule_token_ids,
                tokens=rule_tokens,
                len_legalese=len_legalese,
                rule_length=rule_length,
            )

            # Some rules that cannot be matched as a sequence are "weak" rules
            # or can require to be matched only as a continuous sequence of
            # tokens. This includes, tiny, is_continuous or is_license_reference
            # rules. We skip adding these to the data structures used for
            # sequence matching.
            can_match_as_sequence = not (
                is_weak
                or is_tiny
                or rule.is_continuous
                or (rule.is_small
                    and (rule.is_license_reference or rule.is_license_tag))
            )

            if can_match_as_sequence:
                approx_matchable_rids_add(rid)

                ####################
                # update high postings: positions by high tids used to
                # speed up sequence matching
                ####################
                # no postings for rules that cannot be matched as a sequence (too short and weak)
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid < len_legalese:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                high_postings_by_rid[rid] = postings

                ####################
                # ... and ngram fragments: compute ngrams and populate an automaton with ngrams
                ####################
                if (USE_AHO_FRAGMENTS
                    and rule.minimum_coverage < 100
                    and rule_length > ngram_len
                ):
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len)
                    all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    # all_ngrams_with_pos = enumerate(all_ngrams)
                    for pos, ngram in all_ngrams_with_pos:
                        fragments_automaton_add(tids=ngram, rid=rid, start=pos)

                ####################
                # use the start and end of this rule as a break point for query runs
                ####################
                if USE_RULE_STARTS and rule_length > min_len_starts:
                    starts_automaton_add_start(
                        tids=rule_token_ids[:len_starts],
                        rule_identifier=rule.identifier,
                        rule_length=rule_length,
                    )

            ####################
            # build sets and multisets indexes, for all regular rules as we need
            # the thresholds
            ####################
            tids_set, mset = match_set.build_set_and_mset(
                rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS)
            sets_by_rid[rid] = tids_set
            msets_by_rid[rid] = mset

            ####################################################################
            ####################################################################
            # FIXME!!!!!!! we should store them: we need them and we recompute
            # them later at match time
            tids_set_high = match_set.high_tids_set_subset(
                tids_set, len_legalese)
            mset_high = match_set.high_multiset_subset(
                mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS)

            # FIXME!!!!!!!
            ####################################################################
            ####################################################################

            ####################
            # update rule thresholds
            ####################
            rule.length_unique = match_set_tids_set_counter(tids_set)
            rule.high_length_unique = match_set_tids_set_counter(tids_set_high)

            rule.high_length = match_set_multiset_counter(mset_high)
            rule.compute_thresholds()

        ########################################################################
        # Finalize index data structures
        ########################################################################
        # Create the tid -> token string lookup structure.
        ########################################################################
        self.tokens_by_tid = tokens_by_tid = [
            ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))]
        self.len_tokens = len_tokens = len(tokens_by_tid)

        # some tokens are made entirely of digits and these can create some
        # worst case behavior when there are long runs on these
        ########################################################################
        self.digit_only_tids = intbitset([
            i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()])

        # Finalize automatons
        ########################################################################
        self.rules_automaton.make_automaton()
        if USE_AHO_FRAGMENTS:
            self.fragments_automaton.make_automaton()
        if USE_RULE_STARTS:
            match_aho.finalize_starts(self.starts_automaton)
        self.unknown_automaton.make_automaton()

        ########################################################################
        # Do some sanity checks
        ########################################################################

        msg = 'Inconsistent structure lengths'
        assert len_tokens == highest_tid + 1 == len(dictionary), msg

        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([
                        ('file://' + rule.text_file)
                        if rule.text_file
                        else ('text: ' + rule.stored_text)
                            for rule in rules])
                    )
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True
Exemplo n.º 3
0
    def _add_rules(self,
                   rules,
                   _ranked_tokens=global_tokens_by_ranks,
                   _spdx_tokens=None):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_spdx_tokens` if provided is a set of token strings from known SPDX
        keys: these receive a special treatment.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.is_false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.is_negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Add SPDX key tokens to the dictionary. track which are only from SPDX leys
        ########################################################################
        spdx_tokens = None
        if _spdx_tokens:
            spdx_tokens = _spdx_tokens.difference(frequencies_by_token)
            frequencies_by_token.update(_spdx_tokens)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        ########################################################################
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        # initial dictionary mapping to old/arbitrary token ids
        ########################################################################
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) integer ids
        ########################################################################
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        # Get SPDX-only token ids
        ########################################################################
        spdx_token_ids = None
        if spdx_tokens:
            spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens)

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens,
                                             _spdx_token_ids=spdx_token_ids)
        (
            self.len_junk,
            self.dictionary,
            self.tokens_by_tid,
            self.tids_by_rid,
            self.weak_rids,
        ) = renumbered

        len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered

        #######################################################################
        # build index structures
        #######################################################################
        self.len_good = len_good = len_tokens - len_junk
        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            rule_is_weak = rid in weak_rids

            if rule.is_negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)
            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by

                # FIXME: we do not want to keep small rules and rules that
                # cannot be seq matches in the index

                # no postings for junk only rules
                # we do not want to keep small rules and rules that
                # cannot be seq matches in the index
                if not rule_is_weak:
                    postings = defaultdict(list)
                    for pos, tid in enumerate(rule_token_ids):
                        if tid >= len_junk:
                            postings[tid].append(pos)
                    # OPTIMIZED: for speed and memory: convert postings to arrays
                    postings = {
                        tid: array('h', value)
                        for tid, value in postings.items()
                    }
                    # OPTIMIZED: for speed, sparsify dict
                    sparsify(postings)
                    self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(
                    rule_token_ids, len_junk, len_good)

                # no set indexes for junk only rules
                if not rule_is_weak:
                    self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                    self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100
                        and len(rule_token_ids) > NGRAM_LEN):
                    all_ngrams = tokenize.ngrams(rule_token_ids,
                                                 ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams,
                                                             with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # FIXME: this may not be updated for a rule that is createda at
                # match time such as SPDX rules

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([('file://' + rule.text_file) if rule.text_file else
                            ('text: ' + rule.stored_text) for rule in rules]))
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True
Exemplo n.º 4
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            rul_len = rul.length
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
                if rul_len > self.largest_false_positive_length:
                    self.largest_false_positive_length = rul_len
            elif rul.negative():
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.false_positive:
                # FP rules are not used for any matching
                # there is nothing else for these rules
                self.false_positive_rid_by_hash[rule_hash] = rid
            else:
                # negative, small and regular

                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {
                    tid: array('h', value)
                    for tid, value in postings.items()
                }
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets(
                    rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automatons...
                if rule.negative():
                    # ... with only the whole rule tokens sequence
                    negative_automaton_add(tids=rule_token_ids, rid=rid)
                else:
                    # ... or with the whole rule tokens sequence
                    rules_automaton_add(tids=rule_token_ids, rid=rid)
                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
                    if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(
                            rule_token_ids) > NGRAM_LEN:
                        all_ngrams = ngrams(rule_token_ids,
                                            ngram_length=NGRAM_LEN)
                        selected_ngrams = select_ngrams(all_ngrams,
                                                        with_pos=True)
                        for pos, ngram in selected_ngrams:
                            rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = tids_set_counter(rlow_set)
                rule.high_unique = tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = tids_multiset_counter(rlow_mset)
                rule.high_length = tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)
        sparsify(self.false_positive_rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules]
                               for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' +
                   u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True
Exemplo n.º 5
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)

            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True