def index_token_sets(token_ids, len_junk, len_good): """ Return a 4-tuple of low & high tids sets, low & high tids multisets given a token_ids sequence. """ # For multisets, we use a defaultdict, rather than a Counter. This is midly # faster than a Counter for sparse sets. # this variant uses intbitset to evaluate its performance wrt to bitarray low_tids_set = intbitset(len_junk) low_tids_set_add = low_tids_set.add high_tids_set = intbitset(len_good) high_tids_set_add = high_tids_set.add low_tids_mset = defaultdict(int) high_tids_mset = defaultdict(int) for tid in token_ids: # this skips unknown token ids that are -1 as well as possible None if tid < 0: continue if tid < len_junk: low_tids_mset[tid] += 1 low_tids_set_add(tid) else: high_tids_mset[tid] += 1 high_tids_set_add(tid) # sparify for speed sparsify(low_tids_mset) sparsify(high_tids_mset) return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
def index_token_bitsets(token_ids, len_junk, len_good): """ Return a 4-tuple of low & high tids sets, low & high tids multisets given a token_ids sequence. """ # For multisets, we use a defaultdict rather than a Counter. This is midly faster # than a Counter for the common case of rather sparse sets. tids_set = bitarray([0] * (len_good + len_junk)) low_tids_mset = defaultdict(int) high_tids_mset = defaultdict(int) for tid in token_ids: # this skips unknown token ids that are -1 as well as possible None if tid < 0: continue tids_set[tid] = True if tid < len_junk: low_tids_mset[tid] += 1 else: high_tids_mset[tid] += 1 # sparify for speed sparsify(low_tids_mset) sparsify(high_tids_mset) low_tids_set = tids_set[:len_junk] high_tids_set = tids_set[len_junk:] return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
def loads(saved): """ Return a LicenseIndex from a pickled string. """ idx = cPickle.loads(saved) # perform some optimizations on the dictionaries sparsify(idx.dictionary) return idx
def load(fn, fast=True): """ Return a LicenseIndex loaded from the `fn` file-like object pickled index. """ pickler = cPickle if fast else pickle idx = pickler.load(fn) # perform some optimizations on the dictionaries sparsify(idx.dictionary) return idx
def loads(saved, fast=True): """ Return a LicenseIndex from a pickled string. """ pickler = cPickle if fast else pickle idx = pickler.loads(saved) # perform some optimizations on the dictionaries sparsify(idx.dictionary) return idx
def build_set_and_tids_mset(token_ids): """ Return a tuple of (tids set, multiset) given a `token_ids` tids sequence. """ tids_mset = defaultdict(int) for tid in token_ids: # this skips already matched token ids that are -1 if tid == -1: continue tids_mset[tid] += 1 # OPTIMIZED: sparsify for speed sparsify(tids_mset) tids_set = intbitset(tids_mset.keys()) return tids_set, tids_mset
def build_set_and_bigrams_mset(token_ids): """ Return a tuple of (tids set, multiset) given a `token_ids` tids sequence. """ tids_set = intbitset() bigrams_mset = defaultdict(int) for bigram in ngrams(token_ids, 2): # this skips already matched token ids that are -1 if -1 in bigram: continue bigrams_mset[bigram] += 1 tids_set.update(bigram) # OPTIMIZED: sparsify for speed sparsify(bigrams_mset) return tids_set, bigrams_mset
def dumps(self, fast=True): """ Return a pickled string of self. """ # here cPickle fails when we load it back. Pickle is slower to write but # works when we read with cPickle :| pickler = cPickle if fast else pickle pickled = pickler.dumps(self, protocol=cPickle.HIGHEST_PROTOCOL) # NB: this is making the usage of cPickle possible... as a weird workaround. # the gain from dumping using cPickle is not as big with this optimize # but still much faster than using the plain pickle module # TODO: revisit me after the Python3 port import pickletools pickletools.code2op = sparsify(pickletools.code2op) pickled = pickletools.optimize(pickled) return pickled
def loads(saved): """ Return a LicenseIndex from a pickled string. """ idx = cPickle.loads(saved) # perform some optimizations on dictionaries sparsify(idx.dictionary) for post in idx.postings_by_rid: sparsify(post) for start in idx.start_ngrams_by_rid: sparsify(start) return idx
def _add_rules(self, rules, _legalese=common_license_words, _spdx_tokens=frozenset()): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_legalese` is a set of common license-specific words aka. legalese `_spdx_tokens` is a set of token strings used in SPDX license identifiers """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # initial dictionary mapping for known legalese tokens ######################################################################## # FIXME: we should start at 1, and ids are become valid unichr values self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(sorted(_legalese))} dictionary_get = dictionary.get self.len_legalese = len_legalese = len(dictionary) highest_tid = len_legalese - 1 # Add SPDX key tokens to the dictionary # these are always treated as non-legalese ######################################################################## for sts in _spdx_tokens: stid = dictionary_get(sts) if stid is None: # we have a never yet seen token, so we assign a new tokenid highest_tid += 1 stid = highest_tid dictionary[sts] = stid # OPTIMIZED sparsify(dictionary) self.rules_by_rid = rules_by_rid = list(rules) len_rules = len(rules_by_rid) # create index data structures # OPTIMIZATION: bind frequently used methods to the local scope for index structures ######################################################################## tids_by_rid_append = self.tids_by_rid.append false_positive_rids_add = self.false_positive_rids.add negative_rids_add = self.negative_rids.add regular_rids_add = self.regular_rids.add approx_matchable_rids_add = self.approx_matchable_rids.add # since we only use these for regular rules, these lists may be sparse. # their index is the rule rid self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules self.sets_by_rid = sets_by_rid = [None] * len_rules self.msets_by_rid = msets_by_rid = [None] * len_rules # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build partials for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton, with_duplicates=False) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton, with_duplicates=False) if USE_AHO_FRAGMENTS: fragments_automaton_add = partial(match_aho.add_sequence, automaton=self.fragments_automaton, with_duplicates=True) if USE_RULE_STARTS: starts_automaton_add_start = partial(match_aho.add_start, automaton=self.starts_automaton) # OPTIMIZED: bind frequently used objects to local scope rid_by_hash = self.rid_by_hash match_hash_index_hash = match_hash.index_hash match_set_tids_set_counter = match_set.tids_set_counter match_set_multiset_counter = match_set.multiset_counter len_starts = SMALL_RULE min_len_starts = SMALL_RULE * 6 ngram_len = AHO_FRAGMENTS_NGRAM_LEN # Index each rule ######################################################################## for rid, rule in enumerate(rules_by_rid): # assign rid rule.rid = rid rule_token_ids = array('h', []) tids_by_rid_append(rule_token_ids) # A rule is weak if it does not contain at least one legalese word: # we consider all rules to be weak until proven otherwise below. # "weak" rules can only be matched with an automaton. is_weak = True for rts in rule.tokens(): rtid = dictionary_get(rts) if rtid is None: # we have a never yet seen token, so we assign a new tokenid # note: we could use the length of the dictionary instead highest_tid += 1 rtid = highest_tid dictionary[rts] = rtid if is_weak and rtid < len_legalese: is_weak = False rule_token_ids.append(rtid) # build hashes index and check for duplicates rule texts rule_hash = match_hash_index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) # classify rules and build disjuncted sets of rids if rule.is_negative: # negative rules are matched early and their tokens are only # exactly matched. When matched as a whole, their tokens are # removed from the token stream negative_rids_add(rid) negative_automaton_add(tids=rule_token_ids, rid=rid) continue #################### # populate automaton with the whole rule tokens sequence, for all # RULEs, be they "standard"/regular, weak, false positive or small # (but not negative) #################### rules_automaton_add(tids=rule_token_ids, rid=rid) if rule.is_false_positive: # False positive rules do not participate in the set or sequence # matching at all: they are used for exact matching and in post- # matching filtering false_positive_rids_add(rid) continue # from now on, we have regular rules rid_by_hash[rule_hash] = rid regular_rids_add(rid) # Some rules cannot be matched as a sequence are "weak" rules if not is_weak: approx_matchable_rids_add(rid) #################### # update high postings: positions by high tids used to # speed up sequence matching #################### # no postings for rules that cannot be matched as a sequence (too short and weak) # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid < len_legalese: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} # OPTIMIZED: for speed, sparsify dict sparsify(postings) high_postings_by_rid[rid] = postings #################### # ... and ngram fragments: compute ngrams and populate an automaton with ngrams #################### if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and rule.length > ngram_len: all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len) all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True) # all_ngrams_with_pos = enumerate(all_ngrams) for pos, ngram in all_ngrams_with_pos: fragments_automaton_add(tids=ngram, rid=rid, start=pos) #################### # use the start and end of this rule as a break point for query runs #################### if USE_RULE_STARTS and rule.length > min_len_starts: starts_automaton_add_start( tids=rule_token_ids[:len_starts], rule_identifier=rule.identifier, rule_length=rule.length) #################### # build sets and multisets indexes, for all regular rules as we need # the thresholds #################### tids_set, mset = match_set.build_set_and_mset( rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS) sets_by_rid[rid] = tids_set msets_by_rid[rid] = mset #################################################################### #################################################################### # FIXME!!!!!!! we should store them: we need them and we recompute # them later at match time tids_set_high = match_set.high_tids_set_subset( tids_set, len_legalese) mset_high = match_set.high_multiset_subset( mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS) # FIXME!!!!!!! #################################################################### #################################################################### #################### # update rule thresholds #################### rule.length_unique = match_set_tids_set_counter(tids_set) rule.high_length_unique = match_set_tids_set_counter(tids_set_high) rule.high_length = match_set_multiset_counter(mset_high) rule.compute_thresholds() ######################################################################## # Finalize index data structures ######################################################################## # some tokens are made entirely of digits and these can create some # worst case behavior when there are long runs on these ######################################################################## self.digit_only_tids = intbitset([ i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()]) # Create the tid -> token string lookup structure. ######################################################################## self.tokens_by_tid = tokens_by_tid = [ ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))] self.len_tokens = len_tokens = len(tokens_by_tid) # Finalize automatons ######################################################################## self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() if USE_AHO_FRAGMENTS: self.fragments_automaton.make_automaton() if USE_RULE_STARTS: match_aho.finalize_starts(self.starts_automaton) # OPTIMIZED: sparser dicts for faster lookup sparsify(self.rid_by_hash) ######################################################################## # Do some sanity checks ######################################################################## msg = 'Inconsistent structure lengths' assert len_tokens == highest_tid + 1 == len(dictionary), msg msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([ ('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules]) ) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def renumber_token_ids(self, frequencies_by_old_tid, _ranked_tokens=global_tokens_by_ranks): """ Return updated index structures with new token ids such that the most common tokens (aka. 'junk' or 'low' tokens) have the lowest ids. Return a tuple of (len_junk, dictionary, tokens_by_tid, tids_by_rid) - len_junk: the number of junk_old_tids tokens such that all junk token ids are smaller than this number. - dictionary: mapping of token string->token id - tokens_by_tid: reverse mapping of token id->token string - tids_by_rid: mapping of rule id-> array of token ids The arguments all relate to old, temporary token ids and are : - frequencies_by_old_tid: mapping of token id-> occurences across all rules - _ranked_tokens: callable returning a list of common lowercase token strings, ranked from most common to least common Used only for testing and default to a global list. Common tokens are computed based on a curated list of frequent words and token frequencies across rules such that: - common tokens have lower token ids smaller than len_junk - no rule is composed entirely of junk tokens. """ old_dictionary = self.dictionary tokens_by_old_tid = self.tokens_by_tid old_tids_by_rid = self.tids_by_rid # track tokens for rules with a single token: their token is never junk # otherwise they can never be detected rules_of_one = set(r.rid for r in self.rules_by_rid if r.length == 1) never_junk_old_tids = set(rule_tokens[0] for rid, rule_tokens in enumerate(old_tids_by_rid) if rid in rules_of_one) # creat initial set of junk token ids junk_old_tids = set() junk_old_tids_add = junk_old_tids.add # Treat very common tokens composed only of digits or single chars as junk very_common_tids = set(old_tid for old_tid, token in enumerate(tokens_by_old_tid) if token.isdigit() or len(token) == 1) junk_old_tids.update(very_common_tids) # TODO: ensure common number as words are treated as very common # (one, two, and first, second, etc.)? # TODO: add and treat person and place names as always being JUNK # Build the candidate junk set as an apprixmate proportion of total tokens len_tokens = len(tokens_by_old_tid) junk_max = len_tokens // PROPORTION_OF_JUNK # Use a curated list of common tokens sorted by decreasing frequency as # the basis to determine junk status. old_dictionary_get = old_dictionary.get for token in _ranked_tokens(): # stop when we reach the maximum junk proportion if len(junk_old_tids) == junk_max: break old_tid = old_dictionary_get(token) if old_tid is not None and old_tid not in never_junk_old_tids: junk_old_tids_add(old_tid) len_junk = len(junk_old_tids) # Assemble our final set of good old token id good_old_tids = set(range(len_tokens)) - junk_old_tids assert len_tokens == len(junk_old_tids) + len(good_old_tids) # Sort the list of old token ids: junk before good, then by decreasing # frequencies, then old id. # This sort does the renumbering proper of old to new token ids key = lambda i: (i in good_old_tids, -frequencies_by_old_tid[i], i) new_to_old_tids = sorted(range(len_tokens), key=key) # keep a mapping from old to new id used for renumbering index structures old_to_new_tids = [new_tid for new_tid, _old_tid in sorted(enumerate(new_to_old_tids), key=itemgetter(1))] # create the new ids -> tokens string mapping tokens_by_new_tid = [tokens_by_old_tid[old_tid] for _new_tid, old_tid in enumerate(new_to_old_tids)] # create the new dcitionary tokens trings -> new id new_dictionary = {token: new_tid for new_tid, token in enumerate(tokens_by_new_tid)} sparsify(new_dictionary) old_tids_by_rid = self.tids_by_rid # mapping of rule_id->new token_ids array new_tids_by_rid = [array('h', (old_to_new_tids[tid] for tid in old_tids)) for old_tids in old_tids_by_rid] # Now do a few sanity checks... # By construction this should always be true assert set(tokens_by_new_tid) == set(tokens_by_old_tid) fatals = [] for rid, new_tids in enumerate(new_tids_by_rid): # Check that no rule is all junk: this is a fatal indexing error if all(t < len_junk for t in new_tids): message = ( 'WARNING: Weak rule, made only of frequent junk tokens. Can only be matched exactly:', self.rules_by_rid[rid].identifier, u' '.join(tokens_by_new_tid[t] for t in new_tids) ) fatals.append(u' '.join(message)) if TRACE and fatals: # raise IndexError(u'\n'.join(fatals)) print() print('############################################') map(print, fatals) print('############################################') print() # TODO: Check that the junk count choice is correct: for instance using some # stats based on standard deviation or markov chains or similar # conditional probabilities such that we verify that we CANNOT create a # distinctive meaningful license string made entirely from junk tokens return len_junk, new_dictionary, tokens_by_new_tid, new_tids_by_rid
def renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid, length=9, with_checks=True): """ Return updated index structures with new token ids such that the most common aka. 'junk' tokens have the lowest ids. `rules_tokens_ids` is a mapping of rule_id->sequence of token ids These common tokens are based on a curated list of frequent words and further refined such that: - no rule text sequence is composed entirely of these common tokens. - no or only a few rule text sub-sequence of `length` tokens (aka. ngrams) is not composed entirely of these common tokens. The returned structures are: - old_to_new: mapping of (old token id->new token id) - len_junk: the highest id of a junk token - dictionary (token string->token id) - tokens_by_tid (token id->token string) - frequencies_by_tid (token id->frequency) """ # keep track of very common junk tokens: digits and single letters very_common = set() very_common_add = very_common.add string_lowercase = u'abcdefghijklmnopqrstuvwxyz' for tid, token in enumerate(tokens_by_tid): # DIGIT TOKENS: Treat tokens composed only of digits as common junk # SINGLE ASCII LETTER TOKENS: Treat single ASCII letter tokens as common junk # TODO: ensure common numbers as strings are always there (one, two, and first, second, etc.) if token.isdigit() or (len(token) == 1 and token in string_lowercase): very_common_add(tid) # keep track of good, "not junk" tokens good = set() good_update = good.update # Classify rules tokens as smaller or equal to `length` or regular. regular_rules = [] regular_rules_append = regular_rules.append small_rules = [] small_rules_append = small_rules.append for rid, rule_toks_ids in enumerate(rules_tokens_ids): len_toks = len(rule_toks_ids) if len_toks == 1: # RULES of ONE TOKEN: their token cannot be junk good_update(rule_toks_ids) if len_toks <= length: small_rules_append((rid, rule_toks_ids)) else: regular_rules_append((rid, rule_toks_ids)) # Build a candidate junk set of roughly ~ 1/10th the size of of tokens set: # we use a curated list of common words as a base. The final length (and # also biggest token id) of junk tokens set typically ~ 1200 for about 12K # tokens junk_max = abs((len(tokens_by_tid) / 11) - len(very_common)) junk = set() junk_add = junk.add dictionary_get = dictionary.get junk_count = 0 for token in global_tokens_by_ranks(): tid = dictionary_get(token) if tid is None: continue if tid not in very_common and tid not in good: junk_add(tid) junk_count += 1 if junk_count == junk_max: break # Assemble our final junk and not junk sets final_junk = (very_common | junk) - good good = set(range(len(tokens_by_tid))) - final_junk if with_checks: # Now do a few sanity checks... def tokens_str(_tks): return u' '.join(tokens_by_tid[_tk] for _tk in _tks) # Check that no small rule is made entirely of junk for rid, tokens in small_rules: try: assert not all([jt in final_junk for jt in tokens]) except AssertionError: # this is a serious index issue print('!!!License Index FATAL ERROR: small rule: ', rid , 'is all made of junk:', tokens_str(tokens)) raise # Check that not too many ngrams are made entirely of junk # we build a set of ngrams for `length` over tokens of rules at equal or # bigger than length and check them all all_junk_ngrams_count = 0 for rid, tokens in regular_rules: for ngram in ngrams(tokens, length): # skip ngrams composed only of common junk as not significant if all(nt in very_common for nt in ngram): continue try: # note: we check only against junk, not final_junk assert not all(nt in junk for nt in ngram) except AssertionError: all_junk_ngrams_count += 1 # TODO: test that the junk choice is correct: for instance using some # stats based on standard deviation or markov chains or similar # conditional probabilities such that we verify that CANNOT create a # distinctive meaningful license string made entirely from junk tokens # check that we do not have too many ngrams made entirely of junk assert all_junk_ngrams_count < (length * 20) # Sort each set of old token IDs by decreasing original frequencies # FIXME: should use a key function not a schwartzian sort decorated = ((frequencies_by_tid[old_id], old_id) for old_id in final_junk) final_junk = [t for _f, t in sorted(decorated, reverse=True)] # FIXME: should use a key function not a schwartzian sort decorated = ((frequencies_by_tid[old_id], old_id) for old_id in good) good = [t for _f, t in sorted(decorated, reverse=True)] # create the new ids -> tokens value mapping new_tokens_by_tid = [tokens_by_tid[t] for t in final_junk + good] # sanity check: by construction this should always be true assert set(new_tokens_by_tid) == set(tokens_by_tid) # create new structures based on new ids and a mapping from old to new id len_tokens = len(new_tokens_by_tid) old_to_new = array('h', [0] * len_tokens) new_frequencies_by_tid = [None] * len_tokens new_dictionary = {} # assign new ids, re build dictionary, frequency for new_id, token in enumerate(new_tokens_by_tid): old_id = dictionary[token] old_to_new[old_id] = new_id new_dictionary[token] = new_id old_freq = frequencies_by_tid[old_id] new_frequencies_by_tid[new_id] = old_freq sparsify(new_dictionary) return old_to_new, len(final_junk), new_dictionary, new_tokens_by_tid, new_frequencies_by_tid
def _add_rules(self, rules, optimize=True, _ngram_length=NGRAM_LENGTH): """ Add an iterable of Rule objects to the index as an optimized batch operation. This replaces any existing indexed rules previously added. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') rules = list(rules) # First pass: collect tokens, count frequencies and find unique tokens ###################################################################### # compute the unique tokens and frequency at once unique_tokens = Counter() # accumulate all rule tokens at once. Also assign the rule ids tokens_by_rid = [] regular_rids = set() regular_rids_add = regular_rids.add negative_rids = set() negative_rids_add = negative_rids.add for rid, rule in enumerate(rules): rule.rid = rid if rule.negative(): negative_rids_add(rid) else: regular_rids_add(rid) rule_tokens = list(rule.tokens()) tokens_by_rid.append(rule_tokens) unique_tokens.update(rule_tokens) # Create the tokens lookup structure at once. # Note that tokens ids are assigned randomly at first by unzipping we # get the frequencies and tokens->id at once. tokens_by_tid, frequencies_by_tid = izip(*sorted(unique_tokens.most_common())) dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)} # for speed sparsify(dictionary) # replace strings with token ids rules_tokens_ids = [[dictionary[tok] for tok in rule_tok] for rule_tok in tokens_by_rid] len_tokens = len(tokens_by_tid) # Second pass: Optimize token ids based on frequencies and common words ####################################################################### # renumber tokens ids if optimize: renumbered = renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid) old_to_new, len_junk, dictionary, tokens_by_tid, frequencies_by_tid = renumbered else: # for testing only len_junk = 0 # this becomes a noop mapping existing id to themselves old_to_new = range(len_tokens) # mapping of rule_id->new token_ids array new_rules_tokens_ids = [] # renumber old token ids to new for rule_token_ids in rules_tokens_ids: new_rules_tokens_ids.append(array('h', (old_to_new[tid] for tid in rule_token_ids))) # Third pass: build index structures #################################### # lists of bitvectors for high and low tokens, one per rule high_bitvectors_by_rid = [0 for _r in rules] low_bitvectors_by_rid = [0 for _r in rules] frequencies_by_rid = [0 for _r in rules] lengths_by_rid = array('h', [0 for _r in rules]) # nested inverted index by rule_id->token_id->[postings array] postings_by_rid = [defaultdict(list) for _r in rules] # mapping of rule_id -> mapping of starter ngrams -> [(start, end,), ...] start_ngrams_by_rid = [defaultdict(list) for _r in rules] bv_template = bitarray([0 for _t in tokens_by_tid]) # build posting lists and other index structures for rid, new_rule_token_ids in enumerate(new_rules_tokens_ids): rid_postings = postings_by_rid[rid] tokens_frequency = Counter() # rule bitvector: index is the token id, 1 means token is present, and 0 absent tokens_occurrence = bv_template.copy() # loop through rules token (new) ids for pos, new_tid in enumerate(new_rule_token_ids): # append posting rid_postings[new_tid].append(pos) # set bit to one in bitvector for the token id # TODO: optimize: slice assignments could be faster? tokens_frequency[new_tid] += 1 tokens_occurrence[new_tid] = 1 sparsify(rid_postings) # build a high and low bitvector for the rule high_bitvectors_by_rid[rid] = tokens_occurrence[len_junk:] # build a high and low bitvector for the rule low_bitvectors_by_rid[rid] = tokens_occurrence[:len_junk] frequencies_by_rid[rid] = tokens_frequency lengths_by_rid[rid] = len(new_rule_token_ids) # collect starters rid_starters = start_ngrams_by_rid[rid] gaps = rules[rid].gaps for starter_ngram, start in index_starters(new_rule_token_ids, gaps, _ngram_length): rid_starters[starter_ngram].append(start) sparsify(rid_starters) # OPTIMIZED: for faster access to index: convert postings to arrays postings_by_rid[rid] = {key: array('h', value) for key, value in rid_postings.items()} # assign back the created index structure to self attributes self.postings_by_rid = postings_by_rid self.len_junk = len_junk self.len_tokens = len_tokens self.tokens_by_tid = tokens_by_tid self.frequencies_by_tid = frequencies_by_tid self.lengths_by_rid = lengths_by_rid self.dictionary = dictionary self.rules_by_rid = rules self.high_bitvectors_by_rid = high_bitvectors_by_rid self.low_bitvectors_by_rid = low_bitvectors_by_rid self.frequencies_by_rid = frequencies_by_rid self.tokens_by_rid = new_rules_tokens_ids self.start_ngrams_by_rid = start_ngrams_by_rid self.negative_rids = negative_rids self.regular_rids = regular_rids if optimize: self.optimized = True else: # for testing return rules_tokens_ids
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks, _spdx_tokens=None): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_spdx_tokens` if provided is a set of token strings from known SPDX keys: these receive a special treatment. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.is_false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.is_negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Add SPDX key tokens to the dictionary. track which are only from SPDX leys ######################################################################## spdx_tokens = None if _spdx_tokens: spdx_tokens = _spdx_tokens.difference(frequencies_by_token) frequencies_by_token.update(_spdx_tokens) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way ######################################################################## tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg # initial dictionary mapping to old/arbitrary token ids ######################################################################## self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) integer ids ######################################################################## self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] # Get SPDX-only token ids ######################################################################## spdx_token_ids = None if spdx_tokens: spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens) ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens, _spdx_token_ids=spdx_token_ids) ( self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid, self.weak_rids, ) = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered ####################################################################### # build index structures ####################################################################### self.len_good = len_good = len_tokens - len_junk len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) rule_is_weak = rid in weak_rids if rule.is_negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by # FIXME: we do not want to keep small rules and rules that # cannot be seq matches in the index # no postings for junk only rules # we do not want to keep small rules and rules that # cannot be seq matches in the index if not rule_is_weak: postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets( rule_token_ids, len_junk, len_good) # no set indexes for junk only rules if not rule_is_weak: self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # FIXME: this may not be updated for a rule that is createda at # match time such as SPDX rules # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules])) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids rul_len = rul.length if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) if rul_len > self.largest_false_positive_length: self.largest_false_positive_length = rul_len elif rul.negative(): # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.false_positive: # FP rules are not used for any matching # there is nothing else for these rules self.false_positive_rid_by_hash[rule_hash] = rid else: # negative, small and regular # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets( rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automatons... if rule.negative(): # ... with only the whole rule tokens sequence negative_automaton_add(tids=rule_token_ids, rid=rid) else: # ... or with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len( rule_token_ids) > NGRAM_LEN: all_ngrams = ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = tids_set_counter(rlow_set) rule.high_unique = tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = tids_multiset_counter(rlow_mset) rule.high_length = tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) sparsify(self.false_positive_rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True
def renumber_token_ids(self, frequencies_by_old_tid, _ranked_tokens=global_tokens_by_ranks): """ Return updated index structures with new token ids such that the most common tokens (aka. 'junk' or 'low' tokens) have the lowest ids. Return a tuple of (len_junk, dictionary, tokens_by_tid, tids_by_rid) - len_junk: the number of junk_old_tids tokens such that all junk token ids are smaller than this number. - dictionary: mapping of token string->token id - tokens_by_tid: reverse mapping of token id->token string - tids_by_rid: mapping of rule id-> array of token ids The arguments all relate to old, temporary token ids and are : - frequencies_by_old_tid: mapping of token id-> occurences across all rules - _ranked_tokens: callable returning a list of common lowercase token strings, ranked from most common to least common Used only for testing and default to a global list. Common tokens are computed based on a curated list of frequent words and token frequencies across rules such that: - common tokens have lower token ids smaller than len_junk - no rule is composed entirely of junk tokens. """ old_dictionary = self.dictionary tokens_by_old_tid = self.tokens_by_tid old_tids_by_rid = self.tids_by_rid # track tokens for rules with a single token: their token is never junk # otherwise they can never be detected rules_of_one = set(r.rid for r in self.rules_by_rid if r.length == 1) never_junk_old_tids = set( rule_tokens[0] for rid, rule_tokens in enumerate(old_tids_by_rid) if rid in rules_of_one) # creat initial set of junk token ids junk_old_tids = set() junk_old_tids_add = junk_old_tids.add # Treat very common tokens composed only of digits or single chars as junk very_common_tids = set( old_tid for old_tid, token in enumerate(tokens_by_old_tid) if token.isdigit() or len(token) == 1) junk_old_tids.update(very_common_tids) # TODO: ensure common number as words are treated as very common # (one, two, and first, second, etc.)? # TODO: add and treat person and place names as always being JUNK # Build the candidate junk set as an apprixmate proportion of total tokens len_tokens = len(tokens_by_old_tid) junk_max = len_tokens // PROPORTION_OF_JUNK # Use a curated list of common tokens sorted by decreasing frequency as # the basis to determine junk status. old_dictionary_get = old_dictionary.get for token in _ranked_tokens(): # stop when we reach the maximum junk proportion if len(junk_old_tids) == junk_max: break old_tid = old_dictionary_get(token) if old_tid is not None and old_tid not in never_junk_old_tids: junk_old_tids_add(old_tid) len_junk = len(junk_old_tids) # Assemble our final set of good old token id good_old_tids = set(range(len_tokens)) - junk_old_tids assert len_tokens == len(junk_old_tids) + len(good_old_tids) # Sort the list of old token ids: junk before good, then by decreasing # frequencies, then old id. # This sort does the renumbering proper of old to new token ids key = lambda i: (i in good_old_tids, -frequencies_by_old_tid[i], i) new_to_old_tids = sorted(range(len_tokens), key=key) # keep a mapping from old to new id used for renumbering index structures old_to_new_tids = [ new_tid for new_tid, _old_tid in sorted(enumerate(new_to_old_tids), key=itemgetter(1)) ] # create the new ids -> tokens string mapping tokens_by_new_tid = [ tokens_by_old_tid[old_tid] for _new_tid, old_tid in enumerate(new_to_old_tids) ] # create the new dcitionary tokens trings -> new id new_dictionary = { token: new_tid for new_tid, token in enumerate(tokens_by_new_tid) } sparsify(new_dictionary) old_tids_by_rid = self.tids_by_rid # mapping of rule_id->new token_ids array new_tids_by_rid = [ array('h', (old_to_new_tids[tid] for tid in old_tids)) for old_tids in old_tids_by_rid ] # Now do a few sanity checks... # By construction this should always be true assert set(tokens_by_new_tid) == set(tokens_by_old_tid) fatals = [] for rid, new_tids in enumerate(new_tids_by_rid): # Check that no rule is all junk: this is a fatal indexing error if all(t < len_junk for t in new_tids): message = ( 'WARNING: Weak rule, made only of frequent junk tokens. Can only be matched exactly:', self.rules_by_rid[rid].identifier, u' '.join(tokens_by_new_tid[t] for t in new_tids)) fatals.append(u' '.join(message)) if TRACE and fatals: # raise IndexError(u'\n'.join(fatals)) print() print('############################################') map(print, fatals) print('############################################') print() # TODO: Check that the junk count choice is correct: for instance using some # stats based on standard deviation or markov chains or similar # conditional probabilities such that we verify that we CANNOT create a # distinctive meaningful license string made entirely from junk tokens return len_junk, new_dictionary, tokens_by_new_tid, new_tids_by_rid
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)} sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN: all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True