def index_starters(rule_tokens, gaps, _ngram_length=NGRAM_LENGTH): """ Given an sequence of rule tokens and a set of gaps for that rule, return a sequence of tuples of (starter ngram, start,) computed from the tokens, gaps and ngram len. start is the starting position of the ngram. """ rule_tokens = list(rule_tokens) len_tokens = len(rule_tokens) if not gaps: # no gaps: consider only the first ngram and the whole rule. if len_tokens >= _ngram_length: yield tuple(rule_tokens[:_ngram_length]), 0 else: # T' starts at -1 # pos: # 0 1 2 T 3 4 5 6 7 T 8 9 L # gaps + len: # 2 7 10 # slices: # [0:3] [3:8] [8:11] # spans: # [0:2] [3:7] [8:10] # recipe: # [T'+1:T+1] [T'+1:T+1] [T'+1:T+1] for start, ngram in enumerate(ngrams(rule_tokens, ngram_length=_ngram_length)): if start == 0: if not any(g in gaps for g in range(0, _ngram_length - 2)): yield ngram, start elif start - 1 in gaps and not any(p in range(start, start + _ngram_length - 1) for p in gaps): yield ngram, start
def write_ngrams(texts, output, _seen=set(), ngram_length=6): """ Write the texts list as ngrams to the output file-like object. """ for text in ['\n'.join(ngs) for ngs in ngrams(texts, ngram_length=ngram_length)]: if text in _seen: continue _seen.add(text) output.write(template.format(text))
def test_ngrams_with_None_length_three(self): tokens = [ 'Redistribution', 'and', 'use', None, 'in', 'source', 'and', 'binary', 'are', None ] result = list(ngrams(tokens, ngram_length=3)) expected = [('Redistribution', 'and', 'use'), ('and', 'use', None), ('use', None, 'in'), (None, 'in', 'source'), ('in', 'source', 'and'), ('source', 'and', 'binary'), ('and', 'binary', 'are'), ('binary', 'are', None)] assert result == expected
def add_ngrams( automaton, tids, tokens, rule_length, len_legalese, unknown_ngram_length=UNKNOWN_NGRAM_LENGTH, ): """ Add the `tids` sequence of token ids to an unknown ngram automaton. """ if rule_length >= unknown_ngram_length: tids_ngrams = tokenize.ngrams(tids, ngram_length=unknown_ngram_length) toks_ngrams = tokenize.ngrams(tokens, ngram_length=unknown_ngram_length) for tids_ngram, toks_ngram in zip(tids_ngrams, toks_ngrams): if is_good_tokens_ngram(toks_ngram, tids_ngram, len_legalese): # note that we do not store positions as values, only the ngram # since we do not keep the rule origin of an ngram automaton.add_word(tids_ngram)
def test_ngrams_with_None(self): tokens = ['Redistribution', 'and', 'use', None, 'in', 'source', 'and', 'binary', 'are', None] result = list(ngrams(tokens, ngram_length=4)) expected = [ ('Redistribution', 'and', 'use', None), ('and', 'use', None, 'in'), ('use', None, 'in', 'source'), (None, 'in', 'source', 'and'), ('in', 'source', 'and', 'binary'), ('source', 'and', 'binary', 'are'), ('and', 'binary', 'are', None)] assert expected == result
def test_ngrams(self): tokens = ''' Redistribution and use in source and binary are permitted. '''.split() result = list(ngrams(tokens, ngram_length=4)) expected = [('Redistribution', 'and', 'use', 'in'), ('and', 'use', 'in', 'source'), ('use', 'in', 'source', 'and'), ('in', 'source', 'and', 'binary'), ('source', 'and', 'binary', 'are'), ('and', 'binary', 'are', 'permitted.')] assert result == expected
def test_ngrams2(self): tokens = ''' Redistribution and use in source and binary are permitted. '''.split() result = list(ngrams(tokens, ngram_length=4)) expected = [ ('Redistribution', 'and', 'use', 'in'), ('and', 'use', 'in', 'source'), ('use', 'in', 'source', 'and'), ('in', 'source', 'and', 'binary'), ('source', 'and', 'binary', 'are'), ('and', 'binary', 'are', 'permitted.')] assert expected == result
def get_weighted_hash(self): """ Return a weighted array from the word token list. """ result = [0] * HASH_LENGTH length = len(self.tokens) - SHINGLE_LENGTH + 1 shingles = ngrams(self.tokens, SHINGLE_LENGTH) if length > 0: for shingle in shingles: shingle = ''.join(shingle) self.process_shingles(shingle, result) else: self.process_shingles(''.join(self.tokens), result) return result
def build_set_and_bigrams_mset(token_ids): """ Return a tuple of (tids set, multiset) given a `token_ids` tids sequence. """ tids_set = intbitset() bigrams_mset = defaultdict(int) for bigram in ngrams(token_ids, 2): # this skips already matched token ids that are -1 if -1 in bigram: continue bigrams_mset[bigram] += 1 tids_set.update(bigram) return tids_set, bigrams_mset
def filter_strings(strs, nglen=4): """ Filter cluster of short strings. If a string two previous and next neighbors and itself have a small length less than mlen, discard that string. """ from licensedcode.tokenize import ngrams # FIXME: the ngrams function skips things if we have less than ngram_len strings strs = list(strs) if len(strs) < nglen: for s in strs: yield s else: for ngm in ngrams(strs, nglen): junk = (all(len(s) <= 5 for s in ngm) or sum(len(s) for s in ngm) <= nglen * 5 or len(set(ngm[0])) / float(len(ngm[0])) < 0.01) if junk: continue yield ngm[0]
def _add_rules( self, rules, _legalese=common_license_words, _spdx_tokens=frozenset(), _license_tokens=frozenset(), ): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_legalese` is a set of common license-specific words aka. legalese `_spdx_tokens` is a set of token strings used in SPDX license identifiers ``license_tokens`` is a set of "license" tokens used as start or end of a rule """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # initial dictionary mapping for known legalese tokens ######################################################################## # FIXME: we should start enumerating at 1 below: token ids then become # valid "unichr" values, making it easier downstream when used in # automatons self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(sorted(_legalese)) } dictionary_get = dictionary.get self.len_legalese = len_legalese = len(dictionary) highest_tid = len_legalese - 1 # Add SPDX key tokens to the dictionary # these are always treated as non-legalese. This may seem weird # but they are detected in expressions alright and some of their # tokens exist as rules too (e.g. GPL) ######################################################################## for sts in sorted(_spdx_tokens): stid = dictionary_get(sts) if stid is None: # we have a never yet seen token, so we assign a new tokenid highest_tid += 1 stid = highest_tid dictionary[sts] = stid self.rules_by_rid = rules_by_rid = list(rules) # ensure that rules are sorted rules_by_rid.sort() len_rules = len(rules_by_rid) # create index data structures # OPTIMIZATION: bind frequently used methods to the local scope for # index structures ######################################################################## tids_by_rid_append = self.tids_by_rid.append false_positive_rids_add = self.false_positive_rids.add regular_rids_add = self.regular_rids.add approx_matchable_rids_add = self.approx_matchable_rids.add # since we only use these for regular rules, these lists may be sparse. # their index is the rule rid self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules self.sets_by_rid = sets_by_rid = [None] * len_rules self.msets_by_rid = msets_by_rid = [None] * len_rules # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # create a set of known "license" words used to determine if a rule # starts or ends with a "license" word/token ######################################################################## license_tokens = set() for t in _license_tokens: tid = dictionary_get(t) if tid is not None: license_tokens.add(tid) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton, with_duplicates=False) if USE_AHO_FRAGMENTS: fragments_automaton_add = partial( match_aho.add_sequence, automaton=self.fragments_automaton, with_duplicates=True, ) if USE_RULE_STARTS: starts_automaton_add_start = partial( match_aho.add_start, automaton=self.starts_automaton, ) # OPTIMIZED: bind frequently used objects to local scope rid_by_hash = self.rid_by_hash match_hash_index_hash = match_hash.index_hash match_set_tids_set_counter = match_set.tids_set_counter match_set_multiset_counter = match_set.multiset_counter len_starts = SMALL_RULE min_len_starts = SMALL_RULE * 6 ngram_len = AHO_FRAGMENTS_NGRAM_LEN # Index each rule ######################################################################## for rid, rule in enumerate(rules_by_rid): # assign rid rule.rid = rid rule_token_ids = array('h', []) tids_by_rid_append(rule_token_ids) rule_token_ids_append = rule_token_ids.append rule_tokens = [] rule_tokens_append = rule_tokens.append # A rule is weak if it does not contain at least one legalese word: # we consider all rules to be weak until proven otherwise below. # "weak" rules can only be matched with an automaton. is_weak = True for rts in rule.tokens(): rule_tokens_append(rts) rtid = dictionary_get(rts) if rtid is None: # we have a never yet seen token, so we assign a new tokenid # note: we could use the length of the dictionary instead highest_tid += 1 rtid = highest_tid dictionary[rts] = rtid if is_weak and rtid < len_legalese: is_weak = False rule_token_ids_append(rtid) rule_length = rule.length is_tiny = rule_length < TINY_RULE # build hashes index and check for duplicates rule texts rule_hash = match_hash_index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) #################### # populate automaton with the whole rule tokens sequence, for all # RULEs, be they "standard"/regular, weak, false positive or small #################### rules_automaton_add(tids=rule_token_ids, rid=rid) if rule.is_false_positive: # False positive rules do not participate in the set or sequence # matching at all: they are used for exact matching and in post- # matching filtering false_positive_rids_add(rid) continue # from now on, we have regular rules rid_by_hash[rule_hash] = rid regular_rids_add(rid) # Does the rule starts or ends with a "license" word? We track this # to help disambiguate some overlapping false positive short rules # OPTIMIZED: the last rtid above IS the last token id if license_tokens: if rtid in license_tokens: rule.ends_with_license = True if rule_token_ids[0] in license_tokens: rule.starts_with_license = True # populate unknown_automaton that only makes sense for rules that # are also sequence matchable. #################### match_unknown.add_ngrams( automaton=self.unknown_automaton, tids=rule_token_ids, tokens=rule_tokens, len_legalese=len_legalese, rule_length=rule_length, ) # Some rules that cannot be matched as a sequence are "weak" rules # or can require to be matched only as a continuous sequence of # tokens. This includes, tiny, is_continuous or is_license_reference # rules. We skip adding these to the data structures used for # sequence matching. can_match_as_sequence = not ( is_weak or is_tiny or rule.is_continuous or (rule.is_small and (rule.is_license_reference or rule.is_license_tag)) ) if can_match_as_sequence: approx_matchable_rids_add(rid) #################### # update high postings: positions by high tids used to # speed up sequence matching #################### # no postings for rules that cannot be matched as a sequence (too short and weak) # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid < len_legalese: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} high_postings_by_rid[rid] = postings #################### # ... and ngram fragments: compute ngrams and populate an automaton with ngrams #################### if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and rule_length > ngram_len ): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len) all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True) # all_ngrams_with_pos = enumerate(all_ngrams) for pos, ngram in all_ngrams_with_pos: fragments_automaton_add(tids=ngram, rid=rid, start=pos) #################### # use the start and end of this rule as a break point for query runs #################### if USE_RULE_STARTS and rule_length > min_len_starts: starts_automaton_add_start( tids=rule_token_ids[:len_starts], rule_identifier=rule.identifier, rule_length=rule_length, ) #################### # build sets and multisets indexes, for all regular rules as we need # the thresholds #################### tids_set, mset = match_set.build_set_and_mset( rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS) sets_by_rid[rid] = tids_set msets_by_rid[rid] = mset #################################################################### #################################################################### # FIXME!!!!!!! we should store them: we need them and we recompute # them later at match time tids_set_high = match_set.high_tids_set_subset( tids_set, len_legalese) mset_high = match_set.high_multiset_subset( mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS) # FIXME!!!!!!! #################################################################### #################################################################### #################### # update rule thresholds #################### rule.length_unique = match_set_tids_set_counter(tids_set) rule.high_length_unique = match_set_tids_set_counter(tids_set_high) rule.high_length = match_set_multiset_counter(mset_high) rule.compute_thresholds() ######################################################################## # Finalize index data structures ######################################################################## # Create the tid -> token string lookup structure. ######################################################################## self.tokens_by_tid = tokens_by_tid = [ ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))] self.len_tokens = len_tokens = len(tokens_by_tid) # some tokens are made entirely of digits and these can create some # worst case behavior when there are long runs on these ######################################################################## self.digit_only_tids = intbitset([ i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()]) # Finalize automatons ######################################################################## self.rules_automaton.make_automaton() if USE_AHO_FRAGMENTS: self.fragments_automaton.make_automaton() if USE_RULE_STARTS: match_aho.finalize_starts(self.starts_automaton) self.unknown_automaton.make_automaton() ######################################################################## # Do some sanity checks ######################################################################## msg = 'Inconsistent structure lengths' assert len_tokens == highest_tid + 1 == len(dictionary), msg msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([ ('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules]) ) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid, length=9, with_checks=True): """ Return updated index structures with new token ids such that the most common aka. 'junk' tokens have the lowest ids. `rules_tokens_ids` is a mapping of rule_id->sequence of token ids These common tokens are based on a curated list of frequent words and further refined such that: - no rule text sequence is composed entirely of these common tokens. - no or only a few rule text sub-sequence of `length` tokens (aka. ngrams) is not composed entirely of these common tokens. The returned structures are: - old_to_new: mapping of (old token id->new token id) - len_junk: the highest id of a junk token - dictionary (token string->token id) - tokens_by_tid (token id->token string) - frequencies_by_tid (token id->frequency) """ # keep track of very common junk tokens: digits and single letters very_common = set() very_common_add = very_common.add string_lowercase = u'abcdefghijklmnopqrstuvwxyz' for tid, token in enumerate(tokens_by_tid): # DIGIT TOKENS: Treat tokens composed only of digits as common junk # SINGLE ASCII LETTER TOKENS: Treat single ASCII letter tokens as common junk # TODO: ensure common numbers as strings are always there (one, two, and first, second, etc.) if token.isdigit() or (len(token) == 1 and token in string_lowercase): very_common_add(tid) # keep track of good, "not junk" tokens good = set() good_update = good.update # Classify rules tokens as smaller or equal to `length` or regular. regular_rules = [] regular_rules_append = regular_rules.append small_rules = [] small_rules_append = small_rules.append for rid, rule_toks_ids in enumerate(rules_tokens_ids): len_toks = len(rule_toks_ids) if len_toks == 1: # RULES of ONE TOKEN: their token cannot be junk good_update(rule_toks_ids) if len_toks <= length: small_rules_append((rid, rule_toks_ids)) else: regular_rules_append((rid, rule_toks_ids)) # Build a candidate junk set of roughly ~ 1/10th the size of of tokens set: # we use a curated list of common words as a base. The final length (and # also biggest token id) of junk tokens set typically ~ 1200 for about 12K # tokens junk_max = abs((len(tokens_by_tid) / 11) - len(very_common)) junk = set() junk_add = junk.add dictionary_get = dictionary.get junk_count = 0 for token in global_tokens_by_ranks(): tid = dictionary_get(token) if tid is None: continue if tid not in very_common and tid not in good: junk_add(tid) junk_count += 1 if junk_count == junk_max: break # Assemble our final junk and not junk sets final_junk = (very_common | junk) - good good = set(range(len(tokens_by_tid))) - final_junk if with_checks: # Now do a few sanity checks... def tokens_str(_tks): return u' '.join(tokens_by_tid[_tk] for _tk in _tks) # Check that no small rule is made entirely of junk for rid, tokens in small_rules: try: assert not all([jt in final_junk for jt in tokens]) except AssertionError: # this is a serious index issue print('!!!License Index FATAL ERROR: small rule: ', rid , 'is all made of junk:', tokens_str(tokens)) raise # Check that not too many ngrams are made entirely of junk # we build a set of ngrams for `length` over tokens of rules at equal or # bigger than length and check them all all_junk_ngrams_count = 0 for rid, tokens in regular_rules: for ngram in ngrams(tokens, length): # skip ngrams composed only of common junk as not significant if all(nt in very_common for nt in ngram): continue try: # note: we check only against junk, not final_junk assert not all(nt in junk for nt in ngram) except AssertionError: all_junk_ngrams_count += 1 # TODO: test that the junk choice is correct: for instance using some # stats based on standard deviation or markov chains or similar # conditional probabilities such that we verify that CANNOT create a # distinctive meaningful license string made entirely from junk tokens # check that we do not have too many ngrams made entirely of junk assert all_junk_ngrams_count < (length * 20) # Sort each set of old token IDs by decreasing original frequencies # FIXME: should use a key function not a schwartzian sort decorated = ((frequencies_by_tid[old_id], old_id) for old_id in final_junk) final_junk = [t for _f, t in sorted(decorated, reverse=True)] # FIXME: should use a key function not a schwartzian sort decorated = ((frequencies_by_tid[old_id], old_id) for old_id in good) good = [t for _f, t in sorted(decorated, reverse=True)] # create the new ids -> tokens value mapping new_tokens_by_tid = [tokens_by_tid[t] for t in final_junk + good] # sanity check: by construction this should always be true assert set(new_tokens_by_tid) == set(tokens_by_tid) # create new structures based on new ids and a mapping from old to new id len_tokens = len(new_tokens_by_tid) old_to_new = array('h', [0] * len_tokens) new_frequencies_by_tid = [None] * len_tokens new_dictionary = {} # assign new ids, re build dictionary, frequency for new_id, token in enumerate(new_tokens_by_tid): old_id = dictionary[token] old_to_new[old_id] = new_id new_dictionary[token] = new_id old_freq = frequencies_by_tid[old_id] new_frequencies_by_tid[new_id] = old_freq sparsify(new_dictionary) return old_to_new, len(final_junk), new_dictionary, new_tokens_by_tid, new_frequencies_by_tid
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids rul_len = rul.length if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) if rul_len > self.largest_false_positive_length: self.largest_false_positive_length = rul_len elif rul.negative(): # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.false_positive: # FP rules are not used for any matching # there is nothing else for these rules self.false_positive_rid_by_hash[rule_hash] = rid else: # negative, small and regular # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets( rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automatons... if rule.negative(): # ... with only the whole rule tokens sequence negative_automaton_add(tids=rule_token_ids, rid=rid) else: # ... or with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len( rule_token_ids) > NGRAM_LEN: all_ngrams = ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = tids_set_counter(rlow_set) rule.high_unique = tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = tids_multiset_counter(rlow_mset) rule.high_length = tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) sparsify(self.false_positive_rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks, _spdx_tokens=None): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_spdx_tokens` if provided is a set of token strings from known SPDX keys: these receive a special treatment. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.is_false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.is_negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Add SPDX key tokens to the dictionary. track which are only from SPDX leys ######################################################################## spdx_tokens = None if _spdx_tokens: spdx_tokens = _spdx_tokens.difference(frequencies_by_token) frequencies_by_token.update(_spdx_tokens) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way ######################################################################## tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg # initial dictionary mapping to old/arbitrary token ids ######################################################################## self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) integer ids ######################################################################## self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] # Get SPDX-only token ids ######################################################################## spdx_token_ids = None if spdx_tokens: spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens) ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens, _spdx_token_ids=spdx_token_ids) ( self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid, self.weak_rids, ) = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered ####################################################################### # build index structures ####################################################################### self.len_good = len_good = len_tokens - len_junk len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) rule_is_weak = rid in weak_rids if rule.is_negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by # FIXME: we do not want to keep small rules and rules that # cannot be seq matches in the index # no postings for junk only rules # we do not want to keep small rules and rules that # cannot be seq matches in the index if not rule_is_weak: postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets( rule_token_ids, len_junk, len_good) # no set indexes for junk only rules if not rule_is_weak: self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # FIXME: this may not be updated for a rule that is createda at # match time such as SPDX rules # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules])) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)} sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN: all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True