def test_split_spdx_lid(self): test = [ 'SPDX License Identifier : BSD-3-Clause', 'SPDX-License-Identifier : BSD-3-Clause', 'spdx-license- identifier : BSD-3-Clause', ' SPDX License--Identifier: BSD-3-Clause', 'SPDX-License-Identifier : BSD-3-Clause', 'SPDx-Licence-Identifier : BSD-3-Clause', 'SPD-Licence-Identifier : BSD-3-Clause', ] results = [split_spdx_lid(l) for l in test] expected = [ ('SPDX License Identifier : ', 'BSD-3-Clause'), ('SPDX-License-Identifier : ', 'BSD-3-Clause'), ('spdx-license- identifier : ', 'BSD-3-Clause'), ('SPDX License--Identifier: ', 'BSD-3-Clause'), ('SPDX-License-Identifier : ', 'BSD-3-Clause'), ('SPDx-Licence-Identifier : ', 'BSD-3-Clause'), (None, 'SPD-Licence-Identifier : BSD-3-Clause'), ] assert results == expected
def tokens_by_line(self): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`, `shorts_and_digits_pos` and `spdx_lines` as a side effect. """ from licensedcode.match_spdx_lid import split_spdx_lid # bind frequently called functions to local scope tokenizer = query_tokenizer line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including all known and unknown tokens abs_pos = -1 # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids if TRACE: logger_debug('tokens_by_line: query lines') for line_num, line in query_lines(self.location, self.query_string): logger_debug(' ', line_num, ':', line) for line_num, line in query_lines(self.location, self.query_string): # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None # FIXME: the implicit update of abs_pos is not clear for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos else: if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. self_unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)
def tokens_by_line( self, location=None, query_string=None, start_line=1, ): """ Yield multiple sequences of tokens, one for each line in this query. Line numbers start at ``start_line`` which is 1-based by default. SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`, `unknowns_span`, `stopwords_by_pos`, `shorts_and_digits_pos` and `spdx_lines` . """ from licensedcode.match_spdx_lid import split_spdx_lid from licensedcode.stopwords import STOPWORDS location = location or self.location query_string = query_string or self.query_string # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append # we use a defaultdict as a convenience at construction time unknowns_by_pos = defaultdict(int) unknowns_pos = set() unknowns_pos_add = unknowns_pos.add # we use a defaultdict as a convenience at construction time stopwords_by_pos = defaultdict(int) stopwords_pos = set() stopwords_pos_add = stopwords_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids qlines = query_lines( location=location, query_string=query_string, start_line=start_line, ) if TRACE or TRACE_STOP_AND_UNKNOWN: logger_debug('tokens_by_line: query lines:') qlines = list(qlines) for line_num, line in qlines: logger_debug(' ', line_num, ':', line) for line_num, line in qlines: if TRACE_STOP_AND_UNKNOWN: logger_debug(f' line: {line_num}: {line!r}') # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None for token in query_tokenizer(line): tid = dic_get(token) is_stopword = token in STOPWORDS if TRACE_STOP_AND_UNKNOWN: logger_debug( f' token: {token!r}, tid: {tid}, is_stopword: {is_stopword}' ) if tid is not None and not is_stopword: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos if TRACE_STOP_AND_UNKNOWN: logger_debug( f' KNOWN token: known_pos: {known_pos}') else: # process STOPWORDS and unknown words if is_stopword: if not started: # If we have not yet started globally, then all tokens # seen so far are stopwords and we keep a count of them # in the magic "-1" position. stopwords_by_pos[-1] += 1 if TRACE_STOP_AND_UNKNOWN: logger_debug( f' STOPWORD token: known_pos: -1') else: # here we have a new unknwon token positioned right after # the current known_pos stopwords_by_pos[known_pos] += 1 stopwords_pos_add(known_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug( f' STOPWORD token: known_pos: {known_pos}' ) # we do not track stopwords, only their position continue else: # this is an UNKNOWN word if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. unknowns_by_pos[-1] += 1 if TRACE_STOP_AND_UNKNOWN: logger_debug( f' UNKNOWN token: known_pos: -1') else: # here we have a new unknwon token positioned right after # the current known_pos unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug( f' UNKNOWN token: known_pos: {known_pos}' ) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append( (spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally update the attributes and create a Span of positions followed # by unkwnons and another for positions followed by stopwords used for # intersection with the query span to do the scoring matches correctly self.unknowns_span = Span(unknowns_pos) # also convert the defaultdicts back to plain discts self.unknowns_by_pos = dict(unknowns_by_pos) self.stopwords_by_pos = dict(stopwords_by_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug(f' self.unknowns_span: {self.unknowns_span}') logger_debug(f' self.unknowns_by_pos: {self.unknowns_by_pos}') logger_debug(f' self.stopwords_by_pos: {self.stopwords_by_pos}')