def test_Index_exact_match_ngrams_templates_perfect_minimalist(self): index_doc = [u'name is joker, {{}} name is joker'] idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=True) query_doc = [u'Hi my name is joker the joker name is joker yes.'] # 012345678901234567890123456789012345678901234567 # 11111111112222222222333333333344444444 expected = { 'tst': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9))] } matches = idx.match(text_lines(query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_Index_match_unigrams_perfect(self): test_docs = self.get_test_docs('index/bsd') idx = self.get_test_index(test_docs, ngram_len=1, template=False) test_query_doc = self.get_test_loc('index/queryperfect') expected = { 'bsd-new': [ (Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212), Token(start=0, start_line=5, start_char=0, end_line=11, end_char=753, end=212)) ] } test = dict(idx.match(text_lines(test_query_doc), perfect=True)) self.assertNotEqual({}, test) for k, val in test.items(): assert expected[k] == val with codecs.open(test_query_doc, encoding='utf-8') as td: actual = td.read().splitlines(True) expected = u''.join(actual[5:-2])[:-2] query_match_pos = test['bsd-new'][0][-1] tst = analysis.doc_subset(text_lines(location=test_query_doc), query_match_pos) tst = u''.join(tst) assert expected == tst
def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps( self): # was failing when a gapped token (from a template) starts at a # beginning of an index doc and at a position less than ngram length # setup idx = index.Index(ngram_len=4) index_doc = text_lines(self.get_test_loc('index/templates/idx.txt')) idx.index_one('idx', text_lines(index_doc), template=True) # test index quad_grams_index = idx._get_index_for_len(4) assert 205 == len(quad_grams_index) assert u'software without prior written' in quad_grams_index # test match query_doc = text_lines(self.get_test_loc('index/templates/query.txt')) matches = idx.match(query_doc) assert 1 == len(matches) # we expect a single match to the idx doc matched_query_doc_position = matches['idx'][0][1] expected = Token(start=0, start_line=0, start_char=0, end_line=39, end_char=34, end=276) assert expected == matched_query_doc_position
def test_Index_exact_match_unigrams_perfect(self): test_docs = self.get_test_docs("index/bsd") idx = self.get_test_index(test_docs, ngram_len=1, template=False) test_query_doc = self.get_test_loc("index/queryperfect") expected = { "bsd-new": [ ( Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212), Token(start=0, start_line=5, start_char=0, end_line=11, end_char=753, end=212), ) ] } matches = idx.match(text_lines(test_query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val with codecs.open(test_query_doc, encoding="utf-8") as td: actual = td.read().splitlines(True) expected = u"".join(actual[5:-2])[:-2] query_match_pos = matches["bsd-new"][0][-1] tst = analysis.doc_subset(text_lines(location=test_query_doc), query_match_pos) tst = u"".join(tst) assert expected == tst
def test_text_lines_from_list_or_location_yield_same_results(self): test_file = self.get_test_loc('analysis/bsd-new') with open(test_file, 'rb') as inf: test_strings_list = inf.read().splitlines(True) # test when we are passing a location or a list from_loc = list(text_lines(location=test_file)) from_list = list(text_lines(location=test_strings_list)) assert from_loc == from_list
def match(self, location, minimum_score=100): """ Match the file at location against the index and return a sequence of LicenseMatch. If minimum_score is less than 100, also include approximate matches. """ if DEBUG: print('LicenseIndex.match: location=%(location)r, minimum_score=%(minimum_score)r' % locals()) qdoc = analysis.text_lines(location) if DEBUG: qdoc = list(qdoc) print(' LicenseIndex.match: Query doc has %d lines.' % len(qdoc)) qdoc = iter(qdoc) exact_matches = self.license_index.match(qdoc, minimum_score=minimum_score) if DEBUG: len_exact_matches = len(exact_matches) print(' LicenseIndex.match: exact_matches#: %(len_exact_matches)r' % locals()) exact_license_matches = [] for rule_id, matched_pos in exact_matches.items(): rule = self.rules_by_id[rule_id] for match in matched_pos: index_position, query_position = match lmatch = LicenseMatch(rule, query_position, index_position, score=100.00) exact_license_matches.append(lmatch) if DEBUG: print(' LicenseIndex.match: unfiltered exact_license_matches: %(exact_license_matches)r' % locals()) if DEBUG_FILTER: print(' in EXACT: LicenseIndex.match: filtered with filter_overlapping_matches') filtered_exact = filter_overlapping_matches(exact_license_matches, discard_negative=True) return sorted(filtered_exact, key=lambda x: x.span)
def test_Index_exact_match_ngrams_template_perfect_multi_index_doc_in_index( self): test_docs = self.get_test_docs('index/bsd_templates') idx = self.get_test_index(test_docs, ngram_len=3, template=True) test_query_doc = self.get_test_loc( 'index/queryperfect_single_template') expected = { 'bsd-new': [(Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=210), Token(start=4, start_line=5, start_char=0, end_line=11, end_char=753, end=216))] } matches = idx.match(text_lines(test_query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_Index_exact_match_ngrams_perfect_single_index_doc_in_index_minimal( self): test_docs = self.get_test_docs('index/mini') idx = self.get_test_index(test_docs, ngram_len=3, template=False) test_query_doc = self.get_test_loc('index/queryperfect-mini') expected = { 'bsd-new': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=94, end=13), Token(start=1, start_line=2, start_char=0, end_line=2, end_char=94, end=14))] } matches = idx.match(text_lines(test_query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_Index_exact_match_ngrams_perfect_minimalist(self): index_doc = [u'name is joker, name is joker'] # 0 1 2 3 4 5 idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=False) query_doc = [u'Hi my name is joker, name is joker yes.'] # match 0 1 |2 3 4 5 6 7| 8 expected = { 'tst': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=28, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=34, end=7))] } matches = idx.match(query_doc) assert {} != matches for k, val in matches.items(): # assert [] == val assert expected[k] == val
def match(self, location, perfect=True): """ Match the file at location against the index and return a sequence of LicenseMatch. If perfect is True, only return perfect matches. """ if DEBUG: print('LicenseIndex.match: location=%(location)r, perfect=%(perfect)r ' % locals()) qdoc = analysis.text_lines(location) if DEBUG: qdoc = list(qdoc) print(' LicenseIndex.match: Query doc has %d lines.' % len(qdoc)) print(' LicenseIndex.match: Query doc:') print(u''.join(qdoc)) qdoc = iter(qdoc) matches = self.license_index.match(qdoc, perfect) license_matches = [] for rule_id, matched_pos in matches.items(): rule = self.rules_by_id[rule_id] for match in matched_pos: index_position, query_position = match lmatch = LicenseMatch(rule, query_position, index_position, score=100) license_matches.append(lmatch) return filter_matches(license_matches)
def test_Index_match_ngrams_templates_perfect_minimalist(self): index_doc = [u'name is joker, {{}} name is joker'] idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=True) query_doc = [u'Hi my name is joker the joker name is joker yes.'] expected = { 'tst': [ (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9)) ] } test = dict(idx.match(text_lines(query_doc), perfect=True)) self.assertNotEqual({}, test) for k, val in test.items(): assert expected[k] == val
def test_get_tokens_count(self): base = self.get_test_loc('index/tokens_count', copy=True) docids = os.listdir(base) idx = index.Index(ngram_len=3) for docid in docids: doc = text_lines(location=os.path.join(base, docid)) template = docid.startswith('tmpl') idx.index_one(docid, doc, template=template) indexes = [( idx.indexes[1], set(['all', 'redistribution', 'for', 'is']), ), ( idx.indexes[2], set([ 'is allowed', 'all and', 'redistribution is', 'allowed for', ]), ), ( idx.indexes[3], set([ 'for all and', 'and any thing', 'is allowed for', 'all and any', 'redistribution is allowed', 'allowed for all', ]), )] for idxi, expected_keys in indexes: assert expected_keys == set(idxi.keys()) expected = { 'plain1': 1, 'plain2': 2, 'plain3': 3, 'plain4': 4, 'plain5': 5, 'tmpl10': 10, 'tmpl2': 2, 'tmpl3': 3, 'tmpl4': 4, 'tmpl5': 5, 'tmpl5_2': 5, 'tmpl6': 6, 'tmpl7': 7, 'tmpl8': 8, 'tmpl9': 9, } result = {docid: idx.get_tokens_count(docid) for docid in docids} assert expected == result
def detect_copyrights(location): """ Yield tuples of: (copyrights list, authors list, years list, holders list, start line, end line) detected in file at location. """ detector = CopyrightDetector() for numbered_lines in candidate_lines(analysis.text_lines(location)): detected = detector.detect(numbered_lines) cp, auth, yr, hold, _start, _end = detected if any([cp, auth, yr, hold]): yield detected
def test_Index_exact_match_ngrams_templates_perfect_minimalist(self): index_doc = [u"name is joker, {{}} name is joker"] idx = index.Index(ngram_len=3) idx.index_one("tst", text_lines(index_doc), template=True) query_doc = [u"Hi my name is joker the joker name is joker yes."] # 012345678901234567890123456789012345678901234567 # 11111111112222222222333333333344444444 expected = { "tst": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9), ) ] } matches = idx.match(text_lines(query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_get_tokens_count(self): base = self.get_test_loc('index/tokens_count', copy=True) docids = os.listdir(base) idx = index.Index(ngram_len=3) for docid in docids: doc = text_lines(location=os.path.join(base, docid)) template = docid.startswith('tmpl') idx.index_one(docid, doc, template=template) indexes = [ (idx.indexes[1], set([('all',), ('redistribution',), ('for',), ('is',) ]),), (idx.indexes[2], set([('is', 'allowed',), ('all', 'and',), ('redistribution', 'is',), ('allowed', 'for',), ]),), (idx.indexes[3], set([('for', 'all', 'and',), ('and', 'any', 'thing',), ('is', 'allowed', 'for',), ('all', 'and', 'any',), ('redistribution', 'is', 'allowed',), ('allowed', 'for', 'all',), ]),) ] for idxi, expected_keys in indexes: assert expected_keys == set(idxi.keys()) expected = { 'plain1': 1, 'plain2': 2, 'plain3': 3, 'plain4': 4, 'plain5': 5, 'tmpl10': 10, 'tmpl2': 2, 'tmpl3': 3, 'tmpl4': 4, 'tmpl5': 5, 'tmpl5_2': 5, 'tmpl6': 6, 'tmpl7': 7, 'tmpl8': 8, 'tmpl9': 9 } result = {docid: idx.get_tokens_count(docid) for docid in docids} assert expected == result
def text(self): """ Return the rule text loaded from its file. """ # used for test only if self._text: return self._text elif self.text_file and exists(self.text_file): # IMPORTANT: use the same process as query text loading for symmetry lines = text_lines(self.text_file, demarkup=False) return ''.join(lines) else: raise Exception('Inconsistent rule text for:', self.identifier)
def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps(self): # was failing when a gapped token (from a template) starts at a # beginning of an index doc and at a position less than ngram length # setup idx = index.Index(ngram_len=4) index_doc = text_lines(self.get_test_loc("index/templates/idx.txt")) idx.index_one("idx", text_lines(index_doc), template=True) # test index quad_grams_index = idx._get_index_for_len(4) assert 205 == len(quad_grams_index) assert u"software without prior written" in quad_grams_index # test match query_doc = text_lines(self.get_test_loc("index/templates/query.txt")) matches = idx.match(query_doc) assert 1 == len(matches) # we expect a single match to the idx doc matched_query_doc_position = matches["idx"][0][1] expected = Token(start=0, start_line=0, start_char=0, end_line=39, end_char=34, end=276) assert expected == matched_query_doc_position
def test_get_tokens_count(self): base = self.get_test_loc("index/tokens_count", copy=True) docids = os.listdir(base) idx = index.Index(ngram_len=3) for docid in docids: doc = text_lines(location=os.path.join(base, docid)) template = docid.startswith("tmpl") idx.index_one(docid, doc, template=template) indexes = [ (idx.indexes[1], set(["all", "redistribution", "for", "is"])), (idx.indexes[2], set(["is allowed", "all and", "redistribution is", "allowed for"])), ( idx.indexes[3], set( [ "for all and", "and any thing", "is allowed for", "all and any", "redistribution is allowed", "allowed for all", ] ), ), ] for idxi, expected_keys in indexes: assert expected_keys == set(idxi.keys()) expected = { "plain1": 1, "plain2": 2, "plain3": 3, "plain4": 4, "plain5": 5, "tmpl10": 10, "tmpl2": 2, "tmpl3": 3, "tmpl4": 4, "tmpl5": 5, "tmpl5_2": 5, "tmpl6": 6, "tmpl7": 7, "tmpl8": 8, "tmpl9": 9, } result = {docid: idx.get_tokens_count(docid) for docid in docids} assert expected == result
def test_Index_match_ngrams_perfect_single_index_doc_in_index_minimal(self): test_docs = self.get_test_docs('index/mini') idx = self.get_test_index(test_docs, ngram_len=3, template=False) test_query_doc = self.get_test_loc('index/queryperfect-mini') expected = { 'bsd-new': [ (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=94, end=13), Token(start=1, start_line=2, start_char=0, end_line=2, end_char=94, end=14)) ] } test = dict(idx.match(text_lines(test_query_doc), perfect=True)) assert {} != test for k, val in test.items(): assert expected[k] == val
def test_Index_exact_match_return_one_match_with_correct_offsets(self): index_doc = [u'A one. A two. A three.'] idx = index.Index(ngram_len=4) idx.index_one('tst', text_lines(index_doc), template=False) query_doc = [u'some junk. A one. A two. A three.'] # 1111111111222222222233 # 012345678901234567890123456789012 matches = idx.match(query_doc) match = matches['tst'] assert 1 == len(match) index_pos, query_pos = match[0] assert 11 == query_pos.start_char assert 32 == query_pos.end_char assert 0 == index_pos.start_char assert 21 == index_pos.end_char
def test_Index_exact_match_return_one_match_with_correct_offsets(self): index_doc = [u"A one. A two. A three."] idx = index.Index(ngram_len=4) idx.index_one("tst", text_lines(index_doc), template=False) query_doc = [u"some junk. A one. A two. A three."] # 1111111111222222222233 # 012345678901234567890123456789012 matches = idx.match(query_doc) match = matches["tst"] assert 1 == len(match) index_pos, query_pos = match[0] assert 11 == query_pos.start_char assert 32 == query_pos.end_char assert 0 == index_pos.start_char assert 21 == index_pos.end_char
def test_Index_match_ngrams_template_perfect_multi_index_doc_in_index(self): test_docs = self.get_test_docs('index/bsd_templates') idx = self.get_test_index(test_docs, ngram_len=3, template=True) test_query_doc = self.get_test_loc('index/queryperfect_single_template') expected = { 'bsd-new':[ (Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=210), Token(start=4, start_line=5, start_char=0, end_line=11, end_char=753, end=216)) ] } test = dict(idx.match(text_lines(test_query_doc), perfect=True)) self.assertNotEqual({}, test) for k, val in test.items(): assert expected[k] == val
def match(self, location, minimum_score=100): """ Match the file at location against the index and return a sequence of LicenseMatch. If minimum_score is less than 100, also include approximate matches. """ if DEBUG: print( 'LicenseIndex.match: location=%(location)r, minimum_score=%(minimum_score)r' % locals()) qdoc = analysis.text_lines(location) if DEBUG: qdoc = list(qdoc) print(' LicenseIndex.match: Query doc has %d lines.' % len(qdoc)) qdoc = iter(qdoc) exact_matches = self.license_index.match(qdoc, minimum_score=minimum_score) if DEBUG: len_exact_matches = len(exact_matches) print( ' LicenseIndex.match: exact_matches#: %(len_exact_matches)r' % locals()) exact_license_matches = [] for rule_id, matched_pos in exact_matches.items(): rule = self.rules_by_id[rule_id] for match in matched_pos: index_position, query_position = match lmatch = LicenseMatch(rule, query_position, index_position, score=100.00) exact_license_matches.append(lmatch) if DEBUG: print( ' LicenseIndex.match: unfiltered exact_license_matches: %(exact_license_matches)r' % locals()) if DEBUG_FILTER: print( ' in EXACT: LicenseIndex.match: filtered with filter_overlapping_matches' ) filtered_exact = filter_overlapping_matches(exact_license_matches, discard_negative=False) return sorted(filtered_exact, key=lambda x: x.span)
def test_Index_match_simple(self): test_docs = self.get_test_docs('index/bsd') idx = self.get_test_index(test_docs, ngram_len=1) test_query_doc = self.get_test_loc('index/querysimple') expected = { 'bsd-new': [(Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212), Token(start=0, start_line=4, start_char=0, end_line=12, end_char=607, end=212)) ], 'bsd-no-mod': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=49, end=7), Token(start=0, start_line=4, start_char=0, end_line=4, end_char=49, end=7)) ], 'bsd-original': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29) ), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47) ), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103) ), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137) ) ], 'bsd-original-uc': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29)), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47)), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103)), (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137)) ], 'bsd-simplified': [(Token(start=0, start_line=0, start_char=3, end_line=7, end_char=73, end=67), Token(start=0, start_line=4, start_char=0, end_line=7, end_char=207, end=67)) ] } test = dict(idx.match(text_lines(test_query_doc), perfect=True)) for k, val in test.items(): assert expected[k] == val
def test_Index_exact_match_ngrams_template_perfect_multi_index_doc_in_index(self): test_docs = self.get_test_docs("index/bsd_templates") idx = self.get_test_index(test_docs, ngram_len=3, template=True) test_query_doc = self.get_test_loc("index/queryperfect_single_template") expected = { "bsd-new": [ ( Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=210), Token(start=4, start_line=5, start_char=0, end_line=11, end_char=753, end=216), ) ] } matches = idx.match(text_lines(test_query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_Index_exact_match_ngrams_perfect_single_index_doc_in_index_minimal(self): test_docs = self.get_test_docs("index/mini") idx = self.get_test_index(test_docs, ngram_len=3, template=False) test_query_doc = self.get_test_loc("index/queryperfect-mini") expected = { "bsd-new": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=94, end=13), Token(start=1, start_line=2, start_char=0, end_line=2, end_char=94, end=14), ) ] } matches = idx.match(text_lines(test_query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def find(location, patterns): """ Yield match and matched lines for patterns found in file at location. as a tuple of (key, found text, text line). Pattern is list of tuples (key, compiled regex). Note: the location can be a list of lines for testing convenience. """ if DEBUG: loc = pformat(location) print('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals()) for line in analysis.text_lines(location): for key, pattern in patterns: for match in pattern.findall(line): if DEBUG: print('find: yielding match: key=%(key)r, ' 'match=%(match)r,\n line=%(line)r' % locals()) yield key, unicode(match), line
def test_Index_exact_match_ngrams_perfect_minimalist(self): index_doc = [u"name is joker, name is joker"] # 0 1 2 3 4 5 idx = index.Index(ngram_len=3) idx.index_one("tst", text_lines(index_doc), template=False) query_doc = [u"Hi my name is joker, name is joker yes."] # match 0 1 |2 3 4 5 6 7| 8 expected = { "tst": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=28, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=34, end=7), ) ] } matches = idx.match(query_doc) assert {} != matches for k, val in matches.items(): # assert [] == val assert expected[k] == val
def query_lines(location=None, query_string=None, strip=True): """ Return an iterable of text lines given a file at `location` or a `query string`. Include empty lines. """ # TODO: OPTIMIZE: tokenizing line by line may be rather slow # we could instead get lines and tokens at once in a batch? lines = [] if location: lines = text_lines(location, demarkup=False) elif query_string: if strip: keepends = False else: keepends = True lines = query_string.splitlines(keepends) for line in lines: if strip: yield line.strip() else: yield line
def test_Index_exact_match_simple(self): test_docs = self.get_test_docs("index/bsd") idx = self.get_test_index(test_docs, ngram_len=1) test_query_doc = self.get_test_loc("index/querysimple") expected = { "bsd-new": [ ( Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212), Token(start=0, start_line=4, start_char=0, end_line=12, end_char=607, end=212), ) ], "bsd-no-mod": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=49, end=7), Token(start=0, start_line=4, start_char=0, end_line=4, end_char=49, end=7), ) ], "bsd-original": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137), ), ], "bsd-original-uc": [ ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103), ), ( Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0), Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137), ), ], "bsd-simplified": [ ( Token(start=0, start_line=0, start_char=3, end_line=7, end_char=73, end=67), Token(start=0, start_line=4, start_char=0, end_line=7, end_char=207, end=67), ) ], } matches = idx.match(text_lines(test_query_doc)) for k, val in matches.items(): assert expected[k] == val
def get_test_docs(self, base, subset=None): base = self.get_test_loc(base, copy=True) for docid in os.listdir(base): if (subset and docid in subset) or not subset: yield docid, text_lines(location=os.path.join(base, docid))
def test_some_media_do_yield_text_lines(self): test_dir = self.get_test_loc('media_with_text') for test_file in file_iter(test_dir): result = list(text_lines(test_file)) assert result, 'Should return text lines:' + test_file assert any('nexb' in l for l in result)
def test_some_media_do_not_yield_text_lines(self): test_dir = self.get_test_loc('media_without_text') for test_file in file_iter(test_dir): result = list(text_lines(test_file)) assert [] == result, 'Should not return text lines:' + test_file
def test_archives_do_not_yield_text_lines(self): test_file = self.get_test_loc('archive/simple.jar') result = list(text_lines(test_file)) assert [] == result