def make_old_index(): from Products.PluginIndexes.TextIndex.TextIndex import TextIndex from Products.PluginIndexes.TextIndex.Lexicon import Lexicon from zope.index.text.stopdict import get_stopdict l = Lexicon(get_stopdict()) l.SplitterFunc = MySplitter() return TextIndex("read", lexicon=l)
def testThreeElementPipeline(self): lexicon = Lexicon( Splitter(), StopWordPipelineElement({"and": 1}), StupidPipelineElement("dogs", "fish"), WackyReversePipelineElement("fish"), ) wids = lexicon.sourceToWordIds("cats and dogs") wids = lexicon.termToWordIds("hsif") self.assertEqual(wids, [2])
def tfIdfBlock(self, data, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} for i, (record_id, doc) in enumerate(data, 1) : index_to_id[i] = record_id base_tokens[i] = doc index.index_doc(i, doc) canopies = (tfidf._createCanopies(index, base_tokens, threshold, field) for threshold in self.tfidf_fields[field]) for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def _makeOne(self, family=None): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter if family is None: family = self._getBTreesFamily() lexicon = Lexicon(Splitter()) return self._getTargetClass()(lexicon, family=family)
def _default_indexes(): return { 'title': CatalogFieldIndex(get_title), 'description': CatalogFieldIndex(get_description), 'type_name': CatalogFieldIndex(get_type_name), 'sortable_title': CatalogFieldIndex(get_sortable_title), 'path': CatalogPathIndex(get_path), 'searchable_text': CatalogTextIndex(get_searchable_text, lexicon=Lexicon(Splitter(), CaseNormalizer())), 'uid': CatalogFieldIndex(get_uid), 'tags': CatalogKeywordIndex(get_tags), 'search_visible': CatalogFieldIndex(get_search_visible), 'date': CatalogFieldIndex(get_date), 'modified': CatalogFieldIndex(get_modified), 'created': CatalogFieldIndex(get_created), 'wf_state': CatalogFieldIndex(get_wf_state), 'workflow': CatalogFieldIndex(get_workflow), }.items()
def _makeIndexAndParser(self): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter from zope.index.text.queryparser import QueryParser lexicon = Lexicon(Splitter()) parser = QueryParser(lexicon) index = FauxIndex() return index, parser
def __init__(self, lexicon=None, index=None): """Provisional constructor. This creates the lexicon and index if not passed in. """ _explicit_lexicon = True if lexicon is None: _explicit_lexicon = False lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) if index is None: index = OkapiIndex(lexicon) self.lexicon = _explicit_lexicon and lexicon or index.lexicon self.index = index
def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms
def __init__(self, discriminator, lexicon=None, index=None): _lexicon = lexicon if lexicon is None: _lexicon = Lexicon( Splitter(), CaseNormalizer(), StopWordRemover(), ) if index is None: index = OkapiIndex(_lexicon, family=self.family) super(TextIndex, self).__init__(discriminator, lexicon, index) if lexicon is None: self.lexicon = index.lexicon self.index = index self.clear()
def stopWords(data) : index = TextIndex(Lexicon(Splitter())) for i, (_, doc) in enumerate(data, 1) : index.index_doc(i, doc) doc_freq = [(len(index.index._wordinfo[wid]), word) for word, wid in index.lexicon.items()] doc_freq.sort(reverse=True) N = float(index.index.documentCount()) threshold = int(max(1000, N * 0.05)) stop_words = set([]) for frequency, word in doc_freq : if frequency > threshold : stop_words.add(word) else : break return stop_words
def _create_full_text_index(self, language): lexicon = Lexicon( FullTextIndexProcessor(language, self.stemming) ) return OkapiIndex(lexicon)
def testTermToWordIds(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds("cats and dogs") wids = lexicon.termToWordIds("dogs") self.assertEqual(wids, [3])
def testOnePipelineElement(self): lexicon = Lexicon(Splitter(), StupidPipelineElement("dogs", "fish")) wids = lexicon.sourceToWordIds("cats and dogs") wids = lexicon.termToWordIds("fish") self.assertEqual(wids, [3])
def testMissingTermToWordIds(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds("cats and dogs") wids = lexicon.termToWordIds("boxes") self.assertEqual(wids, [0])
def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon)
class IndexTest(TestCase): def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon) def _test_index_document_assertions(self, DOCID=1): self.assertEqual(self.index.documentCount(), 1) self.assertEqual(self.index.wordCount(), 5) self.assertEqual(self.lexicon.wordCount(), 5) self.assert_(self.index.has_doc(DOCID)) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 1) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 5) self.assertEqual(len(self.index._wordinfo), self.index.wordCount()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_index_document(self, DOCID=1): doc = "simple document contains five words" self.assert_(not self.index.has_doc(DOCID)) self.index.index_doc(DOCID, doc) self._test_index_document_assertions(DOCID) def test_unindex_document_absent_docid(self): self.test_index_document(1) self.index.unindex_doc(2) self._test_index_document_assertions(1) def test_clear(self): self.test_index_document(1) self.index.clear() self._test_unindex_document_assertions() def _test_unindex_document_assertions(self): self.assertEqual(len(self.index._docweight), 0) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._wordinfo), self.index.wordCount()) def test_unindex_document(self): DOCID = 1 self.test_index_document(DOCID) self.index.unindex_doc(DOCID) self._test_unindex_document_assertions() def test_index_two_documents(self): self.test_index_document() doc = "another document just four" DOCID = 2 self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 2) self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.wordCount()) wids = self.lexicon.termToWordIds("document") self.assertEqual(len(wids), 1) document_wid = wids[0] for wid, map in self.index._wordinfo.items(): if wid == document_wid: self.assertEqual(len(map), 2) self.assert_(map.has_key(1)) self.assert_(map.has_key(DOCID)) else: self.assertEqual(len(map), 1) def test_index_two_unindex_one(self): # index two documents, unindex one, and test the results self.test_index_two_documents() self.index.unindex_doc(1) DOCID = 2 self.assertEqual(len(self.index._docweight), 1) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.wordCount()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_index_duplicated_words(self, DOCID=1): doc = "very simple repeat repeat repeat document test" self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 7) self.assertEqual(len(self.index._wordinfo), self.index.wordCount()) wids = self.lexicon.termToWordIds("repeat") self.assertEqual(len(wids), 1) repititive_wid = wids[0] for wid, map in self.index._wordinfo.items(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_simple_query_oneresult(self): self.index.index_doc(1, 'not the same document') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_simple_query_noresults(self): self.index.index_doc(1, 'not the same document') results = self.index.search("frobnicate") self.assertEqual(list(results.keys()), []) def test_query_oneresult(self): self.index.index_doc(1, 'not the same document') self.index.index_doc(2, 'something about something else') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_search_phrase(self): self.index.index_doc(1, "the quick brown fox jumps over the lazy dog") self.index.index_doc(2, "the quick fox jumps lazy over the brown dog") results = self.index.search_phrase("quick brown fox") self.assertEqual(list(results.keys()), [1]) def test_search_glob(self): self.index.index_doc(1, "how now brown cow") self.index.index_doc(2, "hough nough browne cough") self.index.index_doc(3, "bar brawl") results = self.index.search_glob("bro*") self.assertEqual(list(results.keys()), [1, 2]) results = self.index.search_glob("b*") self.assertEqual(list(results.keys()), [1, 2, 3])
def testSplitterAdaptorNofold(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds("CATS and dogs") wids = lexicon.termToWordIds("cats and dogs") self.assertEqual(wids, [0, 2, 3])
def _makeLexicon(self, *pipeline): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter if not pipeline: pipeline = (Splitter(),) return Lexicon(*pipeline)
def _makeLexicon(self): from zope.index.text.lexicon import Lexicon return Lexicon(*self._makePipeline())