示例#1
0
    def get_dummy_index(self):
        dummyIndex = Index()
        dummyIndex.register_Token('DaDaDa', 'd01')
        dummyIndex.register_Token('DaDaDa', 'd01')
        dummyIndex.register_Token('DumDiDum', 'd02')
        dummyIndex.register_Token('DumDiDum', 'd03')

        return dummyIndex
示例#2
0
class Indexer:
	"""
	Build an inverted index from assigned page objects.
	"""

	def __init__(self):
		self.__index = Index()
		self.__already_indexed_pages = set()

	def index_pages(self, pages):
		"""
		Index a sequence of pages
		"""
		if not isinstance(pages, Pages):
			raise TypeError('pages must be an instance of Pages')

		for page in pages:
			self.__index_page(page)

	def __index_page(self, page):
		"""
		Index a single page
		"""

		# Prevent that a page is indexed multiple times
		if page.title in self.__already_indexed_pages:
			return

		lexer = TokenLexer(page)
		for token in lexer.tokens():
			self.__index.register_Token(token, page.title)

		# Memorize already indexed pages by his title
		self.__already_indexed_pages.add(page.title)

	@property
	def index(self):
		"""
		Returns the complete inverted index
		"""

		return self.__index
示例#3
0
    def index_data(self, page_index):
        key = '/index/{0}'.format(page_index)

        if Options().caching and DataCacher().cached_already(key):
            return DataCacher().get_cached(key)

        common = self.common_data
        if Options().include_drafts:
            posts = self.posts_files
        else:
            posts = self.posts_published

        data, _ = Index().data(page_index, posts)
        #  We don't care yet about the introduction content

        combined = self.__combine(common, data)

        if Options().caching:
            DataCacher().cache(key, combined)

        return combined
示例#4
0
class Indexer:
    def __init__(self, stopwords):
        self.__index = Index()
        self.__preprocessor = Preprocessor()
        self.__sw = stopwords

    def __process_word(self, word):
        return word

    def load_inverted_index(self):
        documents = self.__preprocessor.load_collection()
        dw = {}
        for doc in documents:
            for word in doc.get_content():
                if (word not in self.__sw):
                    w = self.__process_word(word)
                    self.__index.add_tf_inverted_file(w, doc.get_name())
                    self.__index.update_file_vector(w, doc.get_name())
            self.__index.normalize_doc_freq(doc.get_name())

    def get_index(self):
        return self.__index
示例#5
0
 def __init__(self, stopwords):
     self.__index = Index()
     self.__preprocessor = Preprocessor()
     self.__sw = stopwords
示例#6
0
	def __init__(self):
		self.__index = Index()
		self.__already_indexed_pages = set()
示例#7
0
 def index_highlight_posts(self):
     return self.__get_data('index_highlight_posts', Index(),
                            'highlight_posts')
示例#8
0
 def index_spotlight_posts(self):
     return self.__get_data('index_spotlight_posts', Index(),
                            'spotlight_posts')
示例#9
0
 def index_max_posts(self):
     return self.__get_data('index_max_posts', Index(), 'max_posts')
示例#10
0
 def index_footer_menu(self):
     return self.__get_data('index_footer_menu', Index(), 'footer_menu')
示例#11
0
 def index_main_menu(self):
     return self.__get_data('index_main_menu', Index(), 'main_menu')