class SearchIndex(object): def __init__(self, scan_dir, output_dir, private_dir): self.__scan_dir = scan_dir self.__output_dir = output_dir self.__private_dir = private_dir prepare_folder(self.__search_dir) prepare_folder(self.__fragments_dir) self.__indices_lock = threading.Lock() self.__full_index = defaultdict(list) self.__new_index = defaultdict(list) self.__trie = Trie() self.__filler = futures.ThreadPoolExecutor( max_workers=multiprocessing.cpu_count() * 5) here = os.path.dirname(__file__) with open(os.path.join(here, 'stopwords.txt'), 'r') as _: self.__stop_words = set(_.read().split()) self.__futures = [] self.__connected_all_projects = False def process(self, path, lxml_tree): self.__futures.append(self.__filler.submit(self.fill, path, lxml_tree)) def write(self): for future in self.__futures: # Make sure all the filling is done. future.result() self.save() @property def __search_dir(self): return os.path.join(self.__output_dir, 'search') @property def __fragments_dir(self): return os.path.join(self.__search_dir, 'hotdoc_fragments') def __get_fragments(self, filenames): fragments = set() for filename in filenames: url = os.path.relpath(filename, self.__scan_dir) for fragment in glob.glob( os.path.join(self.__fragments_dir, url) + '*'): fragments.add( os.path.relpath(fragment, self.__fragments_dir)[:-9]) os.unlink(fragment) return fragments def fill(self, filename, lxml_tree): for token, section_url, prioritize, context in parse_file( self.__scan_dir, lxml_tree, filename, self.__stop_words, self.__fragments_dir): self.__indices_lock.acquire() contextualized_url = ContextualizedURL(section_url, context) if not prioritize: self.__full_index[token].append(contextualized_url) self.__new_index[token].append(contextualized_url) else: self.__full_index[token].insert(0, contextualized_url) self.__new_index[token].insert(0, contextualized_url) self.__indices_lock.release() def save(self): self.__indices_lock.acquire() for key, value in sorted(self.__new_index.items()): self.__trie.insert(key) deduped = OrderedDict() for url in value: try: context = deduped[url.url] for key_, val_ in url.context.items(): try: vset = context[key_] vset.add(val_) except KeyError: context[key_] = set([val_]) except KeyError: deduped[url.url] = \ {k: set([v]) for k, v in url.context.items()} urls = [] for url, context in deduped.items(): for key_, val_ in context.items(): context[key_] = list(val_) urls.append({'url': url, 'context': context}) metadata = {'token': key, 'urls': urls} with open(os.path.join(self.__search_dir, key), 'w') as _: _.write('urls_downloaded_cb(%s);' % json.dumps(metadata)) self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'), os.path.join(self.__output_dir, 'trie_index.js')) self.__indices_lock.release()
class SearchIndex(object): def __init__(self, scan_dir, output_dir, private_dir): self.__scan_dir = scan_dir self.__output_dir = output_dir self.__private_dir = private_dir prepare_folder(self.__search_dir) prepare_folder(self.__fragments_dir) self.__full_index = defaultdict(list) self.__new_index = defaultdict(list) self.__trie = Trie() def scan(self, stale_filenames): self.load(stale_filenames) self.fill(stale_filenames) self.save() @property def __search_dir(self): return os.path.join(self.__output_dir, 'search') @property def __fragments_dir(self): return os.path.join(self.__search_dir, 'hotdoc_fragments') def __get_fragments(self, filenames): fragments = set() for filename in filenames: url = os.path.relpath(filename, self.__scan_dir) for fragment in glob.glob( os.path.join(self.__fragments_dir, url) + '*'): fragments.add( os.path.relpath(fragment, self.__fragments_dir)[:-9]) os.unlink(fragment) return fragments def load(self, stale_filenames): to_remove = self.__get_fragments(stale_filenames) trie_path = os.path.join(self.__private_dir, 'search.trie') if os.path.exists(trie_path): self.__trie = Trie.from_file(trie_path) search_index_path = os.path.join(self.__private_dir, 'search.json') if os.path.exists(search_index_path): with open(search_index_path, 'r') as _: previous_index = json.loads(_.read()) for token, fragment_urls in list(previous_index.items()): new_set = list(OrderedSet(fragment_urls) - to_remove) if new_set: self.__full_index[token] = new_set else: self.__trie.remove(token) os.unlink(os.path.join(self.__search_dir, token)) def fill(self, filenames): here = os.path.dirname(__file__) with open(os.path.join(here, 'stopwords.txt'), 'r') as _: stop_words = set(_.read().split()) for filename in filenames: if not os.path.exists(filename): continue for token, section_url, prioritize in parse_file( self.__scan_dir, filename, stop_words, self.__fragments_dir): if not prioritize: self.__full_index[token].append(section_url) self.__new_index[token].append(section_url) else: self.__full_index[token].insert(0, section_url) self.__new_index[token].insert(0, section_url) def save(self): for key, value in sorted(self.__new_index.items()): self.__trie.insert(key) metadata = {'token': key, 'urls': list(OrderedSet(value))} with open(os.path.join(self.__search_dir, key), 'w') as _: _.write("urls_downloaded_cb(") _.write(json.dumps(metadata)) _.write(");") self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'), os.path.join(self.__output_dir, 'trie_index.js')) with open(os.path.join(self.__private_dir, 'search.json'), 'w') as _: _.write(json.dumps(self.__full_index))
class SearchIndex(object): def __init__(self, scan_dir, output_dir, private_dir): self.__scan_dir = scan_dir self.__output_dir = output_dir self.__private_dir = private_dir self.__search_dir = os.path.join(self.__output_dir, 'search') self.__fragments_dir = os.path.join(self.__search_dir, 'hotdoc_fragments') prepare_folder(self.__search_dir) prepare_folder(self.__fragments_dir) self.__full_index = defaultdict(list) self.__new_index = defaultdict(list) self.__trie = Trie() def scan(self, stale_filenames): self.__load(stale_filenames) self.__fill(stale_filenames) self.__save() def __get_fragments(self, filenames): fragments = set() for filename in filenames: url = os.path.relpath(filename, self.__scan_dir) for fragment in glob.glob(os.path.join(self.__fragments_dir, url) + '*'): fragments.add(os.path.relpath(fragment, self.__fragments_dir)[:-9]) os.unlink(fragment) return fragments def __load(self, stale_filenames): to_remove = self.__get_fragments(stale_filenames) trie_path = os.path.join(self.__private_dir, 'search.trie') if os.path.exists(trie_path): self.__trie = Trie.from_file(trie_path) search_index_path = os.path.join(self.__private_dir, 'search.json') if os.path.exists(search_index_path): with open(search_index_path, 'r') as f: previous_index = json.loads(f.read()) for token, fragment_urls in previous_index.items(): new_set = list(OrderedSet(fragment_urls) - to_remove) if new_set: self.__full_index[token] = new_set else: self.__trie.remove(token) os.unlink(os.path.join(self.__search_dir, token)) def __fill(self, filenames): here = os.path.dirname(__file__) with open(os.path.join(here, 'stopwords.txt'), 'r') as f: stop_words = set(f.read().split()) for filename in filenames: if not os.path.exists(filename): continue for token, section_url, prioritize in parse_file(self.__scan_dir, filename, stop_words, self.__fragments_dir): if not prioritize: self.__full_index[token].append(section_url) self.__new_index[token].append(section_url) else: self.__full_index[token].insert(0, section_url) self.__new_index[token].insert(0, section_url) def __save(self): for key, value in sorted(self.__new_index.items()): self.__trie.insert(key) metadata = {'token': key, 'urls': list(OrderedSet(value))} with open (os.path.join(self.__search_dir, key), 'w') as f: f.write("urls_downloaded_cb(") f.write(json.dumps(metadata)) f.write(");") self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'), os.path.join(self.__output_dir, 'trie_index.js')) with open (os.path.join(self.__private_dir, 'search.json'), 'w') as f: f.write(json.dumps(self.__full_index))