Пример #1
0
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()
Пример #2
0
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__indices_lock = threading.Lock()
        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()

        self.__filler = futures.ThreadPoolExecutor(
            max_workers=multiprocessing.cpu_count() * 5)
        here = os.path.dirname(__file__)
        with open(os.path.join(here, 'stopwords.txt'), 'r') as _:
            self.__stop_words = set(_.read().split())

        self.__futures = []
        self.__connected_all_projects = False
Пример #3
0
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir
        self.__search_dir = os.path.join(self.__output_dir, 'search')
        self.__fragments_dir = os.path.join(self.__search_dir, 'hotdoc_fragments')

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()
Пример #4
0
    def load(self, stale_filenames):
        to_remove = self.__get_fragments(stale_filenames)

        trie_path = os.path.join(self.__private_dir, 'search.trie')
        if os.path.exists(trie_path):
            self.__trie = Trie.from_file(trie_path)

        search_index_path = os.path.join(self.__private_dir, 'search.json')
        if os.path.exists(search_index_path):
            with open(search_index_path, 'r') as _:
                previous_index = json.loads(_.read())

            for token, fragment_urls in list(previous_index.items()):
                new_set = list(OrderedSet(fragment_urls) - to_remove)

                if new_set:
                    self.__full_index[token] = new_set
                else:
                    self.__trie.remove(token)
                    os.unlink(os.path.join(self.__search_dir, token))
Пример #5
0
    def __load(self, stale_filenames):
        to_remove = self.__get_fragments(stale_filenames)

        trie_path = os.path.join(self.__private_dir, 'search.trie')
        if os.path.exists(trie_path):
            self.__trie = Trie.from_file(trie_path)

        search_index_path = os.path.join(self.__private_dir, 'search.json')
        if os.path.exists(search_index_path):
            with open(search_index_path, 'r') as f:
                previous_index = json.loads(f.read())

            for token, fragment_urls in previous_index.items():
                new_set = list(OrderedSet(fragment_urls) - to_remove)

                if new_set:
                    self.__full_index[token] = new_set
                else:
                    self.__trie.remove(token)
                    os.unlink(os.path.join(self.__search_dir, token))
Пример #6
0
class SearchIndex(object):
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()

    def scan(self, stale_filenames):
        self.load(stale_filenames)
        self.fill(stale_filenames)
        self.save()

    @property
    def __search_dir(self):
        return os.path.join(self.__output_dir, 'search')

    @property
    def __fragments_dir(self):
        return os.path.join(self.__search_dir, 'hotdoc_fragments')

    def __get_fragments(self, filenames):
        fragments = set()

        for filename in filenames:
            url = os.path.relpath(filename, self.__scan_dir)
            for fragment in glob.glob(
                    os.path.join(self.__fragments_dir, url) + '*'):
                fragments.add(
                    os.path.relpath(fragment, self.__fragments_dir)[:-9])
                os.unlink(fragment)

        return fragments

    def load(self, stale_filenames):
        to_remove = self.__get_fragments(stale_filenames)

        trie_path = os.path.join(self.__private_dir, 'search.trie')
        if os.path.exists(trie_path):
            self.__trie = Trie.from_file(trie_path)

        search_index_path = os.path.join(self.__private_dir, 'search.json')
        if os.path.exists(search_index_path):
            with open(search_index_path, 'r') as _:
                previous_index = json.loads(_.read())

            for token, fragment_urls in list(previous_index.items()):
                new_set = list(OrderedSet(fragment_urls) - to_remove)

                if new_set:
                    self.__full_index[token] = new_set
                else:
                    self.__trie.remove(token)
                    os.unlink(os.path.join(self.__search_dir, token))

    def fill(self, filenames):
        here = os.path.dirname(__file__)
        with open(os.path.join(here, 'stopwords.txt'), 'r') as _:
            stop_words = set(_.read().split())

        for filename in filenames:
            if not os.path.exists(filename):
                continue

            for token, section_url, prioritize in parse_file(
                    self.__scan_dir, filename, stop_words,
                    self.__fragments_dir):
                if not prioritize:
                    self.__full_index[token].append(section_url)
                    self.__new_index[token].append(section_url)
                else:
                    self.__full_index[token].insert(0, section_url)
                    self.__new_index[token].insert(0, section_url)

    def save(self):
        for key, value in sorted(self.__new_index.items()):
            self.__trie.insert(key)

            metadata = {'token': key, 'urls': list(OrderedSet(value))}

            with open(os.path.join(self.__search_dir, key), 'w') as _:
                _.write("urls_downloaded_cb(")
                _.write(json.dumps(metadata))
                _.write(");")

        self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'),
                            os.path.join(self.__output_dir, 'trie_index.js'))

        with open(os.path.join(self.__private_dir, 'search.json'), 'w') as _:
            _.write(json.dumps(self.__full_index))
Пример #7
0
class SearchIndex(object):
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__indices_lock = threading.Lock()
        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()

        self.__filler = futures.ThreadPoolExecutor(
            max_workers=multiprocessing.cpu_count() * 5)
        here = os.path.dirname(__file__)
        with open(os.path.join(here, 'stopwords.txt'), 'r') as _:
            self.__stop_words = set(_.read().split())

        self.__futures = []
        self.__connected_all_projects = False

    def process(self, path, lxml_tree):
        self.__futures.append(self.__filler.submit(self.fill, path, lxml_tree))

    def write(self):
        for future in self.__futures:
            # Make sure all the filling is done.
            future.result()
        self.save()

    @property
    def __search_dir(self):
        return os.path.join(self.__output_dir, 'search')

    @property
    def __fragments_dir(self):
        return os.path.join(self.__search_dir, 'hotdoc_fragments')

    def __get_fragments(self, filenames):
        fragments = set()

        for filename in filenames:
            url = os.path.relpath(filename, self.__scan_dir)
            for fragment in glob.glob(
                    os.path.join(self.__fragments_dir, url) + '*'):
                fragments.add(
                    os.path.relpath(fragment, self.__fragments_dir)[:-9])
                os.unlink(fragment)

        return fragments

    def fill(self, filename, lxml_tree):
        for token, section_url, prioritize, context in parse_file(
                self.__scan_dir, lxml_tree, filename, self.__stop_words,
                self.__fragments_dir):

            self.__indices_lock.acquire()
            contextualized_url = ContextualizedURL(section_url, context)
            if not prioritize:
                self.__full_index[token].append(contextualized_url)
                self.__new_index[token].append(contextualized_url)
            else:
                self.__full_index[token].insert(0, contextualized_url)
                self.__new_index[token].insert(0, contextualized_url)
            self.__indices_lock.release()

    def save(self):
        self.__indices_lock.acquire()
        for key, value in sorted(self.__new_index.items()):
            self.__trie.insert(key)

            deduped = OrderedDict()
            for url in value:
                try:
                    context = deduped[url.url]
                    for key_, val_ in url.context.items():
                        try:
                            vset = context[key_]
                            vset.add(val_)
                        except KeyError:
                            context[key_] = set([val_])
                except KeyError:
                    deduped[url.url] = \
                        {k: set([v]) for k, v in url.context.items()}

            urls = []
            for url, context in deduped.items():
                for key_, val_ in context.items():
                    context[key_] = list(val_)
                urls.append({'url': url, 'context': context})

            metadata = {'token': key, 'urls': urls}

            with open(os.path.join(self.__search_dir, key), 'w') as _:
                _.write('urls_downloaded_cb(%s);' % json.dumps(metadata))

        self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'),
                            os.path.join(self.__output_dir, 'trie_index.js'))

        self.__indices_lock.release()
Пример #8
0
class SearchIndex(object):
    def __init__(self, scan_dir, output_dir, private_dir):
        self.__scan_dir = scan_dir
        self.__output_dir = output_dir
        self.__private_dir = private_dir
        self.__search_dir = os.path.join(self.__output_dir, 'search')
        self.__fragments_dir = os.path.join(self.__search_dir, 'hotdoc_fragments')

        prepare_folder(self.__search_dir)
        prepare_folder(self.__fragments_dir)

        self.__full_index = defaultdict(list)
        self.__new_index = defaultdict(list)
        self.__trie = Trie()

    def scan(self, stale_filenames):
        self.__load(stale_filenames)
        self.__fill(stale_filenames)
        self.__save()

    def __get_fragments(self, filenames):
        fragments = set()

        for filename in filenames:
            url = os.path.relpath(filename, self.__scan_dir)
            for fragment in glob.glob(os.path.join(self.__fragments_dir, url) + '*'):
                fragments.add(os.path.relpath(fragment,
                    self.__fragments_dir)[:-9])
                os.unlink(fragment)

        return fragments

    def __load(self, stale_filenames):
        to_remove = self.__get_fragments(stale_filenames)

        trie_path = os.path.join(self.__private_dir, 'search.trie')
        if os.path.exists(trie_path):
            self.__trie = Trie.from_file(trie_path)

        search_index_path = os.path.join(self.__private_dir, 'search.json')
        if os.path.exists(search_index_path):
            with open(search_index_path, 'r') as f:
                previous_index = json.loads(f.read())

            for token, fragment_urls in previous_index.items():
                new_set = list(OrderedSet(fragment_urls) - to_remove)

                if new_set:
                    self.__full_index[token] = new_set
                else:
                    self.__trie.remove(token)
                    os.unlink(os.path.join(self.__search_dir, token))

    def __fill(self, filenames):
        here = os.path.dirname(__file__)
        with open(os.path.join(here, 'stopwords.txt'), 'r') as f:
            stop_words = set(f.read().split())

        for filename in filenames:
            if not os.path.exists(filename):
                continue

            for token, section_url, prioritize in parse_file(self.__scan_dir,
                    filename, stop_words, self.__fragments_dir):
                if not prioritize:
                    self.__full_index[token].append(section_url)
                    self.__new_index[token].append(section_url)
                else:
                    self.__full_index[token].insert(0, section_url)
                    self.__new_index[token].insert(0, section_url)

    def __save(self):
        for key, value in sorted(self.__new_index.items()):
            self.__trie.insert(key)

            metadata = {'token': key, 'urls': list(OrderedSet(value))}

            with open (os.path.join(self.__search_dir, key), 'w') as f:
                f.write("urls_downloaded_cb(")
                f.write(json.dumps(metadata))
                f.write(");")

        self.__trie.to_file(os.path.join(self.__private_dir, 'search.trie'),
                os.path.join(self.__output_dir, 'trie_index.js'))

        with open (os.path.join(self.__private_dir, 'search.json'), 'w') as f:
            f.write(json.dumps(self.__full_index))