예제 #1
0
    def generate_adjacency_matrix(self, drop_static=False):
        pages = [page for page in Page.select() if not drop_static or "html" in page.content_type]
        ids = {}
        matrix = {}

        for page in pages:
            if drop_static and "text/html" not in page.content_type:
                continue
            ids[page.url] = int(page.id)
            matrix[page.id] = set()

        for page in pages:
            if drop_static and "text/html" not in page.content_type:
                continue
            for link in json.loads(page.links):
                if drop_static and link not in ids:
                    continue
                if ids[link] not in matrix[page.id]:
                    matrix[page.id].add(ids[link])

        for el in matrix:
            matrix[el] = list(matrix[el])

        with open("data\\matrix.json", "w") as w:
            w.write(json.dumps(matrix))
예제 #2
0
    def load(self):
        max_id = 1
        visited_links = set()
        all_links = set()

        for page in Page.select():
            if page.id > max_id:
                max_id = page.id
            visited_links.add(page.url)
            for link in json.loads(page.links):
                all_links.add(link)

        self.pages = {l: None for l in visited_links}
        self.queue = all_links - visited_links
        return max_id