path_size = random.choice(path_sizes) protocol = 's:http|' tld = 'h:com|' host = 'h:%s|' % (random.choice(voc)) path = '' for p in range(path_size): path += 'p:%s|' % (random.choice(voc)) lru = protocol + tld + host + path # print lru report = traph.add_page(lru) webentity_store.data['webentities'].update(report.created_webentities) print '\n:: Webentities' print '\nExisting webentities from Store:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s:' % (weid) for prefix in prefixes: print '\t\t' + prefix print '\nPrefixes from Traph:' for node, lru in traph.webentity_prefix_iter(): print ' - (%s) \t%s' % (node.webentity(), lru) print '\n:: Pages in "Lorem" webentity' lorem_weid = traph.get_webentity_by_prefix('s:http|h:com|h:lorem|') lorem_prefixes = webentity_store.data['webentities'][lorem_weid] for lru in traph.get_webentity_pages(lorem_weid, lorem_prefixes): print ' - %s' % (lru) traph.close()
print '- %s pages in the Traph' % (len(pages)) print '\n:: Traph: LRU trie' print traph.lru_trie.representation() print '\n:: Breakdown by webentity' for weid in webentities: print '\nWebentity %s' % (weid) we_prefixes = webentity_store.data['webentities'][weid] print ' - %s prefixes (store)' % (len(we_prefixes)) for prefix in we_prefixes: print ' \t- %s' % (prefix) we_pages = traph.get_webentity_pages(weid, we_prefixes) print ' - %s pages (traph)' % (len(we_pages)) for lru in we_pages: print ' \t- %s' % (lru) we_crawled_pages = traph.get_webentity_crawled_pages(weid, we_prefixes) print ' - %s crawled pages (traph)' % (len(we_crawled_pages)) for lru in we_crawled_pages: print ' \t- %s' % (lru) we_most_linked_pages = traph.get_webentity_most_linked_pages( weid, we_prefixes, 3) print ' - %s most linked pages (traph, max 3)' % ( len(we_most_linked_pages))