def write_nodelinks_network_from_MS(nodes_links, filename, fileformat='gexf'): G = nx.DiGraph() for link in nodes_links: G.add_node(link.targetLRU, label=lru.lru_to_url(link.targetLRU)) G.add_node(link.sourceLRU, label=lru.lru_to_url(link.sourceLRU)) G.add_edge(link.sourceLRU, link.targetLRU, weight=link.weight) write_graph_in_format(G, filename, fileformat)
def write_nodelinks_network_from_MS(nodes_links, filename, fileformat='gexf') : G = nx.DiGraph() for link in nodes_links : G.add_node(link.targetLRU, label=lru.lru_to_url(link.targetLRU)) G.add_node(link.sourceLRU, label=lru.lru_to_url(link.sourceLRU)) G.add_edge(link.sourceLRU, link.targetLRU, weight=link.weight) write_graph_in_format(G, filename, fileformat)
def write_pages_network_from_mongo(pages, filename, fileformat='gexf'): G = nx.DiGraph() for page in pages: if "lrulinks" in page: G.add_node(page['lru'], label=page['url']) for index, lrulink in enumerate(page["lrulinks"]): G.add_node(lrulink, label=lru.lru_to_url(lrulink)) G.add_edge(page['lru'], lrulink) write_graph_in_format(G, filename, fileformat)
def write_pages_network_from_mongo(pages, filename, fileformat='gexf') : G = nx.DiGraph() for page in pages : if "lrulinks" in page : G.add_node(page['lru'], label=page['url']) for index,lrulink in enumerate(page["lrulinks"]) : G.add_node(lrulink, label=lru.lru_to_url(lrulink)) G.add_edge(page['lru'], lrulink) write_graph_in_format(G, filename, fileformat)
def generate_cache_from_pages_list(pageList, precision_limit=1, precision_exceptions=[], verbose=False): if verbose: print "### createCache" pages = {} links = {} original_link_number = 0 nodes = {} for page_item in pageList: page_item["lru"] = lru.cleanLRU(page_item["lru"]) is_full_precision = lru.isFullPrecision(page_item["lru"], precision_exceptions) lru_head = lru.getLRUHead(page_item["lru"], precision_exceptions) is_node = lru.isLRUNode(page_item["lru"], precision_limit, lru_head=lru_head) node_lru = page_item["lru"] if is_node else lru.getLRUNode( page_item["lru"], precision_limit, lru_head=lru_head) nodes[node_lru] = 1 # Create index of crawled pages from queue if page_item["lru"] not in pages: pages[page_item["lru"]] = ms.PageItem( str(page_item["_id"]), page_item["url"].encode('utf8'), page_item["lru"].encode('utf8'), str(page_item["timestamp"]), int(page_item["status"]), int(page_item["depth"]), str(page_item["error"]), ['CRAWL'], is_full_precision, is_node, {}) else: if 'CRAWL' not in pages[page_item["lru"]].sourceSet: pages[page_item["lru"]].sourceSet.append('CRAWL') pages[page_item["lru"]].depth = max( 0, min(pages[page_item["lru"]].depth, int(page_item["depth"]))) # Add to index linked pages and index all links between nodes if "lrulinks" in page_item: for index, lrulink in enumerate(page_item["lrulinks"]): lrulink = lru.cleanLRU(lrulink) is_full_precision = lru.isFullPrecision( lrulink, precision_exceptions) lru_head = lru.getLRUHead(lrulink, precision_exceptions) is_node = lru.isLRUNode(lrulink, precision_limit, lru_head=lru_head) target_node = lrulink if is_node else lru.getLRUNode( lrulink, precision_limit, lru_head=lru_head) nodes[target_node] = 1 original_link_number += 1 # check False {} errorcode if lrulink not in pages: pages[lrulink] = ms.PageItem( str(page_item["_id"]) + "_" + str(index), lru.lru_to_url(lrulink).encode('utf8'), lrulink.encode('utf8'), str(page_item["timestamp"]), None, int(page_item["depth"]) + 1, None, ['LINK'], is_full_precision, is_node, {}) elif 'LINK' not in pages[lrulink].sourceSet: pages[lrulink].sourceSet.append('LINK') links[(node_lru, target_node)] = links[(node_lru, target_node)] + 1 if ( node_lru, target_node) in links else 1 if verbose: print str(len(pages)) + " unique pages ; " + str( original_link_number) + " links ; " + str(len( links.values())) + " unique links / identified " + str( len(nodes)) + " nodes" return (pages, [(source, target, weight) for (source, target), weight in links.iteritems()])
def generate_cache_from_pages_list(pageList, precision_limit = 1, precision_exceptions = [], verbose = False) : if verbose : print "### createCache" pages = {} links = {} original_link_number = 0 nodes = {} for page_item in pageList : page_item["lru"] = lru.cleanLRU(page_item["lru"]) is_full_precision = lru.isFullPrecision(page_item["lru"], precision_exceptions) lru_head = lru.getLRUHead(page_item["lru"], precision_exceptions) is_node = lru.isLRUNode(page_item["lru"], precision_limit, lru_head=lru_head) node_lru = page_item["lru"] if is_node else lru.getLRUNode(page_item["lru"], precision_limit, lru_head=lru_head) nodes[node_lru] = 1 # Create index of crawled pages from queue if page_item["lru"] not in pages: pages[page_item["lru"]] = ms.PageItem(str(page_item["_id"]), page_item["url"].encode('utf8'), page_item["lru"].encode('utf8'), str(page_item["timestamp"]), int(page_item["status"]), int(page_item["depth"]), str(page_item["error"]), ['CRAWL'], is_full_precision, is_node, {}) else: if 'CRAWL' not in pages[page_item["lru"]].sourceSet: pages[page_item["lru"]].sourceSet.append('CRAWL') pages[page_item["lru"]].depth = max(0, min(pages[page_item["lru"]].depth, int(page_item["depth"]))) # Add to index linked pages and index all links between nodes if "lrulinks" in page_item: for index,lrulink in enumerate(page_item["lrulinks"]) : lrulink = lru.cleanLRU(lrulink) is_full_precision = lru.isFullPrecision(lrulink, precision_exceptions) lru_head = lru.getLRUHead(lrulink, precision_exceptions) is_node = lru.isLRUNode(lrulink, precision_limit, lru_head=lru_head) target_node = lrulink if is_node else lru.getLRUNode(lrulink, precision_limit, lru_head=lru_head) nodes[target_node] = 1 original_link_number += 1 # check False {} errorcode if lrulink not in pages: pages[lrulink] = ms.PageItem(str(page_item["_id"])+"_"+str(index), lru.lru_to_url(lrulink).encode('utf8'), lrulink.encode('utf8'), str(page_item["timestamp"]), None, int(page_item["depth"])+1, None, ['LINK'], is_full_precision, is_node, {}) elif 'LINK' not in pages[lrulink].sourceSet: pages[lrulink].sourceSet.append('LINK') links[(node_lru,target_node)] = links[(node_lru,target_node)] + 1 if (node_lru,target_node) in links else 1 if verbose: print str(len(pages))+" unique pages ; "+str(original_link_number)+" links ; "+str(len(links.values()))+" unique links / identified "+str(len(nodes))+" nodes" return (pages, [(source, target, weight) for (source,target),weight in links.iteritems()])