def pick_clusters(self, clusters, nonleaves): """ """ _logger.debug("compute initial cluster errors. %d clusters", len(clusters)) start = time.time() for c in clusters: c.error = self.influence_cluster(c) c.c_range = list(self.c_range) c.inf_func = self.create_inf_func(c) self.stats['init_cluster_errors'] = [time.time() - start, 1] self.update_status("computing frontier") _logger.debug("compute initial frontier") frontier, _ = Frontier(self.c_range, 0.001)(clusters) ret = list(frontier) _logger.debug("get nonleaves containing frontier") for nonleaf in nonleaves: for c in frontier: if nonleaf.contains(c): nonleaf.error = self.influence_cluster(nonleaf) ret.append(nonleaf) break self.update_status("expanding frontier (%d rules)" % len(ret)) _logger.debug("second merger pass") return ret
def main(): from frontier import Frontier from options import parse_command_line parse_command_line() ft = Frontier([ ('http://localhost/', 1), ]) Master(ft).start() IOLoop.instance().start()
def greedy(matrix, start, goal): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param matrix: Search space, as a 2D list :param start: Start point, as a tuple :param goal: Goal point, as a tuple :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', goal node ' + str(goal)) # The set of nodes already evaluated visited = set() partially_expanded = set() # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, manhattan_dist(start, goal)) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_distance = frontier.nearest if current == goal: print('Analytics: ' + str(len(partially_expanded)) + ' expanded nodes out of ' + str(count_nodes(matrix)) + ' nodes , among which ' + str(len(visited)) + ' are fully expanded (all successors evaluated)') return reconstruct_path(came_from, current) partially_expanded.add(current) is_interrupted = False for neighbor in expand(current, matrix): if neighbor not in visited: neighbor_distance = manhattan_dist(neighbor, goal) if neighbor not in frontier: # Discover a new node came_from[neighbor] = current frontier.add(neighbor, neighbor_distance) if current_distance > neighbor_distance: is_interrupted = True break if not is_interrupted: frontier.pop_nearest() visited.add(current) return None
def __init__(self): self.url_waiting_queue = mp.Queue() self.url_result_queue = mp.Queue() self.config = Config() self.frontier = Frontier(self.config) self.agents = [] for id in range(self.config.agentCount): self.agents.append(Agent(self.config, id)) #setting seed url on waiting queue self.url_waiting_queue.put(self.config.seedURL)
def _a_star_search(self, heuristic_func): if self.state == Puzzle.goal_state: return [self.state], 0 # heap of tuple(path_total_cost, path) # path_total_cost = path_cost + heuristic frontier = Frontier() frontier.push((heuristic_func(self.state), [self.state])) explored_list = [] node_gen = 0 while True: if frontier.empty(): break # pop state with min cost from the frontier cur_path_total_cost, cur_path = frontier.pop() cur_state = cur_path[-1] # check lazy delete if cur_state in explored_list: continue # test goal condition if cur_state == Puzzle.goal_state: return cur_path, node_gen # add current state to explored list explored_list.append(cur_state) # get all neighbours of current state in asc order neighbours = Puzzle._get_neighbours(cur_state) for neighbour in neighbours: if neighbour not in explored_list: # new path to go to a neighbour path_to_neighbour = cur_path.copy() path_to_neighbour.append(neighbour) # calc path_total_cost (include heuristic) path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \ + 1 + heuristic_func(neighbour) node_gen += 1 # if neighbour already in frontier or not # -> use lazy delete frontier.push( (path_to_neighbour_total_cost, path_to_neighbour)) return None, node_gen
def __init__(self): self.seed_urls = None self.frontier = Frontier() self.canonicalizer = Canonicalizer() self.all_links = None self.crawled_links = set() self.count = 0 self.all_out_links = {} self.redirected_map = {} self.robots = {} self.robots_delay = {} self.robots_timer = {} self.time_out = 3 self.total_count = 40000
def __init__(self, num_threads, seeds, cont_to_crawl): self.num_workers = num_threads self.dash = Dashboard(num_threads) self.workers = [] self.frontier = Frontier(num_threads, self.dash) self.db = Storage() # Create the workers for i in range(num_threads): self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash)) # print("Workers created") # insert seeds in to serve if not cont_to_crawl: self.frontier.push_to_serve(seeds, 0) # print("seeds pushed") self.frontier.distribute() # print("seeds distributed") else: self.frontier.load_to_crawl()
def reweightbasegrammar(basegrammar, pseudoCounts, filter_depth=None, size=None): frontiers = [] for datum in islice( batchloader('train', batchsize=1, compute_sketches=False, filter_depth=filter_depth), size): #TODO #class Task(object): #def __init__(self, name, request, examples, features=None, cache=False): frontiers.append( Frontier([ FrontierEntry(datum.p, logPrior=basegrammar.logLikelihood( datum.tp, datum.p), logLikelihood=0) ], Task('dummyName', datum.tp, []))) return basegrammar.insideOutside(frontiers, pseudoCounts, iterations=1)
def main(): corpus = Corpus( output="stack_without_dupes/result-{}.csv".format(CORPUS_SIZE)) frontier = Frontier(corpus, 10, 8, duplicate_identification=True, verbose=VERBOSE, debug=DEBUG) crawler = Crawler(SEEDS, corpus, frontier, corpuse_max_size=CORPUS_SIZE, duplicate_identification=True, verbose=VERBOSE, debug=DEBUG) print("Starting at {}".format(datetime.now())) crawler.start() print("Done at {}".format(datetime.now()))
def _pyccg_meanings_to_ec_frontiers(self, pyccg_meanings): """ Ret: pyccg_frontiers: dict from task -> Dreamcoder frontiers. """ pyccg_frontiers = {} for task in pyccg_meanings: if len(pyccg_meanings[task]) > 0: frontier_entries = [] for (meaning, log_prob) in pyccg_meanings[task]: ec_sexpr = self.pyccg_learner.ontology.as_ec_sexpr(meaning) if self.ec_ontology_translation_fn: ec_sexpr = self.ec_ontology_translation_fn(ec_sexpr, is_pyccg_to_ec=True) # Uses the p=1.0 likelihood for programs that solve the task. frontier_entry = FrontierEntry( program=Program.parse(ec_sexpr), logPrior=log_prob, logLikelihood=0.0) frontier_entries.append(frontier_entry) pyccg_frontiers[task] = Frontier(frontier_entries, task) return pyccg_frontiers
def a_star(matrix, start, goal, estimate=manhattan_dist): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param estimate: Heuristics used in a_star search :param matrix: Search space, as a 2D list :param start: Start point, as a tuple :param goal: Goal point, as a tuple :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', goal node ' + str(goal)) # The set of nodes already evaluated visited = set() # For each node, the cost of getting from the start node to that node. # The cost of going from start to start is zero. g_score = {start: 0} # For each node, the total cost of getting from the start node to the goal # by passing by that node. That value is partly known, partly heuristic. # For the first node, that value is completely heuristic. f_score = {start: estimate(start, goal)} # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, f_score[start]) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_f_score = frontier.pop_nearest() if current == goal: print('Analytics: ' + str(len(visited)) + ' expanded nodes, out of ' + str(count_nodes(matrix)) + ' nodes') # draw_expanded_nodes(matrix, visited) return reconstruct_path(came_from, current) visited.add(current) for neighbor in expand(current, matrix): if neighbor not in visited: g_through_current = g_score[ current] + 1 # every neighbor has distance 1 if (neighbor not in frontier or g_through_current < g_score[neighbor]): # Discover a new node or a better path came_from[neighbor] = current g_score[neighbor] = g_through_current f_score[neighbor] = (g_score[neighbor] + estimate(neighbor, goal)) frontier.add(neighbor, f_score[neighbor]) return None
import atexit import logging import sys from corpus import Corpus from crawler import Crawler from frontier import Frontier if __name__ == "__main__": # Configures basic logging logging.basicConfig( format='%(asctime)s (%(name)s) %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # Instantiates frontier and loads the last state if exists frontier = Frontier() frontier.load_frontier() # Instantiates corpus object with the given cmd arg corpus = Corpus(sys.argv[1]) # Registers a shutdown hook to save frontier state upon unexpected shutdown atexit.register(frontier.save_frontier) # Instantiates a crawler object and starts crawling crawler = Crawler(frontier, corpus) crawler.start_crawling() crawler.analytics()
def __init__(self, space): self.__space = space self.__frontier = Frontier() self.__goal = None self.__origin = None
def a_star_multidots(edges, start: tuple, goals: tuple, estimate=mst_estimator): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param estimate: Heuristics used in a_star search :param edges: Search space, as a 2D list :param start: Start point, as a tuple :param goals: Goal points, as a set of all dots :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', dots node ' + str(goals)) goals_to_indices = {g: i for i, g in enumerate(goals, 2)} start = init_state(start, goals) if start[0:2] in goals: start = mark_visited(start[0:2], goals_to_indices, start) # The set of nodes already evaluated visited = set() # For each node, the cost of getting from the start node to that node. # The cost of going from start to start is zero. g_score = {start: 0} # For each node, the total cost of getting from the start node to the dots # by passing by that node. That value is partly known, partly heuristic. # For the first node, that value is completely heuristic. # f_score = {start: naive_estimator(start, dots_visited[start], goals)} f_score = {start: estimate(start, goals, edges)} # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, f_score[start]) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_f_score = frontier.pop_nearest() if current[2:].count(1) == len(current) - 2: print('Analytics: ' + str(len(visited)) + ' expanded nodes, out of ' + str(len(edges) * (2**(len(current) - 2))) + ' nodes') return reconstruct_path(came_from, current) visited.add(current) for neighbor in expand_multidots(current, edges): if neighbor[0:2] in goals: neighbor = mark_visited(neighbor[0:2], goals_to_indices, neighbor) if neighbor not in visited: # Subtract 1 here because the edge_maps contains both start and end for # the shortest path between dots g_through_current = g_score[current] + len( edges[current[0:2]][neighbor[0:2]]) - 1 if (neighbor not in frontier or g_through_current < g_score[neighbor]): # Discover a new node or a better path came_from[neighbor] = current g_score[neighbor] = g_through_current f_score[neighbor] = (g_score[neighbor] + estimate(neighbor, goals, edges)) frontier.add(neighbor, f_score[neighbor]) return None
if rp.can_fetch("*", url): get_urls(driver, frontier, page_id) elif is_html: # no robots.txt => parse everything :) # Write site to database without get_urls(driver, frontier, page_id) if not frontier.has_urls(): print(th_num + " sleep") time.sleep(10) driver.close() if __name__ == "__main__": frontier = Frontier(seed) robots = [] rp = RobotFileParser() sp = SitemapParser() db = Database(use_database) init_sites() print(robots) start = time.time() # Read thread num argument thread_num = 1 print(sys.argv) if len(sys.argv) > 1: thread_num = int(sys.argv[1])
def __init__(self, num_of_workers=1, seed=False, seed_path=None): self.n_workers = num_of_workers self.frontier = Frontier(seed=seed, seed_path=seed_path)
"http://www.dce.harvard.edu", "http://hsdm.harvard.edu", "http://www.fas.harvard.edu", "http://hds.harvard.edu", "http://www.gsd.harvard.edu", "http://www.gse.harvard.edu", "http://www.gsas.harvard.edu", "http://www.seas.harvard.edu", "https://www.hks.harvard.edu", "http://hls.harvard.edu", "http://www.radcliffe.harvard.edu", "http://hms.harvard.edu", "https://www.hsph.harvard.edu" ] frontier = Frontier(SEEDS) crawler = Crawler() FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page" FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier" # frontier.restore(open(FRONTIER_BACKUP)) crawled = 0 MIN_CRAWL = 35000 purl = None DOMAIN_TIMESTAMP = {} while not frontier.empty() and crawled < MIN_CRAWL: if crawled % 100 == 0: print str(crawled) + " pages crawled " + str(
def __init__(self): self.count = 0 self.last_domain = '' self.frontier = Frontier() self.store = Store()
from frontier import Frontier from parser import Parser from graph import Graph from pagerank import Ranker from indexer import Indexer from scorer import Scorer frontier = Frontier([ 'http://mysql12.f4.htw-berlin.de/crawl/d01.html', 'http://mysql12.f4.htw-berlin.de/crawl/d06.html', 'http://mysql12.f4.htw-berlin.de/crawl/d08.html' ]) parser = Parser() indexer = Indexer() web_graph = Graph() for url in frontier: # get outgoing links for the graph and content for tokenization body, links_on_page = parser.parse(url) # add document to indexer indexer.add_document(url, body) # build our webgraph node = web_graph.get_node(url) if node is None: node = web_graph.add_node(url) for out_link in links_on_page: web_graph.add_edge(url, out_link)