示例#1
0
def crawl(url):
    try:
        #print("Visiting: " + url)
        full_url = urlparse(url)
        req = urllib.request.Request(url, headers=header)
        page = urlopen(req)
        soup = BeautifulSoup(page, 'html.parser')
        text = get_text(soup)
        links = get_links(soup)
        allowed_links = prt.parse_and_test_links(get_base_url(full_url), links,
                                                 user_agent)
        checked_links = N.check_duplicate_urls(allowed_links)

        N.check_and_save_page(url, text)

        Frontier.prioritize_urls(checked_links, number_of_priorities)
        #print(Frontier.queue_map)
        #print("Printing Front Queue:")
        #for index, queue in Frontier.front_queues.items():
        #  print(queue.queue)
        #print("Printing Back Queue:")
        #for index, queue in Frontier.back_queues.items():
        #  print(queue.queue)
    except (urllib.error.HTTPError, urllib.error.URLError) as e:
        pass
示例#2
0
    def __init__(self, expert):

        self.expert_mode = expert  #Booléen indiquant si la partie se déroule en mode expert ou non
        self.frontiers_visual, self.frontiers_p0_rect, self.frontiers_p1_rect, self.frontiers_p2_rect = self.initializeFrontiers(
        )  #Visuels des frontières
        self.cards_visual = self.initializeCards()  #Visuels des cartes
        self.frontiers = []  #Liste des objets "Frontier"
        for i in range(0, 9):  #On initialise les objets "Frontier"
            self.frontiers.append(Frontier(i))
        self.ClanCardDraw = Card_Draw(
            "Clan")  #On initialise la pioche de cartes de clan
        if expert:  #Si la partie se déroule en mode expert
            self.TacticCardDraw = Card_Draw(
                "Tactic")  #On initialise la pioche de cartes tactiques
            self.nTacticCardPlayed = [
                0, 0
            ]  #Nombre de cartes tactiques jouées par les deux joueurs
            self.DiscardPile = []  #Liste de cartes défaussées
        self.hands = [Hand(expert), Hand(expert)
                      ]  #On initialise les objets "Hand" des deux joueurs
        self.hands[0].fillHand(
            self.ClanCardDraw)  #On remplit la main du premier joueur
        self.hands[1].fillHand(
            self.ClanCardDraw)  #On remplit la main du second joueur
        self.player1_cards_rect, self.player2_cards_rect = self.initializeHandCardsRect(
        )  #Positions des cartes des mains des joueurs
        self.frontiers_sides_cards_rect = self.initializeSideCardsRect(
        )  #Positions des cartes jouées
        self.winner = 0
示例#3
0
    def DepthFirstSearch(self, initial_state):
        """
        Perform depth-first search of the problem
        """
        # configure search: for DFS, we want the Frotnier with the LIFO Stack
        self._frontier = Frontiers.FrontierLIFO()

        # run search
        return self._tree_search(initial_state)
示例#4
0
    def BreadthFirstSearch(self, initial_state):
        """
        Perform breadth-first search of the problem,
        starting at a given initial state.
        """
        # configure search: for BFS, we want the Frontier with the FIFO Queue
        self._frontier = Frontiers.FrontierFIFO()

        # run search
        return self._tree_search(initial_state)
示例#5
0
    def DepthFirstSearch(self, initial_state):
        """
        Perform depth-first search of the problem,
        starting at a given initial state.
        :param initial_state: a Problem State
        :return: SearchTerminationRecord
        """
        # configure search: for DFS, we want the Frotnier with the LIFO Stack
        self._frontier = Frontiers.FrontierLIFO()

        # run search
        return self._tree_search(initial_state)
示例#6
0
    def DepthLimitedSearch(self, initial_state, limit):
        """
        Perform depth-limited search of the problem,
        starting at a given initial state.
        """
        # configure search: We want the FIFO Frontier with the depth limit
        self._frontier = Frontiers.FrontierLIFO_DL(limit)

        # run search
        result = self._tree_search(initial_state)

        result.cutoff = self._frontier._cutoff
        return result
示例#7
0
    def DepthLimitedSearch(self, initial_state, limit):
        """
        Perform depth-limited search of the problem,
        starting at a given initial state.
        :param initial_state: a Problem State
        :param limit: the maximum allowable depth
        :return: SearchTerminationRecord
        """
        # configure search: We want the FIFO Frontier with the depth limit
        self._frontier = Frontiers.FrontierLIFO_DL(limit)

        # run search
        result = self._tree_search(initial_state)

        # another attribute to indicate whether we ran out of time (cutoff = True)
        # or if the search space was less deep than the limit, and we searched it all
        # (cutoff = False)
        # This is needed by Iterative Deepening, so that we know to stop searching deeper.
        result.cutoff = self._frontier._cutoff
        return result
示例#8
0
def searchAlgorithm(problem, maze):
    depth = 0
    sol = None
    closed = []
    fringe = Frontier.Frontier()

    initial_node = Node.Node()
    initial_node.idState = problem.initial
    initial_node.cost = 0
    initial_node.parent = None
    initial_node.action = None
    initial_node.depth = 0
    initial_node.heuristic = heuristic(problem.initial, problem.objective)
    initial_node.value = calcValue(initial_node, problem.strategy)
    fringe.insertNode(initial_node)

    while (1):
        if depth == 1000000:
            print("The algorithm arrrives to the limit of iteractions")
            break
        elif fringe.isEmpty():
            print("The frontier is empty.")
            break
        else:
            currentNode = fringe.removeNode()
            if problem.goal(currentNode.idState):
                sol = solution(currentNode)
                break
            if not isIn(currentNode, closed):
                neighbors = succerssorFunction(currentNode, maze.grid)
                fringe.insertList(
                    initNodes(currentNode, neighbors, problem.objective,
                              problem.strategy, maze.grid))
                closed.append(currentNode)
                depth += 1
    writeSolution(sol, problem.strategy, maze)
    drawSolution(sol, fringe.frontier, closed, maze, problem.strategy)
示例#9
0
def main():
    Frontier.initialize_queues_and_priorities(number_of_crawlers,
                                              number_of_priorities)
    Frontier.prioritize_urls(seed, number_of_priorities)
    Frontier.update_back_queue()
    start(number_of_crawlers)
示例#10
0
def crawler():
    while True:
        url = Frontier.get_url_from_back_queue()
        if url is None:
            break
        crawl(url)
    def cspace_callback(self, cspace_og):
        """
		Brief: Callback for /cspace
		Details:
			- Constructs list of frontiers from C-space
			- Publishes frontier and centroid gridcells
			- Publishes goal position of highest priority frontier centroid
		Inputs: cspace_og [OccupancyGrid]
		Return: None
		"""

        # Start timing
        stopwatch = Stopwatch()

        # Build frontier list from C-space
        frontier_list = []
        m = cspace_og.info.height
        n = cspace_og.info.width
        for i in range(m):
            for j in range(n):
                if og_on_frontier_ij(i, j, cspace_og):

                    # Potential frontier starting point
                    start = (i, j)

                    # Check against existing frontiers
                    new_frontier = True
                    for frontier in frontier_list:
                        if frontier.has_point(start):
                            new_frontier = False
                            break
                    if new_frontier:

                        # Generate new frontier from starting point
                        frontier = Frontier(start)

                        # Expand using wavefront algorithm
                        queue = Queue()
                        queue.put(start)
                        visited = {}
                        while not queue.empty():
                            curr = queue.pop()
                            visited[curr] = True
                            i_c = curr[0]
                            j_c = curr[1]
                            for i_n in range(i_c - 1, i_c + 2):
                                for j_n in range(j_c - 1, j_c + 2):
                                    next_on_frontier = og_on_frontier_ij(
                                        i_n, j_n, cspace_og)
                                    next = (i_n, j_n)
                                    if (next not in visited
                                        ) and next_on_frontier:
                                        visited[next] = True
                                        queue.put(next)
                                        frontier.add_point(next)

                        # Add to list of frontiers
                        frontier_list.append(frontier)

        # Convert frontier list to priority queue
        frontier_points = []
        centroid_points = []
        frontier_queue = PriorityQueue()
        for frontier in frontier_list:

            # Goal position is centroid of frontier
            centroid = frontier.get_centroid()
            c_i = centroid[0]
            c_j = centroid[1]
            c_x, c_y = og_ij_to_xy(c_i, c_j, cspace_og)
            goal = PoseStamped()
            goal.pose.position = Point(c_x, c_y, 0.0)

            # Get path time cost from A*
            rospy.wait_for_service('waypoints')
            get_waypoints = rospy.ServiceProxy('waypoints', CalcWaypoints)
            resp = get_waypoints(goal, Bool(False))

            # If valid path exists:
            if resp.exception.data == 'none':

                # Add frontier to priority queue
                priority = resp.time_cost.data
                frontier_queue.put(frontier, priority)

                # Add points to gridcells
                frontier_points += frontier.get_points()
                centroid_points.append(frontier.get_centroid())

        # Check if map is complete
        if frontier_queue.empty():

            # Map complete - publish home position to goal
            if self.nav_state == 'exploring':
                goal = PoseStamped()
                goal.pose.position = Point(self.robot_initial_x,
                                           self.robot_initial_y, 0.0)
                self.pub_goal.publish(goal)
                self.pub_debug.publish('Map complete - returning home')
                self.nav_state = 'returning'
                if hypot(self.robot_initial_x - self.robot_x,
                         self.robot_initial_y - self.robot_y) < 0.2:
                    self.pub_debug.publish('Returned home - ready to race')
                    self.nav_state = 'racing'

        else:

            # Map incomplete - publish top priority frontier to goal
            if self.nav_state != 'racing':
                i, j = frontier_queue.pop().get_centroid()
                x, y = og_ij_to_xy(i, j, cspace_og)
                goal = PoseStamped()
                goal.pose.position = Point(x, y, 0.0)
                self.pub_goal.publish(goal)
                self.pub_debug.publish('Frontier goal published')
                self.nav_state = 'exploring'

        # Publish frontier and centroid gridcells
        og_pub_gridcells(frontier_points, 0.002, self.pub_frontiers_gc,
                         cspace_og)
        og_pub_gridcells(centroid_points, 0.004, self.pub_centroids_gc,
                         cspace_og)

        # Finish timing
        dur = stopwatch.stop()
        timing_msg = 'Frontier generation: ' + str(dur) + ' sec'
        self.pub_timing.publish(timing_msg)
示例#12
0
def main():
  Frontier.add_seed(seed)
  Crawler.crawl()
示例#13
0
 def __init__(self,seedUrls):
     self.__frontier = Frontier(seedUrls)
示例#14
0
class Crawler:
    '''
    classdocs
    '''
    seedUrls =[""]
    __webGrapg = {}
    __siteContents = []
    ____frontier = None
    __soup = None
    
    baseURL = "http://mysql12.f4.htw-berlin.de/crawl/" # base url erstellen durch urlparse
    
    def __init__(self,seedUrls):
        self.__frontier = Frontier(seedUrls)
        
    def startCrawling(self):
        node = self.__frontier.getNode()
        
        while(node!=""):
            self.__downloadPage(node)
            node = self.__frontier.getNode()
        return (self.__webGrapg,self.__siteContents)
        
  
    # download via urlib2
    """
    Download webpage to given Url via urllib2 and processes it with BeautifulSoup. After text and links will be extracted
    """
    def __downloadPage(self,seedUrl):
        response = urllib2.urlopen(seedUrl)  # TODO catch HTTPError urllib2.HTTPError: HTTP Error 404: Not Found
        #print response.info()
        html = response.read()
        self.__soup = BeautifulSoup(html)                                  #get the html soup
        self.__getLinksFromPage(seedUrl)
        self.__getTextFromPage(seedUrl)
    
        response.close()  # best practice to close the fi
    """
    Extract all URLs from a with BeatifulSoup processed webpage and add them to the webgraph dictionary
    Found links will be added to the frontier
    argument The URL of the page from which the links should be extracted
    """    
    def __getLinksFromPage(self,seedUrl):
        links = []                                               #
        for link in self.__soup.find_all('a'):
            link = self.__validateUrl(seedUrl, link.get('href'))     
            links.append(link)
            self.__frontier.setNode(link)
        self.__webGrapg.update({seedUrl:links})         
        #print '[%s]' % '\n '.join(map(str,links))
    """
    Extract all text from a with BeatifulSoup processed webpage and add them to the siteContent dictionary
    argument The URL of the page from which the links should be extracted
    """   
    def __getTextFromPage(self,seedUrl):
        [s.extract() for s in self.__soup('a')] #extract all <a> link tags
        bodyContent = self.__soup.body.get_text()
        regex = re.compile('[%s]' % re.escape(string.punctuation)) # building regex obj on punctuation
        words = regex.sub(" ",bodyContent).split()                 #replacing punctuation with whitespace and split into single words
        #self.__siteContents.update({seedUrl:words})
        self.__siteContents.append((seedUrl,words))
        #print '[%s]' % '\n '.join(map(str,words))
        
    def __validateUrl(self,seedUrl,url):
        return "http://mysql12.f4.htw-berlin.de/crawl/"+url #TODO extract base url if missing