Exemplo n.º 1
0
def greedy_approx(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()
    path = []
    '''Initialize Priority Queue which will help us find Farthest node after distance is calcualted from visited node'''
    for node in G.nodes():
        pq.additem(node, float("-inf"))

    curr = pq.pop()
    vis.add(curr)
    path.append(curr)
    while len(pq) > 0:
        for s, nod, wt in G.edges(curr, data=True):
            '''Distance calculation'''
            if nod not in vis and -wt['weight'] > pq[nod]:
                pq.updateitem(nod, -wt['weight'])

        if len(pq) > 0:
            ''' Selection Step'''
            top = pq.top()
            vis.add(top)
            curr = pq.pop()
            ''' Insertion Step'''
            loc, cost = minCost(G, path, top)
            '''Insert into the location found by minCost()'''
            path.insert(loc, top)
            tot_weight += cost

    return path, tot_weight
Exemplo n.º 2
0
def primMST(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()
    Gprime = nx.Graph()
    ''' Add all nodes to PQDict with infinite distance'''
    for node in G.nodes():
        pq.additem(node, float("inf"))

    curr = pq.pop()  #Select initial node
    vis.add(curr)
    while len(pq) > 0:
        for s, nod, wt in G.edges(curr, data=True):
            if nod not in vis and wt['weight'] < pq[nod]:
                pq.updateitem(nod, wt['weight'])

        if len(pq) > 0:
            top = pq.top()
            source, destination, dist = [
                data for data in sorted(G.edges(top, data=True),
                                        key=lambda
                                        (source, target, data): data['weight'])
                if data[1] in vis
            ][0]
            Gprime.add_edge(source, destination, weight=dist['weight'])
            vis.add(top)
            tot_weight += pq[top]
            curr = pq.pop()

    return Gprime, tot_weight
def primMST(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()            
    Gprime = nx.Graph()
    
    ''' Add all nodes to PQDict with infinite distance'''
    for node in G.nodes():
        pq.additem(node, float("inf"))
    
    curr = pq.pop()    #Select initial node
    vis.add(curr)
    while len(pq) > 0:
        for s,nod, wt in G.edges(curr, data=True):
            if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) 
        
        if len(pq)>0:            
            top = pq.top()
            source,destination, dist = [data for data in sorted(G.edges(top, data=True), key=lambda (source,target,data): data['weight']) if data[1] in vis][0]
            Gprime.add_edge(source, destination, weight = dist['weight'])
            vis.add(top)
            tot_weight += pq[top]
            curr = pq.pop()
            
    return Gprime, tot_weight
Exemplo n.º 4
0
def greedy_approx(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()            
    path = []
    
    '''Initialize Priority Queue which will help us find Farthest node after distance is calcualted from visited node''' 
    for node in G.nodes():
        pq.additem(node, float("-inf"))
    
    curr = pq.pop()
    vis.add(curr)
    path.append(curr)
    while len(pq) > 0:
        for s,nod, wt in G.edges(curr, data=True):
            '''Distance calculation'''
            if nod not in vis and -wt['weight'] > pq[nod]: pq.updateitem(nod, -wt['weight']) 
        
        if len(pq)>0:
            ''' Selection Step'''
            top = pq.top()
            vis.add(top)
            curr = pq.pop()
            ''' Insertion Step'''
            loc,cost = minCost(G,path,top)
            '''Insert into the location found by minCost()'''
            path.insert(loc, top)
            tot_weight += cost
            
    return path,tot_weight
Exemplo n.º 5
0
    def a_star(self, heuristic):
        node = self.tree.create_node(state=State(self.wrigglers), pathCost=0)
        node.heuristic = heuristic(node)

        frontier = PQDict()
        stateFrontier = {}
        explored = {}

        # Sacrifice memory to have a huge speed up being able to instantly check for state in frontier
        stateFrontier[str(node.state)] = node.heuristic
        frontier.additem(node._identifier, node.heuristic)

        while(True):
            if(len(frontier) == 0):
                return None

            nodeID = frontier.popitem()[0]
            node = self.tree.get_node(nodeID)
            nodeStateStr = str(node.state)

            del stateFrontier[nodeStateStr]

            if self.testGoal(node.state):
                return node

            explored[nodeStateStr] = -1  # we don't care what the hash matches
            actions = self.getActions(node.state)
            for action in actions:
                child = self.childNode(node, action)
                child.heuristic = heuristic(child)
                childStr = str(child.state)

                inExplored = False
                inFrontier = False

                if childStr in explored:
                    inExplored = True

                bGreater = False
                if childStr in stateFrontier:
                    if(stateFrontier[childStr] < child.heuristic + child.pathCost):
                        bGreater = True
                    inFrontier = True

                if(not inExplored and not inFrontier):
                    stateFrontier[childStr] = child.heuristic
                    frontier.additem(child._identifier, child.heuristic + child.pathCost)
                elif(bGreater):
                    bHappened = False
                    for key in frontier:
                        if(str(self.tree.get_node(key).state) == childStr):
                            bHappened = True
                            frontier.pop(key)
                            frontier.additem(child._identifier, child.heuristic + child.pathCost)
                            break
                    assert bHappened
 def test_pop(self):
     # pop selected item - return pkey
     pq = PQDict(A=5, B=8, C=1)
     pkey = pq.pop('B')
     self.assertEqual(pkey, 8)
     pq.pop('A')
     pq.pop('C')
     self.assertRaises(KeyError, pq.pop, 'A')
     self.assertRaises(KeyError, pq.pop, 'does_not_exist')
     # no args and empty - throws
     self.assertRaises(KeyError, pq.pop) #pq is now empty
     # no args - return top dkey
     pq = PQDict(A=5, B=8, C=1)
     self.assertEqual(pq.pop(), 'C')
def primWeight(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()            
    
    for node in G.nodes():
        pq.additem(node, float("inf"))
    
    curr = pq.pop()
    vis.add(curr)
    while len(pq) > 0:
        for s,nod, wt in G.edges(curr, data=True):
            if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) 
        
        if len(pq)>0:
            top = pq.top()
            vis.add(top)
            tot_weight += pq[top]
            curr = pq.pop()
    return tot_weight
Exemplo n.º 8
0
def primWeight(G):
    """ Return MST of the given undirected graph"""
    vis = set()
    tot_weight = 0
    pq = PQDict()

    for node in G.nodes():
        pq.additem(node, float("inf"))

    curr = pq.pop()
    vis.add(curr)
    while len(pq) > 0:
        for s, nod, wt in G.edges(curr, data=True):
            if nod not in vis and wt['weight'] < pq[nod]:
                pq.updateitem(nod, wt['weight'])

        if len(pq) > 0:
            top = pq.top()
            vis.add(top)
            tot_weight += pq[top]
            curr = pq.pop()
    return tot_weight
Exemplo n.º 9
0
class Storage:
    """
    Kademlia storage implementation.

    Three responsibilities:
    - Storing data
    - Listing old keys to refresh
    - Keeping a record of data popularity and evicting unpopular data when
      a storage limit is reached.
    """
    implements(kademlia.storage.IStorage)

    max_len = 5000

    def __init__(self, args, ttl=604800, time=time):
        self.args = args
        self.time = time

        # linked
        self.popularity_queue = PQDict()
        self.age_dict = OrderedDict()

        # separate
        self.future_popularity_queue = PQDict()

        self.step = ttl

    def cull(self):
        if len(self.popularity_queue) > self.max_len:
            key = self.popularity_queue.pop()
            if self.args.verbose:
                log_info('Dropping key {} (over count {})'.format(binascii.hexlify(key), self.max_len))
            del self.age_dict[key]
        if len(self.future_popularity_queue) > self.max_len:
            key = self.future_popularity_queue.pop()
            if self.args.verbose:
                log_info('Dropping future key {} (over count {})'.format(binascii.hexlify(key), self.max_len))

    def inc_popularity(self, key):
        current = self.popularity_queue.get(key)
        if current is not None:
            self.popularity_queue[key] = current + self.step
        else:
            current = self.future_popularity_queue.get(key, self.time.time())
            self.future_popularity_queue[key] = current + self.step

    def _tripleIterable(self):
        ikeys = self.age_dict.iterkeys()
        ibirthday = imap(operator.itemgetter(0), self.age_dict.itervalues())
        ivalues = imap(operator.itemgetter(1), self.age_dict.itervalues())
        return izip(ikeys, ibirthday, ivalues)

    # interface methods below
    def __setitem__(self, key, value):
        age, oldvalue = self.age_dict.get(key) or self.time.time(), None
        if not validate(self.args, key, value, oldvalue)[0]:
            return
        if oldvalue is not None:
            self.age_dict[key] = (age, value)
        else:
            age = self.future_popularity_queue.pop(key, self.time.time())
            self.age_dict[key] = (self.time.time(), value)
            self.popularity_queue[key] = age
        self.cull()

    def __getitem__(self, key):
        self.inc_popularity(key)
        self.cull()
        return self.age_dict[key][1]

    def get(self, key, default=None):
        self.inc_popularity(key)
        self.cull()
        if key in self.age_dict:
            return self.age_dict[key][1]
        return default

    def iteritemsOlderThan(self, secondsOld):
        minBirthday = self.time.time() - secondsOld
        zipped = self._tripleIterable()
        matches = takewhile(lambda r: minBirthday >= r[1], zipped)
        return imap(operator.itemgetter(0, 2), matches)

    def iteritems(self):
        self.cull()
        return self.age_dict.iteritems()
Exemplo n.º 10
0
class Crawler():
    def __init__(self):

        self.query = input("Enter search query: ")
        self.webpages_limit = input(
            "Set total number of webpages to be crawled: ")
        self.limit = input(
            "Set limits on how many webpages be crawled from single site: ")
        self.priority_queue = PQDict().maxpq()
        self.queue = queue.Queue()
        self.downloader = Downloader()
        self.parser = Parser(self.query)
        self.calculator = Calculator(self.query)
        self.relevance = Relevance()
        self.webpages_crawled = 0
        self.logger = logging.getLogger(__name__)
        self.visited_urls = set()
        self.sites_times = {}

    #fetch top 10 results from google search:
    def __fetch_google_results(self):
        service = build("customsearch", "v1", developerKey=API_KEY)
        res = service.cse().list(q=self.query, cx=SEARCH_ENGINE_ID).execute()
        return res

    #enqueue the 10 google search results
    def enqueue_seeds(self):
        res = self.__fetch_google_results()
        for item in res['items']:
            self.priority_queue.additem(item['link'], 10)
            self.queue.put(item['link'])
            self.logger.debug("Enqueued: " + item['link'])

    #check has this url been visited before
    #and has it reach the limit of each site
    #and Robot Exclusion Protocols
    def urlchecker(self, url):
        if url is None:
            return False
        normalized_url = urltools.normalize(url)
        robotparser = urllib.robotparser.RobotFileParser()

        try:
            url_comp = urlparse(normalized_url)
            base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
        except:
            self.logger.error("Cannot parse: " + url)
        try:
            robotparser.set_url(base_url + "robots.txt")
            robotparser.read()
            if not robotparser.can_fetch("*", normalized_url):
                self.logger.error(url + " is excluded due to protocol")
                return False
        except:
            self.logger.error("Cannot determine robots exclusion protocol: " +
                              url)

        if normalized_url in self.visited_urls:
            self.logger.debug(url + " Has been visited before! ")
            return False
        elif base_url in self.sites_times and self.sites_times[base_url] > int(
                self.limit):
            #
            self.logger.debug(
                url + " Times visiting this site have reach the limit ")
            return False
        elif 'cgi' in normalized_url:
            return False
        else:
            return True

    #the crawling process
    def crawl(self):
        try:
            harvest_rate_accum = 0
            while self.webpages_crawled < int(self.webpages_limit):
                print(self.webpages_crawled)
                try:
                    url = self.priority_queue.pop()
                except e:
                    print("cannot pop")
                print(url)
                if self.urlchecker(url):
                    try:
                        content = self.downloader.download(url).decode('utf-8')
                        if content is not None:
                            self.webpages_crawled += 1
                            rel = self.relevance.relevance(content, self.query)
                            harvest_rate_accum += rel
                            self.crawled_log(" Harvest rate: " +
                                             str(harvest_rate_accum /
                                                 self.webpages_crawled))
                    except:
                        print("Failed in downloading")
                    normalized_url = urltools.normalize(url)
                    try:
                        url_comp = urlparse(normalized_url)
                        base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
                    except:
                        self.logger.error("Cannot parse: " + url)

                    if base_url in self.sites_times:
                        self.sites_times[base_url] += 1
                    else:
                        self.sites_times[base_url] = 1
                    self.visited_urls.add(normalized_url)

                    if rel < 0.2:
                        continue
                    for link in self.parser.extract_all_links(content):
                        full_link = self.parser.parse_links(url, link)
                        if full_link is not None:
                            link_promise = self.calculator.link_promise(
                                full_link) + rel

                        try:
                            self.priority_queue.additem(
                                full_link, link_promise)
                        except:
                            pass
        except KeyError:
            print("Queue is empty now")

    def bfs_crawl(self):
        try:
            harvest_rate_accum = 0
            while self.webpages_crawled < int(self.webpages_limit):
                print(self.webpages_crawled)
                try:
                    url = self.queue.get()
                except e:
                    print("cannot pop")
                print(url)
                if self.urlchecker(url):
                    try:
                        content = self.downloader.download(url).decode('utf-8')
                        if content is not None:
                            self.webpages_crawled += 1
                            rel = self.relevance.relevance(content, self.query)
                            harvest_rate_accum += rel
                            self.crawled_log(" Harvest rate: " +
                                             str(harvest_rate_accum /
                                                 self.webpages_crawled))
                    except:
                        print("Failed in downloading")
                    normalized_url = urltools.normalize(url)
                    try:
                        url_comp = urlparse(normalized_url)
                        base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
                    except:
                        self.logger.error("Cannot parse: " + url)
                    self.visited_urls.add(normalized_url)

                    for link in self.parser.extract_all_links(content):
                        full_link = self.parser.parse_links(url, link)
                        if full_link is not None:
                            try:
                                if base_url not in self.sites_times:
                                    self.sites_times[base_url] = 1
                                elif self.sites_times[base_url] < int(
                                        self.limit):
                                    self.sites_times[base_url] += 1
                                else:
                                    continue
                                self.queue.put(full_link)
                            except:
                                pass
        except KeyError:
            print("Queue is empty now")

    def crawled_log(self, log):
        file = open('demo.log', 'a')
        file.write(log + '\n\n')
        file.close()