示例#1
0
文件: tdb_sql.py 项目: rajbot/tikical
def get_read_table(tables):
    #shortcut with 1 entry
    if len(tables) == 1:
        return tables[0]

    #'t' is a list of engines itself. since we assume those engines
    #are on the same machine, just take the first one. len(ips) may be
    #< len(tables) if some tables are on the same host.
    ips = dict((t[0].bind.url.host, t) for t in tables)
    ip_loads = AppServiceMonitor.get_db_load(ips.keys())

    total_load = 0
    missing_loads = []
    no_connections = []
    have_loads = []

    for ip in ips:
        if ip not in ip_loads:
            missing_loads.append(ip)
        else:
            load, avg_load, conns, avg_conns, max_conns = ip_loads[ip]

            #prune high-connection machines
            if conns < .9 * max_conns:
                max_load = max(load, avg_load)
                total_load += max_load
                have_loads.append((ip, max_load))
            else:
                no_connections.append(ip)

    if total_load:
        avg_load = total_load / max(len(have_loads), 1)
        ip_weights = [(ip, 1 - load / total_load) for ip, load in have_loads]
    #if total_load is 0, which happens when have_loads is empty
    else:
        avg_load = 1.0
        ip_weights = [(ip, 1.0 / len(have_loads)) for ip, load in have_loads]

    if missing_loads or no_connections:
        #add in the missing load numbers with an average weight
        ip_weights.extend((ip, avg_load) for ip in missing_loads)

        #add in the over-connected machines with a 1% weight
        ip_weights.extend((ip, .01) for ip in no_connections)

        #rebalance the weights
        total_weight = sum(w[1] for w in ip_weights)
        ip_weights = [(ip, weight / total_weight)
                      for ip, weight in ip_weights]

    r = random.random()
    for ip, load in ip_weights:
        if r < load:
            return ips[ip]
        else:
            r = r - load

    #should never happen
    print 'yer stupid'
    return  random.choice(tables)
示例#2
0
def get_read_table(tables):
    # short-cut for only one element
    if len(tables) == 1:
        return tables[0]

    #'t' is a list of engines itself. since we assume those engines
    #are on the same machine, just take the first one. len(ips) may be
    #< len(tables) if some tables are on the same host.
    ips = dict((t[0].bind.url.host, t) for t in tables)
    ip_loads = AppServiceMonitor.get_db_load(ips.keys())

    total_load = 0
    missing_loads = []
    no_connections = []
    have_loads = []

    for ip in ips:
        if ip not in ip_loads:
            missing_loads.append(ip)
        else:
            load, avg_load, conns, avg_conns, max_conns = ip_loads[ip]

            #prune high-connection machines
            #if conns < .9 * max_conns:
            max_load = max(load, avg_load)
            total_load += max_load
            have_loads.append((ip, max_load))
            #else:
            #    no_connections.append(ip)

    if total_load:
        avg_load = total_load / max(len(have_loads), 1)
        ip_weights = [(ip, 1 - load / total_load) for ip, load in have_loads]
    #if total_load is 0, which happens when have_loads is empty
    else:
        avg_load = 1.0
        ip_weights = [(ip, 1.0 / len(have_loads)) for ip, load in have_loads]

    if missing_loads or no_connections:
        #add in the missing load numbers with an average weight
        ip_weights.extend((ip, avg_load) for ip in missing_loads)

        #add in the over-connected machines with a 1% weight
        ip_weights.extend((ip, .01) for ip in no_connections)

    #rebalance the weights
    total_weight = sum(w[1] for w in ip_weights) or 1
    ip_weights = [(ip, weight / total_weight) for ip, weight in ip_weights]

    r = random.random()
    for ip, load in ip_weights:
        if r < load:
            # print "db ip: %s" % str(ips[ip][0].metadata.bind.url.host)
            return ips[ip]
        else:
            r = r - load

    #should never happen
    print 'yer stupid'
    return random.choice(tables)
示例#3
0
    def get_read_table(self, tables):
        from r2.lib.services import AppServiceMonitor
        # short-cut for only one element
        if len(tables) == 1:
            return tables[0]

        if self.dead:
            tables = set(tables)
            dead = set(t for t in tables if t[0].bind in self.dead)
            for t in list(dead):
                # TODO: tune the reconnect code.  We have about 1-2
                # requests per second per app, so this should
                # reconnect every 50-100 seconds.
                if (random.randint(1, 100) == 42
                        and self.test_engine(t[0].bind)):
                    dead.remove(t)
            tables = tables - dead

        #'t' is a list of engines itself. since we assume those engines
        #are on the same machine, just take the first one. len(ips) may be
        #< len(tables) if some tables are on the same host.
        ips = dict((t[0].bind.url.host, t) for t in tables)
        ip_loads = AppServiceMonitor.get_db_load(ips.keys())

        total_load = 0
        missing_loads = []
        no_connections = []
        have_loads = []

        for ip in ips:
            if ip not in ip_loads:
                missing_loads.append(ip)
            else:
                load, avg_load, conns, avg_conns, max_conns = ip_loads[ip]

                # remove high load machines from the pool.
                if load < 100:
                    max_load = max(load, avg_load)
                    total_load += max_load
                    have_loads.append((ip, max_load))
                else:
                    no_connections.append(ip)

        if total_load:
            avg_load = total_load / max(len(have_loads), 1)
            ip_weights = [(ip, 1 - load / total_load)
                          for ip, load in have_loads]
        #if total_load is 0, which happens when have_loads is empty
        else:
            avg_load = 1.0
            ip_weights = [(ip, 1.0 / len(have_loads))
                          for ip, load in have_loads]

        if missing_loads or no_connections:
            #add in the missing load numbers with an average weight
            ip_weights.extend((ip, avg_load) for ip in missing_loads)

            #add in the over-connected machines with a 1% weight
            ip_weights.extend((ip, .01) for ip in no_connections)

        #rebalance the weights
        total_weight = sum(w[1] for w in ip_weights) or 1
        ip_weights = [(ip, weight / total_weight) for ip, weight in ip_weights]

        r = random.random()
        for ip, load in ip_weights:
            if r < load:
                # print "db ip: %s" % str(ips[ip][0].metadata.bind.url.host)
                return ips[ip]
            r = r - load

        #should never happen
        print 'yer stupid'
        return random.choice(list(tables))
示例#4
0
    def get_read_table(self, tables):
        from r2.lib.services import AppServiceMonitor
        # short-cut for only one element
        if len(tables) == 1:
            return tables[0]

        if self.dead:
            tables = set(tables)
            dead = set(t for t in tables if t[0].bind in self.dead)
            for t in list(dead):
                # TODO: tune the reconnect code.  We have about 1-2
                # requests per second per app, so this should
                # reconnect every 50-100 seconds.
                if (random.randint(1,100) == 42 and 
                    self.test_engine(t[0].bind)):
                    dead.remove(t)
            tables = tables - dead

        #'t' is a list of engines itself. since we assume those engines
        #are on the same machine, just take the first one. len(ips) may be
        #< len(tables) if some tables are on the same host.
        ips = dict((t[0].bind.url.host, t) for t in tables)
        ip_loads = AppServiceMonitor.get_db_load(ips.keys())

        total_load = 0
        missing_loads = []
        no_connections = []
        have_loads = []

        for ip in ips:
            if ip not in ip_loads:
                missing_loads.append(ip)
            else:
                load, avg_load, conns, avg_conns, max_conns = ip_loads[ip]

                # remove high load machines from the pool.
                if load < 100:
                    max_load = max(load, avg_load)
                    total_load += max_load
                    have_loads.append((ip, max_load))
                else:
                    no_connections.append(ip)

        if total_load:
            avg_load = total_load / max(len(have_loads), 1)
            ip_weights = [(ip, 1 - load / total_load) for ip, load in have_loads]
        #if total_load is 0, which happens when have_loads is empty
        else:
            avg_load = 1.0
            ip_weights = [(ip, 1.0 / len(have_loads)) for ip, load in have_loads]

        if missing_loads or no_connections:
            #add in the missing load numbers with an average weight
            ip_weights.extend((ip, avg_load) for ip in missing_loads)

            #add in the over-connected machines with a 1% weight
            ip_weights.extend((ip, .01) for ip in no_connections)

        #rebalance the weights
        total_weight = sum(w[1] for w in ip_weights) or 1
        ip_weights = [(ip, weight / total_weight)
                      for ip, weight in ip_weights]

        r = random.random()
        for ip, load in ip_weights:
            if r < load:
                # print "db ip: %s" % str(ips[ip][0].metadata.bind.url.host)
                return ips[ip]
            r = r - load

        #should never happen
        print 'yer stupid'
        return  random.choice(list(tables))
示例#5
0
    def get_read_table(self, tables):
        from r2.lib.services import AppServiceMonitor
        # short-cut for only one element
        if len(tables) == 1:
            return tables[0]

        if self.dead:
            tables = set(tables)
            dead = set(t for t in tables if t[0].bind in self.dead)
            for t in list(dead):
                # TODO: tune the reconnect code.  We have about 1-2
                # requests per second per app, so this should
                # reconnect every 50-100 seconds.
                #
                # random.random() generates a random float <= 1.
                # db_dead_reconnect_prob is defined in the ini
                # 0.01 makes a 1/100 chance of attempting a reconnect
                # 1.00 makes a 1/1 chance.
                rand = random.random()
                logger.debug("if {0} < {1} , we are trying to reconnect...".format(rand, self.db_dead_reconnect_prob))
                if rand < self.db_dead_reconnect_prob:
                    if self.test_engine(t[0].bind):
                      dead.remove(t)
            #only apply changes to tables if there are changes to apply
            if dead:
                tables = tables - dead

        #'t' is a list of engines itself. since we assume those engines
        #are on the same machine, just take the first one. len(ips) may be
        #< len(tables) if some tables are on the same host.
        ips = dict((t[0].bind.url.host, t) for t in tables)
        ip_loads = AppServiceMonitor.get_db_load(ips.keys())

        total_load = 0
        missing_loads = []
        no_connections = []
        have_loads = []

        for ip in ips:
            if ip not in ip_loads:
                missing_loads.append(ip)
            else:
                load, avg_load, conns, avg_conns, max_conns = ip_loads[ip]

                # remove high load machines from the pool.
                if load < 100:
                    max_load = max(load, avg_load)
                    total_load += max_load
                    have_loads.append((ip, max_load))
                else:
                    no_connections.append(ip)

        if total_load:
            avg_load = total_load / max(len(have_loads), 1)
            ip_weights = [(ip, 1 - load / total_load) for ip, load in have_loads]
        #if total_load is 0, which happens when have_loads is empty
        else:
            avg_load = 1.0
            ip_weights = [(ip, 1.0 / len(have_loads)) for ip, load in have_loads]

        if missing_loads or no_connections:
            #add in the missing load numbers with an average weight
            ip_weights.extend((ip, avg_load) for ip in missing_loads)

            #add in the over-connected machines with a 1% weight
            ip_weights.extend((ip, .01) for ip in no_connections)

        #rebalance the weights
        total_weight = sum(w[1] for w in ip_weights) or 1
        ip_weights = [(ip, weight / total_weight)
                      for ip, weight in ip_weights]

        r = random.random()
        for ip, load in ip_weights:
            if r < load:
                # print "db ip: %s" % str(ips[ip][0].metadata.bind.url.host)
                return ips[ip]
            r = r - load

        #should never happen
        logger.error("""I couldn't find any usable PGSQLs anymore. Maybe it is down or maybe I just think it is down.
        Restarting reddit or postgresql may be a good short-term fix for this. Please examine the logs more thorougly
        to attempt to find the time I lose all record of usable connections and fix whatever causes this.""")
        return  random.choice(list(tables))
示例#6
0
def Run(srvname, *a, **kw):
    args = {}
    if kw.has_key('queue_length_max'):
        args['queue_length_max'] = kw.pop('queue_length_max')
    AppServiceMonitor(**args).monitor(srvname, *a, **kw)
示例#7
0
def Alert(restart_list=['MEM', 'CPU'],
          alert_recipients=[nerds_email],
          alert_sender=nerds_email,
          cpu_limit=99,
          mem_limit=8,
          smtpserver=smtp_server,
          test=False):

    p = re.compile("newreddit(\d+)")
    cache_key = 'already_alerted_'
    for host in AppServiceMonitor(g.monitored_servers):
        for service in host:
            # cpu values
            cpu = [service.cpu(x) for x in (0, 5, 60, 300)]

            output = "\nCPU:   " + ' '.join("%6.2f%%" % x for x in cpu)
            output += "\nMEMORY: %6.2f%%" % service.mem()

            service_name = "%s %s" % (host.host, service.name)

            # is this service pegged?
            mem_pegged = ('MEM' in restart_list and service.mem() > mem_limit)
            need_restart = (('CPU' in restart_list and all(x >= cpu_limit
                                                           for x in cpu))
                            or mem_pegged)

            if (need_restart):
                mesg = ("To: " + ', '.join(alert_recipients) + "\nSubject: " +
                        service_name + " needs attention\n\n" + service_name +
                        (" is out of mem: " if mem_pegged else " is pegged:") +
                        output)
                m = p.match(service.name)
                # If we can restart this process, we do it here
                if m:
                    proc_number = str(m.groups()[0])
                    cmd = "/usr/local/bin/push -h " + \
                        host.host + " -r " + proc_number
                    if test:
                        print(
                            "would have restarted the app with command '%s'" %
                            cmd)
                    else:
                        result = os.popen3(cmd)[2].read()
                        # We override the other message to show we restarted it
                        mesg = ("To: [email protected]\n" + "Subject: " +
                                "Process " + proc_number + " on " + host.host +
                                " was automatically restarted " +
                                "due to the following:\n\n" + output + "\n\n" +
                                "Here was the output:\n" + result)
                    # Uncomment this to disable restart messages
                    #mesg = ""
                last_alerted = g.rendercache.get(cache_key + service_name) or 0
                #last_alerted = 0
                if mesg is not "":
                    if test:
                        print "would have sent email\n '%s'" % mesg
                    elif (time.time() - last_alerted > 300):
                        g.rendercache.set(cache_key + service_name,
                                          time.time())
                        session = smtplib.SMTP(smtpserver)
                        smtpresult = session.sendmail(alert_sender,
                                                      alert_recipients, mesg)
                        session.quit()