Пример #1
0
def loop():
    while True:
        try:
            tick()
        except Exception, e:
            LOG.warning("Exception in main loop: %s" % (traceback.format_exc(e)))
        time.sleep(int(config.get_value(KEY_LOOP_INTERVAL_SECS)))
Пример #2
0
def get_ssh_keys():
    keys = re.split(r'\s*,\s*', config.get_value(KEY_SSH_KEYS))
    for i in range(0, len(keys)):
        key = keys[i]
        if key[0] == '$':
            var_name = key[1:]
            key_file = KEY_FILE_NAME_PATTERN % var_name
            if not os.path.isfile(key_file):
                key_value = os.environ.get(var_name)
                if key_value:
                    marker_begin = '-----BEGIN RSA PRIVATE KEY-----'
                    marker_end = '-----END RSA PRIVATE KEY-----'
                    key_value = key_value.replace(marker_begin, '')
                    key_value = key_value.replace(marker_end, '')
                    key_value = key_value.replace(' ', '\n')
                    if marker_begin not in key_value:
                        key_value = ('%s\n' % marker_begin) + key_value
                    if marker_end not in key_value:
                        key_value += ('\n%s' % marker_end)
                    key_value = key_value.replace('\n\n', '\n')
                    save_file(key_file, key_value)
                    run('chmod 600 %s' % key_file)
                else:
                    LOG.warning(
                        'Unable to read SSH key from environment variable: %s'
                        % var_name)
            keys[i] = key_file
    return keys
Пример #3
0
def get_cluster_nodes(cluster_id, role=None):
    emr_client = connect_emr(role=role)
    result = run_func(emr_client.list_instances, ClusterId=cluster_id,
                      InstanceStates=['AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING'],
                      cache_duration_secs=QUERY_CACHE_TIMEOUT)
    result = json.loads(result)
    result = result['Instances']

    # read domain name config
    custom_dn = config.get_value(constants.KEY_CUSTOM_DOMAIN_NAME, section=SECTION_EMR, resource=cluster_id)

    i = 0
    while i < len(result):
        inst = result[i]
        if inst['Status']['State'] == INSTANCE_STATE_TERMINATED:
            del result[i]
            i -= 1
        else:
            inst['cid'] = inst['Id'] if 'Id' in inst else 'n/a'
            inst['iid'] = inst['Ec2InstanceId'] if 'Ec2InstanceId' in inst else 'n/a'
            inst['gid'] = inst['InstanceGroupId'] if 'InstanceGroupId' in inst else 'n/a'
            inst['ip'] = inst['PrivateIpAddress'] if 'PrivateIpAddress' in inst else 'n/a'
            inst['host'] = inst['PrivateDnsName'] if 'PrivateDnsName' in inst else 'n/a'
            if custom_dn:
                inst['host'] = ip_to_hostname(hostname_to_ip(inst['host']), custom_dn)
            inst['type'] = get_instance_group_type(cluster_id, inst['InstanceGroupId'], role=role)
            inst['state'] = inst['Status']['State']
            inst['market'] = get_instance_group_details(cluster_id, inst['InstanceGroupId'], role=role)['Market']
        i += 1
    return result
Пример #4
0
def get_node_queries(cluster):
    cmd = ('presto-cli --execute \\"SELECT n.http_uri,count(q.node_id) from system.runtime.nodes n ' +
        'left join (select * from system.runtime.queries where state = \'RUNNING\' ) as q ' +
        'on q.node_id = n.node_id group by n.http_uri\\"')

    result = {}
    if cluster.ip == 'localhost':
        # for testing purposes
        return result

    # run ssh command
    out = run_ssh(cmd, cluster.ip, user='******', cache_duration_secs=QUERY_CACHE_TIMEOUT)

    # remove SSH log output line
    out = remove_lines_from_string(out, r'.*Permanently added.*')

    # read config for domain
    custom_dn = config.get_value(constants.KEY_CUSTOM_DOMAIN_NAME, section=SECTION_EMR, resource=cluster.id)
    # assume input is actually domain name (not ip)
    dn = custom_dn if custom_dn else re.match(r'ip-[^\.]+\.(.+)', cluster.ip).group(1)

    for line in out.splitlines():
        ip = re.sub(r'.*http://([0-9\.]+):.*', r'\1', line)
        if ip:
            queries = re.sub(r'.*"([0-9\.]+)"$', r'\1', line)
            host = aws_common.ip_to_hostname(ip, dn)
            try:
                result[host] = int(queries)
            except Exception, e:
                result[host] = 0
Пример #5
0
def get_node_queries(cluster):
    cmd = ('presto-cli --execute \\"SELECT n.http_uri,count(q.node_id) from system.runtime.nodes n ' +
        'left join (select * from system.runtime.queries where state = \'RUNNING\' ) as q ' +
        'on q.node_id = n.node_id group by n.http_uri\\"')

    result = {}
    if cluster.ip == 'localhost':
        # for testing purposes
        return result

    # run ssh command
    out = run_ssh(cmd, cluster.ip, user='******', cache_duration_secs=QUERY_CACHE_TIMEOUT)

    # remove SSH log output line
    out = remove_lines_from_string(out, r'.*Permanently added.*')

    # read config for domain
    custom_dn = config.get_value(constants.KEY_CUSTOM_DOMAIN_NAME, section=SECTION_EMR, resource=cluster.id)
    # assume input is actually domain name (not ip)
    dn = custom_dn if custom_dn else re.match(r'ip-[^\.]+\.(.+)', cluster.ip).group(1)

    for line in out.splitlines():
        ip = re.sub(r'.*http://([0-9\.]+):.*', r'\1', line)
        if ip:
            queries = re.sub(r'.*"([0-9\.]+)"$', r'\1', line)
            host = aws_common.ip_to_hostname(ip, dn)
            try:
                result[host] = int(queries)
            except Exception, e:
                result[host] = 0
Пример #6
0
def run_ssh(cmd, host, user=None, keys=None, via_hosts=[], cache_duration_secs=0):
    if not keys:
        keys = config.get_value(KEY_SSH_KEYS).split(',')

    user = '******' % user if user else ''

    agent_forward = ''
    forward_addendum = ''
    ssh_configs = ('-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ' +
        '-o PasswordAuthentication=no -o BatchMode=yes -o ConnectTimeout=3')

    if len(via_hosts) > 0:
        agent_forward = '-o ForwardAgent=yes'
        for via_host in list(reversed(via_hosts)):
            forward_addendum = ('ssh %s %s%s ' % (ssh_configs, user, via_host)) + forward_addendum

    ssh_cmd_tmpl = 'ssh ' + ssh_configs + ' ' + agent_forward + ' -i %s %s%s "' + forward_addendum + '%s"'

    for key in keys:
        key = key.strip()
        ssh_cmd = ssh_cmd_tmpl % (key, user, host, cmd)

        if len(via_hosts) > 0:
            run('ssh-add %s 2>&1 > /dev/null' % key)

        try:
            out = run(ssh_cmd, cache_duration_secs)
            return out
        except subprocess.CalledProcessError, e:
            # TODO find a more elegant solution for this.
            if 'Permission denied (publickey)' not in e.output:
                raise e
Пример #7
0
def update_resources(resource_config):
    for resource in resource_config:
        id = resource.id
        enabled = config.get_value('enable_enhanced_monitoring',
                                   section=SECTION_KINESIS,
                                   resource=id)
        if enabled == 'true':
            resource.enhanced_monitoring = [MONITORING_METRICS_ALL]
    return resource_config
Пример #8
0
def select_tasknode_group(tasknodes_groups, cluster_id):
    if len(tasknodes_groups) <= 0:
        raise Exception("Empty list of task node instance groups for scaling: %s" % tasknodes_groups)
    if len(tasknodes_groups) == 1:
        return tasknodes_groups[0]
    preferred = config.get_value(KEY_PREFERRED_UPSCALE_INSTANCE_MARKET, cluster_id)
    for group in tasknodes_groups:
        if group['market'] == preferred:
            return group
    raise Exception("Could not select task node instance group for preferred market '%s': %s" %
            (preferred, tasknodes_groups))
Пример #9
0
def get_state(cluster_id):
    """ Get cluster state
        ---
        operationId: 'getState'
        parameters:
            - name: cluster_id
              in: path
    """
    monitoring_interval_secs = int(config.get_value(KEY_MONITORING_INTERVAL_SECS))
    info = monitoring.collect_info(CLUSTERS[cluster_id], monitoring_interval_secs=monitoring_interval_secs)
    return jsonify(info)
Пример #10
0
def loop():
    while True:
        LOG.info("Running next loop iteration")
        try:
            resource_list = resources.get_resources()

            for resource in resource_list:
                resource.fetch_data()
                scaling_required = resource.needs_scaling()
                if scaling_required:
                    resource.perform_scaling(scaling_required)

        except Exception, e:
            LOG.warning("Exception in main loop: %s" % (traceback.format_exc(e)))
        time.sleep(int(config.get_value(KEY_LOOP_INTERVAL_SECS)))
Пример #11
0
def loop():
    while True:
        LOG.info("Running next loop iteration")
        try:
            resource_list = resources.get_resources()

            for resource in resource_list:
                resource.fetch_data()
                scaling_required = resource.needs_scaling()
                if scaling_required:
                    resource.perform_scaling(scaling_required)

        except Exception, e:
            LOG.warning("Exception in main loop: %s" %
                        (traceback.format_exc(e)))
        time.sleep(int(config.get_value(KEY_LOOP_INTERVAL_SECS)))
Пример #12
0
def get_emr_costs():
    """ Get summary of cluster costs and cost savings
        ---
        operationId: 'getEmrCosts'
        parameters:
            - name: 'request'
              in: body
    """
    data = json.loads(request.data)
    cluster_id = data['cluster_id']
    num_datapoints = data['num_datapoints'] if 'num_datapoints' in data else 300
    baseline_nodes = (data['baseline_nodes'] if 'baseline_nodes' in data else
        config.get_value(KEY_BASELINE_COMPARISON_NODES, section=SECTION_EMR, resource=cluster_id, default=20))
    baseline_nodes = int(baseline_nodes)
    info = database.history_get(section=SECTION_EMR, resource=cluster_id, limit=num_datapoints)
    common.remove_NaN(info)
    result = aws_pricing.get_cluster_savings(info, baseline_nodes)
    common.remove_NaN(result, delete_values=False, replacement=0)
    return jsonify(results=result, baseline_nodes=baseline_nodes)
Пример #13
0
def tick():
    LOG.info("Running next loop iteration")
    monitoring_interval_secs = int(config.get_value(KEY_MONITORING_INTERVAL_SECS))
    for cluster_id, details in CLUSTERS.iteritems():
        cluster_ip = details['ip_public']
        info = None
        try:
            info = monitoring.collect_info(details, monitoring_interval_secs=monitoring_interval_secs)
        except Exception, e:
            LOG.warning("Error getting monitoring info for cluster %s: %s" % (cluster_id, e))
        if info:
            action = 'N/A'
            # Make sure we are only resizing Presto clusters atm
            if details['type'] == 'Presto':
                # Make sure we don't change clusters that are not configured
                if cluster_id in get_autoscaling_clusters():
                    try:
                        nodes_to_terminate = get_nodes_to_terminate(info)
                        print nodes_to_terminate
                        if len(nodes_to_terminate) > 0:
                            for node in nodes_to_terminate:
                                terminate_node(cluster_ip, node['ip'], node['gid'])
                            action = 'DOWNSCALE(-%s)' % len(nodes_to_terminate)
                        else:
                            nodes_to_add = get_nodes_to_add(info)
                            if len(nodes_to_add) > 0:
                                tasknodes_groups = aws_common.get_instance_groups_tasknodes(cluster_id)
                                tasknodes_group = select_tasknode_group(tasknodes_groups, cluster_id)['id']
                                current_num_nodes = len([n for key, n in info['nodes'].iteritems()
                                    if n['gid'] == tasknodes_group])
                                spawn_nodes(cluster_ip, tasknodes_group, current_num_nodes, len(nodes_to_add))
                                action = 'UPSCALE(+%s)' % len(nodes_to_add)
                            else:
                                action = 'NOTHING'
                    except Exception, e:
                        LOG.warning("WARNING: Error downscaling/upscaling cluster %s: %s" %
                            (cluster_id, traceback.format_exc(e)))
                    # clean up and terminate instances whose nodes are already in inactive state
                    aws_common.terminate_inactive_nodes(cluster_ip, info['nodes'])
            # store the state for future reference
            monitoring.history_add(cluster_id, info, action)
Пример #14
0
def get_emr_costs():
    """ Get summary of cluster costs and cost savings
        ---
        operationId: 'getEmrCosts'
        parameters:
            - name: 'request'
              in: body
    """
    data = json.loads(request.data)
    cluster_id = data['cluster_id']
    num_datapoints = data['num_datapoints'] if 'num_datapoints' in data else 300
    baseline_nodes = (data['baseline_nodes'] if 'baseline_nodes' in data else
                      config.get_value(KEY_BASELINE_COMPARISON_NODES,
                                       section=SECTION_EMR,
                                       resource=cluster_id,
                                       default=20))
    baseline_nodes = int(baseline_nodes)
    info = database.history_get(section=SECTION_EMR,
                                resource=cluster_id,
                                limit=num_datapoints)
    common.remove_NaN(info)
    result = aws_pricing.get_cluster_savings(info, baseline_nodes)
    common.remove_NaN(result, delete_values=False, replacement=0)
    return jsonify(results=result, baseline_nodes=baseline_nodes)
Пример #15
0
def get_iam_role_for_cluster(cluster):
    if not isinstance(cluster, basestring):
        cluster = cluster.id
    return config.get_value('role_to_assume', section=SECTION_EMR, resource=cluster)
Пример #16
0
def get_iam_role_for_stream(stream):
    if not isinstance(stream, basestring):
        stream = stream.id
    return config.get_value('role_to_assume',
                            section=SECTION_KINESIS,
                            resource=stream)
Пример #17
0
def get_iam_role_for_cluster(cluster):
    if not isinstance(cluster, basestring):
        cluster = cluster.id
    return config.get_value('role_to_assume', section=SECTION_EMR, resource=cluster)
Пример #18
0
def get_autoscaling_clusters():
    return re.split(r'\s*,\s*', config.get_value(KEY_AUTOSCALING_CLUSTERS))