예제 #1
0
def scale_cluster(token, cluster_id, cluster_delta, status='Pending'):
    """
    Scales an active cluster by cluster_delta (signed int).
    For scaling up finds the cluster settings and last internal ip/port slave
    and "appends" cluster_delta nodes.
    For scaling down it removes the last slave. 
    """
    from reroute_ssh import reroute_ssh_to_slaves
    from run_ansible_playbooks import modify_ansible_hosts_file,ansible_scale_cluster,ansible_manage_cluster
    cluster_to_scale = ClusterInfo.objects.get(id=cluster_id)
    pre_scale_size = cluster_to_scale.cluster_size
    previous_cluster_status = cluster_to_scale.cluster_status
    previous_hadoop_status = cluster_to_scale.hadoop_status
    status_map = {"0":"Destroyed","1":"Active","2":"Pending","3":"Failed"}
    # pre-flight checks. If cluster status is pending or hadoop status formatting abort.
    if (previous_cluster_status == const_cluster_status_pending) or (previous_hadoop_status == const_hadoop_status_format):
        current_task.update_state(state="Skipping")
        return cluster_to_scale.cluster_name
    # pre-flight checks done
    current_task.update_state(state="Started")
    auth = check_credentials(unmask_token(encrypt_key,token))
    current_task.update_state(state="Authenticated")
    endpoints, user_id = endpoints_and_user_id(auth)
    cyclades = init_cyclades(endpoints['cyclades'], unmask_token(encrypt_key,token))
    netclient = init_cyclades_netclient(endpoints['network'], unmask_token(encrypt_key,token))
    plankton = init_plankton(endpoints['plankton'], unmask_token(encrypt_key,token))
    state = ''
    list_of_new_slaves = []
    cluster_name_suffix_id = '{0}-{1}'.format(cluster_to_scale.cluster_name, cluster_id)
    if cluster_delta < 0: # scale down
        for counter in range(cluster_delta,0):
            state = "Starting node decommission for %s" % (cluster_to_scale.cluster_name)
            set_cluster_state(token, cluster_id, state)           
            try:
                node_fqdn, node_id = find_node_to_remove(cluster_to_scale, cyclades, netclient)
                state = "Decommissioning Node %s from %s" % (node_fqdn,cluster_to_scale.cluster_name)
                set_cluster_state(token, cluster_id, state)
                ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, action='remove_slaves',
                                                         slave_hostname=node_fqdn)
                ansible_scale_cluster(ansible_hosts, action='remove_slaves', slave_hostname=node_fqdn.split('.')[0])
            except Exception, e:
                msg = str(e.args[0])
                set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status],
                                  error=msg)
                raise RuntimeError(msg)
            state = "Node %s decommissioned from %s and will be deleted"% (node_fqdn, cluster_to_scale.cluster_name)
            cluster_remove_node(node_fqdn, node_id, token, cluster_id, cluster_to_scale, cyclades,
                                status_map[previous_cluster_status])
예제 #2
0
def rollback_scale_cluster(list_of_slaves, cyclades, cluster_to_scale, size, ansible=False):
    """
    Rollback cluster when scale add node fail. More rollback actions when ansible has failed during
    hadoop configurations for the new nodes.
    """
    from run_ansible_playbooks import modify_ansible_hosts_file,ansible_scale_cluster
    cluster_name_suffix_id = '{0}-{1}'.format(cluster_to_scale.cluster_name, cluster_to_scale.id)
    for slave in list_of_slaves:
        cyclades.delete_server(slave['id'])
    if ansible:
        for slave in list_of_slaves:
            modify_ansible_hosts_file(cluster_name_suffix_id, action='remove_slaves', slave_hostname=slave['fqdn'])           
        ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, action='join_slaves')
        ansible_scale_cluster(ansible_hosts, action='rollback_scale_cluster')
    cluster_to_scale.cluster_size = size
    cluster_to_scale.save()
예제 #3
0
             reroute_ssh_to_slaves(new_slave['port'], new_slave['private_ip'], master_ip, new_slave['password'],
                                   '',linux_dist)
     except Exception, e:
         msg = '{0}. Scale action failed. Cluster rolled back'.format(str(e.args[0]))
         set_cluster_state(token, cluster_id, msg)
         rollback_scale_cluster(list_of_new_slaves, cyclades, cluster_to_scale, pre_scale_size)
         set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status],
                           error=msg)
         raise RuntimeError(msg)
     try:
         ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, list_of_hosts=list_of_new_slaves,
                                                   master_ip=master_ip,
                                               action='add_slaves')
         state = 'Configuring Hadoop for new nodes of %s ' % cluster_to_scale.cluster_name
         set_cluster_state(token, cluster_id, state)
         ansible_scale_cluster(ansible_hosts, new_slaves_size=len(list_of_new_slaves), orka_image_uuid=image_id,
                               user_id=user_id)
         modify_ansible_hosts_file(cluster_name_suffix_id, action='join_slaves')  
     except Exception, e:
         msg = '{0}. Scale action failed. Cluster rolled back'.format(str(e.args[0]))
         set_cluster_state(token, cluster_id, msg)
         rollback_scale_cluster(list_of_new_slaves, cyclades, cluster_to_scale, pre_scale_size,ansible=True)
         set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status],
                           error=msg)
         raise RuntimeError(msg)
     finally:
         subprocess.call('rm -rf /tmp/{0}'.format(user_id),shell=True)
 # Restart hadoop cluster for changes to take effect   
 state = "Restarting %s for the changes to take effect" % (cluster_to_scale.cluster_name)
 set_cluster_state(token, cluster_id, state)
 try:
     if REVERSE_HADOOP_STATUS[previous_hadoop_status] == 'stop':