def become_leader(connection): global currentleader global follower global retrythread log.log({ 'info': 'Becoming leader of collective', 'subsystem': 'collective' }) if follower: follower.kill() cfm.stop_following() follower = None if retrythread: retrythread.cancel() retrythread = None currentleader = connection.getsockname()[0] skipaddr = connection.getpeername()[0] myname = get_myname() skipem = set(cfm.cfgstreams) skipem.add(currentleader) skipem.add(skipaddr) for member in cfm.list_collective(): dronecandidate = cfm.get_collective_member(member)['address'] if dronecandidate in skipem or member == myname: continue eventlet.spawn_n(try_assimilate, dronecandidate) schedule_rebalance()
def start_collective(): global follower global retrythread if follower: follower.kill() cfm.stop_following() follower = None try: if cfm.cfgstreams: cfm.check_quorum() # Do not start if we have quorum and are leader return except exc.DegradedCollective: pass if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return myname = get_myname() for member in sorted(list(cfm.list_collective())): if member == myname: continue if cfm.cfgleader is None: cfm.stop_following(True) ldrcandidate = cfm.get_collective_member(member)['address'] log.log({'info': 'Performing startup attempt to {0}'.format( ldrcandidate), 'subsystem': 'collective'}) if connect_to_leader(name=myname, leader=ldrcandidate): break else: retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def handle_dispatch(connection, cert, dispatch, peername): cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) if not util.cert_matches( cfm.get_collective_member(peername)['fingerprint'], cert): connection.close() return if dispatch[0:2] != b'\x01\x03': # magic value to indicate msgpack # We only support msgpack now # The magic should preclude any pickle, as the first byte can never be # under 0x20 or so. connection.close() return dispatch = msgpack.unpackb(dispatch[2:], raw=False) configmanager = cfm.ConfigManager(dispatch['tenant']) nodes = dispatch['nodes'] inputdata = dispatch['inputdata'] operation = dispatch['operation'] pathcomponents = dispatch['path'] routespec = nested_lookup(noderesources, pathcomponents) inputdata = msg.get_input_message(pathcomponents, operation, inputdata, nodes, dispatch['isnoderange'], configmanager) plugroute = routespec.routeinfo plugpath = None nodesbyhandler = {} passvalues = [] nodeattr = configmanager.get_node_attributes(nodes, plugroute['pluginattrs']) for node in nodes: for attrname in plugroute['pluginattrs']: if attrname in nodeattr[node]: plugpath = nodeattr[node][attrname]['value'] elif 'default' in plugroute: plugpath = plugroute['default'] if plugpath: try: hfunc = getattr(pluginmap[plugpath], operation) except KeyError: nodesbyhandler[BadPlugin(node, plugpath).error] = [node] continue if hfunc in nodesbyhandler: nodesbyhandler[hfunc].append(node) else: nodesbyhandler[hfunc] = [node] try: for hfunc in nodesbyhandler: passvalues.append( hfunc(nodes=nodesbyhandler[hfunc], element=pathcomponents, configmanager=configmanager, inputdata=inputdata)) for res in itertools.chain(*passvalues): _forward_rsp(connection, res) except Exception as res: _forward_rsp(connection, res) connection.sendall('\x00\x00\x00\x00\x00\x00\x00\x00')
def start_collective(): global follower global retrythread global initting initting = True retrythread = None try: cfm.membership_callback = schedule_rebalance if follower is not None: initting = False return try: if cfm.cfgstreams: cfm.check_quorum() # Do not start if we have quorum and are leader return except exc.DegradedCollective: pass if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return myname = get_myname() connecto = [] for member in sorted(list(cfm.list_collective())): if member == myname: continue if cfm.cfgleader is None: cfm.stop_following(True) ldrcandidate = cfm.get_collective_member(member)['address'] connecto.append(ldrcandidate) conpool = greenpool.GreenPool(64) connections = conpool.imap(create_connection, connecto) for ent in connections: member, remote = ent if isinstance(remote, Exception): continue if follower is None: log.log({ 'info': 'Performing startup attempt to {0}'.format(member), 'subsystem': 'collective' }) if not connect_to_leader( name=myname, leader=member, remote=remote): remote.close() else: remote.close() except Exception as e: pass finally: if retrythread is None and follower is None: retrythread = eventlet.spawn_after(5 + random.random(), start_collective) initting = False
def connect_node(node, configmanager, username=None): attrval = configmanager.get_node_attributes(node, 'collective.manager') myc = attrval.get(node, {}).get('collective.manager', {}).get( 'value', None) myname = collective.get_myname() if myc and myc != collective.get_myname(): minfo = configmodule.get_collective_member(myc) return ProxyConsole(node, minfo, myname, configmanager, username) consk = (node, configmanager.tenant) if consk not in _handled_consoles: _handled_consoles[consk] = ConsoleHandler(node, configmanager) return _handled_consoles[consk]
def start_proxy_term(connection, cert, request): cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) droneinfo = configmanager.get_collective_member(request['name']) if not util.cert_matches(droneinfo['fingerprint'], cert): connection.close() return cfm = configmanager.ConfigManager(request['tenant']) ccons = ClientConsole(connection) consession = consoleserver.ConsoleSession(node=request['node'], configmanager=cfm, username=request['user'], datacallback=ccons.sendall, skipreplay=request['skipreplay']) term_interact(None, None, ccons, None, connection, consession, None)
def become_leader(connection): global currentleader global follower if follower: follower.kill() follower = None currentleader = connection.getsockname()[0] skipaddr = connection.getpeername()[0] myname = get_myname() for member in cfm.list_collective(): dronecandidate = cfm.get_collective_member(member)['address'] if dronecandidate in (currentleader, skipaddr) or member == myname: continue eventlet.spawn_n(try_assimilate, dronecandidate)
def handle_dispatch(connection, cert, dispatch, peername): cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) if not util.cert_matches( cfm.get_collective_member(peername)['fingerprint'], cert): connection.close() return pversion = 0 if bytearray(dispatch)[0] == 0x80: pversion = bytearray(dispatch)[1] dispatch = pickle.loads(dispatch, **pargs) configmanager = cfm.ConfigManager(dispatch['tenant']) nodes = dispatch['nodes'] inputdata = dispatch['inputdata'] operation = dispatch['operation'] pathcomponents = dispatch['path'] routespec = nested_lookup(noderesources, pathcomponents) plugroute = routespec.routeinfo plugpath = None nodesbyhandler = {} passvalues = [] nodeattr = configmanager.get_node_attributes(nodes, plugroute['pluginattrs']) for node in nodes: for attrname in plugroute['pluginattrs']: if attrname in nodeattr[node]: plugpath = nodeattr[node][attrname]['value'] elif 'default' in plugroute: plugpath = plugroute['default'] if plugpath: try: hfunc = getattr(pluginmap[plugpath], operation) except KeyError: nodesbyhandler[BadPlugin(node, plugpath).error] = [node] continue if hfunc in nodesbyhandler: nodesbyhandler[hfunc].append(node) else: nodesbyhandler[hfunc] = [node] try: for hfunc in nodesbyhandler: passvalues.append( hfunc(nodes=nodesbyhandler[hfunc], element=pathcomponents, configmanager=configmanager, inputdata=inputdata)) for res in itertools.chain(*passvalues): _forward_rsp(connection, res, pversion) except Exception as res: _forward_rsp(connection, res, pversion) connection.sendall('\x00\x00\x00\x00\x00\x00\x00\x00')
def start_collective(): global follower global retrythread if follower: follower.kill() follower = None if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return myname = get_myname() for member in sorted(list(cfm.list_collective())): if member == myname: continue if cfm.cfgleader is None: cfm.stop_following(True) ldrcandidate = cfm.get_collective_member(member)['address'] if connect_to_leader(name=myname, leader=ldrcandidate): break else: retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def _assimilate_missing(skipaddr=None): connecto = [] myname = get_myname() skipem = set(cfm.cfgstreams) numfollowers = len(skipem) skipem.add(currentleader) if skipaddr is not None: skipem.add(skipaddr) for member in cfm.list_collective(): dronecandidate = cfm.get_collective_member(member)['address'] if dronecandidate in skipem or member == myname or member in skipem: continue connecto.append(dronecandidate) if not connecto: return True conpool = greenpool.GreenPool(64) connections = conpool.imap(create_connection, connecto) for ent in connections: member, remote = ent if isinstance(remote, Exception): continue if not try_assimilate(member, numfollowers, remote): return False return True
def handle_connection(connection, cert, request, local=False): global currentleader global retrythread operation = request['operation'] if cert: cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) else: if not local: return if operation in ('show', 'delete'): if not list(cfm.list_collective()): tlvdata.send( connection, { 'collective': { 'error': 'Collective mode not ' 'enabled on this ' 'system' } }) return if follower: linfo = cfm.get_collective_member_by_address(currentleader) remote = socket.create_connection((currentleader, 13001)) remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) if not (linfo and util.cert_matches(linfo['fingerprint'], cert)): remote.close() tlvdata.send(connection, { 'error': 'Invalid certificate, ' 'redo invitation process' }) connection.close() return tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'getinfo', 'name': get_myname() } }) collinfo = tlvdata.recv(remote) else: collinfo = {} populate_collinfo(collinfo) try: cfm.check_quorum() collinfo['quorum'] = True except exc.DegradedCollective: collinfo['quorum'] = False if operation == 'show': tlvdata.send(connection, {'collective': collinfo}) elif operation == 'delete': todelete = request['member'] if (todelete == collinfo['leader'] or todelete in collinfo['active']): tlvdata.send( connection, { 'collective': { 'error': '{0} is still active, stop the confluent service to remove it' .format(todelete) } }) return if todelete not in collinfo['offline']: tlvdata.send( connection, { 'collective': { 'error': '{0} is not a recognized collective member'. format(todelete) } }) return cfm.del_collective_member(todelete) tlvdata.send( connection, { 'collective': { 'status': 'Successfully deleted {0}'.format(todelete) } }) connection.close() return if 'invite' == operation: try: cfm.check_quorum() except exc.DegradedCollective: tlvdata.send(connection, { 'collective': { 'error': 'Collective does not have quorum' } }) return #TODO(jjohnson2): Cannot do the invitation if not the head node, the certificate hand-carrying #can't work in such a case. name = request['name'] invitation = invites.create_server_invitation(name) tlvdata.send(connection, {'collective': { 'invitation': invitation }}) connection.close() if 'join' == operation: invitation = request['invitation'] try: invitation = base64.b64decode(invitation) name, invitation = invitation.split(b'@', 1) name = util.stringify(name) except Exception: tlvdata.send( connection, {'collective': { 'status': 'Invalid token format' }}) connection.close() return host = request['server'] try: remote = socket.create_connection((host, 13001)) # This isn't what it looks like. We do CERT_NONE to disable # openssl verification, but then use the invitation as a # shared secret to validate the certs as part of the join # operation remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') except Exception: tlvdata.send( connection, { 'collective': { 'status': 'Failed to connect to {0}'.format(host) } }) connection.close() return mycert = util.get_certificate_from_file( '/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) proof = base64.b64encode( invites.create_client_proof(invitation, mycert, cert)) tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'enroll', 'name': name, 'hmac': proof } }) rsp = tlvdata.recv(remote) if 'error' in rsp: tlvdata.send(connection, {'collective': { 'status': rsp['error'] }}) connection.close() return proof = rsp['collective']['approval'] proof = base64.b64decode(proof) j = invites.check_server_proof(invitation, mycert, cert, proof) if not j: remote.close() tlvdata.send(connection, {'collective': { 'status': 'Bad server token' }}) connection.close() return tlvdata.send(connection, {'collective': {'status': 'Success'}}) connection.close() currentleader = rsp['collective']['leader'] f = open('/etc/confluent/cfg/myname', 'w') f.write(name) f.close() log.log({ 'info': 'Connecting to collective due to join', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, rsp['collective']['fingerprint'], name) if 'enroll' == operation: #TODO(jjohnson2): error appropriately when asked to enroll, but the master is elsewhere mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') proof = base64.b64decode(request['hmac']) myrsp = invites.check_client_proof(request['name'], mycert, cert, proof) if not myrsp: tlvdata.send(connection, {'error': 'Invalid token'}) connection.close() return myrsp = base64.b64encode(myrsp) fprint = util.get_fingerprint(cert) myfprint = util.get_fingerprint(mycert) cfm.add_collective_member(get_myname(), connection.getsockname()[0], myfprint) cfm.add_collective_member(request['name'], connection.getpeername()[0], fprint) myleader = get_leader(connection) ldrfprint = cfm.get_collective_member_by_address( myleader)['fingerprint'] tlvdata.send( connection, { 'collective': { 'approval': myrsp, 'fingerprint': ldrfprint, 'leader': get_leader(connection) } }) if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not droneinfo: tlvdata.send( connection, {'error': 'Unrecognized leader, ' 'redo invitation process'}) return if not util.cert_matches(droneinfo['fingerprint'], cert): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) return if request['txcount'] < cfm._txcount: tlvdata.send( connection, { 'error': 'Refusing to be assimilated by inferior' 'transaction count', 'txcount': cfm._txcount, }) return if connecting.active: # don't try to connect while actively already trying to connect tlvdata.send(connection, {'status': 0}) connection.close() return if (currentleader == connection.getpeername()[0] and follower and not follower.dead): # if we are happily following this leader already, don't stir # the pot tlvdata.send(connection, {'status': 0}) connection.close() return log.log({ 'info': 'Connecting in response to assimilation', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, None, None, leader=connection.getpeername()[0]) tlvdata.send(connection, {'status': 0}) connection.close() if 'getinfo' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return collinfo = {} populate_collinfo(collinfo) tlvdata.send(connection, collinfo) if 'connect' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return myself = connection.getsockname()[0] if connecting.active: tlvdata.send(connection, { 'error': 'Connecting right now', 'backoff': True }) connection.close() return if myself != get_leader(connection): tlvdata.send( connection, { 'error': 'Cannot assimilate, our leader is ' 'in another castle', 'leader': currentleader }) connection.close() return if request['txcount'] > cfm._txcount: retire_as_leader() tlvdata.send( connection, { 'error': 'Client has higher tranasaction count, ' 'should assimilate me, connecting..', 'txcount': cfm._txcount }) log.log({ 'info': 'Connecting to leader due to superior ' 'transaction count', 'subsystem': collective }) eventlet.spawn_n(connect_to_leader, None, None, connection.getpeername()[0]) connection.close() return if retrythread: retrythread.cancel() retrythread = None with leader_init: cfm.update_collective_address(request['name'], connection.getpeername()[0]) tlvdata.send(connection, cfm._dump_keys(None, False)) tlvdata.send(connection, cfm._cfgstore['collective']) tlvdata.send(connection, {}) # cfm.get_globals()) cfgdata = cfm.ConfigManager(None)._dump_to_json() tlvdata.send(connection, { 'txcount': cfm._txcount, 'dbsize': len(cfgdata) }) connection.sendall(cfgdata) #tlvdata.send(connection, {'tenants': 0}) # skip the tenants for now, # so far unused anyway if not cfm.relay_slaved_requests(drone, connection): if not retrythread: # start a recovery if everyone else seems # to have disappeared retrythread = eventlet.spawn_after(30 + random.random(), start_collective)