def connect_to_collective(cert, member, remote=None): if remote is None: _, remote = create_connection(member) if isinstance(remote, Exception): raise remote if cert: fprint = cert else: collent = cfm.get_collective_member_by_address(member) fprint = collent['fingerprint'] if not util.cert_matches(fprint, remote.getpeercert(binary_form=True)): # probably Janeway up to something raise Exception("Certificate mismatch in the collective") return remote
def connect_to_collective(cert, member): remote = socket.create_connection((member, 13001)) # TLS cert validation is custom and will not pass normal CA vetting # to override completely in the right place requires enormous effort, so just defer until after connect remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') if cert: fprint = cert else: collent = cfm.get_collective_member_by_address(member) fprint = collent['fingerprint'] if not util.cert_matches(fprint, remote.getpeercert(binary_form=True)): # probably Janeway up to something raise Exception("Certificate mismatch in the collective") return remote
def connect_to_leader(cert=None, name=None, leader=None): global currentleader global follower if leader is None: leader = currentleader log.log({ 'info': 'Attempting connection to leader {0}'.format(leader), 'subsystem': 'collective' }) try: remote = connect_to_collective(cert, leader) except socket.error as e: log.log({ 'error': 'Collective connection attempt to {0} failed: {1}' ''.format(leader, str(e)), 'subsystem': 'collective' }) return False with connecting: with cfm._initlock: banner = tlvdata.recv(remote) # the banner vers = banner.split()[2] if vers != b'v2': raise Exception( 'This instance only supports protocol 2, synchronize versions between collective members' ) tlvdata.recv(remote) # authpassed... 0.. if name is None: name = get_myname() tlvdata.send( remote, { 'collective': { 'operation': 'connect', 'name': name, 'txcount': cfm._txcount } }) keydata = tlvdata.recv(remote) if not keydata: return False if 'error' in keydata: if 'backoff' in keydata: log.log({ 'info': 'Collective initialization in progress on ' '{0}'.format(leader), 'subsystem': 'collective' }) return False if 'leader' in keydata: log.log({ 'info': 'Prospective leader {0} has redirected this ' 'member to {1}'.format(leader, keydata['leader']), 'subsystem': 'collective' }) ldrc = cfm.get_collective_member_by_address( keydata['leader']) if ldrc and ldrc['name'] == name: raise Exception("Redirected to self") return connect_to_leader(name=name, leader=keydata['leader']) if 'txcount' in keydata: log.log({ 'info': 'Prospective leader {0} has inferior ' 'transaction count, becoming leader' ''.format(leader), 'subsystem': 'collective', 'subsystem': 'collective' }) return become_leader(remote) return False follower.kill() cfm.stop_following() follower = None if follower: follower.kill() cfm.stop_following() follower = None log.log({ 'info': 'Following leader {0}'.format(leader), 'subsystem': 'collective' }) colldata = tlvdata.recv(remote) # the protocol transmits global data, but for now we ignore it globaldata = tlvdata.recv(remote) dbi = tlvdata.recv(remote) dbsize = dbi['dbsize'] dbjson = b'' while (len(dbjson) < dbsize): ndata = remote.recv(dbsize - len(dbjson)) if not ndata: try: remote.close() except Exception: pass raise Exception("Error doing initial DB transfer") dbjson += ndata cfm.clear_configuration() try: cfm._restore_keys(keydata, None, sync=False) for c in colldata: cfm._true_add_collective_member(c, colldata[c]['address'], colldata[c]['fingerprint'], sync=False) #for globvar in globaldata: # cfm.set_global(globvar, globaldata[globvar], False) cfm._txcount = dbi.get('txcount', 0) cfm.ConfigManager(tenant=None)._load_from_json(dbjson, sync=False) cfm.commit_clear() except Exception: cfm.stop_following() cfm.rollback_clear() raise currentleader = leader #spawn this as a thread... follower = eventlet.spawn(follow_leader, remote, leader) return True
def handle_connection(connection, cert, request, local=False): global currentleader global retrythread operation = request['operation'] if cert: cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) else: if not local: return if operation in ('show', 'delete'): if not list(cfm.list_collective()): tlvdata.send( connection, { 'collective': { 'error': 'Collective mode not ' 'enabled on this ' 'system' } }) return if follower: linfo = cfm.get_collective_member_by_address(currentleader) remote = socket.create_connection((currentleader, 13001)) remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) if not (linfo and util.cert_matches(linfo['fingerprint'], cert)): remote.close() tlvdata.send(connection, { 'error': 'Invalid certificate, ' 'redo invitation process' }) connection.close() return tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'getinfo', 'name': get_myname() } }) collinfo = tlvdata.recv(remote) else: collinfo = {} populate_collinfo(collinfo) try: cfm.check_quorum() collinfo['quorum'] = True except exc.DegradedCollective: collinfo['quorum'] = False if operation == 'show': tlvdata.send(connection, {'collective': collinfo}) elif operation == 'delete': todelete = request['member'] if (todelete == collinfo['leader'] or todelete in collinfo['active']): tlvdata.send( connection, { 'collective': { 'error': '{0} is still active, stop the confluent service to remove it' .format(todelete) } }) return if todelete not in collinfo['offline']: tlvdata.send( connection, { 'collective': { 'error': '{0} is not a recognized collective member'. format(todelete) } }) return cfm.del_collective_member(todelete) tlvdata.send( connection, { 'collective': { 'status': 'Successfully deleted {0}'.format(todelete) } }) connection.close() return if 'invite' == operation: try: cfm.check_quorum() except exc.DegradedCollective: tlvdata.send(connection, { 'collective': { 'error': 'Collective does not have quorum' } }) return #TODO(jjohnson2): Cannot do the invitation if not the head node, the certificate hand-carrying #can't work in such a case. name = request['name'] invitation = invites.create_server_invitation(name) tlvdata.send(connection, {'collective': { 'invitation': invitation }}) connection.close() if 'join' == operation: invitation = request['invitation'] try: invitation = base64.b64decode(invitation) name, invitation = invitation.split(b'@', 1) name = util.stringify(name) except Exception: tlvdata.send( connection, {'collective': { 'status': 'Invalid token format' }}) connection.close() return host = request['server'] try: remote = socket.create_connection((host, 13001)) # This isn't what it looks like. We do CERT_NONE to disable # openssl verification, but then use the invitation as a # shared secret to validate the certs as part of the join # operation remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') except Exception: tlvdata.send( connection, { 'collective': { 'status': 'Failed to connect to {0}'.format(host) } }) connection.close() return mycert = util.get_certificate_from_file( '/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) proof = base64.b64encode( invites.create_client_proof(invitation, mycert, cert)) tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'enroll', 'name': name, 'hmac': proof } }) rsp = tlvdata.recv(remote) if 'error' in rsp: tlvdata.send(connection, {'collective': { 'status': rsp['error'] }}) connection.close() return proof = rsp['collective']['approval'] proof = base64.b64decode(proof) j = invites.check_server_proof(invitation, mycert, cert, proof) if not j: remote.close() tlvdata.send(connection, {'collective': { 'status': 'Bad server token' }}) connection.close() return tlvdata.send(connection, {'collective': {'status': 'Success'}}) connection.close() currentleader = rsp['collective']['leader'] f = open('/etc/confluent/cfg/myname', 'w') f.write(name) f.close() log.log({ 'info': 'Connecting to collective due to join', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, rsp['collective']['fingerprint'], name) if 'enroll' == operation: #TODO(jjohnson2): error appropriately when asked to enroll, but the master is elsewhere mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') proof = base64.b64decode(request['hmac']) myrsp = invites.check_client_proof(request['name'], mycert, cert, proof) if not myrsp: tlvdata.send(connection, {'error': 'Invalid token'}) connection.close() return myrsp = base64.b64encode(myrsp) fprint = util.get_fingerprint(cert) myfprint = util.get_fingerprint(mycert) cfm.add_collective_member(get_myname(), connection.getsockname()[0], myfprint) cfm.add_collective_member(request['name'], connection.getpeername()[0], fprint) myleader = get_leader(connection) ldrfprint = cfm.get_collective_member_by_address( myleader)['fingerprint'] tlvdata.send( connection, { 'collective': { 'approval': myrsp, 'fingerprint': ldrfprint, 'leader': get_leader(connection) } }) if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not droneinfo: tlvdata.send( connection, {'error': 'Unrecognized leader, ' 'redo invitation process'}) return if not util.cert_matches(droneinfo['fingerprint'], cert): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) return if request['txcount'] < cfm._txcount: tlvdata.send( connection, { 'error': 'Refusing to be assimilated by inferior' 'transaction count', 'txcount': cfm._txcount, }) return if connecting.active: # don't try to connect while actively already trying to connect tlvdata.send(connection, {'status': 0}) connection.close() return if (currentleader == connection.getpeername()[0] and follower and not follower.dead): # if we are happily following this leader already, don't stir # the pot tlvdata.send(connection, {'status': 0}) connection.close() return log.log({ 'info': 'Connecting in response to assimilation', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, None, None, leader=connection.getpeername()[0]) tlvdata.send(connection, {'status': 0}) connection.close() if 'getinfo' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return collinfo = {} populate_collinfo(collinfo) tlvdata.send(connection, collinfo) if 'connect' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return myself = connection.getsockname()[0] if connecting.active: tlvdata.send(connection, { 'error': 'Connecting right now', 'backoff': True }) connection.close() return if myself != get_leader(connection): tlvdata.send( connection, { 'error': 'Cannot assimilate, our leader is ' 'in another castle', 'leader': currentleader }) connection.close() return if request['txcount'] > cfm._txcount: retire_as_leader() tlvdata.send( connection, { 'error': 'Client has higher tranasaction count, ' 'should assimilate me, connecting..', 'txcount': cfm._txcount }) log.log({ 'info': 'Connecting to leader due to superior ' 'transaction count', 'subsystem': collective }) eventlet.spawn_n(connect_to_leader, None, None, connection.getpeername()[0]) connection.close() return if retrythread: retrythread.cancel() retrythread = None with leader_init: cfm.update_collective_address(request['name'], connection.getpeername()[0]) tlvdata.send(connection, cfm._dump_keys(None, False)) tlvdata.send(connection, cfm._cfgstore['collective']) tlvdata.send(connection, {}) # cfm.get_globals()) cfgdata = cfm.ConfigManager(None)._dump_to_json() tlvdata.send(connection, { 'txcount': cfm._txcount, 'dbsize': len(cfgdata) }) connection.sendall(cfgdata) #tlvdata.send(connection, {'tenants': 0}) # skip the tenants for now, # so far unused anyway if not cfm.relay_slaved_requests(drone, connection): if not retrythread: # start a recovery if everyone else seems # to have disappeared retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def connect_to_leader(cert=None, name=None, leader=None): global currentleader global cfginitlock global follower if cfginitlock is None: cfginitlock = threading.RLock() if leader is None: leader = currentleader try: remote = connect_to_collective(cert, leader) except socket.error: return False with connecting: with cfginitlock: tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. if name is None: name = get_myname() tlvdata.send( remote, { 'collective': { 'operation': 'connect', 'name': name, 'txcount': cfm._txcount } }) keydata = tlvdata.recv(remote) if not keydata: return False if 'error' in keydata: if 'backoff' in keydata: eventlet.spawn_after(random.random(), connect_to_leader, cert, name, leader) return True if 'leader' in keydata: ldrc = cfm.get_collective_member_by_address( keydata['leader']) if ldrc and ldrc['name'] == name: raise Exception("Redirected to self") return connect_to_leader(name=name, leader=keydata['leader']) if 'txcount' in keydata: return become_leader(remote) print(keydata['error']) return False if follower is not None: follower.kill() cfm.stop_following() follower = None colldata = tlvdata.recv(remote) globaldata = tlvdata.recv(remote) dbi = tlvdata.recv(remote) dbsize = dbi['dbsize'] dbjson = '' while (len(dbjson) < dbsize): ndata = remote.recv(dbsize - len(dbjson)) if not ndata: try: remote.close() except Exception: pass raise Exception("Error doing initial DB transfer") dbjson += ndata cfm.clear_configuration() try: cfm._restore_keys(keydata, None, sync=False) for c in colldata: cfm._true_add_collective_member(c, colldata[c]['address'], colldata[c]['fingerprint'], sync=False) for globvar in globaldata: cfm.set_global(globvar, globaldata[globvar], False) cfm._txcount = dbi.get('txcount', 0) cfm.ConfigManager(tenant=None)._load_from_json(dbjson, sync=False) cfm.commit_clear() except Exception: cfm.stop_following() cfm.rollback_clear() raise currentleader = leader #spawn this as a thread... follower = eventlet.spawn(follow_leader, remote) return True