def partition(hash_columns: list, range_columns: list = None, bound: dict = None, bucket_num=3) -> Partitioning: # Define partitioning schema partition = Partitioning() for column in hash_columns: partition.add_hash_partitions(column_names=column, num_buckets=bucket_num) partition.set_range_partition_columns(range_columns) partition.add_range_partition( lower_bound=bound.get("lower_bound"), upper_bound=bound.get("upper_bound"), lower_bound_type=bound.get("lower_bound_type") or "inclusive", upper_bound_type=bound.get("upper_bound_type") or "exclusive" ) return partition
def executeCommand(client, command, tableName): print("Executing Command {} on table {}".format(command, tableName)) if command == "create": # Creating a table requires just a few steps # - Define your schema # - Define your partitioning scheme # - Call the create_table API # Use the schema_builder to build your table's schema builder = kudu.schema_builder() # Lastname column builder.add_column('lastname').type('string').default( 'doe').compression('snappy').encoding('plain').nullable(False) # State/Province the person lives in # Leave all defaults except for the type and nullability builder.add_column('state_prov').type('string').nullable(False) builder.add_column('key').type(kudu.int64).nullable(False) # We prefer using dot notation, so let's add a few more columns # using that strategy # - type : We specify the string representation of types # - default: Default value if none specified # - compression: Compression type # - encoding: Encoding strategy # - nullable: Nullability # - block_size: Target block size, overriding server defaults builder.add_column('firstname').type('string').default( 'jane').compression('zlib').encoding('plain').nullable( False).block_size(20971520) # Use add_column list of parameters to specify properties # just as an example instead of dot notation. builder.add_column('ts_val', type_=kudu.unixtime_micros, nullable=False, compression='lz4') # Set our primary key column(s) builder.set_primary_keys(['lastname', 'state_prov', 'key']) # Build the schema schema = builder.build() # Define Hash partitioned column by the state/province # Its quite possible the data would then be skewed across partitions # so what we'll do here is add a the optional 3rd parameter to # help randomize the mapping of rows to hash buckets. partitioning = Partitioning().add_hash_partitions( column_names=['state_prov'], num_buckets=3, seed=13) # We've hash partitioned according to the state, now let's further # range partition our content by lastname. If we wanted to find all # the "Smith" families in the state of Oregon, we would very quickly # be able to isolate those rows with this type of schema. # Set the range partition columns - these columns MUST be part of # the primary key columns. partitioning.set_range_partition_columns('lastname') # Add range partitions partitioning.add_range_partition(['A'], ['E']) # By default, lower bound is inclusive while upper is exclusive partitioning.add_range_partition(['E'], ['Z'], upper_bound_type='inclusive') # Create new table passing in the table name, schema, partitioning # object and the optional parameter of number of replicas for this # table. If none specified, then it'll go by the Kudu server default # value for number of replicas. client.create_table(tableName, schema, partitioning, 1) elif command == "insert": # Open a table table = client.table(tableName) # Create a new session so that we can apply write operations session = client.new_session() # We have a few flush modes at our disposal, namely: # FLUSH_MANUAL, FLUSH_AUTO_SYNC and FLUSH_AUTO_BACKGROUND # The default is FLUSH_MANUAL, and we want to flush manually for # our examples below. Just providing example on how to change it # needed. session.set_flush_mode(kudu.FLUSH_MANUAL) # We can set a timeout value as well in milliseconds. Set ours to # 3 seconds. session.set_timeout_ms(3000) # Insert a row op = table.new_insert({ 'lastname': 'Smith', 'state_prov': 'ON', 'firstname': 'Mike', 'key': 1, 'ts_val': datetime.utcnow() }) session.apply(op) op = table.new_insert({ 'lastname': 'Smith', 'state_prov': 'ON', 'firstname': 'Mike', 'key': 1, 'ts_val': datetime.utcnow() }) session.apply(op) op = table.new_insert({ 'lastname': 'Smith', 'state_prov': 'ON', 'firstname': 'Mike', 'key': 1, 'ts_val': datetime.utcnow() }) session.apply(op) try: session.flush() except kudu.KuduBadStatus as e: (errorResult, overflowed) = session.get_pending_errors() print("Insert row failed: {} (more pending errors? {})".format( errorResult, overflowed))