def test_scidb_afl_module(): """Testing all public methods in scidblib.scidb_afl.""" print '*** testing scidblib.scidb_afl...' class TmpArgs: def __init__(self): self.host = '' self.port = '' args = TmpArgs() iquery_cmd = scidb_afl.get_iquery_cmd(args) scidb_afl.execute_it_return_out_err('ls') scidb_afl.afl(iquery_cmd, 'list()') print 'time_afl(..., \'list()\') =', scidb_afl.time_afl( iquery_cmd, 'list()') print 'single_cell_afl(..., \'build(<v:int64>[i=0:0,1,0], 5)\', 1) =', \ scidb_afl.single_cell_afl(iquery_cmd, 'build(<v:int64>[i=0:0,1,0], 5)', 1) print 'single_cell_afl(..., \'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)\', 2) =', \ scidb_afl.single_cell_afl(iquery_cmd, 'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)', 2) print 'get_num_instances(...) =', scidb_afl.get_num_instances(iquery_cmd) print 'get_array_names(...) =', scidb_afl.get_array_names(iquery_cmd) print
def test_scidb_afl_module(): """Testing all public methods in scidblib.scidb_afl.""" print '*** testing scidblib.scidb_afl...' class TmpArgs: def __init__(self): self.host = '' self.port = '' args = TmpArgs() iquery_cmd = scidb_afl.get_iquery_cmd(args) scidb_afl.execute_it_return_out_err('ls') scidb_afl.afl(iquery_cmd, 'list()') print 'time_afl(..., \'list()\') =', scidb_afl.time_afl(iquery_cmd, 'list()') print 'single_cell_afl(..., \'build(<v:int64>[i=0:0,1,0], 5)\', 1) =', \ scidb_afl.single_cell_afl(iquery_cmd, 'build(<v:int64>[i=0:0,1,0], 5)', 1) print 'single_cell_afl(..., \'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)\', 2) =', \ scidb_afl.single_cell_afl(iquery_cmd, 'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)', 2) print 'get_num_instances(...) =', scidb_afl.get_num_instances(iquery_cmd) print 'get_array_names(...) =', scidb_afl.get_array_names(iquery_cmd) print
def __init__(self, iquery_cmd, load_array): """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members. @param iquery_cmd the iquery command. @param load_array the name of the load array. @exception AppError if the show() command does not produce a valid schema, e.g. if load_array is not a valid array name in the database. """ self.list = [] schema_str = scidb_afl.single_cell_afl(iquery_cmd, 'show(' + load_array + ')', 1) re_schema = ( r'^.*' + # array_name r'\<(.*)\>\s*' + # <attributes> r'\[(.*)\]$' # [dimensions] ) match_schema = re.match(re_schema, schema_str, re.M | re.I) if not match_schema: raise scidblib.AppError( 'System Error! I failed to parse the schema of the load_array.' ) str_attrs = match_schema.group(1) str_dims = match_schema.group(2) # attributes self.attrs = Attributes(str_attrs) attrs = self.attrs.list for i, attr in enumerate(attrs): one_name = NameInLoadArray(attr.attr_name, is_dim=False, is_int64=attr.attr_type == 'int64', local_index=i) self.list.append(one_name) # dimensions self.dims = Dimensions(str_dims) dims = self.dims.list for i, dim in enumerate(dims): one_name = NameInLoadArray(dim.dim_name, is_dim=True, is_int64=True, local_index=i) self.list.append(one_name)
def __init__(self, iquery_cmd, load_array): """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members. @param iquery_cmd the iquery command. @param load_array the name of the load array. @exception AppError if the show() command does not produce a valid schema, e.g. if load_array is not a valid array name in the database. """ self.list = [] schema_str = scidb_afl.single_cell_afl(iquery_cmd, 'show(' + load_array + ')', 1) re_schema = ( r'^.*' + # array_name r'\<(.*)\>\s*' + # <attributes> r'\[(.*)\]$' # [dimensions] ) match_schema = re.match(re_schema, schema_str, re.M|re.I) if not match_schema: raise scidblib.AppError('System Error! I failed to parse the schema of the load_array.') str_attrs = match_schema.group(1) str_dims = match_schema.group(2) # attributes self.attrs = Attributes(str_attrs) attrs = self.attrs.list for i, attr in enumerate(attrs): one_name = NameInLoadArray(attr.attr_name, is_dim = False, is_int64 = attr.attr_type=='int64', local_index = i) self.list.append(one_name) # dimensions self.dims = Dimensions(str_dims) dims = self.dims.list for i, dim in enumerate(dims): one_name = NameInLoadArray(dim.dim_name, is_dim = True, is_int64 = True, local_index = i) self.list.append(one_name)
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = parse_dimensions(raw_dims_str) dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims]) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker( sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step( 'min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step( 'calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) dbg("S:", S) dbg("N:", N) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) dbg("names...:", names_in_load_array.list) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))') else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))') dbg("Cmd:", cmd) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl( iquery_cmd, cmd, 3) dbg("(min,max,dc):", (min_coord, max_coord, distinct_count)) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))') progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl( iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError( 'Error: The query to get overall_distinct_count failed to return an integer.' ) if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N) == 0: print scidb_schema.unparse( dims=[x.to_tuple() for x in calculated_dims]) return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division( overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division( num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims[i] chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims[k].distinct_count geomean = math.pow(product, 1.0 / len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count < 1: chunk_count = 1.0 the_dim.chunk_length = int( math.ceil( (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count)) if chunk_count > 1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims]) return 0
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = Dimensions(raw_dims_str) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker(sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step('min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step('calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims.list): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims.list: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))' ) else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))' ) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 3) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims.list: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims.list): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))' ) progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl(iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError('Error: The query to get overall_distinct_count failed to return an integer.') if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N)==0: print calculated_dims.__str__() return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division(overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims.list[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division(num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims.list[i] chunk_count = math.pow(num_chunks_from_n, 1.0/len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims.list[k].distinct_count geomean = math.pow(product, 1.0/len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count<1: chunk_count = 1.0 the_dim.chunk_length = int(math.ceil( (the_dim.max_coord-the_dim.min_coord+1)/chunk_count )) if chunk_count>1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print calculated_dims.__str__() return 0