def test_scidb_progress_module(): """Testing all public methods in scidblib.scidb_progress.""" print '*** testing scidblib.scidb_progress...' print 'datetime_as_str =', scidb_progress.datetime_as_str() s = 'SciDB Graph500 perf-test: 14.6.7610 (2014-6-14)' my_match = re.match( r'^.*(' + scidb_progress.VersionAndDate.re_whole + r')', s) assert my_match mid = scidb_progress.VersionAndDate(my_match.group(1)) smallest = scidb_progress.VersionAndDate(my_match.group(1)) smallest.major = 13 small = scidb_progress.VersionAndDate(my_match.group(1)) small.revision = 7608 large = scidb_progress.VersionAndDate(my_match.group(1)) large.minor = 9 s = small.__str__() assert smallest.earlier_than(small) assert small.earlier_than(mid) assert mid.earlier_than(large) assert smallest.earlier_than(large) print 'VersionAndDate passed unit test.' pt = scidb_progress.ProgressTracker() pt.register_step('one', 'the first step') pt.start_step('one') pt.end_step('one') print
def test_scidb_progress_module(): """Testing all public methods in scidblib.scidb_progress.""" print '*** testing scidblib.scidb_progress...' print 'datetime_as_str =', scidb_progress.datetime_as_str() # A string including a version and date should be able to be parsed. s = 'SciDB Graph500 perf-test: 15.12 (2016-1-1)' my_match = re.match(r'^.*(' + scidb_progress.VersionAndDate.re_whole + r')', s) assert my_match assert my_match.group(1) == '15.12 (2016-1-1)' # vd0 < vd1 < vd2 < vd3 < vd4 < vd5 = vd6 # --------------------------------------- # vd0 = 14.12 (2015-1-1) # vd1 = 15.11 (2015-1-1) # vd2 = 15.12 (2015-1-1) # vd3 = 15.12 (2016-1-1) # vd4 = 15.12 (2016-2-1) # vd5 = 15.12 (2016-2-3) # vd6 = 15.12 (2016-2-3) vd = [] for i in range(7): #vd.append(scidb_progress.VersionAndDate(my_match.group(1))) vd.append(scidb_progress.VersionAndDate('15.12 (2016-1-1)')) vd[0].major = 14 vd[0].year = 2015 vd[1].minor = 11 vd[1].year = 2015 vd[2].year = 2015 vd[4].month = 2 vd[5].month = 2 vd[5].day = 3 vd[6].month = 2 vd[6].day = 3 for i in range(5): assert vd[i].earlier_than(vd[i+1]) assert not vd[i+1].earlier_than(vd[i]) assert not vd[5].earlier_than(vd[6]) assert not vd[6].earlier_than(vd[5]) # The __str()__ member function should work. s = vd[0].__str__() print 'VersionAndDate passed unit test.' # Testing ProgressTracker. pt = scidb_progress.ProgressTracker() pt.register_step('one', 'the first step') pt.start_step('one') pt.end_step('one') print
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = parse_dimensions(raw_dims_str) dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims]) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker( sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step( 'min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step( 'calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) dbg("S:", S) dbg("N:", N) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) dbg("names...:", names_in_load_array.list) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))') else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))') dbg("Cmd:", cmd) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl( iquery_cmd, cmd, 3) dbg("(min,max,dc):", (min_coord, max_coord, distinct_count)) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))') progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl( iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError( 'Error: The query to get overall_distinct_count failed to return an integer.' ) if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N) == 0: print scidb_schema.unparse( dims=[x.to_tuple() for x in calculated_dims]) return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division( overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division( num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims[i] chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims[k].distinct_count geomean = math.pow(product, 1.0 / len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count < 1: chunk_count = 1.0 the_dim.chunk_length = int( math.ceil( (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count)) if chunk_count > 1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims]) return 0
def my_test(args, num_chunks, chunk_length, initial_values_per_chunk, new_values_per_chunk, type_name): """This function does the testing of appending alternate values to the end of every chunk of an array. @param args command-line parameters. @param num_chunks how many chunks are there. @param chunk_length the chunk length. @param initial_values_per_chunk the number of initial values per chunk @param new_values_per_chunk how many value to insert into each chunk. @param type_name the data type. @return 0 """ # Set even_value and odd_value. even_value = "0" odd_value = "1" if type_name == "bool": even_value = "true" odd_value = "false" # Initialize the ProgressTracker progress_tracker = scidb_progress.ProgressTracker( if_print_start=args.verbose, if_print_end=args.verbose) progress_tracker.register_step('initial', 'Load initial values.') progress_tracker.register_step('new', 'Insert new values.') # Remove the array if exists. iquery_cmd = scidb_afl.get_iquery_cmd(args) my_remove_arrays(iquery_cmd, tolerate_error=True) # Create the array. cmd = "create temp array %s <v:%s>[i=0:%d,%d,0]" % ( array_name, type_name, chunk_length * num_chunks - 1, chunk_length) scidb_afl.afl(iquery_cmd, cmd) # Load initial values. # The algorithm is to create an array that describes the ranges for the initial values, # then use cross_between to filter out values from a fully-populated array. progress_tracker.start_step('initial') cmd = "create temp array %s <low:int64, high:int64>[i=0:%d,%d,0]" % ( ranges_array_name, num_chunks - 1, num_chunks) scidb_afl.afl(iquery_cmd, cmd) for c in xrange(num_chunks): cmd = ( "insert(redimension(apply(build(<adummyattribute:bool>[adummydim=0:0,1,0],true), i, %d, low, %d, high, %d), %s), %s)" % (c, c * chunk_length, c * chunk_length + initial_values_per_chunk - 1, ranges_array_name, ranges_array_name)) scidb_afl.afl(iquery_cmd, cmd) cmd = ( "store(cross_between(build(%s, iif(i%%2=0, %s(%s), %s(%s))), %s), %s)" % (array_name, type_name, even_value, type_name, odd_value, ranges_array_name, array_name)) scidb_afl.afl(iquery_cmd, cmd) progress_tracker.end_step('initial') # Load the additional values. progress_tracker.start_step('new') if args.verbose: print "In each of the %d batches, one value will be appended to each of the %d chunks." % ( new_values_per_chunk, num_chunks) print "Batch\tTime" for i in xrange(new_values_per_chunk): start_time = datetime.datetime.now() for c in xrange(num_chunks): index = c * chunk_length + i + initial_values_per_chunk value = type_name + "(" + even_value + ")" if index % 2 == 0 else type_name + "(" + odd_value + ")" cmd = "op_set_cell_attr_1D(%s, i, %d, v, %s)" % (array_name, index, value) scidb_afl.afl(iquery_cmd, cmd) if args.verbose: seconds = scidb_progress.timedelta_total_seconds( datetime.datetime.now() - start_time) print "%d\t%f" % (i + 1, seconds) progress_tracker.end_step('new') # Remove the array. my_remove_arrays(iquery_cmd, tolerate_error=False) # Return 0 return 0