def test_scidb_progress_module():
    """Testing all public methods in scidblib.scidb_progress."""
    print '*** testing scidblib.scidb_progress...'
    print 'datetime_as_str =', scidb_progress.datetime_as_str()

    s = 'SciDB Graph500 perf-test: 14.6.7610 (2014-6-14)'
    my_match = re.match(
        r'^.*(' + scidb_progress.VersionAndDate.re_whole + r')', s)
    assert my_match
    mid = scidb_progress.VersionAndDate(my_match.group(1))
    smallest = scidb_progress.VersionAndDate(my_match.group(1))
    smallest.major = 13
    small = scidb_progress.VersionAndDate(my_match.group(1))
    small.revision = 7608
    large = scidb_progress.VersionAndDate(my_match.group(1))
    large.minor = 9
    s = small.__str__()
    assert smallest.earlier_than(small)
    assert small.earlier_than(mid)
    assert mid.earlier_than(large)
    assert smallest.earlier_than(large)
    print 'VersionAndDate passed unit test.'

    pt = scidb_progress.ProgressTracker()
    pt.register_step('one', 'the first step')
    pt.start_step('one')
    pt.end_step('one')
    print
示例#2
0
def test_scidb_progress_module():
    """Testing all public methods in scidblib.scidb_progress."""
    print '*** testing scidblib.scidb_progress...'
    print 'datetime_as_str =', scidb_progress.datetime_as_str()

    # A string including a version and date should be able to be parsed.
    s = 'SciDB Graph500 perf-test: 15.12 (2016-1-1)'
    my_match = re.match(r'^.*(' + scidb_progress.VersionAndDate.re_whole + r')', s)
    assert my_match
    assert my_match.group(1) == '15.12 (2016-1-1)'

    # vd0 < vd1 < vd2 < vd3 < vd4 < vd5 = vd6
    # ---------------------------------------
    # vd0 = 14.12 (2015-1-1)
    # vd1 = 15.11 (2015-1-1)
    # vd2 = 15.12 (2015-1-1)
    # vd3 = 15.12 (2016-1-1)
    # vd4 = 15.12 (2016-2-1)
    # vd5 = 15.12 (2016-2-3)
    # vd6 = 15.12 (2016-2-3)
    vd = []
    for i in range(7):
        #vd.append(scidb_progress.VersionAndDate(my_match.group(1)))
        vd.append(scidb_progress.VersionAndDate('15.12 (2016-1-1)'))
    vd[0].major = 14
    vd[0].year = 2015
    vd[1].minor = 11
    vd[1].year = 2015
    vd[2].year = 2015
    vd[4].month = 2
    vd[5].month = 2
    vd[5].day = 3
    vd[6].month = 2
    vd[6].day = 3
    for i in range(5):
        assert vd[i].earlier_than(vd[i+1])
        assert not vd[i+1].earlier_than(vd[i])
    assert not vd[5].earlier_than(vd[6])
    assert not vd[6].earlier_than(vd[5])

    # The __str()__ member function should work.
    s = vd[0].__str__()

    print 'VersionAndDate passed unit test.'

    # Testing ProgressTracker.
    pt = scidb_progress.ProgressTracker()
    pt.register_step('one', 'the first step')
    pt.start_step('one')
    pt.end_step('one')
    print
示例#3
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = parse_dimensions(raw_dims_str)
    dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims])

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(
        sys.stdout,
        '',
        args.verbose,  # if_print_start
        args.verbose,  # if_print_end
        args.verbose  # if_print_skip
    )
    progress_tracker.register_step(
        'min_max_dc',
        'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc',
                                   'Get overall ApproxDC from load_array.')
    progress_tracker.register_step(
        'calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)
    dbg("S:", S)
    dbg("N:", N)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)
    dbg("names...:", names_in_load_array.list)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' +
                   the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name +
                   '), min(' + tmp + '), max(' + tmp + '), count(*))')
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name +
                   '), max(' + the_dim.dim_name + '), approxdc(' +
                   the_dim.dim_name + '))')
        dbg("Cmd:", cmd)
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(
            iquery_cmd, cmd, 3)
        dbg("(min,max,dc):", (min_coord, max_coord, distinct_count))
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' +
                                    the_dim.dim_name + ' in array ' +
                                    load_array + ', not all of min_coord (=' +
                                    min_coord + '), max_coord (=' + max_coord +
                                    '), and distinct_count (=' +
                                    distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int,
                               distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' +
           string_concat_of_dim_values + '), approxdc(' + tmp + '))')
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(
        iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError(
            'Error: The query to get overall_distinct_count failed to return an integer.'
        )
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N) == 0:
        print scidb_schema.unparse(
            dims=[x.to_tuple() for x in calculated_dims])
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(
        overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count,
                                                  int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(
            num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims[k].distinct_count
            geomean = math.pow(product, 1.0 / len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count < 1:
            chunk_count = 1.0
        the_dim.chunk_length = int(
            math.ceil(
                (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count))
        if chunk_count > 1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                the_dim.chunk_length,
                args.grid_threshold,
                use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims])
    return 0
示例#4
0
def my_test(args, num_chunks, chunk_length, initial_values_per_chunk,
            new_values_per_chunk, type_name):
    """This function does the testing of appending alternate values to the end of every chunk of an array.

    @param args                          command-line parameters.
    @param num_chunks                    how many chunks are there.
    @param chunk_length                  the chunk length.
    @param initial_values_per_chunk  the number of initial values per chunk
    @param new_values_per_chunk      how many value to insert into each chunk.
    @param type_name                     the data type.
    @return 0
    """
    # Set even_value and odd_value.
    even_value = "0"
    odd_value = "1"
    if type_name == "bool":
        even_value = "true"
        odd_value = "false"

    # Initialize the ProgressTracker
    progress_tracker = scidb_progress.ProgressTracker(
        if_print_start=args.verbose, if_print_end=args.verbose)
    progress_tracker.register_step('initial', 'Load initial values.')
    progress_tracker.register_step('new', 'Insert new values.')

    # Remove the array if exists.
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    my_remove_arrays(iquery_cmd, tolerate_error=True)

    # Create the array.
    cmd = "create temp array %s <v:%s>[i=0:%d,%d,0]" % (
        array_name, type_name, chunk_length * num_chunks - 1, chunk_length)
    scidb_afl.afl(iquery_cmd, cmd)

    # Load initial values.
    # The algorithm is to create an array that describes the ranges for the initial values,
    # then use cross_between to filter out values from a fully-populated array.
    progress_tracker.start_step('initial')
    cmd = "create temp array %s <low:int64, high:int64>[i=0:%d,%d,0]" % (
        ranges_array_name, num_chunks - 1, num_chunks)
    scidb_afl.afl(iquery_cmd, cmd)
    for c in xrange(num_chunks):
        cmd = (
            "insert(redimension(apply(build(<adummyattribute:bool>[adummydim=0:0,1,0],true), i, %d, low, %d, high, %d), %s), %s)"
            %
            (c, c * chunk_length, c * chunk_length + initial_values_per_chunk -
             1, ranges_array_name, ranges_array_name))
        scidb_afl.afl(iquery_cmd, cmd)
    cmd = (
        "store(cross_between(build(%s, iif(i%%2=0, %s(%s), %s(%s))), %s), %s)"
        % (array_name, type_name, even_value, type_name, odd_value,
           ranges_array_name, array_name))
    scidb_afl.afl(iquery_cmd, cmd)
    progress_tracker.end_step('initial')

    # Load the additional values.
    progress_tracker.start_step('new')
    if args.verbose:
        print "In each of the %d batches, one value will be appended to each of the %d chunks." % (
            new_values_per_chunk, num_chunks)
        print "Batch\tTime"
    for i in xrange(new_values_per_chunk):
        start_time = datetime.datetime.now()
        for c in xrange(num_chunks):
            index = c * chunk_length + i + initial_values_per_chunk
            value = type_name + "(" + even_value + ")" if index % 2 == 0 else type_name + "(" + odd_value + ")"
            cmd = "op_set_cell_attr_1D(%s, i, %d, v, %s)" % (array_name, index,
                                                             value)
            scidb_afl.afl(iquery_cmd, cmd)
        if args.verbose:
            seconds = scidb_progress.timedelta_total_seconds(
                datetime.datetime.now() - start_time)
            print "%d\t%f" % (i + 1, seconds)
    progress_tracker.end_step('new')

    # Remove the array.
    my_remove_arrays(iquery_cmd, tolerate_error=False)

    # Return 0
    return 0