Пример #1
0
def analysis(sliceno, params, prepare_res):
    spilldata = {}
    stats = {}
    we_have_spill = False
    if datasets.previous:
        prev_spilldata = blob.load('spilldata',
                                   jobid=datasets.previous,
                                   sliceno=sliceno)
        for source, data in prev_spilldata:
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res, data)
            we_have_spill |= not stats[source].virtual_spill
    if datasets.source:
        prev_params = job_params(datasets.previous, default_empty=True)
        for source in datasets.source.chain(
                stop_ds=prev_params.datasets.source):
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res)
            we_have_spill |= not stats[source].virtual_spill
    spilldata = [(k, v) for k, v in spilldata.iteritems() if v]
    if we_have_spill:
        spilldata.append((params.jobid, empty_spilldata('SPILL')))
    blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
    return we_have_spill
def analysis(sliceno, prepare_res):
    stats = {}
    prev_spilldata = blob.load('spilldata',
                               jobid=datasets.source,
                               sliceno=sliceno)
    source_params = job_params(datasets.source)
    for source, data in prev_spilldata:
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            data,
            save_discard=True)
    source_params = job_params(datasets.source)
    prev_params = job_params(source_params.datasets.previous,
                             default_empty=True)
    for source in Dataset(source_params.datasets.source).chain(
            stop_ds=prev_params.datasets.source):
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            save_discard=True)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
Пример #3
0
def _ds_load(obj):
    n = unicode(obj)
    if n not in _ds_cache:
        _ds_cache[n] = _v2_columntypefix(
            blob.load(obj._name('pickle'), obj.jobid))
        _ds_cache.update(_ds_cache[n].get('cache', ()))
    return _ds_cache[n]
Пример #4
0
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False):
	stats = DotDict(
		included_lines          = [0] * params.slices,
		discarded_lines         = [0] * params.slices,
		spilled_lines           = [0] * params.slices,
		virtually_spilled_lines = [0] * params.slices,
		split_date              = str(options.split_date) if options.split_date else None,
		discard_before_date     = str(options.discard_before_date) if options.discard_before_date else None,
	)
	minmax_per_slice = [{} for _ in range(params.slices)]
	def update_stats(data):
		for item in data.itervalues():
			stats.included_lines[sliceno] += item.counters[2]
			stats.discarded_lines[sliceno] += item.counters[1]
			if item.virtual_spill:
				stats.virtually_spilled_lines[sliceno] += item.counters[3]
			else:
				stats.spilled_lines[sliceno] += item.counters[3]
			update_minmax(minmax_per_slice[sliceno], item.minmax)
	def update_minmax(dest, src):
		for name, lst0 in src.iteritems():
			lst1 = dest.get(name, lst0)
			mins = map(min, zip(lst0[:3], lst1[:3]))
			maxs = map(max, zip(lst0[3:], lst1[3:]))
			dest[name] = mins + maxs
	for sliceno in range(params.slices):
		update_stats(blob.load('stats', sliceno=sliceno))
	minmax = {}
	for item in minmax_per_slice:
		update_minmax(minmax, item)
	def minmax_select(offset, stringify=False):
		d = {}
		for k, v in minmax.iteritems():
			mn = v[offset]
			mx = v[3 + offset]
			if mn <= mx:
				if stringify and isinstance(mn, (date, time,)):
					d[k] = [str(mn), str(mx)]
				else:
					d[k] = [mn, mx]
		return d
	dw, dw_spill = prepare_res[:2]
	dw.set_minmax(None, minmax_select(minmax_index))
	dw_spill.set_minmax(None, minmax_select(2))
	if save_discard:
		included_lines = stats.discarded_lines
	else:
		included_lines = stats.included_lines
	for sliceno in range(params.slices):
		dw.set_lines(sliceno, included_lines[sliceno])
		dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno])
	if not we_have_spill:
		dw_spill.discard()
	stats.minmax_discarded = minmax_select(0, True)
	stats.minmax           = minmax_select(1, True)
	stats.minmax_spilled   = minmax_select(2, True)
	json_save(stats)
def one_slice(sliceno):
    first = True
    updater = globals()['upd_' + options.flavour]
    for pickle in options.pickles:
        tmp = load(pickle, sliceno=sliceno)
        if first:
            res = tmp
            first = False
        else:
            updater(res, tmp)
    save(res, options.resultname, sliceno=sliceno)
Пример #6
0
def analysis(sliceno, prepare_res):
    key_filter, value_filter = prepare_res
    d = blob.load(jobid=jobids.previous,
                  sliceno=sliceno,
                  default=defaultdict(set))
    if options.key_filter:
        d = {k: v for k, v in d.iteritems() if k in key_filter}
    iterator = datasets.source.iterate_chain(
        sliceno,
        (
            options.key_column,
            options.value_column,
        ),
        stop_jobid={jobids.previous: 'source'},
    )
    # These break out into four versions for shorter runtime
    if options.value_filter:
        # Remove anything that's not in the filter
        for k, v in d.items():
            v = v & value_filter
            if v:
                d[k] = v
            else:
                del d[k]
        # This lets us reuse the same str object for the same value (smaller pickles)
        value_filter = {v: v for v in value_filter}
        if options.key_filter:
            for k, v in iterator:
                if k in key_filter and v in value_filter:
                    d[k].add(value_filter[v])
        else:
            for k, v in iterator:
                if v in value_filter:
                    d[k].add(value_filter[v])
    else:
        reuse = {}
        if options.key_filter:
            for k, v in iterator:
                if k in key_filter:
                    d[k].add(reuse.setdefault(v, v))
        else:
            for k, v in iterator:
                d[k].add(reuse.setdefault(v, v))
    blob.save(d, sliceno=sliceno, temp=False)
    blob.save(set(d), 'keyset', sliceno=sliceno, temp=False)
    blob.save(Counter(len(v) for v in d.itervalues()),
              'setsizehist',
              sliceno=sliceno,
              temp=False)
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_jobid=datasets.stop)
    for src in jobs:
        jid = build('dataset_checksum',
                    options=dict(columns=options.columns, sort=options.sort),
                    datasets=dict(source=src))
        data = blob.load(jobid=jid)
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
Пример #8
0
def synthesis(params):
    setsizehist = Counter()
    for sliceno in range(params.slices):
        setsizehist.update(blob.load('setsizehist', sliceno=sliceno))
    blob.save(setsizehist, 'setsizehist')
Пример #9
0
def prepare(jobids):
    key_filter = blob.load(options.key_filter, default=set()),
    value_filter = blob.load(options.value_filter, default=set())
    return key_filter, value_filter
Пример #10
0
def main(urd):
    resetlocale()

    if False:
        # One BILLION rows
        # This takes about half an hour on a fast machine
        num_rows = int(1e7)
        num_datasets = 100
    else:
        # One MILLION rows
        num_rows = int(1e6)
        num_datasets = 10

    # Create datasets
    print("\x1b[1m(1) Create chain of datasets.\x1b[m")
    jid = None
    for _ in range(num_datasets):
        jid = urd.build('example_perf_gendata',
                        options=dict(num_rows=num_rows),
                        datasets=dict(previous=jid))

    # Export chain of datasets to CSV-file.
    print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m")
    jid = urd.build('csvexport',
                    datasets=dict(source=jid),
                    options=dict(filename='out.csv.gz', chain_source=True))

    filename = resolve_jobid_filename(jid, 'out.csv.gz')
    print('Exported file stored in \"%s\"' % (filename, ))

    # Import and type previously exported CSV-file.
    print("\x1b[1m(3) Import dataset from CVS file.\x1b[m")
    jid = urd.build('csvimport', options=dict(filename=filename))
    opts = dict(
        column2type={
            'a string': 'ascii',
            'large number': 'number',
            'small number': 'number',
            'small integer': 'int32_10',  # you must specify base for integers
            'gauss number': 'number',
            'gauss float': 'float64',
        }, )
    print("\x1b[1m(4) Type imported dataset.\x1b[m")
    jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts)

    # Sum all values in a column.  Repeat for a set of columns with different types.
    print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m")
    jid_single = jid
    source = jid_single
    for colname in ('small number', 'small integer', 'large number',
                    'gauss number', 'gauss float'):
        print(colname)
        jid = urd.build('example_perf_sum',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum ' + colname)
        jid = urd.build('example_perf_sum_positive',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum positive ' + colname)

    # Compute histograms of a column
    print('histogram')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss number'),
                    name='histogram_number')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss float'),
                    name='histogram_float')

    # Find string
    print('find string')
    jid = urd.build('example_perf_find_string',
                    datasets=dict(source=source),
                    options=dict(colname='a string', text='ExAx'),
                    name='find_string')
    print(
        "Number of lines containing string \"%s\" is %d." %
        (job_params(jid).options['text'], blob.load(jobid=jid)), )

    # Print resulting profiling information
    from automata_common import profile_jobs
    print()

    def pl(text, time):
        print("%-30s %10.3f %14s" % (
            text,
            time,
            '{0:n}'.format(round(num_rows * num_datasets / time)),
        ))

    print()
    print('-' * 56)
    print("operation                       exec time         rows/s")
    print()
    pl('csvexport', profile_jobs(urd.joblist.find('csvexport')))
    print()
    pl(
        'reimport total',
        profile_jobs(
            urd.joblist.find('csvimport') + urd.joblist.find('dataset_type')))
    pl("   csvimport         ", profile_jobs(urd.joblist.find('csvimport')))
    pl("   type              ", profile_jobs(urd.joblist.find('dataset_type')))
    print()
    print("sum")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum gauss float')))
    print()
    print("sum positive")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum positive small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum positive small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum positive large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum positive gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum positive gauss float')))
    print()
    print("histogram")
    pl("  number             ",
       profile_jobs(urd.joblist.find('histogram_number')))
    pl("  float              ",
       profile_jobs(urd.joblist.find('histogram_float')))
    print()
    pl("find string          ", profile_jobs(urd.joblist.find('find_string')))
    print()
    print("Total test time                %10.3f" %
          (profile_jobs(urd.joblist), ))
    print()
    print('Example size is %s lines.' %
          ('{0:n}'.format(num_datasets * num_rows), ))
    print('Number of slices is %d.' % (urd.info.slices, ))
    print('-' * 56)
Пример #11
0
def ck(jid, method="dataset_checksum", **kw):
    jid = subjobs.build(method, datasets=dict(source=jid), options=kw)
    return blob.load(jobid=jid).sum
def main(urd):

    # Example 1.  Create a chain of datasets containing random data.
    jid_prev = None
    for n in range(5):
        jid_ds = urd.build(
            'example1_create_dataset',
            datasets=dict(previous=jid_prev),
            options=dict(approx_rows=100000, seed=n),
            name='Created_number_%s' % (n, ),
        )
        jid_prev = jid_ds

    # Example 2.  Export the last dataset in the chain to a tab
    #             separated textfile.
    jid_exp = urd.build(
        'csvexport',
        datasets=dict(source=jid_ds),
        options=dict(filename='random.tsv', separator='\t'),
    )
    filename = resolve_jobid_filename(jid_exp, 'random.tsv')
    print('Exported file stored in \"%s\"' % (filename, ))

    # Example 3.  Import the tab separated textfile and type it
    jid_imp = urd.build(
        'csvimport',
        options=dict(filename=filename, separator='\t',
                     labelsonfirstline=True),
    )
    jid_typ = urd.build(
        'dataset_type',
        datasets=dict(source=jid_imp),
        options=dict(column2type=dict(rflt='number', rint='number')),
    )

    # Example 4.  Run a method computing the average of a column, in a
    #             loop, one column at a time.  The column name is an
    #             input parameter.
    for column in Dataset(jid_typ).columns:
        jid_avg = urd.build(
            'example1_calc_average',
            datasets=dict(source=jid_typ),
            options=dict(column=column),
        )
        (s, n) = blob.load(jobid=jid_avg)
        print("Column %s:  sum=%f, length=%d, average=%f" %
              (column, s, n, s / n))

    # Example 5.  Create a new column that is the product of two
    #             existing columns.
    jid_add = urd.build(
        'example1_add_column',
        datasets=dict(source=jid_typ),
    )

    # Example 6.  Export a dataset with named columns in specified
    #             order.
    jid_add_exp = urd.build(
        'csvexport',
        datasets=dict(source=jid_add),
        options=dict(filename='prod.csv', labels=(
            'prod',
            'rflt',
            'rint',
        )),
    )

    print(urd.joblist.pretty)