Exemplo n.º 1
0
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False):
	stats = DotDict(
		included_lines          = [0] * params.slices,
		discarded_lines         = [0] * params.slices,
		spilled_lines           = [0] * params.slices,
		virtually_spilled_lines = [0] * params.slices,
		split_date              = str(options.split_date) if options.split_date else None,
		discard_before_date     = str(options.discard_before_date) if options.discard_before_date else None,
	)
	minmax_per_slice = [{} for _ in range(params.slices)]
	def update_stats(data):
		for item in data.itervalues():
			stats.included_lines[sliceno] += item.counters[2]
			stats.discarded_lines[sliceno] += item.counters[1]
			if item.virtual_spill:
				stats.virtually_spilled_lines[sliceno] += item.counters[3]
			else:
				stats.spilled_lines[sliceno] += item.counters[3]
			update_minmax(minmax_per_slice[sliceno], item.minmax)
	def update_minmax(dest, src):
		for name, lst0 in src.iteritems():
			lst1 = dest.get(name, lst0)
			mins = map(min, zip(lst0[:3], lst1[:3]))
			maxs = map(max, zip(lst0[3:], lst1[3:]))
			dest[name] = mins + maxs
	for sliceno in range(params.slices):
		update_stats(blob.load('stats', sliceno=sliceno))
	minmax = {}
	for item in minmax_per_slice:
		update_minmax(minmax, item)
	def minmax_select(offset, stringify=False):
		d = {}
		for k, v in minmax.iteritems():
			mn = v[offset]
			mx = v[3 + offset]
			if mn <= mx:
				if stringify and isinstance(mn, (date, time,)):
					d[k] = [str(mn), str(mx)]
				else:
					d[k] = [mn, mx]
		return d
	dw, dw_spill = prepare_res[:2]
	dw.set_minmax(None, minmax_select(minmax_index))
	dw_spill.set_minmax(None, minmax_select(2))
	if save_discard:
		included_lines = stats.discarded_lines
	else:
		included_lines = stats.included_lines
	for sliceno in range(params.slices):
		dw.set_lines(sliceno, included_lines[sliceno])
		dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno])
	if not we_have_spill:
		dw_spill.discard()
	stats.minmax_discarded = minmax_select(0, True)
	stats.minmax           = minmax_select(1, True)
	stats.minmax_spilled   = minmax_select(2, True)
	json_save(stats)
Exemplo n.º 2
0
def process_one(sliceno, options, source, prepare_res, data=None, save_discard=False):
	# Future improvement: Look at the old minmax to determine if we will get anything from reading this data
	dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx = prepare_res
	if data:
		assert data.version == 1
		data.seen_before = True
	else:
		data = empty_spilldata()
	d = Dataset(source, data.spill_ds)
	in_files = []
	out_files = []
	offsets = []
	if not save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "too old" lines
	minmax_files = []
	minmax_d = {}
	for colname in column_names:
		out_fn = dw.column_filename(colname, sliceno).encode('ascii')
		in_fn = d.column_filename(colname, sliceno).encode('ascii')
		offset = d.columns[colname].offsets[sliceno] if d.columns[colname].offsets else 0
		in_files.append(ffi.new('char []', in_fn))
		out_files.append(ffi.new('char []', out_fn))
		offsets.append(offset)
		minmax_fn = out_fn + '_minmax'
		minmax_files.append(ffi.new('char []', minmax_fn))
		minmax_d[colname] = minmax_fn
	if save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "good" lines (save discard instead)
	date_coltype = column_types[options.date_column]
	def date2cfmt(dt):
		if date_coltype == 'datetime':
			date0 = (dt.year << 14) | (dt.month << 10) | (dt.day << 5) | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		elif date_coltype == 'date':
			date0 = (dt.year << 9) | (dt.month << 5) | dt.day
			date1 = 0
		elif date_coltype == 'time':
			date0 = 32277536 | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		else:
			raise Exception('Bad date_coltype type: ' + date_coltype)
		return date0, date1
	dates = [0, 0, 0, 0, 0xffffffff, 0xffffffff]
	stats = DotDict()
	if data.seen_before:
		dates[0:2] = date2cfmt(data.get('process_date', datetime.min))
	if (data.last_time or options.hard_spill) and not save_discard:
		for colname in column_names:
			out_fn = dw_spill.column_filename(colname, sliceno).encode('ascii')
			out_files.append(ffi.new('char []', out_fn))
		stats.virtual_spill = False
	else:
		# We still have to make sure the files exist, or we end up
		# with a broken dataset if only some slices wanted to spill.
		for colname in column_names:
			open(dw_spill.column_filename(colname, sliceno), 'ab').close()
		out_files += [ffi.NULL] * len(column_names)
		stats.virtual_spill = True
	# We are done reading `data` - update it for next iteration
	del data.seen_before
	data.process_date = datetime.min
	if options.discard_before_date:
		if options.split_date:
			assert options.discard_before_date < options.split_date
		dates[2:3] = date2cfmt(options.discard_before_date)
		data.process_date = options.discard_before_date
	if options.split_date:
		dates[4:6] = date2cfmt(options.split_date)
		data.process_date = max(data.process_date, options.split_date)
	counters = ffi.new('uint64_t [4]') # one for each class-enum
	res = backend.filter(len(in_files), in_files, offsets, out_files, minmax_files, column_sizes, counters, dates, minmax_typeidx, d.lines[sliceno])
	assert not res, "cffi converter returned error on data from " + source
	stats.version = 0
	stats.counters = list(counters)
	stats.minmax = {}
	for colname, fn in minmax_d.iteritems():
		if exists(fn):
			with type2iter[column_types[colname]](fn) as it:
				stats.minmax[colname] = list(it)
			unlink(fn)
	# If there is at most 2% left, spill it next time.
	# Or if there is at most 10% left and we have read it at least 8 times.
	# Or if there is at most 20% left and we have read it at least 16 times.
	# A reasonable balance between re-reading and re-writing, one hopes.
	data.counter += 1
	total_lines = sum(counters)
	data.last_time = (counters[3] <= total_lines / 50 or
		(data.counter >= 8 and counters[3] <= total_lines / 10) or
		(data.counter >= 16 and counters[3] <= total_lines / 5)
	)
	# If no lines were spilled we will not need this dataset again,
	# nor if we wrote the spill in this dataset.
	if not counters[3] or not stats.virtual_spill:
		data = None
	return data, stats