Exemplo n.º 1
0
    def generate_validate(self, data_type, max_iteration, shuffle=True):
        """Generate mini-batch data for validation.
        """

        batch_size = self.batch_size

        if data_type == 'train':
            indexes = np.array(self.train_indexes)
            x = self.train_x
            y = self.train_y

        elif data_type == 'validate':
            assert len(self.validate_house_list) > 0
            indexes = np.array(self.validate_indexes)
            x = self.validate_x
            y = self.validate_y

        else:
            raise Exception("Incorrect data_type!")

        if shuffle:
            self.validate_random_state.shuffle(indexes)

        iteration = 0
        pointer = 0

        while pointer < len(indexes):

            # Reset pointer
            if iteration == max_iteration:
                break

            # Get batch indexes
            batch_indexes = indexes[pointer:pointer + batch_size]
            pointer += batch_size

            iteration += 1

            batch_x_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len + self.width - 1)
            batch_y_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len // 2, self.seq_len // 2 + self.width)

            batch_x = x[batch_x_indexes_2d]
            batch_y = y[batch_y_indexes_2d]

            # Normalize input
            batch_x = self.transform(batch_x)
            if self.binary_threshold is not None:
                batch_y = binarize(batch_y, self.binary_threshold)
            else:
                batch_y = self.transform(batch_y)

            yield batch_x, batch_y
Exemplo n.º 2
0
    def _generate_balanced(self):
        """Generate mini-batch data for training using balanced data.
        """
        logging.info('----balance generation----')
        batch_size = self.batch_size

        indexes = np.array(self.train_indexes)

        positive_size = int(self.batch_size * self.balance_positive)

        target_values = self.train_y[indexes]
        indexes_on = indexes[target_values >= self.balance_threshold]
        indexes_off = indexes[target_values < self.balance_threshold]

        i_on = len(indexes_on)  # To trigger shuffling
        i_off = len(indexes_off)  # To trigger shuffling
        while True:

            if i_on + positive_size > len(indexes_on):
                i_on = 0
                self.random_state.shuffle(indexes_on)

            if i_off + batch_size - positive_size > len(indexes_off):
                i_off = 0
                self.random_state.shuffle(indexes_off)

            # Get batch indexes
            batch_indexes = np.concatenate(
                (indexes_on[i_on:i_on + positive_size],
                 indexes_off[i_off:i_off + batch_size - positive_size]),
                axis=0)
            batch_x_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len + self.width - 1)
            batch_y_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len // 2, self.seq_len // 2 + self.width)
            batch_x = self.train_x[batch_x_indexes_2d]
            batch_y = self.train_y[batch_y_indexes_2d]
            # Normalize input
            batch_x = self.transform(batch_x)
            if self.binary_threshold is not None:
                batch_y = binarize(batch_y, self.binary_threshold)
            else:
                batch_y = self.transform(batch_y)

            yield batch_x, batch_y
            i_on += positive_size
            i_off += batch_size - positive_size
Exemplo n.º 3
0
    def _generate(self):
        """Generate mini-batch data for training.
        """
        logging.info('----no balance generation----')

        batch_size = self.batch_size

        indexes = np.array(self.train_indexes)
        self.random_state.shuffle(indexes)

        iteration = 0
        pointer = 0

        while True:

            # Reset pointer
            if pointer >= len(indexes):
                pointer = 0

                self.random_state.shuffle(indexes)

            # Get batch indexes
            batch_indexes = indexes[pointer:pointer + batch_size]
            pointer += batch_size

            iteration += 1

            batch_x_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len + self.width - 1)
            batch_y_indexes_2d = batch_indexes[:, None] + np.arange(
                self.seq_len // 2, self.seq_len // 2 + self.width)

            batch_x = self.train_x[batch_x_indexes_2d]
            batch_y = self.train_y[batch_y_indexes_2d]

            # Normalize input
            batch_x = self.transform(batch_x)
            if self.binary_threshold is not None:
                batch_y = binarize(batch_y, self.binary_threshold)
            else:
                batch_y = self.transform(batch_y)

            yield batch_x, batch_y
Exemplo n.º 4
0
def mainGJ(filename, **kwargs):  # TODO: FIX THE KWARGS !!!
	""" Main execution using GaussJordan elimination"""

	DEBUG = get_or_default(kwargs, 'DEBUG', False)
	data = read_data(filename, ' ')
	c = Counter(data['data'])
	child_name = "C1"
	child_idx = data['header'].index(child_name)
	num_columns = len(data['header'])
	new_counter = match_by_column(c, child_idx)

	binary_data = binarize(new_counter)

	items = sorted(binary_data.items(), key=lambda x: x[1][2], reverse=True)

	def leak_exponent(k):
		#return (-sum(k)+1,)
		return (1,)
		#return ()

	log_base = 2

	A_vect = [k + leak_exponent(k) for k, v in items if v[0] not in(1.0, 0.0)]
	A = np.array(A_vect) * Fraction(1, 1)

	b_vect = [v[0] for k, v in items if v[0] not in (1.0, 0.0)]
	b_vect = [log(1.0 - b, log_base) for b in b_vect]

	b_cnt = [(v[1], v[2]) for k, v in items if v[0] not in (1.0, 0.0)]

	if DEBUG:
		for i in xrange(A.shape[0]):
			print "b%d"%i, A_vect[i], b_vect[i], b_cnt[i]
	
	b = np.array(sp.symbols('b0:%d' % A.shape[0]))
	subs = dict(zip(b,b_vect))
	subs_cnt = dict(zip(b,b_cnt))
	
	A2, b2 = GaussJordanElimination(A, b)
	b3 = [1.0 - float(log_base**b.evalf(subs=subs)) for b in b2]

	subs_str = tuple([(str(k), v) for k, v in subs.iteritems()]) + tuple([("r%d"%i, b2[i]) for i in range(len(b2)) ])
	subs_str = dict(subs_str)

	if DEBUG:
		print augment([A2, b2, b3])

	nonzero_i = (i for i in range(A2.shape[0]) if any(j!=0 for j in A2[i]))
	zero_i = (i for i in range(A2.shape[0]) if all(j==0 for j in A2[i]))
	nonzero_v = list((A2[i], b2[i]) for i in nonzero_i)
	zero_v = list((A2[i], b2[i]) for i in zero_i)

	def product(l):
		return reduce(lambda x, y: x * y, l)

	def _min_fitness(b_val, b_subs_cnt_orig):
		b_subs_cnt = dict((k, v[1]) for k, v in b_subs_cnt_orig.iteritems())
		total = sum(b_subs_cnt.values())
		coeff = [(b.args if b.args else (1, b)) for b in (b_val.args if not type(b_val)==sp.Symbol else [b_val])]
		min_c = min(b_subs_cnt[c[1]] for c in coeff)
		return min_c / float(total)

	def _avg_fitness(b_val, b_subs_cnt_orig):
		b_subs_cnt = dict((k, v[1]) for k, v in b_subs_cnt_orig.iteritems())
		total = sum(b_subs_cnt.values())
		coeff = [(b.args if b.args else (1, b)) for b in (b_val.args if not type(b_val) == sp.Symbol else [b_val])]
		#print coeff
		return sum(b_subs_cnt[s[1]] / float(total) for s in coeff)/ float(sum(abs(s) for s, _ in coeff))
		#return sum(abs(s[0])*(b_subs_cnt[s[1]]/float(total)) for s in coeff) / sum(b_subs_cnt[s[1]]/float(total) for s in coeff)
		#return 1

	def _max_count_fitness(b_val, b_subs_cnt_orig):
		b_subs_cnt = dict( (k,v[1]) for k, v in b_subs_cnt_orig.iteritems())
		total = sum(b_subs_cnt.values())
		coeff = [(b.args if b.args else (1, b)) for b in (b_val.args if not type(b_val)==sp.Symbol else [b_val])]
		return sum(b_subs_cnt[s[1]]/abs(s[0]) for s in coeff) / float(total)
	
	def _pu(x,n,c):
		n = float(n)
		x = float(x)
		c = float(c)
		sqr = sqrt(((x/n)*(1.0-x/n))/n)
		return c*sqr
		#return x/n-Ualph*sqr,x/n+Ualph*sqr

	def _pu_fitness(b_val, b_subs_cnt):
		#total = sum(b_subs_cnt.values())
		coeff = [(b.args if b.args else (1, b)) for b in (b_val.args if not type(b_val)==sp.Symbol else [b_val])]
		#return 1.0 - max(b_subs_cnt[b][0]/float(b_subs_cnt[b][1]) - _pu(b_subs_cnt[b][0], b_subs_cnt[b][1], 1.65)[0] for c, b in coeff)
		#return 1.0 - max(b_subs_cnt[b][0]/float(b_subs_cnt[b][1]) - abs(c)*_pu(b_subs_cnt[b][0], b_subs_cnt[b][1], 1.65) for c, b in coeff)
		return 1.0 - max(abs(c)*_pu(b_subs_cnt[b][0], b_subs_cnt[b][1], 1.65) for c, b in coeff)
		
	#fitness = _min_fitness
	#fitness = _avg_fitness
	fitness = _pu_fitness

	#BELOW: poor fitness!
	#fitness = _max_count_fitness

	solutions = []
	for i in nonzero_v:
		for zv in ([(0,0)] + zero_v):
			for coeff in [2, 1,-1, -2]:
				expr = (i[1] + coeff*zv[1])
				fit = fitness(expr, subs_cnt)
				#print i[0], " [",coeff,"]", zv[0], "expr:",expr, "value:",float(1.0 - log_base**expr.evalf(subs=subs)), "fitness:", fit
				solutions.append((i[0],'V' if type(zv[0])!=int else '0', coeff, zv[1],"EXPR:", expr, float(1.0 - log_base ** expr.evalf(subs=subs)), fit))
				if type(zv[0]) == int:
					break

	GJElim_fit_distribution = []
	num_best_solutions = 5
	for i in range(num_columns):
		solutions_filtered = [s for s in sorted(solutions, key= lambda x: x[-1], reverse=True) if s[0][i] == 1][:num_best_solutions]
		GJElim_fit_distribution.append(solutions_filtered[0][-2])
		suma = sum(s[-1]*s[-2] for s in solutions_filtered)
		if DEBUG:
			for s in solutions_filtered:
				print s
			print suma / sum(s[-1] for s in solutions_filtered)
			print ""

	if DEBUG:
		print augment([A2, b2, b3])

	GJElim_distribution = []
	for i in range(num_columns):
		for j in range(A2.shape[0]):
			if A2[j][i] == 1:
				GJElim_distribution.append(b3[j])
				break
	GJElim_distribution = [(d if d>0 else 10e-5) for d in GJElim_distribution]
	GJElim_fit_distribution = [(d if d>0 else 10e-5) for d in GJElim_fit_distribution]

	outs = []
	labels = []
	for h in data['header']:
		labels.append(["True", "False"])
		#FIXME: data['domain'] does not keep states sorted so states are messed up
		#labels.append(data['domain'][h])
		
	for solution in [GJElim_distribution, GJElim_fit_distribution]:
		leak = solution[-1]
		params = reduce( lambda x,y: x+y, [[a,0] for a in solution[:-1]]) + [leak,]
		parent_dims = [2]*(num_columns-1)
		GJ_CPT = CPT([params, [1.0 - p for p in params]], parent_dims, CPT.TYPE_NOISY_MAX, data['header'], labels)
		outs.append(GJ_CPT)

	return outs
Exemplo n.º 5
0
 def get_target(self):
     if self.binary_threshold is not None:
         return binarize(self.target, self.binary_threshold)
     return self.target