示例#1
0
def RandomBootstrap(X_pool, y_pool, size, balance, seed=0):
    '''
    Assume the task is binary classification
    '''
    print ('-' * 50)
    print ('Starting bootstrap...')
    print ('Initial training set size = %d' % size)
    start = time()    
    
    random_state = RandomState(seed=seed)
    poolsize = y_pool.shape[0]
    
    pool_set = np.arange(poolsize)
    
    if balance: # select 1/2 * size from each class
        class0_size = (int)(size / 2)
        class1_size = (int)(size - class0_size)
        class0_indices = np.nonzero(y_pool == 0)[0]
        class1_indices = np.nonzero(y_pool == 1)[0]
        
        class0_docs = random_state.permutation(class0_indices)[:class0_size]
        class1_docs = random_state.permutation(class1_indices)[:class1_size]
        
        training_set = np.hstack((class0_docs, class1_docs))
        
    else: # otherwise, pick 'size' documents randomly
        training_set = random_state.permutation(pool_set)[:size]
    
    pool_set = np.setdiff1d(pool_set, training_set)
    
    print ('bootstraping took %0.2fs.' % (time() - start))
    
    return (training_set.tolist(), pool_set.tolist())
    def run_lhi_informed_analysis(self,max_curves=26,center_size=20,index=None):
        if True:
            self.lhi = compute_local_homogeneity_index(self.OR*pi,__main__.__dict__.get('LHI',2.0))
            f = open(normalize_path('lhi'+str(__main__.__dict__.get('LHI',2.0))+'.pickle'),'wb')
            pickle.dump(self.lhi,f)
            f.close()
        else:
            f = open(normalize_path('lhi'+str(__main__.__dict__.get('LHI',2.0))+'.pickle'),'rb')            
            self.lhi = pickle.load(f)
        
        lhi_center = self.lhi[self.center_r-center_size:self.center_r+center_size,self.center_c-center_size:self.center_c+center_size]
        steps = []
        
        r = RandomState(1023)
       
	if not __main__.__dict__.get('uniform',False):

 
	        pinwheels = r.permutation(numpy.nonzero(numpy.ravel(lhi_center) < __main__.__dict__.get('cutoff',0.3))[0])
        	domains = r.permutation(numpy.nonzero(numpy.ravel(lhi_center) > (1-__main__.__dict__.get('cutoff',0.3)))[0])
	        
		assert len(pinwheels) > max_curves/2

        	#s = numpy.argsort(numpy.ravel(lhi_center))
        
	        if index == None:
	           for i in xrange(0,max_curves/2):
        	        (x,y) = numpy.unravel_index(pinwheels[i],lhi_center.shape)
                	steps.append((x+self.center_r-center_size,y+self.center_c-center_size))

	                (x,y) = numpy.unravel_index(domains[i],lhi_center.shape)
        	        steps.append((x+self.center_r-center_size,y+self.center_c-center_size))
	        else:
                	if (index % 2) == 0:
        	           (x,y) = numpy.unravel_index(pinwheels[int(index/2)],lhi_center.shape)
	                   steps= [(x+self.center_r-center_size,y+self.center_c-center_size)]
        	        else:
                	   (x,y) = numpy.unravel_index(domains[int(index/2)],lhi_center.shape)
	                   steps= [(x+self.center_r-center_size,y+self.center_c-center_size)] 
        else:
		bins = []
		for i in xrange(0,10):
		    a = numpy.ravel(lhi_center) >= i*0.1	
		    b = numpy.ravel(lhi_center) <  (i+1)*0.1
		    bins.append(r.permutation(numpy.nonzero(numpy.multiply(a,b))[0]))
		(x,y) = numpy.unravel_index(bins[index % 10][int(index/10)],lhi_center.shape)
                steps= [(x+self.center_r-center_size,y+self.center_c-center_size)]
   
	    	    					

		#places = r.permutation(numpy.arange(0,len(numpy.ravel(lhi_center)),1))
                #(x,y) = numpy.unravel_index(places[index],lhi_center.shape)
                #steps.append((x+self.center_r-center_size,y+self.center_c-center_size))

        self.analyse(steps,ns=__main__.__dict__.get('number_sizes',10))
示例#3
0
def get_usps_split(seed,digits=range(num_classes)):
    from numpy.random import RandomState
    rnd = RandomState(seed)

    num_train = 200
    num_test = 500

    X_train = []
    Y_train = []

    X_test = []
    Y_test = []

    for t in digits:
        I = rnd.permutation(num_digits_per_class)

        ## note: num_train + num_test < len(I);
        ## The NCA paper used 200 train and 500 test.
        ## This gives 700 per class. 
        I_train = I[:num_train]
        I_test = I[-num_test:]

        X_t_train = raw[:,I_train,t].T
        X_t_test = raw[:,I_test,t].T

        X_train.extend(X_t_train)
        X_test.extend(X_t_test)
        Y_train.extend([t] * num_train)
        Y_test.extend([t] * num_test)

    assert len(X_train) == len(Y_train) and len(X_test) == len(Y_test)
    
    # note: we only permute the training cases, since we don't do 
    # sgd of any kind on the test cases.

    import numpy as np

    I = rnd.permutation(len(X_train))
    X_train = np.array(X_train)[I]
    Y_train = np.array(Y_train)[I]

    I = rnd.permutation(len(X_test))
    X_test = np.array(X_test)[I]
    Y_test = np.array(Y_test)[I]

    X_train = usps_resizer(X_train,8)
    X_test = usps_resizer(X_test,8)

    X_train /= 255.
    X_test /= 255.

    return (X_train,X_test),(Y_train,Y_test)
示例#4
0
def get_usps_split(seed, digits=range(num_classes)):
    from numpy.random import RandomState
    rnd = RandomState(seed)

    num_train = 200
    num_test = 500

    X_train = []
    Y_train = []

    X_test = []
    Y_test = []

    for t in digits:
        I = rnd.permutation(num_digits_per_class)

        ## note: num_train + num_test < len(I);
        ## The NCA paper used 200 train and 500 test.
        ## This gives 700 per class.
        I_train = I[:num_train]
        I_test = I[-num_test:]

        X_t_train = raw[:, I_train, t].T
        X_t_test = raw[:, I_test, t].T

        X_train.extend(X_t_train)
        X_test.extend(X_t_test)
        Y_train.extend([t] * num_train)
        Y_test.extend([t] * num_test)

    assert len(X_train) == len(Y_train) and len(X_test) == len(Y_test)

    # note: we only permute the training cases, since we don't do
    # sgd of any kind on the test cases.

    import numpy as np

    I = rnd.permutation(len(X_train))
    X_train = np.array(X_train)[I]
    Y_train = np.array(Y_train)[I]

    I = rnd.permutation(len(X_test))
    X_test = np.array(X_test)[I]
    Y_test = np.array(Y_test)[I]

    X_train = usps_resizer(X_train, 8)
    X_test = usps_resizer(X_test, 8)

    X_train /= 255.
    X_test /= 255.

    return (X_train, X_test), (Y_train, Y_test)
示例#5
0
def pool_entropy_h(X, y, candidate_mask, train_mask, classifier, n_candidates,
                   pool_n, n_jobs=-1, random_state=None, **kwargs):
    """ Return the candidate that will minimise the expected entropy of the predictions.

        Parameters
        ----------
        X_training_candidates : array
            The feature matrix of the potential training candidates.

        classes : int
            The name of classes.

        pool_n : int
            The size of the sampel pool used in estimating the entropy

        n_jobs : int
            The number of parallel jobs (-1 if want to use all cores)

        Returns
        -------
        best_candidate : int
            The index of the best candidate.
    """
    
    classes = classifier.classes_ # sorted lexicographically
    n_classes = len(classes)
    n_features = X.shape[1]
    entropy = np.empty(len(candidate_mask))
    entropy[:] = np.inf
    rng = RandomState(random_state)

    # the probabilities used to calculate expected value of pool
    probs = classifier.predict_proba(X[candidate_mask])

    # copy the classifier (avoid modifying the original classifier)
    classifier_plus = clone(classifier)

    # construct the sample pool (used to estimate the entropy)
    unlabelled_indices = np.where(-train_mask)[0]
    pool_indices = rng.permutation(unlabelled_indices)[:pool_n]
    pool_mask = np.zeros(len(candidate_mask), dtype=bool)
    pool_mask[pool_indices] = True

    # let's look at each candidate
    candidate_indices = np.where(candidate_mask)[0]

    results = Parallel(n_jobs=n_jobs)(delayed(_parallel_entropy_estimate)(
        X, y.copy(), train_mask.copy(), pool_mask,
        clone(classifier_plus), classes, n_classes, probs, i, index)
        for i, index in enumerate(candidate_indices))

    indices, expected = zip(*results)
    indices, expected = np.asarray(indices), np.asarray(expected)
    assert not np.isnan(expected).any(), 'Some expected values are undefined.'

    entropy[indices] = expected

    # pick the candidate with the smallest expected entropy
    best_candidates = np.argsort(entropy)[:n_candidates]
    return best_candidates
示例#6
0
def view_voltages(data,title=None, shuffle=False, shuffle_seed=1, vmin=-70, vmax=35, s_per_step=None):
    """
    Show a complete simulation run in an (M*N)xT trace image.

    Args:
        data: MxNxT array of voltage traces
        title: figure title
        shuffle: If true, shuffle the order of cells in the trace image.
        s_per_step: seconds per step - if given, display a proper time axis
    """
    plt.figure(figsize=fig_size)
    vtraces = data.reshape(-1,data.shape[2])[:]
    if shuffle:
        rng = RandomState(shuffle_seed)
        vtraces = rng.permutation(vtraces)

    if s_per_step is None:
        T = data.shape[-1]
    else:
        T = data.shape[-1] * s_per_step
    plt.imshow(vtraces, cmap='bone', vmin=vmin, vmax=vmax, aspect='auto', interpolation='nearest', extent=[0, T, vtraces.shape[0], 0])
    plt.colorbar()
    if title:
        plt.title(title)
    plt.show()
示例#7
0
文件: core.py 项目: jbpoline/permute
def permute_within_groups(x, group, prng=None):
    """
    Permutation of condition within each group.

    Parameters
    ----------
    x : array-like
        A 1-d array indicating treatment.
    group : array-like
        A 1-d array indicating group membership
    prng : RandomState instance or None, optional (default=None)
        If RandomState instance, prng is the pseudorandom number generator;
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`.

    Returns
    -------
    permuted : array-like
        The within group permutation of x.
    """
    permuted = x.copy()
    if prng is None:
        prng = RandomState()

    # (avoid additional flops) -- maybe memoize
    for g in np.unique(group):
        gg = group == g
        permuted[gg] = prng.permutation(permuted[gg])
    return permuted
示例#8
0
文件: core.py 项目: jbpoline/permute
def corr(x, y, reps=10**4, prng=None):
    """
    Simulate permutation p-value for Spearman correlation coefficient

    Parameters
    ----------
    x : array-like
    y : array-like
    reps : int
    prng : RandomState instance or None, optional (default=None)
        If RandomState instance, prng is the pseudorandom number generator;
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`.

    Returns
    -------
    tuple
        Returns test statistic, left-sided p-value,
        right-sided p-value, two-sided p-value, simulated distribution
    """
    if prng is None:
        prng = RandomState()
    tst = np.corrcoef(x, y)[0, 1]
    sims = [np.corrcoef(prng.permutation(x), y)[0, 1] for i in range(reps)]
    left_pv = np.sum(sims <= tst)/reps
    right_pv = np.sum(sims >= tst)/reps
    two_sided_pv = np.sum(np.abs(sims) >= np.abs(tst))/reps
    return tst, left_pv, right_pv, two_sided_pv, sims
def load_dataset(params, path='datasets'):
    download_dataset(path)

    # training data
    data = [
        np.load(os.path.join(path, 'cifar-10-batches-py',
                             'data_batch_%d' % (i + 1)),
                encoding='latin1') for i in range(5)
    ]
    X_train = np.vstack([d['data'] for d in data])
    y_train = np.hstack([np.asarray(d['labels'], np.int8) for d in data])

    # test data
    data = np.load(os.path.join(path, 'cifar-10-batches-py', 'test_batch'),
                   encoding='latin1')
    X_test = data['data']
    y_test = np.asarray(data['labels'], np.int8)

    # reshape
    X_train = X_train.reshape(-1, 3, 32, 32)
    X_test = X_test.reshape(-1, 3, 32, 32)

    # permute
    rndSeed = RandomState(params.seed)
    permute = rndSeed.permutation(len(y_train))
    X_train = X_train[permute]
    y_train = y_train[permute]
    permute = rndSeed.permutation(len(y_test))
    X_test = X_test[permute]
    y_test = y_test[permute]

    # normalize
    try:
        mean_std = np.load(os.path.join(path, 'cifar-10-mean_std.npz'),
                           encoding='latin1')
        mean = mean_std['mean']
        std = mean_std['std']
    except IOError:
        mean = X_train.mean(axis=(0, 2, 3), keepdims=True).astype(np.float32)
        std = X_train.std(axis=(0, 2, 3), keepdims=True).astype(np.float32)
        np.savez(os.path.join(path, 'cifar-10-mean_std.npz'),
                 mean=mean,
                 std=std)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    return X_train, y_train, X_test, y_test
示例#10
0
def shuffle_list(seed, *data):
    from numpy.random import RandomState
    np_rng = RandomState(seed)
    idxs = np_rng.permutation(np.arange(len(data[0])))
    if len(data) == 1:
        return [data[0][idx] for idx in idxs]
    else:
        return [[d[idx] for idx in idxs] for d in data]
示例#11
0
    def execute(self):
        """Execute the link."""
        ds = process_manager.service(DataStore)

        # basic checks on contensts of the data frame
        assert self.read_key in ds, 'Key "{key}" not in DataStore.'.format(
            key=self.read_key)
        df = ds[self.read_key]
        if not isinstance(df, pd.DataFrame):
            raise Exception('Retrieved object not of type pandas DataFrame.')
        ndf = len(df.index)
        assert ndf > 0, 'dataframe {} is empty.'.format(self.read_key)
        if self.store_key is None:
            if self.column in df.columns:
                raise Exception(
                    'Column name <{}> already used: <{!s}>. Will not overwrite.'
                    .format(self.column, df.columns))
            df[self.column] = 0

        # fix final number of events assigned per random class
        # ... each class gets at least one event
        if self.nevents is not None:
            if len(self.nevents) == self.nclasses - 1:
                self.nevents.append(ndf - sum(n for n in self.nevents))
        else:
            self.nevents = [int(ndf * f) for f in self.fractions]
        for i in range(self.nclasses):
            nsum = sum(n for n in self.nevents[:i + 1])
            ndiff = 0 if nsum - ndf < 0 else nsum - ndf
            self.nevents[i] -= ndiff
            if self.nevents[i] < 0:
                self.nevents[i] = 0
            self.logger.info(
                'Random class <{index:d}> assigned <{n:d}> events.',
                index=i,
                n=self.nevents[i])

        # random reshuffling of dataframe indices
        RNG = RandomState(self._seed)
        permute = RNG.permutation(df.index)

        # apply the random reshuffling, and assign records to the n datasets
        for i in range(self.nclasses):
            ib = sum(n for n in self.nevents[:i])
            ie = sum(n for n in self.nevents[:i + 1])
            if self.store_key is None:
                df.ix[permute[ib:ie], self.column] = i
            else:
                ds[self.store_key[i]] = df.ix[permute[ib:ie]]
                self.logger.info(
                    'Stored output collection <{key}> with <{n:d}> records in datastore.',
                    key=self.store_key[i],
                    n=len(ds[self.store_key[i]].index))

        # increase seed in case of next iteration
        self._seed += 1

        return StatusCode.Success
示例#12
0
文件: opt.py 项目: gwyn31/glm
 def _get_shuffled_mini_batches(self, m: int):
     """
     Get mini batches of indices
     :param m: total number of samples
     """
     rng = RandomState()
     shuffled_indices = rng.permutation(range(m))
     for b in range(0, m, self.batch_size):
         yield shuffled_indices[b:b + self.batch_size]
示例#13
0
def generatePermutation(numbersamples, randomSeedOrState):
    from numpy.random import RandomState

    if isinstance(randomSeedOrState, RandomState):
        randomstate = randomSeedOrState
    else:
        randomstate = RandomState(int(randomSeedOrState % sys.maxint))

    perm = randomstate.permutation(numbersamples)
    return perm
示例#14
0
def generatePermutation(numbersamples,randomSeedOrState):
    from numpy.random import RandomState

    if isinstance(randomSeedOrState,RandomState):
        randomstate = randomSeedOrState
    else:
        randomstate = RandomState(randomSeedOrState)

    perm = randomstate.permutation(numbersamples)
    return perm
示例#15
0
 def _get_sequence(self):
     if self.sequence is None:
         if self.sampling.order == self.sampling.RANDOM:
             rs = RandomState(seed=self.state['list_seed'])
             self.sequence = rs.permutation(self.state['power'])
         elif self.sampling.order == self.sampling.DIRECT:
             self.sequence = list(range(self.state['power']))
         elif self.sampling.order == self.sampling.REVERSED:
             self.sequence = list(range(self.state['power']))[::-1]
     return self.sequence
示例#16
0
    def generate_random_permutation_transform(seed, challenge_length, puf_count, atf=False):
        """
        Returns an input transformation that uses k pseudorandomly generated permutations
        :param seed: int
                     Seed for the pseudorandom generation
        :param challenge_length: int
                   Challenge length (must equal LTFArray.n)
        :param puf_count: int
                          Number of permutations to be used (must equal LTFArray.k)
        :param atf: boolean
                    Perform ATF transform after permuting
        :return:  A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n)
                  A function that can perform the desired transformation.
        """
        prng = RandomState(seed)
        permutations = [prng.permutation(challenge_length) for _ in range(puf_count)]

        def transform(challenges, k):
            """
            Method as described in generate_concatenated_transform doc string.
            :param challenges: array of int shape(N,n)
                               Array of challenges which should be evaluated by the simulation.
            :param k: int
                     Number of LTFArray PUFs
            :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n)
                     A function that can perform the desired transformation.
            """
            (_, n) = challenges.shape
            assert k == puf_count and n == challenge_length, \
                'Permutations Input Transform cannot be used for LTFArrays with size other than defined'

            result = swapaxes(
                array([
                    challenges[:, permutations[i]]
                    for i in range(puf_count)
                ]),
                0,
                1
            )

            if atf:
                # Perform atf transform
                result = transpose(
                    array([
                        prod(result[:, :, i:], 2)
                        for i in range(n)
                    ]),
                    (1, 2, 0)
                )

            return result

        transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed
        return transform
示例#17
0
文件: util.py 项目: gaow/FaST-LMM
def generate_permutation(numbersamples, randomSeedOrState):
    from numpy.random import RandomState

    if isinstance(randomSeedOrState, RandomState):
        randomstate = randomSeedOrState
    else:
        randomstate = RandomState(int(randomSeedOrState %
                                      2147483647))  #old maxint

    perm = randomstate.permutation(numbersamples)
    return perm
示例#18
0
def permute(data, label, params):

    ''' Permute data.
    
    '''
    rndSeed = RandomState(params.seed)
    permute = rndSeed.permutation(data.shape[0])
    data = data[permute]
    label = label[permute]

    return (data, label)
示例#19
0
    def execute(self):
        """ Execute AssignRandomClass """

        ds = ProcessManager().service(DataStore)

        # basic checks on contensts of the data frame
        assert self.readKey in list(
            ds.keys()), 'Key %s not in DataStore.' % self.readKey
        df = ds[self.readKey]
        if not isinstance(df, DataFrame):
            raise Exception('Retrieved object not of type pandas DataFrame.')
        ndf = len(df.index)
        assert ndf > 0, 'dataframe %s is empty.' % self.readKey
        if self.column in df.columns:
            raise Exception(
                'Column name <%s> already used: <%s>. Will not overwrite.' %
                (self.column, str(df.columns)))

        # fix final number of events assigned per random class
        # ... each class gets at least one event
        if self.nevents is not None:
            if len(self.nevents) == self.nclasses - 1:
                self.nevents.append(ndf - sum(n for n in self.nevents))
        if self.nevents is None:
            self.nevents = [int(ndf * f) for f in self.fractions]
            pass
        for i in range(self.nclasses):
            nsum = sum(n for n in self.nevents[:i + 1])
            ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf)
            self.nevents[i] -= ndiff
            if self.nevents[i] < 0:
                self.nevents[i] = 0
            pass
        for i, n in enumerate(self.nevents):
            assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \
                                                                                        (i, n, str(self.nevents))
            self.log().info('Random class <%d> assigned n events <%d>.' %
                            (i, n))

        # random reshuffling of dataframe indices
        settings = ProcessManager().service(ConfigObject)
        RNG = RandomState(settings['seed'])
        permute = RNG.permutation(df.index)

        # apply the random reshuffling, and assign records to the n classes
        df[self.column] = 0
        for i in range(self.nclasses):
            ib = sum(n for n in self.nevents[:i])
            ie = sum(n for n in self.nevents[:i + 1])
            df.ix[permute[ib:ie], self.column] = i
            pass

        return StatusCode.Success
示例#20
0
def view_spikes(data, title=None, shuffle=False, shuffle_seed=1):
    spikesflat = data.reshape(-1, data.shape[2])[:]
    if shuffle:
        rng = RandomState(shuffle_seed)
        spikesflat = rng.permutation(spikesflat)

    idxs, spiketimes = np.nonzero(spikesflat)
    plt.figure(figsize=fig_size)
    plt.scatter(spiketimes, idxs, marker='|', s=50, alpha=0.7, color='k')
    if title:
        plt.title(title)
    plt.show()
示例#21
0
def data_to_csv(prefix: str, path: str, dataset_name: str, seeds, items_to_use,
                only_items_to_use):
    masks = glob.glob(path + "*.png")
    if only_items_to_use:
        masks = list(
            filter(
                lambda x: any(
                    [True if i in x else False for i in items_to_use]), masks))
    else:
        masks = list(
            filter(
                lambda x: any(
                    [True if i not in x else False for i in items_to_use]),
                masks))
    names = list(map(lambda x: x.split(path)[1], masks))
    items = list(map(lambda x: x.split("_"), names))
    codes = list(
        map(lambda x: os.path.join(prefix, "ISIC_" + x[1] + ".jpg"), items))
    types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items))

    diseases = np.array(sorted(list(set(types))))
    print(diseases)
    print(types)
    print(codes)
    assert len(types) == len(codes)
    assert len(types) == len(masks)
    result_dict = {}
    for path, code, typ in tqdm(list(zip(masks, codes, types))):
        if code in result_dict:
            labels = result_dict[code]
        else:
            labels = np.zeros(len(diseases))
            result_dict[code] = labels
        idx = np.where(diseases == typ)
        labels[idx] = load_mask(path)

    use_format = True if len(seeds) > 1 else False
    for seed in seeds:
        rs = RandomState(seed)
        result = list(result_dict.items())
        result.sort(key=lambda x: x[0])
        result = rs.permutation(result)
        indices = list(map(lambda x: x[0], result))
        result = list(map(lambda x: x[1], result))

        frame = pd.DataFrame(result,
                             index=indices,
                             columns=diseases,
                             dtype='int64')
        if use_format:
            frame.to_csv(dataset_name.format(seed), index_label="images")
        else:
            frame.to_csv(dataset_name, index_label="images")
示例#22
0
    def randomise_dataframe_rows(self, df):
        """ Randomise ordering of DataFrame.
        Return a NumPy array of shuffled index values using `np.random.permutation`
        Return a new Dataframe containing the shuffled order using `loc[]`
        `seed(1)` reproduces random same results when share and run same code by others
        """
        if isinstance(df, type(None)):
            return None
        # np.random.seed(0)
        # return df.loc[np.random.permutation(len(df))]

        prng = RandomState(1234567890)
        return df.loc[prng.permutation(len(df))]
示例#23
0
def data_to_csv(masks_data_path: str, save_to_file_path: str,
                real_image_path: str, fake_image_path: str, masks_path: str,
                seeds, replace_first_percents, extend):
    masks = glob.glob(masks_data_path + "*.png")
    names = list(map(lambda x: x.split(masks_data_path)[1], masks))
    items = list(map(lambda x: x.split("_"), names))
    codes = list(map(lambda x: x[1], items))
    types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items))

    diseases = np.array(sorted(list(set(types))))
    print(diseases)
    print(types)
    print(codes)
    assert len(types) == len(codes)
    assert len(types) == len(masks)
    codes = list(set(codes))
    full = len(codes)
    part = int(full / 100 * replace_first_percents)
    filled = [(c, diseases) for c in codes]
    for seed in seeds:
        rs = RandomState(seed)
        result = list(filled)
        result.sort(key=lambda x: x[0])
        result = rs.permutation(result)
        indexes = list(map(lambda x: x[0], result))
        indexes_a = indexes[:part]
        indexes_b = indexes[:part]
        indexes_c = indexes[part:]

        indexes_a = list(
            map(
                lambda i:
                create_record(i, diseases, fake_image_path, masks_path,
                              "_semantic_synthesized_image"), indexes_a))
        indexes_b = list(
            map(
                lambda i: create_record(i, diseases, real_image_path,
                                        masks_path), indexes_b))
        indexes_c = list(
            map(
                lambda i: create_record(i, diseases, real_image_path,
                                        masks_path), indexes_c))

        if extend:
            result = indexes_a + indexes_b + indexes_c
        else:
            result = indexes_a + indexes_c
        frame = pd.DataFrame(result, columns=['images'] + list(diseases))
        frame.to_csv(save_to_file_path.format(replace_first_percents, seed),
                     index_label="images",
                     index=False)
def data_to_csv(real_prefix: str,
                generated_prefix: str,
                path: str,
                dataset_name: str,
                seeds,
                replace_first_percents,
                extend):
    masks = glob.glob(path + "*.png")
    names = list(map(lambda x: x.split(path)[1], masks))
    items = list(map(lambda x: x.split("_"), names))
    # codes = list(map(lambda x: os.path.join(prefix, "ISIC_" + x[1] + ".jpg"), items))
    codes = list(map(lambda x: x[1], items))
    types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items))

    diseases = np.array(sorted(list(set(types))))
    print(diseases)
    print(types)
    print(codes)
    assert len(types) == len(codes)
    assert len(types) == len(masks)
    result_dict = {}
    for path, code, typ in tqdm(list(zip(masks, codes, types))):
        if code in result_dict:
            labels = result_dict[code]
        else:
            labels = np.zeros(len(diseases))
            result_dict[code] = labels
        idx = np.where(diseases == typ)
        labels[idx] = load_mask(path)
    full = len(result_dict)
    part = int(full / 100 * replace_first_percents)
    for seed in seeds:
        rs = RandomState(seed)
        result = list(result_dict.items())
        result.sort(key=lambda x: x[0])
        result = rs.permutation(result)
        indices = list(map(lambda x: x[0], result))
        result = list(map(lambda x: x[1], result))
        if extend:
            indices_a = list(map(lambda x: os.path.join(generated_prefix, "ISIC_" + x + "_semantic_synthesized_image.jpg"), indices[:part]))
            indices_b = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"),  indices[:part]))
            indices_c = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"), indices[part:]))
            indices = indices_a + indices_b + indices_c
            result = result[:part] + result[:part] + result[part:]
        else:
            indices_a = list(map(lambda x: os.path.join(generated_prefix, "ISIC_" + x + "_semantic_synthesized_image.jpg"), indices[:part]))
            indices_b = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"),  indices[part:]))
            indices = indices_a + indices_b

        frame = pd.DataFrame(result, index=indices, columns=diseases, dtype='int64')
        frame.to_csv(dataset_name.format(replace_first_percents, seed), index_label="images")
示例#25
0
def _subsample(counts, n, replace=False, seed=0):
    """Randomly subsample from a vector of counts.

    Parameters
    ----------
    counts : 1-D array_like
        Vector of counts.
    n : int
        Number of element to subsample (<= the total number of counts).
    replace : bool, optional
        Subsample with or without replacement.
    seed : int, optional
        Random seed.

    Returns
    -------
    subcounts : 1-D ndarray
        Subsampled vector of counts

    Raises
    ------
    ValueError, TypeError
    """

    if n < 0:
        raise ValueError("'n' must be > 0 ")

    counts = np.asarray(counts)

    if counts.ndim != 1:
        raise ValueError("counts must be an 1-D array_like object")

    counts = counts.astype(int, casting='safe')
    counts_sum = counts.sum()

    if n > counts_sum:
        raise ValueError("'n' must be <= the total number of counts")

    prng = RandomState(seed)

    if replace:
        p = counts / counts_sum
        subcounts = prng.multinomial(n, p)
    else:
        nonzero = np.flatnonzero(counts)
        expanded = np.concatenate([np.repeat(i, counts[i]) for i in nonzero])
        permuted = prng.permutation(expanded)[:n]
        subcounts = np.bincount(permuted, minlength=counts.size)

    return subcounts
示例#26
0
def _subsample(counts, n, replace=False, seed=0):
    """Randomly subsample from a vector of counts.

    Parameters
    ----------
    counts : 1-D array_like
        Vector of counts.
    n : int
        Number of element to subsample (<= the total number of counts).
    replace : bool, optional
        Subsample with or without replacement.
    seed : int, optional
        Random seed.

    Returns
    -------
    subcounts : 1-D ndarray
        Subsampled vector of counts

    Raises
    ------
    ValueError, TypeError
    """

    if n < 0:
        raise ValueError("'n' must be > 0 ")

    counts = np.asarray(counts)

    if counts.ndim != 1:
        raise ValueError("counts must be an 1-D array_like object")

    counts = counts.astype(int, casting='safe')
    counts_sum = counts.sum()

    if n > counts_sum:
        raise ValueError("'n' must be <= the total number of counts")

    prng = RandomState(seed)

    if replace:
        p = counts / counts_sum
        subcounts = prng.multinomial(n, p)
    else:
        nonzero = np.flatnonzero(counts)
        expanded = np.concatenate([np.repeat(i, counts[i]) for i in nonzero])
        permuted = prng.permutation(expanded)[:n]
        subcounts = np.bincount(permuted, minlength=counts.size)

    return subcounts
示例#27
0
def mix_crop(s1, s2, max_nr_samples=40, random_state=None):

    assert len(s1) == len(s2)

    if random_state is not None:
        prng = RandomState(random_state)
        indices_mixed = prng.permutation(np.arange(0, len(s1)))
        s1 = s1[indices_mixed]
        s2 = s2[indices_mixed]

    max_nr_samples = max_nr_samples if max_nr_samples < len(s1) else len(s1)
    s1 = s1[0:max_nr_samples]
    s2 = s2[0:max_nr_samples]
    return s1, s2
示例#28
0
    def _sample_next_goal_positions(
            self, random_state: RandomState) -> Tuple[np.ndarray, bool]:
        # Set all the goals to the initial rotation first.
        self.mujoco_simulation.set_target_quat(np.array([[1, 0, 0, 0]] * 8))
        self.mujoco_simulation.forward()

        # Set position of blocks
        block_size = self.mujoco_simulation.simulation_params.object_size
        width, height, _ = self.mujoco_simulation.get_placement_area().size

        # Note that block_size and rel_w, rel_h are all half of the block size
        rel_w, rel_h = block_size / width, block_size / height

        # offset for making blocks to be attached to each other
        offset_w, offset_h = rel_w * 2, rel_h * 2

        # Expected configuration
        #       [ ][ ]
        #    [ ][ ][ ][ ]
        #       [ ][ ]
        block_config = random_state.permutation([
            [offset_w, 0],
            [offset_w * 2, 0],
            [0, offset_h],
            [offset_w, offset_h],
            [offset_w * 2, offset_h],
            [offset_w * 3, offset_h],
            [offset_w, offset_h * 2],
            [offset_w * 2, offset_h * 2],
        ])

        # Now randomly place the overall config in the placement area
        config_w, config_h = block_config.max(axis=0)
        margin_w, margin_h = 1.0 - config_w - rel_w, 1.0 - config_h - rel_h

        ori_x, ori_y = random_state.uniform(low=(rel_w, rel_h),
                                            high=(margin_w, margin_h))

        # Randomize the position of the entire block configuration.
        block_config += np.array([[ori_x, ori_y]])

        # Then place the objects as designed.
        return place_targets_with_fixed_position(
            self.mujoco_simulation.get_object_bounding_boxes(),
            self.mujoco_simulation.get_table_dimensions(),
            self.mujoco_simulation.get_placement_area(),
            block_config,
        )
示例#29
0
    def _iter_fast(self, ds, batch_size, start=None, end=None,
            shuffle=True, seed=None):
        # craete random seed
        prng1 = None
        prng2 = _dummy_shuffle
        if shuffle:
            if seed is None:
                seed = get_random_magic_seed()
            prng1 = RandomState(seed)
            prng2 = RandomState(seed)

        batches = create_batch(ds.shape[0], batch_size, start, end, prng1)
        prng2.shuffle(batches)
        for i, j in batches:
            data = ds[i:j]
            yield self._normalizer(data[prng2.permutation(data.shape[0])])
示例#30
0
    def generate_random_permutation_transform(cls, seed, nn, kk, atf=False):
        """
        Returns an input transformation that uses k pseudorandomly generated permutations
        :param seed: int
                     Seed for the pseudorandom generation
        :param nn: int Challenge length (must equal LTFArray.n)
        :param kk: int Number of permutations to be used (must equal LTFArray.k)
        :param atf: boolean
                    Perform ATF transform after permuting
        :return:  A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n)
                  A function that can perform the desired transformation.
        """
        prng = RandomState(seed)
        permutations = [prng.permutation(nn) for _ in range(kk)]

        def transform(challenges, k):
            """
            Method as described in generate_concatenated_transform doc string.
            :param challenges: array of shape(N,n)
                               Array of challenges which should be evaluated by the simulation.
            :param k: int
                     Number of LTFArray PUFs
            :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n)
                     A function that can perform the desired transformation.
            """
            (_, n) = challenges.shape
            assert k == kk and n == nn, \
                'Permutations Input Transform cannot be used for LTFArrays with size other than defined'

            sub_challenges = swapaxes(
                array([
                    challenges[:, permutations[i]]
                    for i in range(kk)
                ]),
                0,
                1
            )

            if atf:
                # Perform atf transform
                sub_challenges = cls.att(sub_challenges)

            return sub_challenges

        transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed
        return transform
def create_dataset(opt, mode):
    convert = tnt.transform.compose([
        lambda x: x.astype(np.float32),
        lambda x: x / 255.0,
        # cvtransforms.Normalize([125.3, 123.0, 113.9], [63.0,  62.1,  66.7]),
        lambda x: x.transpose(2, 0, 1).astype(np.float32),
        torch.from_numpy,
    ])

    train_transform = tnt.transform.compose([
        cvtransforms.RandomHorizontalFlip(),
        cvtransforms.Pad(opt.randomcrop_pad, cv2.BORDER_REFLECT),
        cvtransforms.RandomCrop(32),
        convert,
    ])

    ds = getattr(datasets, opt.dataset)('.', train=mode, download=True)
    smode = 'train' if mode else 'test'
    if mode:
        from numpy.random import RandomState
        prng = RandomState(opt.seed)

        assert (opt.sampleSize % 10 == 0)

        random_permute = prng.permutation(np.arange(
            0, 5000))[0:opt.sampleSize / 10]

        labels = np.array(getattr(ds, 'train_labels'))
        data = getattr(ds, 'train_data')

        classes = np.unique(labels)
        inds_all = np.array([], dtype='int32')
        for cl in classes:
            inds = np.where(np.array(labels) == cl)[0][random_permute]
            inds_all = np.r_[inds, inds_all]

        ds = tnt.dataset.TensorDataset([
            data[inds_all, :].transpose(0, 2, 3, 1), labels[inds_all].tolist()
        ])
    else:
        ds = tnt.dataset.TensorDataset([
            getattr(ds, smode + '_data').transpose(0, 2, 3, 1),
            getattr(ds, smode + '_labels')
        ])
    return ds.transform({0: train_transform if mode else convert})
示例#32
0
    def generate_random_permutation_transform(seed, nn, kk, atf=False):
        """
        Returns an input transformation that uses k pseudorandomly generated permutations
        :param seed: Seed for the pseudorandom generation
        :param nn: challenge length (must equal LTFArray.n)
        :param kk: Number of permutations to be used (must equal LTFArray.k)
        :param atf: Perform ATF transform after permuting
        :return: The desired input transform
        """
        r = RandomState(seed)
        permutations = [r.permutation(nn) for x in range(kk)]

        def transform(cs, k):
            (N, n) = cs.shape
            assert k == kk and n == nn, \
                'Permutations Input Transform cannot be used for LTFArrays with size other than defined'

            result = swapaxes(
                array([
                    cs[:, permutations[i]]
                    for i in range(kk)
                ]),
                0,
                1
            )

            if atf:
                """ Perform atf transform """
                result = transpose(
                    array([
                        prod(result[:, :, i:], 2)
                        for i in range(n)
                    ]),
                    (1, 2, 0)
                )

            return result

        transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed
        return transform
示例#33
0
    def _find_fixed_permutations(cls, n, k):
        """
        Finds permutations suitable to use in LTFArray.transform_fixed_permutation.

        Permutations are chosen such that no permutation has a fix point and no
        two permutations share at least one point. (See `permutation_okay` below.)

        Note that the run time of this method increases drastically with k. On an
        Intel i7, n=64, k=10 takes a couple of seconds.

        :return: list of seeds for `RandomState`. Obtain the permutation with
          `RandomState(seed).permutation(n)`.
        """
        def permutation_okay(new_p, ps):
            # 1. check that p has no fix point
            if any([i == new_p[i] for i in range(len(new_p))]):
                return False

            # 2. check that it does not share a point if any old_p in ps:
            if any([
                    any([old_p[i] == new_p[i] for i in range(len(new_p))])
                    for old_p in ps
            ]):
                return False

            return True

        seed = 0xbad
        permutation_seeds = []
        permutations = []

        while len(permutations) < k:
            prng = RandomState(seed)
            p = prng.permutation(n)
            if permutation_okay(p, permutations):
                permutation_seeds.append(seed)
                permutations.append(p)
            seed += 1

        return permutation_seeds
    def score(self, prediction: Prediction, actual: DataTuple) -> float:
        """We add the ability to take the average of hsic score.

        As for larger datasets it will kill your machine
        """
        preds = prediction.hard.to_numpy()[:, np.newaxis]
        s_cols = actual.s.columns
        sens_labels = np.array(actual.s[s_cols].to_numpy())

        batchs_size = 5000

        together = np.hstack((preds, sens_labels)).transpose()

        random = RandomState(seed=888)
        col_idx = random.permutation(together.shape[1])

        together = np.take(together, col_idx, axis=1)

        prediction_shuffled = together[0]
        label_shuffled = together[1]

        num_batches_float = preds.shape[0] / batchs_size
        num_batches: int = int(math.ceil(num_batches_float))

        batches = []

        start = 0

        for _ in range(num_batches):

            end = start + batchs_size

            preds_to_test = prediction_shuffled[start:end]
            labels_to_test = label_shuffled[start:end]

            batches.append(hsic(preds_to_test, labels_to_test, 0.7, 0.5))

            start += batchs_size

        return np.mean(np.array(batches))
def main(infile: str, outfile: IO, **splitter_kw):
    """Script entry point."""
    logging.basicConfig(
        format='[%(asctime)s] [%(levelname)s] %(name)s - %(message)s',
        level=logging.INFO)

    start = time.perf_counter()

    with h5py.File(infile, mode="r") as h5in:
        _LOGGER.info("Reading labels from %r...", infile)
        labels = (pd.DataFrame.from_records(np.asarray(
            h5in["labels"])).transform(decode_column))

    _LOGGER.info("Creating splitter with args %s", splitter_kw)
    random_state = RandomState(splitter_kw.pop("seed"))
    splitter = Splitter(**splitter_kw,
                        random_state=random_state)  # type:ignore

    for indices in splitter.split(labels):
        permutation = random_state.permutation(
            len(indices.train) + len(indices.val))

        json.dump(
            {
                "train":
                indices.train.tolist(),
                "val":
                indices.val.tolist(),
                "test":
                indices.test.tolist(),
                "train-val":
                np.concatenate(
                    (indices.train, indices.val))[permutation].tolist()
            },
            outfile,
            indent=None,
            separators=(",", ":"))
        outfile.write("\n")

    _LOGGER.info("Script complete in %.2fs", (time.perf_counter() - start))
示例#36
0
def shuffled(random: RandomState,
             datasets: Sequence[xr.Dataset]) -> xr.Dataset:
    """
    Shuffles dataset along the sample dimension within chunks if chunking is present.

    Datasets passed will be shuffled identically.

    Args:
        dim: dimension to shuffle indices along
        random: Initialized random number generator state used for shuffling
        datasets: input data to be shuffled, must contain identical
            dimensionality/coordinates if multiple datasets are given
    """
    chunks_default = (len(datasets[0][SAMPLE_DIM_NAME]), )
    chunks = datasets[0].chunks.get(SAMPLE_DIM_NAME, chunks_default)
    chunk_indices = _get_chunk_indices(chunks)
    shuffled_inds = np.concatenate(
        [random.permutation(indices) for indices in chunk_indices])

    return [
        dataset.isel({SAMPLE_DIM_NAME: shuffled_inds}) for dataset in datasets
    ]
示例#37
0
文件: core.py 项目: jbpoline/permute
def two_sample(x, y, reps=10**5, stat='mean', alternative="greater",
               keep_dist=False, interval=False, level=0.95, seed=None):
    """
    One-sided or two-sided, two-sample permutation test for equality of
    two means, with p-value estimated by simulated random sampling with
    reps replications.

    Tests the hypothesis that x and y are a random partition of x,y
    against the alternative that x comes from a population with mean

    (a) greater than that of the population from which y comes,
        if side = 'greater'
    (b) less than that of the population from which y comes,
        if side = 'less'
    (c) different from that of the population from which y comes,
        if side = 'two-sided'

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2
    reps : int
        number of repetitions
    stat : {'mean', 't'}
        The test statistic.

        (a) If stat == 'mean', the test statistic is (mean(x) - mean(y))
            (equivalently, sum(x), since those are monotonically related)
        (b) If stat == 't', the test statistic is the two-sample t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
            The t-statistic is computed using scipy.stats.ttest_ind
        (c) FIXME: Explanation or example of how to pass in a function,
            instead of a str
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    interval : {'upper', 'lower', 'two-sided'}
        The type of confidence interval

        (a) If interval == 'upper', computes an upper confidence bound on the
            true p-value based on the simulations by inverting Binomial tests.
        (b) If interval == 'lower', computes a lower confidence bound on the
            true p-value based on the simulations by inverting Binomial tests.
        (c) If interval == 'two-sided', computes lower and upper confidence
            bounds on the true p-value based on the simulations by inverting
            Binomial tests.
    level : float in (0, 1)
        the confidence limit for the confidence bounds.


    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    tuple
        These values are only returned if `level` == True

        (a) confidence bound on p-value,
            if interval in {'lower','upper'}
        (b) [lower confidence bound, upper confidence bound],
            if interval == 'two-sided'
    """
    prng = RandomState(seed)
    z = np.concatenate([x, y])   # pooled responses
    # FIXME: Type check: we may want to pass in a function for argument 'stat'
    # FIXME: If function, use that. Otherwise, look in the dictionary
    stats = {
        'mean': lambda u: np.mean(u[:len(x)]) - np.mean(u[len(x):]),
        't': lambda u: ttest_ind(
            u[:len(y)], u[len(y):], equal_var=True)[0]
    }
    tst_fun = stats[stat]

    theStat = {
        'greater': tst_fun,
        'less': lambda u: -tst_fun(u),
        'two-sided': lambda u: math.fabs(tst_fun(u))
    }

    tst = theStat[alternative](z)
    if keep_dist:
       dist = []
       for i in range(reps):
           dist.append( theStat[alternative](prng.permutation(z)) )
       hits = np.sum(dist >= tst)
       if interval in ["upper", "lower", "two-sided"]:
           return (hits/reps, tst,
                   binom_conf_interval(reps, hits, level, alternative), dist)
       else:
           return hits/reps, tst, dist 
    else:
        hits = np.sum([(theStat[alternative](prng.permutation(z)) >= tst)
                       for i in range(reps)])

    if interval in ["upper", "lower", "two-sided"]:
        return (hits/reps, tst,
                binom_conf_interval(reps, hits, level, alternative))
    else:
        return hits/reps, tst
示例#38
0
class Relationship(object):
    def __init__(self, seed):
        self.seed = seed
        self.state = RandomState(self.seed)
        self.grouped = {}
        self.ops = self.RelationshipOps(self)

    def add_relations(self, from_ids, to_ids, weights=1):
        """
        Add relations to this Relationships from from_ids, to_ids, weights
        """

        self.grouped = utils.merge_2_dicts(
            self.grouped, Relations.from_tuples(from_ids, to_ids, weights),
            lambda r1, r2: r1.plus(r2))

    def add_grouped_relations(self, from_ids, grouped_ids):
        """
        Add "bulk" relationship, i.e. many "to" sides for each "from" side at
        once.

        :param from_ids: list of "from" sides of the relationships to add
        :param grouped_ids: list of list of "to" sides of the relationships
            to add

        Note: we assume all weights are 1 for this use (for now
        """

        for one_from, many_tos in zip(from_ids, grouped_ids):
            rels = pd.DataFrame({"from": one_from, "to": many_tos})
            self.add_relations(from_ids=rels["from"], to_ids=rels["to"])

    def remove_relations(self, from_ids, to_ids):
        """
        Removes all relations between those from_ids and to_ids pairs (not combinatory: if each list is
        10 elements, we removed 10 pairs).
        If the same relation was stored several times between two ids, this removes them all
        """

        self.grouped = utils.merge_2_dicts(
            self.grouped, Relations.from_tuples(from_ids, to_ids, weights=0),
            lambda r1, r2: r1.minus(r2))

    def get_relations(self, from_ids=None):
        """
        This returns, as a dataframe, the sub-set of the relationships whose
        "from" is part of specified "from_ids".

        If no from_ids is provided, this just returns all the relations.
        """

        _from_ids = set(self.grouped.keys()) if from_ids is None else from_ids

        def _rel_arrays():
            for gid in set(_from_ids):
                if gid in self.grouped.keys():
                    relations = self.grouped[gid]
                    yield np.array([
                        np.array([gid] * relations.to_ids.shape[0]),
                        relations.to_ids, relations.weights
                    ])

        rel_arrays = list(_rel_arrays())
        if len(rel_arrays) == 0:
            return pd.DataFrame(columns=["from", "to", "weight"])

        else:
            df = pd.DataFrame(np.hstack(rel_arrays).T,
                              columns=["from", "to", "weight"])
            df["weight"] = df["weight"].astype(float)
            return df

    def get_neighbourhood_size(self, from_ids):
        """
        return a series indexed by "from" containing the number of "tos" for
        each requested from.
        """
        def size(from_id):
            if from_id in self.grouped:
                return len(self.grouped[from_id])
            else:
                return 0

        return pd.Series({from_id: size(from_id) for from_id in from_ids})

    def unique_tos(self):
        """
        :return: the set of unique "to" parts throughout all relationships
        """
        return {
            to
            for relations in self.grouped.values() for to in relations.to_ids
        }

    def select_one(self,
                   from_ids=None,
                   named_as="to",
                   remove_selected=False,
                   discard_empty=True,
                   one_to_one=False,
                   overridden_to_weights=None):
        """
        Randomly selects one "to" part for each specified id in from_ids. An
        id can be specified several times in that list, in which case we
        simply do a selection several times. The result is aligned with
        from_ids by index. i.e. the row in the return value that has the same
        pandas index than a rom in from_ids is the selection for that row.

        The selection in the resulting dataframe will by default be named
        "to", unless this is overridden by "named_as".

        If remove_selected is True, the selected relations are removed from
        the relationship. This is handy to model stocks or any container of
        things.

        If discard_empty is True, all specified from_ids will be present in
        the result, even if no relation is available for them or if some
        selection were dropped due to one-to-one config.

        If one_to_one is True, the selection is an injective function,
        i.e each to_ids will at most be picked once.

        overridden_to_weights is an optional dictionary of {"to": weight}
        that can be used to override the default weights contained in this
        Relationship.
        """

        if overridden_to_weights is not None:
            missing_keys = self.unique_tos() - set(
                overridden_to_weights.keys().values)
            assert len(missing_keys) == 0, \
                "overridden_to_weights is missing those 'to' keys: {}".format(
                    missing_keys)

        if from_ids is None:
            _from_ids = pd.Series(list(self.grouped.keys()))
        elif type(from_ids) == list:
            _from_ids = pd.Series(from_ids)
        else:
            _from_ids = from_ids

        def _results():
            # req_index is the technical index of the table built by the Story,
            # => must be respect to join correctly the result of the select_one
            for req_index, from_id in zip(_from_ids.index, _from_ids):
                if from_id in self.grouped:
                    idx, picked = self.grouped[from_id].pick_one(
                        self.state, overridden_to_weights)
                    if picked is None:
                        if discard_empty:
                            continue
                        else:
                            yield req_index, from_id, -1, None
                    else:
                        yield req_index, from_id, idx, picked

                elif not discard_empty:
                    yield req_index, from_id, -1, None

        output = list(zip(*_results()))
        if len(output) == 0:
            return pd.DataFrame(columns=["from", named_as])

        request_index, from_id, rel_idx, chosen_tos = output
        output = pd.DataFrame(
            {
                named_as: list(chosen_tos),
                "idx": list(rel_idx),
                "from": from_id
            },
            index=request_index)

        if one_to_one and output.shape[0] > 0:
            # not de-duplicating the blank results
            blank_idx = output[named_as].isna()
            blanks, present = output[blank_idx], output[~blank_idx]

            present = present.loc[self.state.permutation(present.index)]
            present.drop_duplicates(subset=named_as,
                                    keep="first",
                                    inplace=True)

            output = pd.concat([present, blanks])

        if remove_selected:

            # we have to remove all the relations of each from in one go since
            # no injective selection might have the same index several times
            g = output[output["idx"] != -1][["from", "idx"]].groupby(by="from")
            for from_id in g.groups:
                group = self.grouped[from_id]
                removed_idx = g.get_group(from_id)["idx"]
                group.remove_inplace(removed_idx)
                if len(group) == 0:
                    del self.grouped[from_id]

        output.drop(["idx"], axis=1, inplace=True)
        return output

    def select_all_horizontal(self, from_ids, named_as="to"):
        """
        Return all the "to" sides starting from each "from",
        as an "horizontal" list, i.e. each "from" is on one row and the set of
        all "to" are all on that row, in one list.

        Any requested from_id that has no relationship is absent is the
        returned dataframe (=> the corresponding rows are dropped in the result)
        """

        rows = self.get_relations(from_ids)
        groups = rows.set_index("to", drop=True).groupby("from", sort=False)
        df = pd.DataFrame(data=list(groups.groups.items()),
                          columns=["from", named_as])
        df[named_as] = df[named_as].apply(lambda s: [el for el in s])
        return df

    def select_many(self,
                    from_ids,
                    named_as,
                    quantities,
                    remove_selected=False,
                    discard_empty=True):
        """

        The result is returned in vertical format and index by the values of the index of from_ids.
        Since we select several values, we return several lines per index value of from_id =>
        during the subsequent join by the Operation, the number of produced rows increases.

        """

        req = pd.DataFrame({"from": from_ids, "qties": quantities})
        req["qties"] = req["qties"].astype(np.int)

        # gathers all requests to the same "from" together, keeping track of
        # the "request index" in the original from_ids so we can merge it later
        def gather(df):

            # shuffles that set of request s.t. in case of capping not the same
            # from_id get "capped" all time
            df2 = df.loc[self.state.permutation(df.index)]
            return pd.Series({
                "quantities": df2["qties"].tolist(),
                "req_index": df2.index.tolist()
            })

        # the same "from" can be requested several times
        all_reqs = req.groupby("from", sort=False).apply(gather)

        def _all_picks_results():
            for _, row in all_reqs.iterrows():

                from_id = row.name

                if from_id in self.grouped:

                    relations = self.grouped[from_id]
                    quantities = utils.cap_to_total(row["quantities"],
                                                    len(relations))

                    # rel_idx is the index of the picked values within the grouped values (i.e. for one from_id)
                    rel_idx, rel_tos = relations.pick_many(
                        self.state, np.sum(quantities))

                    # prepares the indices of the resulting vertical format, as a sequence
                    # of index interval where to inject the picked values
                    to_idx = np.cumsum(quantities).tolist()
                    from_idx = [0] + to_idx[:-1]
                    idx_intervals = [(lb, ub)
                                     for lb, ub in zip(from_idx, to_idx)]

                    def _one_pick_result():
                        for ((lower_bound, upper_bound),
                             req_index) in zip(idx_intervals,
                                               row["req_index"]):
                            size = upper_bound - lower_bound

                            if size == 0:
                                continue

                            yield [
                                req_index,
                                from_id,
                                rel_tos[lower_bound:upper_bound],
                                rel_idx[lower_bound:upper_bound],
                            ]

                    yield list(_one_pick_result())

        all_picks_results = list(_all_picks_results())

        if len(all_picks_results) > 0:
            output = pd.DataFrame(
                data=functools.reduce(lambda l1, l2: l1 + l2,
                                      all_picks_results),
                columns=["req_idx", "from", named_as, "rel_idx"])

            if remove_selected:

                # remove all the relations of each from in one go since
                # no injective selection might have the same index several times
                g = output[output["rel_idx"] != -1][["from", "rel_idx"
                                                     ]].groupby(by="from")
                for from_id in g.groups:
                    group = self.grouped[from_id]
                    removed_idx = g.get_group(from_id)["rel_idx"].values[0]
                    group.remove_inplace(removed_idx)
                    if len(group) == 0:
                        del self.grouped[from_id]

        else:
            output = pd.DataFrame(
                columns=["req_idx", "from", named_as, "rel_idx"])

        output.set_index("req_idx", drop=True, inplace=True)
        output.drop(["rel_idx", "from"], axis=1, inplace=True)

        # "discard_empty" option: return empty result (instead of nothing) for
        # any non existing (i.e. empty) "from" relation
        if not discard_empty and output.shape[0] != len(from_ids):
            missing_index = from_ids.index.difference(output.index)
            missing_values = pd.DataFrame({
                named_as:
                pd.Series([[] * missing_index.shape[0]], index=missing_index)
            })

            output = pd.concat([output, missing_values], copy=False)

        return output

    ######################
    # IO                 #
    ######################

    def save_to(self, file_path):
        """
        Saves all the relationship as well as the current status of the seed
        as a CSV file
        """
        logging.info("saving relationship to {}".format(file_path))

        # creating a vertical dataframe to store the inner table
        saved_df = pd.DataFrame(self.get_relations().stack(),
                                columns=["value"])

        # we also want to save the seed => added an index level to separate
        # self._table from self.seed in the end result
        saved_df["param"] = "relations"
        saved_df = saved_df.set_index("param", append=True)
        saved_df.index = saved_df.index.reorder_levels([2, 0, 1])

        # then finally added the seed
        saved_df.loc[("seed", 0, 0)] = self.seed
        saved_df.to_csv(file_path)

    @staticmethod
    def load_from(file_path):
        logging.info("loading relationship from {}".format(file_path))

        saved_df = pd.read_csv(file_path, index_col=[0, 1, 2])
        seed = int(saved_df.loc["seed"].values[0][0])

        _all = slice(None)
        relations = saved_df.loc[("relations", _all, _all)].unstack()
        relations.index = relations.index.droplevel(0)
        relations.columns = relations.columns.droplevel(0)

        relationship = Relationship(seed)
        relationship.add_relations(
            from_ids=relations["from"].values,
            to_ids=relations["to"].values,
            weights=relations["weight"].values.astype(float))

        return relationship

    class RelationshipOps(object):
        def __init__(self, relationship):
            self.relationship = relationship

        class AddNeighbourhoodSize(AddColumns):
            def __init__(self, relationship, from_field, named_as):
                AddColumns.__init__(self)

                self.relationship = relationship
                self.from_field = from_field
                self.named_as = named_as

            def build_output(self, story_data):

                requested_froms = story_data[self.from_field]
                sizes = self.relationship.get_neighbourhood_size(
                    from_ids=requested_froms)

                return pd.DataFrame(
                    {self.named_as: requested_froms.map(sizes).astype(int)})

        def get_neighbourhood_size(self, from_field, named_as):
            return self.AddNeighbourhoodSize(self.relationship, from_field,
                                             named_as)

        class SelectOne(AddColumns):
            """
            """
            def __init__(self, relationship, from_field, named_as, one_to_one,
                         pop, discard_missing, weight):

                # inner join instead of default left to allow dropping rows
                # in case of duplicates and one-to-one
                AddColumns.__init__(self, join_kind="inner")

                self.relationship = relationship
                self.from_field = from_field
                self.named_as = named_as
                self.one_to_one = one_to_one
                self.pop = pop
                self.discard_missing = discard_missing
                self.weight = weight

            def build_output(self, story_data):
                selected = self.relationship.select_one(
                    from_ids=story_data[self.from_field],
                    named_as=self.named_as,
                    remove_selected=self.pop,
                    one_to_one=self.one_to_one,
                    discard_empty=self.discard_missing,
                    overridden_to_weights=self.weight)

                selected.drop("from", axis=1, inplace=True)
                return selected

        def select_one(self,
                       from_field,
                       named_as,
                       one_to_one=False,
                       pop=False,
                       discard_empty=False,
                       weight=None):
            """
            :param from_field: field corresponding to the "from" side of the
                relationship

            :param named_as: field name assigned to the selected "to" side
                of the relationship

            :param one_to_one: boolean indicating that any "to" value will be
                selected at most once

            :param pop: if True, the selected relation is removed

            :param discard_empty: if False, any non-existing "from" in the
                relationship yields a None in the resulting selection. If
                true, that row is removed from the story_data.

            :param weight: weight to use for the "to" side of the
                relationship. Must be a Series whose index are the "to" values.
                Typical usage would be to plug an attribute of the "to"
                population here.

            :return: this operation adds a single column corresponding to a
                random choice from a Relationship
            """
            return self.SelectOne(self.relationship, from_field, named_as,
                                  one_to_one, pop, discard_empty, weight)

        class SelectAll(Operation):
            def __init__(self, relationship, from_field, named_as):
                self.relationship = relationship
                self.from_field = from_field
                self.named_as = named_as

            def transform(self, story_data):

                from_ids = story_data[[self.from_field]].drop_duplicates()
                selected = self.relationship.select_all_horizontal(
                    from_ids=from_ids[self.from_field].values,
                    named_as=self.named_as)

                selected.set_index("from", drop=True, inplace=True)
                return pd.merge(left=story_data,
                                right=selected,
                                left_on=self.from_field,
                                right_index=True)

        def select_all(self, from_field, named_as):
            """
            This simply creates a new story_data field containing all the
            "to" values of the requested from, as a set.
            """
            return self.SelectAll(self.relationship, from_field, named_as)

        class SelectMany(AddColumns):
            """
            """
            def __init__(self, relationship, from_field, named_as,
                         quantity_field, pop, discard_missing):

                # inner join instead of default left to allow dropping rows
                # in case of duplicates and one-to-one
                AddColumns.__init__(self, join_kind="inner")

                self.relationship = relationship
                self.discard_missing = discard_missing
                self.from_field = from_field
                self.named_as = named_as
                self.quantity_field = quantity_field
                self.pop = pop

            def build_output(self, story_data):
                selected = self.relationship.select_many(
                    from_ids=story_data[self.from_field],
                    named_as=self.named_as,
                    quantities=story_data[self.quantity_field],
                    remove_selected=self.pop,
                    discard_empty=self.discard_missing)

                return selected

        def select_many(self,
                        from_field,
                        named_as,
                        quantity_field,
                        pop=False,
                        discard_missing=True):
            return self.SelectMany(self.relationship, from_field, named_as,
                                   quantity_field, pop, discard_missing)

        class Add(SideEffectOnly):
            def __init__(self, relationship, from_field, item_field):
                self.relationship = relationship
                self.from_field = from_field
                self.item_field = item_field

            def side_effect(self, story_data):
                if story_data.shape[0] > 0:
                    self.relationship.add_relations(
                        from_ids=story_data[self.from_field],
                        to_ids=story_data[self.item_field])

        def add(self, from_field, item_field):
            return self.Add(self.relationship, from_field, item_field)

        class AddGrouped(SideEffectOnly):
            def __init__(self, relationship, from_field, grouped_items_field):
                self.relationship = relationship
                self.from_field = from_field
                self.grouped_items_field = grouped_items_field

            def side_effect(self, story_data):
                if story_data.shape[0] > 0:

                    self.relationship.add_grouped_relations(
                        from_ids=story_data[self.from_field],
                        grouped_ids=story_data[self.grouped_items_field])

        def add_grouped(self, from_field, grouped_items_field):
            """
            this is similar to add, execept that the "to" field should here
            contain lists of "to" values instead of single ones
            """
            return self.AddGrouped(self.relationship, from_field,
                                   grouped_items_field)

        class Remove(SideEffectOnly):
            def __init__(self, relationship, from_field, item_field):
                self.relationship = relationship
                self.from_field = from_field
                self.item_field = item_field

            def side_effect(self, story_data):
                if story_data.shape[0] > 0:
                    self.relationship.remove(
                        from_ids=story_data[self.from_field],
                        to_ids=story_data[self.item_field])

        def remove(self, from_field, item_field):
            return self.Remove(self.relationship, from_field, item_field)
示例#39
0
class Configurator:

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def __init__(self, NU_TRNG_DATA, NU_TRNG_LABS, verbose = False, NU_CFG_PATH = None):
        # DATA and LAB MEMBERS
        # Should be your training data and corresponding labels
        self.TRNG_DATA = NU_TRNG_DATA
        self.TRNG_LABS = NU_TRNG_LABS
        # Just in case you want to use a special Config file
        if NU_CFG_PATH is None:
            self.CFG_PATH = DEF_CFG_PATH
        else:
            self.CFG_PATH = NU_CFG_PATH
        # Different Containers for Data
        self.CFG = {'sections' :  ['STACK', 'LAYER', 'SMAX_TUNE',
                                          'MET_TUNE', 'LOG_TUNE',
                                'CPP_LIBRARY','FILEIO', 'SYSTEM'
                                  ]}
        self.CFG_STACK = []
        # RandomState Object
        self.RANDO = RS()
        # Congfig parser
        self.CP = CFPR(allow_no_value=True)
        self.TLAY = None
        self.CHECKS = {'cfg_loaded'        :  False,
                       'lay_obj_initd'    :  False,
                       'verbose'        :  verbose}
        self.SWARM_SIZE = 5
        self.ALL_CHECKS = self.CHECKS.keys()
        self.FLOAT_STEP = .00001
        self.INT_STEP = 1
        self.GIVE_UP_SCALE = np.linspace(-100,100,5000)
        self.PATIENT_LEVEL = 2500
        self.SCORE_STACK = [999999.]
        self.START_TIME = time.time()
        self.STOP_AT_HOUR = 1.0

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def CFGControl(self):
        for TC in [self.isPatient(), not self.isStopTime()]:
            print TC
            yield TC

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def GetPatiencesLevel(self):
        return self.GIVE_UP_SCALE[self.PATIENT_LEVEL]

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def isPatient(self):
        return self.GIVE_UP_SCALE[np.minimum(self.PATIENT_LEVEL, 
               len(self.GIVE_UP_SCALE))] > -50.

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def TimeElapsed(self):
        return ((time.time() - self.START_TIME)/360./60.)

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def isStopTime(self):
        return self.STOP_AT_HOUR < self.TimeElapsed()

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def isStackEmpty(self, STACK):
        return len(STACK) == 0

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def PatienceUp(self):
        self.PATIENT_LEVEL+=1

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def PatienceDown(self):
        self.PATIENT_LEVEL-=1

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def BuzzEm(self, VAL, BUZZ):
        if BUZZ in GenInts():
            NU_INT = self.BuzzInt(VAL)
            return NU_INT if NU_INT>0 else VAL
        else:
            NU_FLOAT = self.BuzzFloat(VAL)
            return NU_FLOAT if NU_FLOAT>0 else VAL
            

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def BuzzInt(self, VAL):
        RANGE = [VAL-self.INT_STEP, VAL+self.INT_STEP]
        SEQ = np.arange(RANGE[0], RANGE[1]+1,self.INT_STEP)
        NEW_INT, THROW_AWAY = SEQ[0],SEQ[1:]
        return NEW_INT

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def BuzzFloat(self, VAL):
        RANGE = [VAL-self.FLOAT_STEP, VAL+self.FLOAT_STEP]
        SEQ = np.arange(RANGE[0], RANGE[1]+1,self.FLOAT_STEP)
        NEW_FLOAT, THROW_AWAY = SEQ[0],SEQ[1:]
        return NEW_FLOAT

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def LoadAndConfigure(self, CFG_PATH=None):
        if CFG_PATH is not None:
            self.CFG_PATH = CFG_PATH
        self.LoadConfig()
        self.Configure()

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def Configure(self, CFG_PATH = None):
        self.CFG_STACK.insert(0, self.CFG)
        LAY = Layer(self.CFG['LAYER'])
        while all([C for C in self.CFGControl()]):
            LAY.ClearLayerParams()
            INT_IDX = [[i,j] for i,j in GenInts()]
            FLOAT_IDX =[[i,j] for i,j in GenFloats()]
            ALL_BUZZERS = self.RANDO.permutation([i for i in INT_IDX]+[f for f in  FLOAT_IDX])
            SPLIT_AT = np.minimum(self.SWARM_SIZE, ALL_BUZZERS.size)
            BUZZERS = ALL_BUZZERS[:SPLIT_AT]
            REJECTS = ALL_BUZZERS[SPLIT_AT+1:]
            for [SECT, PARAM] in BUZZERS:
                OLD_VAL = self.CFG[SECT][PARAM]
                self.CFG[SECT][PARAM] = self.BuzzEm(OLD_VAL, [SECT, PARAM])
            for [SECT,PARAM] in REJECTS:
                self.CFG[SECT][PARAM] = self.CFG[SECT][PARAM] 
            print self.CFG.keys()
            LAY.SetNewParams(self.CFG)
            RESULT = self.LogTrain(LAY)            
            if RESULT['fun'][0][-1]        <= self.SCORE_STACK[0]:
                self.CFG_STACK.insert(0, self.CFG)
                self.PatienceUp()
            else:
                if self.isStackEmpty(self.CFG_STACK):
                    self.SWARM_SIZE += np.ceil(self.SWARM_SIZE/2.)
                    self.FLOAT_STEP = .00002
                    self.INT_STEP = 2
                    self.PatienceDown()
                else:
                    self.CFG_STACK.pop(0)
                    self.PatienceDown()
    
    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def GetCFG(self):
        return self.CFG
    

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def DeepCopy2CFG(self, NU_CFG):
        self.CFG = {SECT : {PARAM : NU_CFG[SECT][PARAM]
                                for PARAM in GenParams(NU_CFG)
                           } for SECT in GenSects(NU_CFG)}

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def DeepCopyCFG2NU(self):
        return    {SECT : {PARAM : self.CFG[SECT][PARAM]
                                for PARAM in GenParams(self.CFG)
                           } for SECT in GenSects(self.CFG)}

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def PushBehave(self):
        self.BHAVE_STACK.insert(0, {})
        self.BHAVE_STACK[0].update(self.CURR_BEHAVIOR)

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def PushConfig(self, LATEST_CFG):
        self.CFG_STACK.insert(0, {})
        for SECT, PARM in GenSectsAndParams(LATEST_CFG):
            self.CFG_STACK[0][SECT][PARM] = LATEST_CFG[SECT][PARM]
        self.NUM_CFG_STACKED+=1

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def PopConfig(self):
        if self.NUM_CFG_STACKED > 0:
            NU_CFG = {SECT : {PARM : self.CFG_STACK[0][SECT][PARM]
                                for PARM in GenParams(self.CFG_STACK[0][SECT])
                             }    for SECT in GenSects(self.CFG_STACK[0])}
            self.CFG_STACK.pop(0)
            self.SCORE_STACK.pop(0)
            self.NUM_CFG_STACKED -= 1
            return NU_CFG
        else:
            raise IndexError, "None more configurations to pop off stack."

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def LoadConfig(self):
        '''
        DESCRPT:  A monster, thankfully this is the only call to the config fil
                    of all the other classes
        IN ARGS:  PATH ; strings    : config file location
        NOTES:
        '''
        try:
            self.CP.read(self.CFG_PATH)
        except:
            print  "Config Name"
            raise IOError, 'Config file wasn\'t able to be read'
        GENERIC_DICT = {}
        for SECT in self.CP.sections():
            GENERIC_DICT[SECT] = {}
            for OPTS in self.CP.options(SECT):
                val = self.CP.get(SECT, OPTS)
                if val == 'True':
                    GENERIC_DICT[SECT][OPTS] = True
                elif val == 'False':
                    print 'haeeyy'
                    GENERIC_DICT[SECT][OPTS] = False
                else:
                    try:
                        GENERIC_DICT[SECT][OPTS] = int(val)
                    except:
                        try:
                            GENERIC_DICT[SECT][OPTS] = float(val)
                        except:
                            try:
                                GENERIC_DICT[SECT][OPTS] = val
                            except:
                                GENERIC_DICT[SECT][OPTS] = None
        for key in GenSects(GENERIC_DICT):
            if key not in self.CFG['sections']:
                self.CFG['sections'].append(key)
        for key in self.CFG['sections']:
            self.CFG[key] = GENERIC_DICT[key]
        self.CFG['LAYER']['disp'] = self.CHECKS['verbose']
        self.CHECKS['config_loaded'] = True
        self.CHECKS['lee_wants_rand_off'] = self.CFG['STACK']['lee_wants_rand_off']
        if self.CFG['STACK']['lee_wants_rand_off']:
            CONFIG_SEED = self.CFG['STACK']['rand_seed_32bit']
            self.RANDO.seed(seed=CONFIG_SEED)
        else:
            self.RANDO.seed(seed=np.int32(time.time()))

        self.CHECKS['cfg_loaded'] = True

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def GetChecks(self, CHECK_NAME):
        if self.CHECKS.has_key(CHECK_NAME):
            return self.CHECKS[CHECK_NAME]
        else:
            raise Warning, "Thats not a valid Check"

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def UpdateConfig(self, SECTION=None, OPTION=None):
        '''
        DESCRPT:
                PREVIOUSLY:
                    Class Parameters were loading with-in  themselves
                    and were frustratingly difficult to interact with from out side.
                    Plus they took up a lot of space
                LATER:
                    I implement parameter dictionaries for individual class that
                    would be passed back-and-forth from StackSAE to an instantiating class
                    Not bad overall just tedious
                NOW:
                    ONE dictionary contains all the parameters and behaves exacly
                    like the old way but with way less tedium

        NOTE: This function backs up the latest version of the config file before
              writing the updated one. The back up as time stamped
        '''
        import shutil
        # This keeps track of the last config file backed up and is a parameter in
        # FILEIO
        LAST_CONFIG_UPDATE_PATH = os.getcwd()
        self.CFG['FILEIO']['last_config_backup'] = LAST_CONFIG_UPDATE_PATH + '_config_bu.ini'
        shutil.copy2(os.getcwd() + '/config.ini', LAST_CONFIG_UPDATE_PATH)
         # The config parser instantiated with SSAE is updated
        if SECTION is not None:
            sect = list(SECTION)
        else:
            sect = self.CFG['sections']
        if OPTION is not None:
            opt = OPTION
            self.CP.set(sect, opt, self.CFG[sect][opt])
        else:
            for s in sect:
                for opt in list(self.CFG[s]):
                    self.CP.set(s, opt, str(self.CFG[s][opt]))
        # new config file is now in the cwd
        with open('config.ini', 'w') as write_config:
            self.CP.write(write_config)

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def ProcessResult(self, RES):
        self.IntakeNuResults()
        for ATT in RES.keys():
            if ATT in self.PREV_RESULT.keys():
                self.CURR_RESULT[ATT] = RES.get(ATT)
        self.TLAY.ClearLayerParams()

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def StopTrain(self):
        self.ChangePhase('stop')
        self.ProcessResult({'phase_name': self.CURR_PHASE})

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def LogTrain(self, LAY):
        IN_SHAPE = self.TRNG_DATA.shape
        NUM_HIDD = self.CFG['STACK']['num_hidden']
        MIN_HIDD = self.CFG['STACK']['min_hidden']
        MAX_LAYER = self.CFG['STACK']['max_layer']
        DEC_HIDD_BY = self.CFG['STACK']['decrement_num_hidden']
        BASE_NOISE = self.CFG['STACK']['base_noise_level']
        OUT_SHAPE = (IN_SHAPE[0], NUM_HIDD)
        [WIN, WOUT, BIN, BOUT, SHAPES] = LAY.CreateLogLayer(IN_SHAPE, NUM_HIDD, self.RANDO)
        THETA = LAY.TrainSparseAE(WIN, WOUT,
                                  BIN, BOUT,
                                  self.TRNG_DATA)
        return {'success': THETA.success,
                'message': THETA.message,
                'fun'    : THETA.fun,
                'nfev'   : THETA.nfev,
                'nit'    : THETA.nit }

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def FlattenData(self, DATA):
        N,M = DATA.shape
        self.TRNG_DATA = DATA.reshape(N*M)

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def SetTrainingData(self, DATA):
        if DATA.ndim > 1:
            self.FlattenData(DATA)
        else:
            self.TRNG_DATA = DATA

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def SetTrainingLabs(self, LABS):
        self.TRNG_LABS = LABS

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def ChangePhase(self, NU_PHASE):
        if NU_PHASE in self.ALLOWED_PHASE:
            self.CURR_PHASE = NU_PHASE
        else:
            raise ValueError, 'Non allowable phase passed'

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def BuildNewLay_CurrParam(self, PHASE = None):
        if PHASE is not None and self.CURR_PHASE != PHASE:
            self.ChangePhase(PHASE)
        self.TEST_LAY = Layer(self.CFG['LAYER'])

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def BuildNewLay_NuParam(self, PHASE = None, **NU_LAY_PARAM):
        if PHASE is not None and self.CURR_PHASE != PHASE:
            self.ChangePhase(PHASE)
        self.TEST_LAY = Layer(MergeNu2Old(self.CFG, NU_LAY_PARAM))

    '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
    def CFGStackIsEmpty(self):
        return len(self.CFG_STACK) == 0
from numpy.random import RandomState

RNG = RandomState(21)

# Construct an example dataset for binary classification
n_vars = 2
n_events = 10000
signal = RNG.multivariate_normal(
    np.ones(n_vars), np.diag(np.ones(n_vars)), n_events)
background = RNG.multivariate_normal(
    np.ones(n_vars) * -1, np.diag(np.ones(n_vars)), n_events)
X = np.concatenate([signal, background])
y = np.ones(X.shape[0])
w = RNG.randint(1, 10, n_events * 2)
y[signal.shape[0]:] *= -1
permute = RNG.permutation(y.shape[0])
X = X[permute]
y = y[permute]

# Use all dataset for training
X_train, y_train, w_train = X, y, w

# Declare BDT - we are going to use AdaBoost Decision Tree
dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.05*len(X_train))
bdt = AdaBoostClassifier(dt,
                         algorithm='SAMME',
                         n_estimators=800,
                         learning_rate=0.5)

# Train BDT
示例#41
0
class CryoDataset:
    def __init__(self,imgstack,ctfstack):
        self.imgstack = imgstack
        self.ctfstack = ctfstack
        
        assert self.imgstack.get_num_images() == self.ctfstack.get_num_images()
        
        self.N = self.imgstack.get_num_pixels()
        self.pixel_size = self.imgstack.get_pixel_size()

    
    def compute_noise_statistics(self):
        self.mleDC_est = self.estimate_dc()
        self.noise_var = self.imgstack.estimate_noise_variance()
        self.data_var = self.imgstack.compute_variance()
        
        print 'Dataset noise profile'
        print '  Noise: {0:.3g}'.format(n.sqrt(self.noise_var))
        print '  Data: {0:.3g}'.format(n.sqrt(self.data_var))
        assert self.data_var > self.noise_var
        self.signal_var = self.data_var - self.noise_var 
        print '  Signal: {0:.3g}'.format(n.sqrt(self.signal_var))
        print '  Signal-to-Noise Ratio: {0:.1f}% ({1:.1f}dB)'.format(100*self.signal_var/self.noise_var, 10*n.log10(self.signal_var/self.noise_var))

    def normalize_dataset(self):
        self.imgstack.scale_images(1.0/n.sqrt(self.noise_var))
        self.ctfstack.scale_ctfs(1.0/n.sqrt(self.noise_var))
    
        self.data_var = self.data_var/self.noise_var
        self.signal_var = self.signal_var/self.noise_var    
        self.noise_var = 1.0


    def divide_dataset(self,minibatch_size,testset_size,partition,num_partitions,seed):
        self.rand = RandomState(seed)
        
        self.N_D = self.imgstack.get_num_images()
        self.idxs = self.rand.permutation(self.N_D)
        
        print "Dividing dataset of {0} images with minisize of {1}".format(self.N_D,minibatch_size)
        if testset_size != None:
            print "  Test Images: {0}".format(testset_size)
            self.test_idxs = self.idxs[0:testset_size]
            self.train_idxs = self.idxs[testset_size:]
        else:
            self.train_idxs = self.idxs
            self.test_idxs = []
        
        if num_partitions > 1:
            print "  Partition: {0} of {1}".format(partition+1,num_partitions)
            N_D = len(self.train_idxs)
            partSz = N_D/num_partitions
            self.train_idxs = self.train_idxs[partition*partSz:(partition+1)*partSz]

        self.N_D_Test = len(self.test_idxs)
        self.N_D_Train = len(self.train_idxs)
        numBatches = int(n.floor(float(self.N_D_Train)/minibatch_size))
        real_minisize = int(n.floor(float(self.N_D_Train)/numBatches))
        N_Rem = self.N_D_Train - real_minisize*numBatches
        numRegBatches = numBatches - N_Rem
        batchInds = [ (real_minisize*i, real_minisize*(i+1)) \
                      for i in xrange(numRegBatches) ] + \
                    [ (real_minisize*numRegBatches + (real_minisize+1)*i,
                       min(real_minisize*numRegBatches + (real_minisize+1)*(i+1),self.N_D_Train)) \
                      for i in xrange(N_Rem) ]
        self.batch_idxs = n.array(batchInds)
        self.N_batches = self.batch_idxs.shape[0]
        self.batch_order = self.rand.permutation(self.N_batches)

        batch_sizes = self.batch_idxs[:,1] - self.batch_idxs[:,0]

        print "  Train Images: {0}".format(self.N_D_Train)
        print "  Minibatches: {0}".format(self.N_batches)
        print "  Batch Size Range: {0} - {1}".format(batch_sizes.min(),batch_sizes.max())
        
        self.minibatch_size = minibatch_size
        self.testset_size = testset_size
        self.partition = partition
        self.num_partitions = num_partitions

        self.reset_minibatches(True)

    def get_dc_estimate(self):
        return self.mleDC_est

    def estimate_dc(self,esttype='robust'):
        N = self.N
        
        obs = []
        ctf_dcs = {}
        zeros = n.zeros((1,2))
        for img_i,img in enumerate(self.imgstack):
            ctf_i = self.ctfstack.get_ctf_idx_for_image(img_i)
            if ctf_i not in ctf_dcs:
                ctf_dcs[ctf_i] = self.ctfstack.get_ctf(ctf_i).compute(zeros)
                 
            obs.append(n.mean(img) * n.sqrt(float(N)) / ctf_dcs[ctf_i])
            
        obs = n.array(obs)
        mleDC, mleDC_std = estimate_mean_std(obs, esttype)
        mleDC_est_std = mleDC_std /  n.sqrt(len(obs))
        
        return mleDC, mleDC_std, mleDC_est_std
    
    def set_datasign(self,datasign):
        mleDC, _, mleDC_est_std = self.get_dc_estimate()
        datasign_est = 1 if mleDC > 2*mleDC_est_std else -1 if mleDC < -2*mleDC_est_std else 0
        print "Estimated DC Component: {0:.3g} +/- {1:.3g}".format(mleDC,mleDC_est_std)

        if datasign == 'auto':
            if datasign_est == 0:
                print "  WARNING: estimated DC component has large variance, detected sign could be wrong."
                datasign = n.sign(mleDC)
            else:
                datasign = datasign_est
        else:
            if datasign_est*datasign < 0:
                print "  WARNING: estimated DC component and specified datasign disagree; be sure this is correct!"
            
        if datasign != 1:
            print "  Using negative datasign"
            assert datasign == -1
            self.ctfstack.flip_datasign()
        else:
            print "  Using positive datasign"
            assert datasign == 1

    def reset_minibatches(self,epochReset=True):
        self.curr_batch = None
        self.epoch_frac = 0

        if epochReset:
            self.epoch = 0
            self.data_visits = 0

    def get_testbatch(self):
        miniidx = self.test_idxs
        ret = {'img_idxs':miniidx, 
               'ctf_idxs':self.ctfstack.get_ctf_idx_for_image(miniidx),
               'N_M':len(miniidx), 'test_batch':True}
        
        return ret
    
    def get_next_minibatch(self,shuffle_minibatches):
        if self.curr_batch == None:
            self.curr_batch = 1
            batchInd = 0
            newepoch = False
        else:
            batchInd = self.curr_batch
            self.curr_batch = (self.curr_batch+1)%self.N_batches
            newepoch = batchInd == 0

        if newepoch:
            if shuffle_minibatches:
                self.batch_order = self.rand.permutation(self.N_batches)
            self.epoch = self.epoch + 1
            self.epoch_frac = 0

        batch_id = self.batch_order[batchInd]

        startI = self.batch_idxs[batch_id,0]
        endI = self.batch_idxs[batch_id,1]
        miniidx = self.train_idxs[startI:endI]

        self.data_visits += endI - startI
        self.epoch_frac += float(endI - startI)/self.N_D_Train
      
        ret = {'img_idxs':miniidx, 
               'ctf_idxs':self.ctfstack.get_ctf_idx_for_image(miniidx),
               'N_M':len(miniidx), 'id':batch_id, 'epoch':self.epoch + self.epoch_frac,
               'num_batches': self.N_batches, 'newepoch':newepoch, 'test_batch':False }

        return ret

    def get_epoch(self,frac=False):
        if self.epoch == None: # Data not yet loaded
            return 0

        if frac:
            return self.epoch + self.epoch_frac
        else:
            return self.epoch
def tune_classifier(clf, param_grid, avg_cycles=10, nr_training_samples=50, nr_test_samples=160, combine_scenes=False, filename=""):
    """

    :param clf:
    :param param_grid:
    :param avg_cycles:
    :param nr_training_samples:
    :param kfold: Nr folds for uncombined scene training.
    :param combine_scenes:
    :return:
    """


    save_csv = True
    randomize = True
    nr_iters = avg_cycles
    objective = 'f1'

    emb1 = load_embeddings("matthias_test.pkl")
    emb2 = load_embeddings("matthias_test2.pkl")
    emb_lfw = load_embeddings("embeddings_lfw.pkl")

    emb1 = clean_duplicates(emb1)
    emb2 = clean_duplicates(emb2)
    emb_lfw = clean_duplicates(emb_lfw)

    # select scenes and outlier class
    class_ds1 = emb1
    class_ds2 = emb2
    outlier_ds = emb_lfw
    clf_name = clf.__class__.__name__

    # calculate folds
    if combine_scenes:
        nr_splits = float(nr_test_samples / (2. * nr_training_samples)) + 1
    else:
        nr_splits = float(nr_test_samples / (4. * nr_training_samples)) + 1

    if not nr_splits.is_integer():
        print "Invalid number of samples. Producing {} splits.".format(nr_splits)
        min_nr_test = nr_training_samples*2 if combine_scenes else nr_training_samples*4
        print "Adjust nr. training samples. E.g. {}, {}, {}, ...".format(min_nr_test, 2*min_nr_test, 3*min_nr_test)
        return

    nr_splits = int(nr_splits)
    print "Performing {}-fold cross-validation...".format(nr_splits)

    if objective not in {'f1', 'youden'}:
        raise ValueError

    # allocate storage
    iter_precision = []
    iter_recall = []
    iter_f1_scores = []
    iter_params = []
    iter_training_time = []
    iter_prediction_time = []
    iter_youden_indices = []
    iter_auc_scores = []

    prng = RandomState()

    for i in range(0, nr_iters):

        # shuffle same every time

        prng = RandomState(i + 1)
        class_ds1_mixed = prng.permutation(class_ds1)
        class_ds2_mixed = prng.permutation(class_ds2)
        outlier_ds_mixed = prng.permutation(outlier_ds)

        # random.seed(i)  # Reset random state
        # random.shuffle(class_ds1)
        # random.shuffle(class_ds2)
        # random.shuffle(outlier_ds)

        kf = KFold(n_splits=nr_splits, shuffle=False)
        param_combinations = get_all_param_variants(param_grid)

        # allocate metrics
        precision_values = []
        recall_values = []
        youden_index = []
        f1_scores = []
        auc_scores = []
        training_time = []
        prediction_time = []

        # mode selection
        if combine_scenes:
            # -------------------- Case B: Train on 1 and 2, test on 1 and 2
            if (nr_training_samples/2+nr_test_samples/4) > len(class_ds1_mixed) or (nr_training_samples/2+nr_test_samples/4) > len(class_ds2_mixed):
                print "Too few samples!"
                return
        else:
            # -------------------- Case A: Train on 1, test on 1 and 2
            if (nr_training_samples+nr_test_samples/4) > len(class_ds1_mixed) or nr_test_samples/4 > len(class_ds2_mixed):
                print "Too few samples!"
                return

        for i_param, clf_params in enumerate(param_combinations):
            # init classifiers
            clf.set_params(**clf_params)

            # build each parameter combination
            precision_scores_config = []
            recall_scores_config = []
            f1_scores_config = []
            auc_scores_config = []
            training_time_config = []
            prediction_time_config = []

            # -------------------- Case A: Train on 1, test on 1 and 2

            if combine_scenes:

                scene1_samples = class_ds1_mixed[0:(nr_training_samples/2+nr_test_samples/4)]
                scene2_samples = class_ds2_mixed[0:(nr_training_samples/2+nr_test_samples/4)]

                # calculate precision and recall in kfold cross validation
                for test_indices, train_indices in kf.split(scene1_samples):

                    training_samples = np.concatenate((scene1_samples[train_indices], scene2_samples[train_indices]))

                    start = current_milli_time()
                    clf.fit(training_samples)
                    training_time_config.append(current_milli_time() - start)

                    # build test set, add scene 2 , add outlier dataset
                    test_with_outliers = np.concatenate((scene1_samples[test_indices], scene2_samples[test_indices], outlier_ds_mixed[0:(nr_test_samples/2)]))
                    # 1/2 class, 1/2 outliers
                    labels = np.concatenate((np.repeat(1, nr_test_samples/2), np.repeat(-1, nr_test_samples/2)))

                    # predict
                    start = current_milli_time()
                    # scores which are thresholded
                    scores = clf.decision_function(test_with_outliers)
                    prediction_time_config.append(current_milli_time() - start)

                    labels_predicted = clf.threshold(scores)

                    if clf_name == 'L2Estimator':
                        # invert probability
                        scores = 3 - scores

                    # validate
                    if len(test_with_outliers) != nr_test_samples:
                        print len(scene1_samples)
                        print len(test_with_outliers)
                        print nr_test_samples
                        print len(test_indices)
                        print "001: Check your code!"
                        return

                    # calculate metrics
                    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
                    auc_val = auc(fpr, tpr)

                    true_nr_positives = nr_test_samples/2
                    true_nr_negatives = nr_test_samples/2
                    tp = np.count_nonzero(labels_predicted[0:true_nr_positives] == 1)
                    fn = true_nr_positives-tp
                    fp = np.count_nonzero(labels_predicted[true_nr_positives:] == 1)
                    tn = true_nr_negatives-fp
                    fpr = float(fp)/float(fp+tn)

                    recall = float(tp) / float(tp + fn)
                    try:
                        precision = float(tp) / float(tp + fp)
                        f1_score = 2 * float(precision * recall) / float(precision + recall)
                    except ZeroDivisionError:
                        precision = 0
                        f1_score = 0

                    # validate
                    if (tp + fn != nr_test_samples/2) or (fp + tn != nr_test_samples/2):
                        print "002: Check your code!"
                        print "tp: {}, tn: {}    ||   fn: {}, fp: {}, ".format(tp, fn, fp, tn)
                        print "precision: {}     ||   recall: {} ".format(precision, recall)
                        return

                    precision_scores_config.append(precision)
                    recall_scores_config.append(recall)
                    f1_scores_config.append(f1_score)
                    auc_scores_config.append(auc_val)

            else:

                class_samples_s1 = class_ds1_mixed[0:(nr_training_samples+nr_test_samples/4)]

                # calculate precision and recall in kfold cross validation
                for test_indices, train_indices in kf.split(class_samples_s1):

                    start = current_milli_time()
                    clf.fit(class_samples_s1[train_indices])
                    training_time_config.append(current_milli_time() - start)

                    # build test set, add scene 2 , add outlier dataset
                    test_with_outliers = np.concatenate((class_samples_s1[test_indices], class_ds2_mixed[0:nr_test_samples/4], outlier_ds_mixed[0:(nr_test_samples/2)]))
                    # 1/2 class, 1/2 outliers
                    labels = np.concatenate((np.repeat(1, nr_test_samples/2), np.repeat(-1, nr_test_samples/2)))

                    # predict
                    start = current_milli_time()
                    # scores which are thresholded
                    scores = clf.decision_function(test_with_outliers)
                    prediction_time_config.append(current_milli_time() - start)

                    labels_predicted = clf.threshold(scores)

                    if clf_name == 'L2Estimator':
                        # invert probability
                        scores = 3 - scores

                    # validate
                    if len(test_with_outliers) != nr_test_samples:
                        print "001: Check your code!"

                    # calculate metrics
                    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
                    auc_val = auc(fpr, tpr)
                    true_nr_positives = nr_test_samples/2
                    true_nr_negatives = nr_test_samples/2
                    tp = np.count_nonzero(labels_predicted[0:true_nr_positives] == 1)
                    fn = true_nr_positives-tp
                    fp = np.count_nonzero(labels_predicted[true_nr_positives:] == 1)
                    tn = true_nr_negatives-fp
                    fpr = float(fp)/float(fp+tn)

                    recall = float(tp) / float(tp + fn)
                    try:
                        precision = float(tp) / float(tp + fp)
                        f1_score = 2 * float(precision * recall) / float(precision + recall)
                    except ZeroDivisionError:
                        precision = 0
                        f1_score = 0

                    # validate
                    if (tp + fn != nr_test_samples/2) or (fp + tn != nr_test_samples/2):
                        print "002: Check your code!"
                        print "tp: {}, tn: {}    ||   fn: {}, fp: {}, ".format(tp, fn, fp, tn)
                        print "precision: {}     ||   recall: {} ".format(precision, recall)
                        return

                    precision_scores_config.append(precision)
                    recall_scores_config.append(recall)
                    f1_scores_config.append(f1_score)
                    auc_scores_config.append(auc_val)

            # average precision and recall values
            precision_avg = np.mean(precision_scores_config)
            recall_avg = np.mean(recall_scores_config)
            training_time_avg = np.mean(training_time_config)
            prediction_time_avg = np.mean(prediction_time_config)
            f1_scores_avg = np.mean(f1_scores_config)
            auc_scores_avg = np.mean(auc_scores_config)

            precision_values.append(precision_avg)
            recall_values.append(recall_avg)
            youden_index.append(precision_avg+recall_avg-1)
            training_time.append(training_time_avg)
            prediction_time.append(prediction_time_avg)
            f1_scores.append(f1_scores_avg)
            auc_scores.append(auc_scores_avg)

            # if verbose:
            #     print "______________________________________________________________________\n" \
            #           "Params: {}".format(clf_params)
            #     print "Precision: {}     ||     Recall: {}".format(precision_avg, recall_avg)

        # --------------- END RANDOMIZED EXPERIMENT

        # print list(precision_values)
        # print list(recall_values)

        # --------------- BEST PARAMETERS

        if objective == 'f1':
            best_index = np.argmax(f1_scores)
        elif objective == 'youden':
            best_index = np.argmax(youden_index)

        best_params = param_combinations[best_index]
        print "________________________{}/{}_______________________________".format(i+1, nr_iters)
        print "Best parameters (Youden-Index {:.2f}, F1: {:.2f}): {}".format(np.max(youden_index), np.max(f1_scores), best_params)
        print "Precision: {:.2f}     ||     Recall: {:.2f}".format(precision_values[best_index], recall_values[best_index])
        iter_precision.append(precision_values[best_index])
        iter_recall.append(recall_values[best_index])
        iter_f1_scores.append(f1_scores[best_index])
        iter_youden_indices.append(youden_index[best_index])
        iter_params.append(best_params)
        iter_training_time.append(training_time[best_index])
        iter_prediction_time.append(prediction_time[best_index])
        iter_auc_scores.append(auc_scores[best_index])

    # --------------- END RANDOM SERIES

    print "_______________________________________________________\n\n\n"
    print "                    FINAL EVALUATION:\n"
    print "LEARNER: {}".format(clf_name)
    print "MODE: {}".format('Mixed Scene Training' if combine_scenes else 'Single Scene Training')
    print "K-FOLD VALIDATION: {} folds".format(nr_splits)
    print "-------------------------------------------------------"
    if combine_scenes:
        print "Batch size training: {} ({} S1/{} S2)".format(len(train_indices)*2, len(train_indices), len(train_indices))
    else:
        print "Batch size training: {} ".format(len(train_indices))
    print "Batch size test: {} ({} class, {} outliers)".format(len(labels), len(labels[labels==1]), len(labels[labels==-1]))
    print "_______________________________________________________"
    # print "Batch size: training: {}, prediction: {}".format(nr_training_samples, nr_training_samples * (nr_splits - 1) * 2)
    # print "Batch size training: {} (class)".format(len(class_training_samples))


    print "Precision Avg, std: {:.4f} +- {:.4f}".format(np.mean(iter_precision), 2*np.std(iter_precision))
    print "Recall Avg, std: {:.4f} +- {:.4f}".format(np.mean(iter_recall), 2 * np.std(iter_recall))
    print "Precision: ", ["%0.2f" % i for i in iter_precision]
    print "Recall: ", ["%0.2f" % i for i in iter_recall]
    print "F1 score: ", ["%0.2f" % i for i in iter_f1_scores]
    print "AUC: ", ["%0.2f" % i for i in iter_auc_scores]
    print "Parameters: ", iter_params

    if save_csv:
        # keep only best

        if filename == "":
            filename = clf_name+'_auc_eval.csv'

        with open(filename, 'wb') as csvfile:
            # write configuration of best results over multiple random tests
            writer = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)

            # settings
            if clf_name == 'OneClassSVM':
                writer.writerow(["LEARNER: {} ({})".format(clf_name, iter_params[0]['kernel'])])
            else:
                writer.writerow(["LEARNER: {}".format(clf_name)])
            writer.writerow(["MODE: {}".format('Mixed Scene Training' if combine_scenes else 'Single Scene Training')])
            writer.writerow(["K-FOLD VALIDATION: {} folds".format(nr_splits)])
            writer.writerow(["RANDOM ITERATIONS: {}".format(nr_iters)])
            writer.writerow(["Batch size: training: {}, test: {}".format(nr_training_samples, nr_test_samples)])
            writer.writerow(["Precision Avg, std: {} +- {}".format(np.mean(iter_precision), 2*np.std(iter_precision))])
            writer.writerow(["Recall Avg, std: {} +- {}".format(np.mean(iter_recall), 2 * np.std(iter_recall))])
            writer.writerow(["F1 score, std: {} +- {}".format(np.mean(iter_f1_scores), 2 * np.std(iter_f1_scores))])
            writer.writerow(["Youden index, std: {} +- {}".format(np.mean(iter_youden_indices), 2 * np.std(iter_youden_indices))])
            writer.writerow("")

            if clf_name == 'L2Estimator' or clf_name == 'ABODEstimator' or clf_name == 'ApproxABODEstimator':
                writer.writerow(["Train", "Test", "Folds", "T Median", "T Mean", "T Std", "P", "P std.", "R", "R std", "F1", "F1 std", "Youdens", "Youdens std", "AUC", "AUC std", "Training Time", "Trainig Time std", "Prediction Time", "Prediction Time std"])
                writer.writerow([
                                 nr_training_samples, nr_test_samples, nr_splits, np.median([tmp['T'] for tmp in iter_params]), np.mean([tmp['T'] for tmp in iter_params]), np.std([tmp['T'] for tmp in iter_params]),
                                 np.mean(iter_precision), np.std(iter_precision), np.mean(iter_recall), np.std(iter_recall),
                                 np.mean(iter_f1_scores), np.std(iter_f1_scores),
                                 np.mean(iter_youden_indices), np.std(iter_youden_indices),
                                 np.mean(iter_auc_scores), np.std(iter_auc_scores),
                                 np.mean(iter_training_time), np.std(iter_training_time),
                                 np.mean(iter_prediction_time), np.std(iter_prediction_time)
                                 ])

            writer.writerow("")
            writer.writerow(["Precision, Recall, Training-Time (ms):"])
            writer.writerow(["%0.6f" % i for i in iter_precision])
            writer.writerow(["%0.6f" % i for i in iter_recall])
            writer.writerow(["%0.6f" % i for i in iter_training_time])

            if clf_name == 'L2Estimator' or clf_name == 'ABODEstimator':
                thresholds = ["%0.4f" % tmp['T'] for tmp in iter_params]
                writer.writerow(thresholds)
            else:
                writer.writerow(iter_params)
            writer.writerow("")
示例#43
0
    def _iter_slow(self, batch_size=128, start=None, end=None,
                   shuffle=True, seed=None, mode=0):
        # ====== Set random seed ====== #
        all_ds = self._data[:]
        prng1 = None
        prng2 = _dummy_shuffle
        if shuffle:
            if seed is None:
                seed = get_random_magic_seed()
            prng1 = RandomState(seed)
            prng2 = RandomState(seed)

        all_size = [i.shape[0] for i in all_ds]
        n_dataset = len(all_ds)

        # ====== Calculate batch_size ====== #
        if mode == 1: # equal
            s = sum(all_size)
            all_batch_size = [int(round(batch_size * i / s)) for i in all_size]
            for i in xrange(len(all_batch_size)):
                if all_batch_size[i] == 0: all_batch_size[i] += 1
            if sum(all_batch_size) > batch_size: # 0.5% -> round up, too much
                for i in xrange(len(all_batch_size)):
                    if all_batch_size[i] > 1:
                        all_batch_size[i] -= 1
                        break
            all_upsample = [None] * len(all_size)
        elif mode == 2 or mode == 3: # upsampling and downsampling
            maxsize = int(max(all_size)) if mode == 2 else int(min(all_size))
            all_batch_size = [int(batch_size / n_dataset) for i in xrange(n_dataset)]
            for i in xrange(batch_size - sum(all_batch_size)): # not enough
                all_batch_size[i] += 1
            all_upsample = [maxsize for i in xrange(n_dataset)]
        else: # sequential
            all_batch_size = [batch_size]
            all_upsample = [None]
            all_size = [sum(all_size)]
        # ====== Create all block and batches ====== #
        # [ ((idx1, batch1), (idx2, batch2), ...), # batch 1
        #   ((idx1, batch1), (idx2, batch2), ...), # batch 2
        #   ... ]
        all_block_batch = []
        # contain [block_batches1, block_batches2, ...]
        tmp_block_batch = []
        for n, batchsize, upsample in zip(all_size, all_batch_size, all_upsample):
            tmp_block_batch.append(
                create_batch(n, batchsize, start, end, prng1, upsample))
        # ====== Distribute block and batches ====== #
        if mode == 1 or mode == 2 or mode == 3:
            for i in zip_longest(*tmp_block_batch):
                all_block_batch.append([(k, v) for k, v in enumerate(i) if v is not None])
        else:
            all_size = [i.shape[0] for i in all_ds]
            all_idx = []
            for i, j in enumerate(all_size):
                all_idx += [(i, k) for k in xrange(j)] # (ds_idx, index)
            all_idx = [all_idx[i[0]:i[1]] for i in tmp_block_batch[0]]
            # complex algorithm to connecting the batch with different dataset
            for i in all_idx:
                tmp = []
                idx = i[0][0] # i[0][0]: ds_index
                start = i[0][1] # i[0][1]: index
                end = start
                for j in i[1:]: # detect change in index
                    if idx != j[0]:
                        tmp.append((idx, (start, end + 1)))
                        idx = j[0]
                        start = j[1]
                    end = j[1]
                tmp.append((idx, (start, end + 1)))
                all_block_batch.append(tmp)
        prng2.shuffle(all_block_batch)
        # print if you want debug
        # for _ in all_block_batch:
        #     for i, j in _:
        #         print('ds:', i, '  batch:', j)
        #     print('===== End =====')
        # ====== return iteration ====== #
        for _ in all_block_batch: # each _ is a block
            batches = np.concatenate(
                [all_ds[i][j[0]:j[1]] for i, j in _], axis=0)
            batches = batches[prng2.permutation(batches.shape[0])]
            yield self._normalizer(batches)
示例#44
0
def run_experiment(arglist):

    # Get the experiment paramters
    p = tools.Params("gape")
    p.set_by_cmdline(arglist)

    # Sequence categories
    cat_list = [[0, 1, 0, 1], [0, 0, 1, 1], [0, 1, 1, 0]]
    cat_names = ["alternated", "paired", "reflected"]

    # Get this run's schedule in a manner that is consistent
    # within and random between subjects
    if p.train:
        letter = letters[p.run - 1]
        p.sched_id = "train_%s" % letter
        sched_file = "sched/schedule_%s.csv" % p.sched_id

    else:
        state = RandomState(abs(hash(p.subject)))
        choices = list(letters[:p.total_schedules])
        p.sched_id = state.permutation(choices)[p.run - 1]
        sched_file = "sched/schedule_%s.csv" % p.sched_id

    # Read in this run's schedule
    s = read_csv(sched_file)

    # Max the screen brightness
    tools.max_brightness(p.monitor_name)

    # Open up the stimulus window
    calib.monitorFolder = "./calib"
    mon = calib.Monitor(p.monitor_name)
    m = tools.WindowInfo(p, mon)
    win = visual.Window(**m.window_kwargs)

    # Set up the stimulus objects
    fix = visual.PatchStim(win, tex=None, mask="circle",
                           color=p.fix_color, size=p.fix_size)
    a_fix = visual.PatchStim(win, tex=None, mask="circle",
                             color=p.fix_antic_color, size=p.fix_size)
    r_fix = visual.PatchStim(win, tex=None, mask="circle",
                             color=p.fix_resp_color, size=p.fix_size)
    d_fix = visual.PatchStim(win, tex=None, mask="circle",
                             color=p.fix_demo_color, size=p.fix_size)
    c_fix = visual.PatchStim(win, tex=None, mask="circle",
                             color=p.fix_catch_color, size=p.fix_size)
    b_fix = visual.PatchStim(win, tex=None, mask="circle",
                             color=p.fix_break_color, size=p.fix_size)
    halo = visual.PatchStim(win, tex=None, mask=p.demo_halo_mask,
                            opacity=p.demo_halo_opacity,
                            color=p.demo_halo_color,
                            size=p.demo_halo_size)
    grate = visual.PatchStim(win, "sin", p.stim_mask, size=p.stim_size,
                             contrast=p.stim_contrast, sf=p.stim_sf,
                             opacity=p.stim_opacity)
    disk = visual.PatchStim(win, tex=None, mask=p.stim_mask,
                            color=win.color, size=p.stim_disk_ratio)
    stims = [grate, disk, fix]

    # Set up some timing variables
    running_time = 0
    antic_secs = p.tr
    demo_secs = 4 * p.demo_stim_dur + 3 * p.demo_stim_isi + p.tr
    seq_secs = p.tr + 4 * p.stim_dur + 3 * p.stim_isi
    catch_secs = p.tr
    rest_secs = p.rest_trs * p.tr

    # Draw the instructions and wait to go
    instruct = dedent("""
    Watch the sample sequence and say if the target sequences match

    Blue dot: sample sequence
    Red dot: get ready
    Orange dot: relax
    Green dot: say if sequence matched the sample
    Button 1: same    Button 2: different

    Grey dot: quick break


    Experimenter: Press space to prep for scan""")  # TODO
    # Draw the instructions and wait to go
    tools.WaitText(win, instruct, height=.7)(check_keys=["space"])

    # Possibly wait for the scanner
    if p.fmri:
        tools.wait_for_trigger(win, p)

    # Start a data file and write the params to it
    f, fname = tools.start_data_file(p.subject, p.experiment_name,
                                     p.run, train=p.train)
    p.to_text_header(f)

    # Save run params to JSON
    save_name = op.join("./data", op.splitext(fname)[0])
    p.to_json(save_name)

    # Write the datafile header
    header = ["trial", "block",
              "cat_id", "cat_name",
              "event_type",
              "event_sched", "event_time",
              "ori_a", "ori_b",
              "oddball", "odd_item", "odd_orient",
              "iti", "response", "rt", "acc"]
    tools.save_data(f, *header)

    # Start a clock and flush the event buffer
    exp_clock = core.Clock()
    trial_clock = core.Clock()
    event.clearEvents()

    # Main experiment loop
    # --------------------
    try:

        # Dummy scans
        fix.draw()
        win.flip()
        dummy_secs = p.dummy_trs * p.tr
        running_time += dummy_secs
        wait_check_quit(dummy_secs, p.quit_keys)

        for t in s.trial:

            cat_seq = cat_list[s.cat_id[t]]
            block_ori_list = np.array([s.ori_a[t], s.ori_b[t]])[cat_seq]

            # Set up some defaults for variables that aren't always set
            oddball_seq = [0, 0, 0 ,0]
            odd_item, odd_ori = -1, -1
            acc, response, resp_rt = -1, -1, -1

            # Possibly rest and then bail out of the rest of the loop
            if s.ev_type[t] == "rest":
                if p.train and not p.fmri:
                    b_fix.draw()
                    win.flip()
                    wait_check_quit(2)
                    before = exp_clock.getTime()
                    msg = "Quick break! Press space to continue."
                    tools.WaitText(win, msg, height=.7)(check_keys=["space"])
                    b_fix.draw()
                    win.flip()
                    wait_check_quit(2)
                    after = exp_clock.getTime()
                    rest_time = after - before
                    running_time += rest_time
                    continue
                else:
                    b_fix.draw()
                    win.flip()
                    wait_check_quit(rest_secs)
                    running_time += rest_secs
                    continue
 
            # Otherwise, we always get an anticipation
            if p.antic_fix_dur <= p.tr:  # possibly problematic
                fix.draw()
                win.flip()
                core.wait(p.tr - p.antic_fix_dur)
            if s.ev_type[t] == "demo":
                stim = d_fix
            else:
                stim = a_fix
            end_time = running_time + p.antic_fix_dur
            tools.precise_wait(win, exp_clock, end_time, stim)
            running_time += antic_secs

            # The event is about to happen so stamp that time
            event_sched = running_time
            event_time = exp_clock.getTime()

            # Demo sequence
            if s.ev_type[t] == "demo":

                for i, ori in enumerate(block_ori_list):
                    # Draw each stim
                    grate.setOri(ori)
                    halo.draw()
                    draw_all(*stims)
                    d_fix.draw()
                    win.flip()
                    core.wait(p.demo_stim_dur)

                    # Short isi fix
                    if i < 3:
                        d_fix.draw()
                        win.flip()
                        core.wait(p.demo_stim_isi)
                    check_quit()

                # Demo always has >1 TR fixation
                fix.draw()
                win.flip()
                wait_check_quit(p.tr)

                # Update timing
                running_time += demo_secs

            # Proper test sequence
            if s.ev_type[t] == "seq":

                # If this is an oddball, figure out where
                if s.oddball[t]:
                    oddball_seq = multinomial(1, [.25] * 4).tolist()
                    odd_item = oddball_seq.index(1)

                # Iterate through each element in the sequence
                for i, ori in enumerate(block_ori_list):

                    # Set the grating attributes
                    if oddball_seq[i]:
                        ori_choices = [o for o in p.stim_orients
                                       if not o == ori]
                        odd_ori = ori_choices[randint(3)]
                        grate.setOri(odd_ori)
                    else:
                        grate.setOri(ori)
                    grate.setPhase(uniform())

                    # Draw the grating set
                    draw_all(*stims)
                    win.flip()
                    core.wait(p.stim_dur)

                    # ISI Fix (on all but last stim)
                    if i < 3:
                        fix.draw()
                        win.flip()
                        core.wait(p.stim_isi)
                    check_quit()

                # Response fixation
                r_fix.draw()
                trial_clock.reset()
                event.clearEvents()
                win.flip()
                acc, response, resp_rt = wait_get_response(p,
                                                           trial_clock,
                                                           s.oddball[t],
                                                           p.resp_dur)

                # Update timing
                running_time += seq_secs

            # Catch trial
            if s.ev_type[t] == "catch":
                c_fix.draw()
                win.flip()
                wait_check_quit(p.tr)
                running_time += catch_secs

            # Save data to the datafile
            data = [t, s.block[t],
                    s.cat_id[t], cat_names[s.cat_id[t]],
                    s.ev_type[t],
                    event_sched, event_time,
                    s.ori_a[t], s.ori_b[t],
                    s.oddball[t],
                    odd_item, odd_ori, s.iti[t],
                    response, resp_rt, acc]
            tools.save_data(f, *data)

            # ITI interval
            # Go by screen refreshes for precise timing
            this_iti = s.iti[t] * p.tr
            end_time = running_time + this_iti
            tools.precise_wait(win, exp_clock, end_time, fix)
            running_time += this_iti

            
    finally:
        # Clean up
        f.close()
        win.close()

    # Good execution, print out some info
    try:
        data_file = op.join("data", fname)
        with open(data_file, "r") as fid:
            lines = fid.readlines()
            n_comments = len([l for l in lines if l.startswith("#")])
        df = read_csv(data_file, skiprows=n_comments, na_values=["-1"])

        info = dict()
        time_error = df.event_sched - df.event_time
        info["run"] = p.run
        info["acc"] = df.acc.mean()
        info["mean_rt"] =  df.rt.mean()
        info["missed_resp"] = (df.response == 0).sum()
        info["time_error_mean"] = abs(time_error).mean()
        info["time_error_max"] = max(time_error)

        print dedent("""Performance summary for run %(run)d:

        Accuracy: %(acc).3f
        Mean RT: %(mean_rt).3f
        Missed responses: %(missed_resp)d

        Mean timing error: %(time_error_mean).4f
        Max timing error: %(time_error_max).4f
        """ % info)

    except Exception as err:
        print "Could not read data file for summary"
        print err
示例#45
0
    def train(train_set_x,
              train_set_y,
              hyper_parameters,
              symmetric_double_encoder,
              params,
              regularization_methods,
              print_verbose=False,
              top=0,
              validation_set_x=None,
              validation_set_y=None,
              moving_averages=None,
              decay=False,
              reduce_val=0,
              autoencoder_x=False,
              autoencoder_y=False):

        OutputLog().write('Using Decay = {0}'.format(decay))

        # Calculating number of batches
        n_training_batches = int(train_set_x.shape[0] / hyper_parameters.batch_size)
        random_stream = RandomState()

        early_stop_count = 0

        model_updates = [shared(p.get_value() * 0) for p in params]
        model_deltas = [shared(p.get_value() * 0) for p in params]

        eps = 1e-8

        symmetric_double_encoder.set_eval(False)

        last_metric = 0

        correlations = []

        tester = TraceCorrelationTester(validation_set_x, validation_set_y, top, reduce_val)

        learning_rate = hyper_parameters.learning_rate

        # The training phase, for each epoch we train on every batch
        best_loss = 0
        for epoch in numpy.arange(hyper_parameters.epochs):

            OutputLog().write('----------Starting Epoch ({0})-----------'.format(epoch), 'debug')

            print 'Building model'
            model = Trainer._build_model(hyper_parameters,
                                         learning_rate,
                                         symmetric_double_encoder,
                                         params,
                                         regularization_methods,
                                         model_updates,
                                         model_deltas,
                                         moving_averages,
                                         n_training_batches,
                                         hyper_parameters.training_strategy,
                                         0.9,
                                         0.999,
                                         hyper_parameters.rho,
                                         eps,
                                         'L2',
                                         len(symmetric_double_encoder) - 1,
                                         autoencoder_x,
                                         autoencoder_y)

            OutputLog().write('Shuffling dataset', 'debug')
            indices_positive = random_stream.permutation(train_set_x.shape[0])

            loss_forward = 0
            loss_backward = 0

            OutputLog().write('Training {0} batches'.format(n_training_batches), 'debug')
            for index in xrange(n_training_batches):

                start_tick = cv2.getTickCount()

                # need to convert the input into tensor variable
                symmetric_double_encoder.var_x.set_value(
                    train_set_x[indices_positive[index * hyper_parameters.batch_size:
                    (index + 1) * hyper_parameters.batch_size], :], borrow=True)

                symmetric_double_encoder.var_y.set_value(
                    train_set_y[indices_positive[index * hyper_parameters.batch_size:
                    (index + 1) * hyper_parameters.batch_size], :], borrow=True)

                output = model(index + 1)
                loss_backward += output[0]
                loss_forward += output[1]

                if math.isnan(loss_backward) or math.isnan(loss_forward):
                    OutputLog().write('loss equals NAN, exiting')
                    sys.exit(-1)

                tickFrequency = cv2.getTickFrequency()
                current_time = cv2.getTickCount()

                regularizations = [regularization_method for regularization_method in regularization_methods if not
                regularization_method.weight == 0]

                string_output = ''

                if len(regularizations) > 0:
                    zipped = zip(output[8:8 + len(regularizations)], regularizations)

                    string_output = ' '
                    for regularization_output, regularization_method in zipped:
                        string_output += '{0}: {1} '.format(regularization_method.regularization_type,
                                                            regularization_output)

                OutputLog().write(
                    'batch {0}/{1} ended, time: {2:.3f}, loss_x: {3}, loss_y: {4}, loss_h: '
                    '{7:.2f} var_x: {5} var_y: {6} mean_g: {9} var_g: {10} {8}'.
                        format(index,
                               n_training_batches,
                               ((current_time - start_tick) / tickFrequency),
                               output[0],
                               output[1],
                               output[2],
                               output[3],
                               calculate_reconstruction_error(output[4], output[5]),
                               string_output,
                               numpy.mean(output[6]),
                               numpy.mean(output[7])), 'debug')

            OutputLog().write('Average loss_x: {0} loss_y: {1}'.format(loss_backward / (n_training_batches * 2),
                                                                       loss_forward / (n_training_batches * 2)))

            if print_verbose and not validation_set_y is None and not validation_set_x is None and epoch % hyper_parameters.validation_epoch == 0:

                OutputLog().write('----------epoch (%d)----------' % epoch, 'debug')

                symmetric_double_encoder.set_eval(True)

                correlations, best_correlation, var, x, y, layer_id = tester.test(
                    DoubleEncoderTransformer(symmetric_double_encoder, 0),
                    hyper_parameters)

                symmetric_double_encoder.set_eval(False)

                if math.isnan(var):
                    sys.exit(0)

                current_metric = \
                tester._metrics[hyper_parameters.early_stopping_layer][hyper_parameters.early_stopping_metric][-1]
                if last_metric > current_metric:
                    early_stop_count += 1

                if hyper_parameters.decay_factor > 0:
                    if not hyper_parameters.decay:
                        if last_metric - current_metric > 0.1:
                            OutputLog().write('Decaying learning rate')
                            learning_rate *= hyper_parameters.decay_factor
                    else:
                        if epoch in hyper_parameters.decay:
                            OutputLog().write('Decaying learning rate')
                            learning_rate *= hyper_parameters.decay_factor
                            symmetric_double_encoder.export_encoder(OutputLog().output_path, 'epoch_{0}'.format(epoch))

                last_metric = current_metric

            if early_stop_count == 1 and hyper_parameters.early_stopping:
                tester.saveResults(OutputLog().output_path)
                return

            OutputLog().write('epoch (%d) ,Loss X = %f, Loss Y = %f, learning_rate = %f\n' % (epoch,
                                                                                              loss_backward / n_training_batches,
                                                                                              loss_forward / n_training_batches,
                                                                                              learning_rate), 'debug')

        tester.saveResults(OutputLog().output_path)

        del model
示例#46
0
def balanced_train_test_split(X, y, test_size=None, train_size=None, bootstrap=False,
                              random_state=None):
    """ Split the data into a balanced training set and test set of some given size.

        For a dataset with an unequal numer of samples in each class, one useful procedure
        is to split the data into a training and a test set in such a way that the classes
        are balanced.
        
        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            Feature matrix.
        
        y : array, shape = [n_features]
            Target vector.

        test_size : float or int (default=0.3)
            If float, should be between 0.0 and 1.0 and represent the proportion of the dataset
            to include in the test split. If int, represents the absolute number of test samples.
            If None, the value is automatically set to the complement of the train size.
            If train size is also None, test size is set to 0.3.
        
        train_size : float or int (default=1-test_size)
            If float, should be between 0.0 and 1.0 and represent the proportion of the dataset
            to include in the train split. If int, represents the absolute number of train samples.
            If None, the value is automatically set to the complement of the test size.
            
        random_state : int, optional (default=None)
            Pseudo-random number generator state used for random sampling.
        
        Returns
        -------
        X_train : array
            The feature vectors (stored as columns) in the training set.
            
        X_test : array
            The feature vectors (stored as columns) in the test set.
            
        y_train : array
            The target vector in the training set.
            
        y_test : array
            The target vector in the test set.
    """
    
    # initialise the random number generator
    rng = RandomState(random_state)

    # make sure X and y are numpy arrays
    X = np.asarray(X)
    y = np.asarray(y)
    
    # get information about the class distribution
    classes, y_indices = np.unique(y, return_inverse=True)
    n_classes = len(classes)
    cls_count = np.bincount(y_indices)

    # get the training and test size
    train_size, test_size = _get_train_test_size(train_size, test_size, len(y))

    # number of samples in each class that is included in the training and test set
    n_train = np.round(train_size / n_classes).astype(int)
    n_test = np.round(test_size / n_classes).astype(int)
    n_total = n_train + n_test
    
    # make sure we have enough samples to create a balanced split
    min_count = min(cls_count)
    if min_count < (n_train + n_test) and not bootstrap:
        raise ValueError('The smallest class contains {} examples, which is not '
                         'enough to create a balanced split. Choose a smaller size '
                         'or enable bootstraping.'.format(min_count))
    
    # selected indices are stored here
    train = []
    test = []
    
    # get the desired sample from each class
    for i, cls in enumerate(classes):
        if bootstrap:
            shuffled = rng.choice(cls_count[i], n_total, replace=True)
        else:
            shuffled = rng.permutation(cls_count[i])
        
        cls_i = np.where(y == cls)[0][shuffled]
        train.extend(cls_i[:n_train])
        test.extend(cls_i[n_train:n_total])
        
    train = list(rng.permutation(train))
    test = list(rng.permutation(test))
    
    return X[train], X[test], y[train], y[test]
示例#47
0
def _subsample_nonzero(counts, ns, replace=False, seed=0):
    """Randomly subsample from a vector of counts and returns the number of
    nonzero values for each number of element to subsample specified.

    Parameters
    ----------
    counts : 1-D array_like of integers
        Vector of counts.
    ns : 1-D array_like of integers
        List of numbers of element to subsample.
    replace : bool, optional
        Subsample with or without replacement.
    seed : int, optional
        Random seed.

    Returns
    -------
    nonzero : 1-D ndarray
        Number of nonzero values for each value of ns.

    Raises
    ------
    ValueError, TypeError
    """

    counts = np.asarray(counts)
    ns = np.asarray(ns)

    if counts.ndim != 1:
        raise ValueError("'counts' must be an 1-D array_like object")

    if (ns < 0).sum() > 0:
        raise ValueError("values in 'ns' must be > 0 ")

    counts = counts.astype(int, casting='safe')
    ns = ns.astype(int, casting='safe')

    counts_sum = counts.sum()

    prng = RandomState(seed)
    nonzero = []

    if replace:
        p = counts / counts_sum
        for n in ns:
            if n > counts_sum:
                nonzero.append(np.nan)
            else:
                subcounts = prng.multinomial(n, p)
                nonzero.append(np.count_nonzero(subcounts))
    else:
        nz = np.flatnonzero(counts)
        expanded = np.concatenate([np.repeat(i, counts[i]) for i in nz])
        permuted = prng.permutation(expanded)
        for n in ns:
            if n > counts_sum:
                nonzero.append(np.nan)
            else:
                subcounts = np.bincount(permuted[:n], minlength=counts.size)
                nonzero.append(np.count_nonzero(subcounts))

    return np.array(nonzero)