示例#1
0
文件: utils.py 项目: gfmei/python
    def create_batches(self, train_file, batch_size, sequence_length):

        self.x_data = []
        self.y_data = []
        padding_index = self.vocab_size - 1
        for line in open(train_file):
            line = line.decode('utf-8').replace('\n', '')
            text, label = line.strip().split('\t')
            tokens = fool.cut(re.sub(r'\w+', ' L', text))
            seq_ids = [self.token_dictionary.get(token) for token in tokens[0] if token not in self.stop_words and
                       self.token_dictionary.get(token) is not None and not chinese.is_other_all(token)]
            seq_ids = seq_ids[:sequence_length]
            for _ in range(len(seq_ids), sequence_length):
                seq_ids.append(padding_index)

            self.x_data.append(seq_ids)
            self.y_data.append(self.label_dictionary.get(label))

        self.num_batches = int(len(self.x_data) / batch_size)
        self.x_data = self.x_data[:self.num_batches * batch_size]
        self.y_data = self.y_data[:self.num_batches * batch_size]

        self.x_data = np.array(self.x_data, dtype=int)
        self.y_data = np.array(self.y_data, dtype=int)
        self.x_batches = np.split(self.x_data.reshape(batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(self.y_data.reshape(batch_size, -1), self.num_batches, 1)
        self.pointer = 0
    def gridify_image(image, grid):
        '''
        Extracts a grid of sub-images of a 01c image, as a b01c batch.
        '''
        assert_equal(image.ndim, 3)  # a single image in 01c format.

        grid = numpy.asarray(grid, dtype=int)
        assert_true(numpy.all(grid > 0))

        # Trim off excess rows and columns
        image_shape = numpy.asarray(image.shape[:2])
        trimmed_shape = image_shape - image_shape % grid
        image = image[:trimmed_shape[0], :trimmed_shape[1], :]

        # b01c
        image = image[numpy.newaxis, ...]

        grid_cells = []

        grid_rows = numpy.split(image, grid[0], axis=1)
        for grid_row in grid_rows:
            row_cells = numpy.split(grid_row, grid[1], axis=2)
            grid_cells.extend(row_cells)

        # concatenate grid_cells along batch axis
        return numpy.concatenate(grid_cells, axis=0)
示例#3
0
文件: match.py 项目: sproutman/wavcmp
 def segments(self):
     if self.a and self.b:
         assert self.a.data_high == self.b.data_high
         assert self.a.rate == self.b.rate
         a = self.a.data_wider()
         b = self.b.data_wider()
         offset = self.offset
         with warnings.catch_warnings():
             # Changing handling of empty arrays not relevant to us
             warnings.simplefilter("ignore", FutureWarning)
             acs = np.split(a, (max(0, offset), len(b)+offset))
             bcs = np.split(b, (max(0, -offset), len(a)-offset))
         for i, (ac, bc) in enumerate(zip(acs, bcs)):
             if i == 1:
                 assert len(ac) == len(bc)
                 yield Segment(
                     ac, bc, self.a.data_high, self.a.rate,
                     min(a.size, b.size)) # matches total in cmp_track
             elif len(ac):
                 assert not len(bc)
                 # careful with np.zeros type
                 yield Segment(ac, ac*0, self.a.data_high, self.a.rate,
                               ac.size, padding="-")
             elif len(bc):
                 yield Segment(bc*0, bc, self.a.data_high, self.a.rate,
                               bc.size, padding="+")
     elif self.a:
         ac = self.a.data_wider()
         yield Segment(ac, ac*0, self.a.data_high, self.a.rate,
                       ac.size, padding="<")
     elif self.b:
         bc = self.b.data_wider()
         yield Segment(bc*0, bc, self.b.data_high, self.b.rate,
                       bc.size, padding=">")
示例#4
0
文件: grid.py 项目: jschwab/zeustools
    def _read_tile(self, filename):

        with open(filename, "r") as tilefile:
            # this is reversed from the fortran b/c in is a reserved word
            self.ni, self.nj, self.nk = np.fromfile(tilefile, dtype="int32", 
                                                    count = 3, sep = " ")

            raw_data= np.genfromtxt(tilefile, 
                                    dtype = ("int32", "float64", "float64", "float64", "float64"),
                                    names = ("idx", "a", "b", "vla", "vlb"))

            self.ii, self.ij, self.ik = np.split(raw_data["idx"],
                                                 [self.ni,
                                                  self.ni+self.nj])

            self.x1a, self.x2a, self.x3a = np.split(raw_data["a"],
                                                    [self.ni,
                                                     self.ni+self.nj])

            self.x1b, self.x2b, self.x3b = np.split(raw_data["b"],
                                                    [self.ni,
                                                     self.ni+self.nj])

            self.vl1a, self.vl2a, self.vl3a = np.split(raw_data["vla"],
                                                    [self.ni,
                                                     self.ni+self.nj])

            self.vl1b, self.vl2b, self.vl3b = np.split(raw_data["vlb"],
                                                    [self.ni,
                                                     self.ni+self.nj])


            return
示例#5
0
def drop_samples(game, prob):
    """Drop samples from a sample game

    Samples are dropped independently with probability prob."""
    sample_map = {}
    for prof, pays in zip(np.split(game.profiles, game.sample_starts[1:]),
                          game.sample_payoffs):
        num_profiles, _, num_samples = pays.shape
        perm = rand.permutation(num_profiles)
        prof = prof[perm]
        pays = pays[perm]
        new_samples, counts = np.unique(
            rand.binomial(num_samples, prob, num_profiles), return_counts=True)
        splits = counts[:-1].cumsum()
        for num, prof_samp, pay_samp in zip(
                new_samples, np.split(prof, splits), np.split(pays, splits)):
            if num == 0:
                continue
            prof, pays = sample_map.setdefault(num, ([], []))
            prof.append(prof_samp)
            pays.append(pay_samp[..., :num])

    if sample_map:
        profiles = np.concatenate(list(itertools.chain.from_iterable(
            x[0] for x in sample_map.values())), 0)
        sample_payoffs = tuple(np.concatenate(x[1]) for x
                               in sample_map.values())
    else:  # No data
        profiles = np.empty((0, game.num_role_strats), dtype=int)
        sample_payoffs = []

    return rsgame.samplegame_copy(game, profiles, sample_payoffs, False)
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    """
    Split character data into training and validation sets, inputs and targets for each set.
    Arguments
    ---------
    chars: character array
    batch_size: Size of examples in each of batch
    num_steps: Number of sequence steps to keep in the input and pass to the network
    split_frac: Fraction of batches to keep in the training set
    Returns train_x, train_y, val_x, val_y
    """

    slice_size = batch_size * num_steps
    n_batches = int(len(chars) / slice_size)

    # Drop the last few characters to make only full batches
    x = chars[: n_batches * slice_size]
    y = chars[1: n_batches * slice_size + 1]

    # Split the data into batch_size slices, then stack them into a 2D matrix
    x = np.stack(np.split(x, batch_size))
    y = np.stack(np.split(y, batch_size))

    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps

    # Split into training and validation sets, keep the first split_frac batches for training
    split_idx = int(n_batches * split_frac)
    train_x, train_y = x[:, :split_idx * num_steps], y[:, :split_idx * num_steps]
    val_x, val_y = x[:, split_idx * num_steps:], y[:, split_idx * num_steps:]

    return train_x, train_y, val_x, val_y
示例#7
0
def test_stratified_batches():
    data = np.array([('a', -1), ('b', 0), ('c', 1), ('d', -1), ('e', -1)],
                    dtype=[('x', np.str_, 8), ('y', np.int32)])

    assert list(data['x']) == ['a', 'b', 'c', 'd', 'e']
    assert list(data['y']) == [-1, 0, 1, -1, -1]

    batch_generator = training_batches(data, batch_size=3, n_labeled_per_batch=1)

    first_ten_batches = list(islice(batch_generator, 10))

    labeled_batch_portions = [batch[:1] for batch in first_ten_batches]
    unlabeled_batch_portions = [batch[1:] for batch in first_ten_batches]

    labeled_epochs = np.split(np.concatenate(labeled_batch_portions), 5)
    unlabeled_epochs = np.split(np.concatenate(unlabeled_batch_portions), 4)

    assert ([sorted(items['x'].tolist()) for items in labeled_epochs] ==
            [['b', 'c']] * 5)
    assert ([sorted(items['y'].tolist()) for items in labeled_epochs] ==
            [[0, 1]] * 5)
    assert ([sorted(items['x'].tolist()) for items in unlabeled_epochs] ==
            [['a', 'b', 'c', 'd', 'e']] * 4)
    assert ([sorted(items['y'].tolist()) for items in unlabeled_epochs] ==
            [[-1, -1, -1, -1, -1]] * 4)
示例#8
0
    def _feed_dict(self, train_batch, is_training=True):

        pred_polys = train_batch['raw_polys'] * np.expand_dims(train_batch['masks'], axis=2)  # (seq,batch,2)
        pred_polys = np.transpose(pred_polys, [1, 0, 2])  # (batch,seq,2)

        pred_mask = np.transpose(train_batch['masks'], [1, 0])  # (batch_size,seq_len)
        cnn_feats = train_batch['cnn_feats']  # (batch_size, 28, 28, 128)

        cells_1 = np.stack([np.split(train_batch['hiddens_list'][-1][0], 2, axis=3)[0]], axis=1)

        cells_2 = np.stack([np.split(train_batch['hiddens_list'][-1][1], 2, axis=3)[0]], axis=1)

        pred_mask_imgs = self.draw_mask(28, 28, pred_polys, pred_mask)

        if is_training:
            raise NotImplementedError()

        r = {
            self._ph.cells_1: cells_1,
            self._ph.cells_2: cells_2,
            self._ph.pred_mask_imgs: pred_mask_imgs,
            self._ph.cnn_feats: cnn_feats,
            self._ph.predicted_mask: pred_mask,
            self._ph.pred_polys: pred_polys,
            self._ph.ious: self._zero_batch
        }

        return r
示例#9
0
def update_h(sigma2, phi, y, mu, psi):
    """Updates the hidden variables using updated parameters.

    This is an implementation of the equation:
..  math::
        \\hat{h} = (\\sigma^2 I + \\sum_{n=1}^N \\Phi_n^T A^T A \\Phi_n)^{-1} \\sum_{n=1}^N \\Phi_n^T A^T (y_n - A \\mu_n - b)

    """
    N = y.shape[0]
    K = phi.shape[1]

    A = psi.params[:2, :2]
    b = psi.translation

    partial_0 = 0
    for phi_n in np.split(phi, N, axis=0):
        partial_0 += phi_n.T @ A.T @ A @ phi_n

    partial_1 = sigma2 * np.eye(K) + partial_0

    partial_2 = np.zeros((K, 1))
    for phi_n, y_n, mu_n in zip(np.split(phi, N, axis=0), y, mu.reshape(-1, 2)):
        partial_2 += phi_n.T @ A.T @ (y_n - A @ mu_n - b).reshape(2, -1)

    return np.linalg.inv(partial_1) @ partial_2
示例#10
0
def read_dataset(train_size, scale=False, normalize=False):
    logging.info('fetching the dataset')
    #
    d = sklearn.datasets.load_diabetes() # 糖尿病
    #d = sklearn.datasets.load_boston() # ボストン住宅価格
    #
    data = d['data'].astype(np.float32)
    target = d['target'].astype(np.float32).reshape(len(d['target']), 1)
    #"Chainerのmnist.pyだと下記ののような書き方になっているが、ミニバッチの数が2以上だと動かない"らしい 
    #target = diabetes['target'].astype(np.float32) 
    # 本来訓練データで標準化・正規化して、そのパラメータをテストデータに適用すべき
    if normalize and scale:
        raise Exception('both normalize and scale can not be True')
    if normalize:
        data = preprocessing.normalize(data)
        target = preprocessing.normalize(target)
    if scale:
        data = preprocessing.scale(data)
        target = preprocessing.scale(target)
    # 分割
    x_train, x_test = np.split(data, [train_size])
    y_train, y_test = np.split(target, [train_size])
    assert len(x_train)==len(y_train)
    assert len(x_test)==len(y_test)
    return  ((x_train, y_train), (x_test, y_test), 
        {"SHAPE_TRAIN_X":x_train.shape,
          "SHAPE_TRAIN_Y":y_train.shape,
          "SHAPE_TEST_X":x_test.shape,
          "SHAPE_TEST_Y":y_test.shape,
          })
示例#11
0
def split_dataset(dataset, N=4000):
    perm = np.random.permutation(len(dataset['target']))
    dataset['data'] = dataset['data'][perm]
    dataset['target'] = dataset['target'][perm]
    x_train, x_test = np.split(dataset['data'],   [N])
    y_train, y_test = np.split(dataset['target'], [N])
    return x_train, y_train, x_test, y_test
示例#12
0
def spiralroll(B, orient=1):
    ''' undo spiral flatten '''
    k = int(np.sqrt(B.size))
    if k**2-B.size != 0:
        print('ERR: unable to form a square 2D array!')
    else:
        C = np.copy(B)
        C = C[::-1]
        if k%2:
            A, C = np.split(C, [1])
            A = A.reshape(1,1)
            start = 2
        else:
            A, C = np.split(C, [4])
            A = A[::-1].reshape(2,2)
            A[-1] = A[-1, ::-1]
            start = 3
        for ix in range(start, k, 2):
            A = np.pad(A, ((1, 1), (1, 1)), mode='constant')
            C1, C2, C3, C4, C = np.split(C, [ix, ix*2, ix*3, ix*4])
            A[1:, 0] = C1
            A[-1, 1:] = C2
            A[-2::-1, -1] = C3
            A[0, -2::-1] = C4
        if orient is 0:
            A = A.T
        return A
示例#13
0
    def get_train_data(self, label_types):
        labeled_images = self.get_labeled_images()
        x_train_all = np.asarray(map(
            lambda labeled_image_file: labeled_image_file.get_image(),
            labeled_images
        ))
        y_train_all = np.asarray(map(
            lambda labeled_image_file: label_to_output(labeled_image_file.get_label(), label_types),
            labeled_images
        ))
        length = len(labeled_images)

        # 元データをランダムに並べ替える
        indexes = np.random.permutation(length)
        x_train_all_rand = x_train_all[indexes]
        y_train_all_rand = y_train_all[indexes]

        # 平均画像を引く
        mean = self.get_mean_image()
        if mean is not None:
            x_train_all_rand -= mean
        # 正規化
        x_train_all /= 255

        # 1/5はテストに使う
        data_size = length * 4 / 5
        x_train, x_test = np.split(x_train_all_rand, [data_size])
        y_train, y_test = np.split(y_train_all_rand, [data_size])

        return x_train, x_test, y_train, y_test
示例#14
0
    def make_batch(self):
        # make datasets
        x_dataset, y_dataset = ps.make_sente_datasets(1,100)
        #print(x_dataset[110])
        #print(y_dataset[110])
        x_dataset = np.asarray(x_dataset)
        y_dataset = np.asarray(y_dataset)

        nb_data = x_dataset.shape[0]

        x_train,x_test = np.split(x_dataset,[nb_data*0.9])
        y_train,y_test = np.split(y_dataset,[nb_data*0.9])

        #x_train = x_train.reshape(x_train.shape[0], 1, 15, 9)
        #x_test = x_test.reshape(x_test.shape[0], 1, 15, 9)
        x_train = x_train.reshape(x_train.shape[0], 1, 11, 9)
        x_test = x_test.reshape(x_test.shape[0], 1, 11, 9)

        y_train = np_utils.to_categorical(y_train, nb_classes)
        y_test = np_utils.to_categorical(y_test, nb_classes)
        print("x_train shape:", x_train.shape)
        print(x_train.shape[0], "train samples")
        print(x_test.shape[0], "test samples")

        return x_train, y_train, x_test, y_test
def split_data(X,Y,degree):
       
      Testing_error =[] #all the testing errors of 10 fold cross validations
      Training_error = [] #all the training errors  of 10 fold cross validations
      X_sets =  np.split(X,10)
      Y_sets = np.split(Y,10)
      
      for i in range(len(X_sets)):
          X_test =np.vstack( X_sets[i])
          Y_test = np.vstack(Y_sets[i])
          if i<len(X_sets)-1: 
             X_train = np.vstack(X_sets[i+1:])      
             Y_train =np.vstack(Y_sets[i+1:])
          elif i==len(X_sets)-1 : 
             X_train = np.vstack(X_sets[:i])
             Y_train = np.vstack(Y_sets[:i])
          while i>0:
              tempX = np.vstack(X_sets[i-1])
              X_train = np.append(tempX,X_train)
              tempY = np.vstack(Y_sets[i-1])
              Y_train = np.append(tempY,Y_train)
              i = i-1
          X_train = np.vstack(X_train)
          Y_train = np.vstack(Y_train)
          Z_train,theta,Z_test = polynomial_withCV(X_train,Y_train,degree,X_test)
          Testing_error.append( mse(Z_test,theta,Y_test))
          Training_error.append(mse(Z_train,theta,Y_train))
      return sum(Testing_error),sum(Training_error)
示例#16
0
 def reconstruct2(self, nl, l1, l2):
     """
     To reconstruct Python List of lists / numpy arrays. Inverse operation of FLATTEN() above.
     Usage: L_reconstructed = reconstruct2(L_flat,l1,l2)
     Source: http://stackoverflow.com/questions/27982432/flattening-and-unflattening-a-nested-list-of-numpy-arrays
     """
     return np.split(np.split(nl,np.cumsum(l1)),np.cumsum(l2))[:-1]
示例#17
0
文件: yapgen.py 项目: rmari/LF_DEM
def conf2yap(conf_fname, yap_filename):
    print("Yap file : ", yap_filename)
    positions, radii, meta = clff.read_conf_file(conf_fname)
    positions[:, 0] -= float(meta['lx'])/2
    positions[:, 1] -= float(meta['ly'])/2
    positions[:, 2] -= float(meta['lz'])/2

    if 'np_fixed' in meta:
        # for conf with fixed particles
        split_line = len(positions) - int(meta['np_fixed'])
        pos_mobile, pos_fixed = np.split(positions, [split_line])
        rad_mobile, rad_fixed = np.split(radii, [split_line])
        yap_out = pyp.layer_switch(3)
        yap_out = pyp.add_color_switch(yap_out, 3)
        yap_out = np.row_stack((yap_out,
                                particles_yaparray(pos_mobile, rad_mobile)))
        yap_out = pyp.add_layer_switch(yap_out, 4)
        yap_out = pyp.add_color_switch(yap_out, 4)
        yap_out = np.row_stack((yap_out,
                                particles_yaparray(pos_fixed, rad_fixed)))
    else:
        yap_out = pyp.layer_switch(3)
        yap_out = pyp.add_color_switch(yap_out, 3)
        yap_out = np.row_stack((yap_out,
                                particles_yaparray(positions, radii)))

    pyp.savetxt(yap_filename, yap_out)
示例#18
0
    def create_batches(self,samples):

        sample_size = len(samples)
        self.num_batches = math.ceil(sample_size /self.batch_size)
        new_sample_size = self.num_batches * self.batch_size

        # Create the batch tensor
        # x_lengths = [len(sample) for sample in samples]

        x_lengths = []
        x_seqs = np.ndarray((new_sample_size,self.seq_max_length),dtype=np.int32)
        y_seqs = np.ndarray((new_sample_size,self.seq_max_length),dtype=np.int32)
        self.x_lengths = []
        for i,sample in enumerate(samples):
            # fill with padding to align batchSize samples into one 2D list
            x_lengths.append(len(sample))
            x_seqs[i] = sample + [self.padToken] * (self.seq_max_length - len(sample))

        for i in range(sample_size,new_sample_size):
            copyi = i - sample_size
            x_seqs[i] = x_seqs[copyi]
            x_lengths.append(x_lengths[copyi])

        y_seqs[:,:-1] = x_seqs[:,1:]
        y_seqs[:,-1] = x_seqs[:,0]
        x_len_array = np.array(x_lengths)



        self.x_batches = np.split(x_seqs.reshape(self.batch_size, -1), self.num_batches, 1)
        self.x_len_batches = np.split(x_len_array.reshape(self.batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(y_seqs.reshape(self.batch_size, -1), self.num_batches, 1)
def train_net(
        net,
        x_train,
        y_train,
        images,
        b_name='\033[30mbaseline_%s\033[0m',
        f_name='\033[30mfollow_%s\033[0m',
        d_name='\033[30mdeformation_%s\033[0m'
):
    defo = False
    d_inputs = []
    c = color_codes()
    n_images = len(images)
    # We try to get the last weights to keep improving the net over and over
    if isinstance(x_train, tuple):
        defo = True
        x_train, defo_train = x_train
        defo_train = np.split(defo_train, len(images), axis=1)
        d_inputs = [(d_name % im, np.squeeze(d_im)) for im, d_im in zip(images, defo_train)]
    print(c['c'] + '[' + strftime("%H:%M:%S") + ']    ' + c['g'] + 'Training' + c['nc'])
    n_channels = x_train.shape[1]
    x_train = np.split(x_train, n_channels, axis=1)
    b_inputs = [(b_name % im, x_im) for im, x_im in zip(images, x_train[:n_images])]
    f_inputs = [(f_name % im, x_im) for im, x_im in zip(images, x_train[n_images:])]
    inputs = dict(b_inputs + f_inputs) if not defo else dict(b_inputs + f_inputs + d_inputs)
    net.fit(inputs, y_train)
示例#20
0
def blocksort2D(sfield, ofield, db):
    """
    Takes two nx x ny fields and divides them into blocks - the new fields have
    dimensions nx' x ny' where nx' = nx/db, ny' = ny/db
    db is half the block width in number of grid cells. 
    the fields are averaged over the block area (db points) and then 
    ofield is sorted according to sfield (spatial structure is lost)
    
    the returned value is a dictionary with sfield as the key and ofield as the value 
    
    assumes nx = ny = even integer.
    db must be a multiple of nx
    
    """
    nx = sfield.shape[0]
    ny = sfield.shape[1]
    
    nxblock = nx / db
    nyblock = ny / db
    
    #tave_field = np.mean(field[ti-ntave:ti,:,:])
    #tave_field = np.squeeze(tave_field)
    
    #split up field column-wise, take average row-wise. then split up resulting field row-wise, and take average column-wise.
    blocksfield = np.average(np.split(np.average(np.split(sfield, nxblock, axis=1), axis=-1), nyblock, axis=1), axis=-1)
    
    blockofield = np.average(np.split(np.average(np.split(ofield, nxblock, axis=1), axis=-1), nyblock, axis=1), axis=-1)
    
    blocksfield = blocksfield.flatten()
    blockofield = blockofield.flatten()
    
    d = dict(zip(blocksfield, blockofield))
    od = collections.OrderedDict(sorted(d.items()))
    
    return od
示例#21
0
def blockave2D(field, db):
    """
    Takes a nx x ny field and divides the field into blocks - the new field has
    dimensions nx' x ny' where nx' = nx/db, ny' = ny/db
    db is the block width in number of grid cells. 
    the field is averaged over the block area (db points) 
    
    in the case of 3D field, averaging is only performed in horizontal direction. 
    
    assumes nx = ny = even integer.
    db must be a multiple of nx
    
    
    """
    
    nx = field.shape[0]
    ny = field.shape[1]
    
    nxblock = nx // db
    nyblock = ny // db
    
    #split up field column-wise, take average row-wise. then split up resulting field row-wise, and take average column-wise.
    blockfield = np.average(np.split(np.average(np.split(field, nxblock, axis=1), axis=-1), nyblock, axis=1), axis=-1)
    
    return blockfield
示例#22
0
def gradient_p(X,y,theta,alpha,m,numIterations):

    errors1_x1 = 0
    errors1_x2 = 0

    errors2_x1 = 0
    errors2_x2 = 0

    x1,x2 = np.split(X,2)
    y1,y2 = np.split(y,2)

    for i in range(0,numIterations):
        
        h1 = x1.dot(theta)
        errors1_x1 = (h1 - y1) * x1[:, 0]
        errors1_x2 = (h1 - y1) * x1[:, 1]

        h2 = x2.dot(theta)
        errors2_x1 = (h2 - y2) * x2[:, 0]
        errors2_x2 = (h2 - y2) * x2[:, 1]
    
        theta[0]=theta[0]-(alpha/m)*(errors1_x1.sum()+errors2_x1.sum())
        theta[1]=theta[1]-(alpha/m)*(errors1_x2.sum()+errors2_x2.sum())
        
    return theta
def split_x(x, split_pos):
    # NOTE: do not support multiple sentence tensors
    # sequence input , non-sequence input, and no non-sequence input
    # sequence input:
    if type(x) is not list:
        x=[x]

    if len(x) == 1:
        # sec1,                 sec2, sec3,...
        # sent1, sent2, sent5
        x01, x02 = tuple(np.split(x[0],[split_pos]))
        cond_list=[x02>=0,x02<0]
        offset = x02[0][0]
        choice_list=[x02-offset, x02 ]
        x02 = np.select(cond_list, choice_list)
        return ([x01],[x02])

    # doc1 doc2 doc3
    # sec1 sec2 ...

    # sec1, sec2, ...
    # sent1, sent2, ...

    x01, x02 = tuple(np.split(x[0], [split_pos]))
    offset = x02[0][0]
    x1, x2 = split_x(x[1:], offset)
    cond_list = [x02 >= 0, x02 < 0]
    choice_list = [x02 - offset, x02]
    x02 = np.select(cond_list, choice_list)
    return ([x01] + x1, [x02]+x2)
def k_fold_cross_validation_sets(X, y, k, shuffle=True):
    if shuffle:
        X, y = shuffle_data(X, y)

    n_samples = len(y)
    left_overs = {}
    n_left_overs = (n_samples % k)
    if n_left_overs != 0:
        left_overs["X"] = X[-n_left_overs:]
        left_overs["y"] = y[-n_left_overs:]
        X = X[:-n_left_overs]
        y = y[:-n_left_overs]

    X_split = np.split(X, k)
    y_split = np.split(y, k)
    sets = []
    for i in range(k):
        X_test, y_test = X_split[i], y_split[i]
        X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
        y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
        sets.append([X_train, X_test, y_train, y_test])

    # Add left over samples to last set as training samples
    if n_left_overs != 0:
        np.append(sets[-1][0], left_overs["X"], axis=0)
        np.append(sets[-1][2], left_overs["y"], axis=0)

    return np.array(sets)
示例#25
0
    def to_json(self):
        base = super().to_json()
        base['offsets'] = self.payoff_to_json(self._offset)
        base['coefs'] = self.payoff_to_json(self._coefs)

        lengths = {}
        for role, strats, lens in zip(
                self.role_names, self.strat_names,
                np.split(self._lengths, self.role_starts[1:])):
            lengths[role] = {s: self.payoff_to_json(l)
                             for s, l in zip(strats, lens)}
        base['lengths'] = lengths

        profs = {}
        for role, strats, data in zip(
                self.role_names, self.strat_names,
                np.split(np.split(self._profiles, self._size_starts[1:]),
                         self.role_starts[1:])):
            profs[role] = {strat: [self.profile_to_json(p) for p in dat]
                           for strat, dat in zip(strats, data)}
        base['profiles'] = profs

        alphas = {}
        for role, strats, alphs in zip(
                self.role_names, self.strat_names,
                np.split(np.split(self._alpha, self._size_starts[1:]),
                         self.role_starts[1:])):
            alphas[role] = {s: a.tolist() for s, a in zip(strats, alphs)}
        base['alphas'] = alphas

        base['type'] = 'rbf.1'
        return base
示例#26
0
def make_predictions(net, data, labels, num_classes):
    data = np.require(data, requirements='C')
    labels = np.require(labels, requirements='C')

    preds = np.zeros((data.shape[1], num_classes), dtype=np.single)
    softmax_idx = net.get_layer_idx('probs', check_type='softmax')

    t0 = time.time()
    net.libmodel.startFeatureWriter(
        [data, labels, preds], softmax_idx)
    net.finish_batch()
    print "Predicted %s cases in %.2f seconds." % (
        labels.shape[1], time.time() - t0)

    if net.multiview_test:
        #  We have to deal with num_samples * num_views
        #  predictions.
        num_views = net.test_data_provider.num_views
        num_samples = labels.shape[1] / num_views
        split_sections = range(
            num_samples, num_samples * num_views, num_samples)
        preds = np.split(preds, split_sections, axis=0)
        labels = np.split(labels, split_sections, axis=1)
        preds = reduce(np.add, preds)
        labels = labels[0]

    return preds, labels
示例#27
0
    def update_stipples(self, cells):
        """ Updates stipple locations from an image
                cells should be an image of the same size as self.img
                with pixel values representing which Voronoi cell that
                pixel falls into
        """
        indices = np.argsort(cells.flat)
        _, boundaries = np.unique(cells.flat[indices], return_index=True)

        gxs = np.split(self.gx.flat[indices], boundaries)[1:]
        gys = np.split(self.gy.flat[indices], boundaries)[1:]
        gws = np.split(1 - self.img.flat[indices], boundaries)[1:]

        w = self.img.shape[1] / 2.0
        h = self.img.shape[0] / 2.0

        for i, (gx, gy, gw) in enumerate(zip(gxs, gys, gws)):
            weight = np.sum(gw)
            if weight > 0:
                x = np.sum(gx * gw) / weight
                y = np.sum(gy * gw) / weight

                self.stipples[i,:] = [(x - w) / w, (y - h) / h]
            else:
                self.stipples[i,:] = np.random.uniform(-1, 1, size=2)
示例#28
0
def generate_svm():
    digits, labels = load_digits(DIGITS_FN)

    print('preprocessing...')
    # shuffle digits
    rand = np.random.RandomState(321)
    shuffle = rand.permutation(len(digits))
    digits, labels = digits[shuffle], labels[shuffle]

    digits2 = list(map(deskew, digits))
    samples = preprocess_hog(digits2)

    train_n = int(0.9*len(samples))
    cv2.imshow('test set', mosaic(25, digits[train_n:]))
    digits_train, digits_test = np.split(digits2, [train_n])
    samples_train, samples_test = np.split(samples, [train_n])
    labels_train, labels_test = np.split(labels, [train_n])


    print('training SVM...')
    model = SVM(C=2.67, gamma=5.383)
    model.train(samples_train, labels_train)
    vis = evaluate_model(model, digits_test, samples_test, labels_test)
    print('saving SVM as "digits_svm.dat"...')
    return model

    cv2.waitKey(0)
示例#29
0
 def train(self, trainfile_name):
   train_X, train_Y, num_classes = self.make_data(trainfile_name)
   accuracies = []
   fscores = []
   if self.cv:
     num_points = train_X.shape[0]
     fol_len = num_points / self.folds
     rem = num_points % self.folds
     X_folds = numpy.split(train_X, self.folds) if rem == 0 else numpy.split(train_X[:-rem], self.folds)
     Y_folds = numpy.split(train_Y, self.folds) if rem == 0 else numpy.split(train_Y[:-rem], self.folds)
     for i in range(self.folds):
       train_folds_X = []
       train_folds_Y = []
       for j in range(self.folds):
         if i != j:
           train_folds_X.append(X_folds[j])
           train_folds_Y.append(Y_folds[j])
       train_fold_X = numpy.concatenate(train_folds_X)
       train_fold_Y = numpy.concatenate(train_folds_Y)
       classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes)
       predictions = self.classify(classifier, X_folds[i])
       accuracy, weighted_fscore, _ = self.evaluate(Y_folds[i], predictions)
       accuracies.append(accuracy)
       fscores.append(weighted_fscore)
     accuracies = numpy.asarray(accuracies)
     fscores = numpy.asarray(fscores)
     print >>sys.stderr, "Accuracies:", accuracies
     print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2)
     print >>sys.stderr, "Fscores:", fscores
     print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2)
   self.classifier = self.fit_model(train_X, train_Y, num_classes)
   cPickle.dump(classifier, open(self.trained_model_name, "wb"))
   #pickle.dump(tagset, open(self.stored_tagset, "wb"))
   print >>sys.stderr, "Done"
示例#30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints',
                       help='checkpoint directory')
    parser.add_argument('--save_every', type=int, default=1000,
                       help='save frequency')
    args = parser.parse_args()

    # Read the training data
    inputFile = open("data/input.txt","rU")
    trainingData = inputFile.read()

    # Count vocab 
    counter = collections.Counter(trainingData)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])
    chars, _ = list(zip(*count_pairs))
    vocabSize = len(chars)
    print vocabSize
    vocab = dict(zip(chars, range(len(chars))))
    inputTensor = np.array(map(vocab.get, trainingData))

    numBatches = inputTensor.size / (batchSize * numSteps)

    print numBatches

    inputTensor = inputTensor[:numBatches * batchSize * numSteps]
    inData = inputTensor
    targetData = np.copy(inputTensor)
    targetData[:-1] = inData[1:]
    targetData[-1] = inData[0]
    inDataBatches = np.split(inData.reshape(batchSize, -1), numBatches, 1)
    targetDataBatches = np.split(targetData.reshape(batchSize, -1), numBatches, 1)
    
    lstmTrain(args)
    def get_medium_estimator(self, measurements):
        """
        """

        num_of_mediums = self.args.num_mediums
        cv_index = self.args.use_cross_validation
        time_list = measurements.time_list
        if cv_index >= 0:
            time_list = np.delete(time_list, cv_index)

        assert isinstance(num_of_mediums, int) and num_of_mediums <= len(time_list)

        wavelength = measurements.wavelength
        if not isinstance(wavelength,list):
            wavelength = [wavelength]

        # Define the grid for reconstruction
        grid = albedo_grid = phase_grid = shdom.Grid(bounding_box=measurements.bb,nx=self.args.nx,ny=self.args.ny,nz=self.args.nz)

        if self.args.assume_moving_cloud:
            cloud_velocity = None
        else:
            cloud_velocity = [0,0,0]


        # Find a cloud mask for non-cloudy grid points
        dynamic_carver = shdom.DynamicSpaceCarver(measurements)
        mask_list, dynamic_grid, cloud_velocity = dynamic_carver.carve(grid, agreement=0.70,
                            time_list = measurements.time_list, thresholds=self.args.radiance_threshold,
                            vx_max = 10, vy_max=10, gt_velocity = cloud_velocity)
        mask = mask_list[0]
        show_mask=1
        if show_mask:
            a = mask.data.astype(int)
            shdom.cloud_plot(a)
            print(cloud_velocity)
            print(sum(sum(sum(a))))
        table_path = self.args.mie_base_path.replace('<wavelength>', '{}'.format(shdom.int_round(wavelength[0])))
        self.cloud_generator.add_mie(table_path)
        albedo = self.cloud_generator.get_albedo(wavelength[0], [albedo_grid] * num_of_mediums)
        phase = self.cloud_generator.get_phase(wavelength[0], [phase_grid] * num_of_mediums)

        # cv_index = self.args.use_cross_validation
        # if cv_index >= 0:
        #     # del dynamic_grid[cv_index]
        #     # del mask_list[cv_index]
        #     # del albedo[cv_index]
        #     # del phase[cv_index]
        #     time_list = np.delete(measurements.time_list, cv_index)
        time_list = np.mean(np.split(time_list, num_of_mediums), 1)


        extinction = shdom.DynamicGridDataEstimator(self.cloud_generator.get_extinction(wavelength, [grid] * num_of_mediums),
                                                    min_bound=1e-5,
                                                    max_bound=2e2)
        kw_optical_scatterer = {"extinction": extinction, "albedo": albedo, "phase": phase}
        cloud_estimator = shdom.DynamicScattererEstimator(wavelength=wavelength, time_list=time_list, **kw_optical_scatterer)
        cloud_estimator.set_mask([mask] * num_of_mediums)

        # Create a medium estimator object (optional Rayleigh scattering)
        air = self.air_generator.get_scatterer(wavelength)
        medium_estimator = shdom.DynamicMediumEstimator(cloud_estimator, air, cloud_velocity)

        return medium_estimator
示例#32
0
def split_array(v1, v2, pieces_number):
    res_v1, res_v2 = np.split(v1, pieces_number), np.split(v2, pieces_number)
    for i in range(len(res_v1)):
        if res_v1[i].size > 0:
            yield (res_v1[i], res_v2[i])
示例#33
0
def label_ims(ims_batch,
              labels=None,
              inverse_normalize=False,
              normalize=False,
              clip_flow=10,
              display_h=128,
              pad_top=None,
              clip_norm=None,
              padding_size=0,
              padding_color=255,
              border_size=0,
              border_color=0,
              color_space='rgb',
              combine_from_axis=0,
              concat_axis=0,
              interp=cv2.INTER_LINEAR):
    '''
    Displays a batch of matrices as an image.

    :param ims_batch: n_batches x h x w x c array of images.
    :param labels: optional labels. Can be an n_batches length list of tuples, floats or strings
    :param inverse_normalize: boolean to do normalization from [-1, 1] to [0, 255]
    :param normalize: boolean to normalize any [min, max] to [0, 255]
    :param clip_flow: float for the min, max absolute flow magnitude to display
    :param display_h: integer number of pixels for the height of each image to display
    :param pad_top: integer number of pixels to pad each image at the top with (for more readable labels)
    :param color_space: string of either 'rgb' or 'ycbcr' to do color space conversion before displaying
    :param concat_axis: integer axis number to concatenate batch along (default is 0 for rows)

    :return:
    '''

    if isinstance(ims_batch, np.ndarray) and len(
            ims_batch.shape) == 3 and ims_batch.shape[-1] == 3:
        # already an image
        return ims_batch

    # transpose the image until batches are in the 0th axis
    if not combine_from_axis == 0:
        # compute all remaining axes
        all_axes = list(range(len(ims_batch.shape)))
        del all_axes[combine_from_axis]
        ims_batch = np.transpose(ims_batch,
                                 (combine_from_axis, ) + tuple(all_axes))

    batch_size = len(ims_batch)  # works for lists and np arrays
    h = ims_batch[0].shape[0]
    w = ims_batch[0].shape[1]
    if len(ims_batch[0].shape) == 2:
        n_chans = 1
    else:
        n_chans = ims_batch[0].shape[-1]

    if type(labels) == list and len(labels) == 1:  # only label the first image
        labels = labels + [''] * (batch_size - 1)
    elif labels is not None and not type(labels) == list and not type(
            labels) == np.ndarray:
        labels = [labels] * batch_size

    scale_factor = display_h / float(h)

    if pad_top:
        im_h = int(display_h + pad_top)
    else:
        im_h = display_h
        im_w = round(scale_factor * float(w))

    # make sure we have a channels dimension
    if len(ims_batch.shape) < 4:
        ims_batch = np.expand_dims(ims_batch, 3)

    if ims_batch.shape[-1] == 2:  # assume to be x,y flow; map to color im
        X_fullcolor = np.concatenate(
            [ims_batch.copy(),
             np.zeros(ims_batch.shape[:-1] + (1, ))], axis=3)

        if labels is not None:
            labels = [''] * batch_size

        for i in range(batch_size):
            X_fullcolor[i], min_flow, max_flow = flow_to_im(
                ims_batch[i], clip_flow=clip_flow)

            # also include the min and max flow in  the label
            if labels[i] is not None:
                labels[i] = '{},'.format(labels[i])
            else:
                labels[i] = ''

            for c in range(len(min_flow)):
                labels[i] += '({}, {})'.format(round(min_flow[c], 1),
                                               round(max_flow[c], 1))
        ims_batch = X_fullcolor.copy()
    elif ims_batch.shape[-1] > 3:
        # not an image, probably labels

        n_labels = ims_batch.shape[-1]
        cmap = make_cmap_rainbow(n_labels)

        labels_im = classification_utils.onehot_to_labels(
            ims_batch, n_classes=ims_batch.shape[-1])
        labels_im_flat = labels_im.flatten()
        labeled_im_flat = np.tile(labels_im_flat[..., np.newaxis],
                                  (1, 3)).astype(np.float32)

        #for ei in range(batch_size):
        for l in range(n_labels):
            labeled_im_flat[labels_im_flat == l, :] = cmap[l]
        ims_batch = labeled_im_flat.reshape((-1, ) + ims_batch.shape[1:-1] +
                                            (3, ))

    elif inverse_normalize:
        ims_batch = image_utils.inverse_normalize(ims_batch)

    elif normalize:
        flattened_dims = np.prod(ims_batch.shape[1:])

        X_spatially_flat = np.reshape(ims_batch, (batch_size, -1, n_chans))
        X_orig_min = np.min(X_spatially_flat, axis=1)
        X_orig_max = np.max(X_spatially_flat, axis=1)

        # now actually flatten and normalize across channels
        X_flat = np.reshape(ims_batch, (batch_size, -1))
        if clip_norm is None:
            X_flat = X_flat - np.tile(np.min(X_flat, axis=1, keepdims=True),
                                      (1, flattened_dims))
            # avoid dividing by 0
            X_flat = X_flat / np.clip(
                np.tile(np.max(X_flat, axis=1, keepdims=True),
                        (1, flattened_dims)), 1e-5, None)
        else:
            X_flat = X_flat - (-float(clip_norm))
            # avoid dividing by 0
            X_flat = X_flat / (2. * clip_norm)
            #X_flat = X_flat - np.tile(np.min(X_flat, axis=1, keepdims=True), (1, flattened_dims))
            # avoid dividing by 0
            #X_flat = X_flat / np.clip(np.tile(np.max(X_flat, axis=1, keepdims=True), (1, flattened_dims)), 1e-5, None)

        ims_batch = np.reshape(X_flat, ims_batch.shape)
        ims_batch = np.clip(ims_batch.astype(np.float32), 0., 1.)
        for i in range(batch_size):
            if labels is not None and len(labels) > 0:
                if labels[i] is not None:
                    labels[i] = '{},'.format(labels[i])
                else:
                    labels[i] = ''
                # show the min, max of each channel
                for c in range(n_chans):
                    labels[i] += '({:.2f}, {:.2f})'.format(
                        round(X_orig_min[i, c], 2), round(X_orig_max[i, c], 2))
    else:
        ims_batch = np.clip(ims_batch, 0., 1.)

    if color_space == 'ycbcr':
        for i in range(batch_size):
            ims_batch[i] = cv2.cvtColor(ims_batch[i], cv2.COLOR_YCR_CB2BGR)

    if np.max(ims_batch) <= 1.0:
        ims_batch = ims_batch * 255.0

    out_im = []
    for i in range(batch_size):
        # convert grayscale to rgb if needed
        if len(ims_batch[i].shape) == 2:
            curr_im = np.tile(np.expand_dims(ims_batch[i], axis=-1), (1, 1, 3))
        elif ims_batch.shape[-1] == 1:
            curr_im = np.tile(ims_batch[i], (1, 1, 3))
        else:
            curr_im = ims_batch[i]

        # scale to specified display size
        if not scale_factor == 1:
            curr_im = cv2.resize(curr_im,
                                 None,
                                 fx=scale_factor,
                                 fy=scale_factor,
                                 interpolation=interp)

        if pad_top:
            curr_im = np.concatenate([
                np.zeros(
                    (pad_top, curr_im.shape[1], curr_im.shape[2])), curr_im
            ],
                                     axis=0)

        if border_size > 0:
            # add a border all around the image
            curr_im = cv2.copyMakeBorder(curr_im,
                                         border_size,
                                         border_size,
                                         border_size,
                                         border_size,
                                         borderType=cv2.BORDER_CONSTANT,
                                         value=border_color)

        if padding_size > 0 and i < batch_size - 1:
            # include a border between images
            padding_shape = list(curr_im.shape[:3])
            padding_shape[concat_axis] = padding_size

            curr_im = np.concatenate(
                [curr_im, np.ones(padding_shape) * padding_color],
                axis=concat_axis)

        out_im.append(curr_im)

    if display_h > 50:
        font_size = 15
    else:
        font_size = 10

    if concat_axis is not None:
        out_im = np.concatenate(out_im, axis=concat_axis).astype(np.uint8)
    else:
        out_im = np.concatenate(out_im, axis=0).astype(np.uint8)

    max_text_width = int(17 * display_h / 128.)  # empirically determined
    if labels is not None and len(labels) > 0:
        im_pil = Image.fromarray(out_im)
        draw = ImageDraw.Draw(im_pil)

        for i in range(batch_size):
            if len(labels) > i:  # if we have a label for this image
                if type(labels[i]) == tuple or type(labels[i]) == list:
                    # format tuple or list nicely
                    formatted_text = ', '.join([
                        labels[i][j].decode('UTF-8') if type(labels[i][j]) == np.unicode_ \
                            else labels[i][j] if type(labels[i][j]) == str \
                            else str(round(labels[i][j], 2)) if isinstance(labels[i][j], float) \
                            else str(labels[i][j]) for j in range(len(labels[i]))])
                elif type(labels[i]) == float or type(labels[i]) == np.float32:
                    formatted_text = str(round(labels[i],
                                               2))  # round floats to 2 digits
                elif isinstance(labels[i], np.ndarray):
                    # assume that this is a 1D array
                    curr_labels = np.squeeze(labels[i]).astype(np.float32)
                    formatted_text = np.array2string(curr_labels,
                                                     precision=2,
                                                     separator=',')
                    # ', '.join(['{}'.format(
                    #	np.around(labels[i][j], 2)) for j in range(labels[i].size)])
                else:
                    formatted_text = '{}'.format(labels[i])

                if display_h > 30:  # only print label if we have room
                    try:
                        font = ImageFont.truetype('Ubuntu-M.ttf', font_size)
                    except:
                        font = ImageFont.truetype('arial.ttf', font_size)
                    # wrap the text so it fits
                    formatted_text = textwrap.wrap(formatted_text,
                                                   width=max_text_width)

                    for li, line in enumerate(formatted_text):
                        if concat_axis == 0:
                            draw.text((5, i * im_h + 5 + 14 * li),
                                      line,
                                      font=font,
                                      fill=(50, 50, 255))
                        elif concat_axis == 1:
                            draw.text((5 + i * im_w, 5 + 14 * li),
                                      line,
                                      font=font,
                                      fill=(50, 50, 255))

        out_im = np.asarray(im_pil)

    # else:
    #     out_im = [im.astype(np.uint8) for im in out_im]
    #
    #     max_text_width = int(17 * display_h / 128.)  # empirically determined
    #     if labels is not None and len(labels) > 0:
    #         for i, im in enumerate(out_im):
    #             im_pil = Image.fromarray(im)
    #             draw = ImageDraw.Draw(im_pil)
    #
    #
    #             if len(labels) > i:  # if we have a label for this image
    #                 if type(labels[i]) == tuple or type(labels[i]) == list:
    #                     # format tuple or list nicely
    #                     formatted_text = ', '.join([
    #                         labels[i][j].decode('UTF-8') if type(labels[i][j]) == np.unicode_ \
    #                             else labels[i][j] if type(labels[i][j]) == str \
    #                             else str(round(labels[i][j], 2)) if isinstance(labels[i][j], float) \
    #                             else str(labels[i][j]) for j in range(len(labels[i]))])
    #                 elif type(labels[i]) == float or type(labels[i]) == np.float32:
    #                     formatted_text = str(round(labels[i], 2))  # round floats to 2 digits
    #                 elif isinstance(labels[i], np.ndarray):
    #                     # assume that this is a 1D array
    #                     curr_labels = np.squeeze(labels[i]).astype(np.float32)
    #                     formatted_text = np.array2string(curr_labels, precision=2, separator=',')
    #                     # ', '.join(['{}'.format(
    #                     #	np.around(labels[i][j], 2)) for j in range(labels[i].size)])
    #                 else:
    #                     formatted_text = '{}'.format(labels[i])
    #
    #                 if display_h > 30:  # only print label if we have room
    #                     try:
    #                         font = ImageFont.truetype('Ubuntu-M.ttf', font_size)
    #                     except:
    #                         font = ImageFont.truetype('arial.ttf', font_size)
    #                     # wrap the text so it fits
    #                     formatted_text = textwrap.wrap(formatted_text, width=max_text_width)
    #
    #                     for li, line in enumerate(formatted_text):
    #                         draw.text((5, 5 + 14 * li), line, font=font, fill=(50, 50, 255))
    #             im = np.asarray(im_pil)
    if concat_axis is None:
        # un-concat the image. faster this way
        out_im = np.split(out_im, batch_size, axis=combine_from_axis)
    return out_im
示例#34
0
 def make_random_iter(self):
     splits = np.arange(self.batch_size, len(self.inputs), self.batch_size)
     np.random.seed(42)
     it = np.split(np.random.permutation(range(len(self.inputs))),
                   splits)[:-1]
     return iter(it)
示例#35
0
def unflatten(y, cuts):
    return np.split(y, cuts) if cuts else y
示例#36
0
    print __doc__

    digits, labels = load_digits(DIGITS_FN)

    print 'preprocessing...'
    # shuffle digits
    rand = np.random.RandomState(321)
    shuffle = rand.permutation(len(digits))
    digits, labels = digits[shuffle], labels[shuffle]

    digits2 = map(deskew, digits)
    samples = preprocess_hog(digits2)

    train_n = int(0.9*len(samples))
    cv2.imshow('test set', mosaic(25, digits[train_n:]))
    digits_train, digits_test = np.split(digits2, [train_n])
    samples_train, samples_test = np.split(samples, [train_n])
    labels_train, labels_test = np.split(labels, [train_n])


    print 'training KNearest...'
    model = KNearest(k=4)
    model.train(samples_train, labels_train)
    vis = evaluate_model(model, digits_test, samples_test, labels_test)
    cv2.imshow('KNearest test', vis)
    cv2.waitKey(10000)
    print 'training SVM...'
    model = SVM(C=2.67, gamma=5.383)
    model.train(samples_train, labels_train)
    vis = evaluate_model(model, digits_test, samples_test, labels_test)
    cv2.imshow('SVM test', vis)
示例#37
0
def _unstack(value, num=None, axis=0, name='unstack'):
    del name
    value = np.array(value)
    return list(
        np.squeeze(x, axis=axis) for x in np.split(
            value, value.shape[axis] if num is None else num, axis))
示例#38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="/home/abc/shhs/polysomnography/edfs/shhs1",
                        help="File path to the PSG files.")
    parser.add_argument("--ann_dir", type=str, default="/home/abc/shhs/polysomnography/annotations-events-profusion/shhs1",
                        help="File path to the annotation files.")
    parser.add_argument("--output_dir", type=str, default="/home/abc/output_npz/shhs",
                        help="Directory where to save numpy files outputs.")
    parser.add_argument("--select_ch", type=str, default="EEG C4-A1",
                        help="The selected channel")
    args = parser.parse_args()


    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    ids = pd.read_csv("selected_shhs1_files.txt", header=None, names='a')
    ids = ids['a'].values.tolist()

    edf_fnames = [os.path.join(args.data_dir, i + ".edf") for i in ids]
    ann_fnames = [os.path.join(args.ann_dir,  i + "-profusion.xml") for i in ids]

    edf_fnames.sort()
    ann_fnames.sort()

    edf_fnames = np.asarray(edf_fnames)
    ann_fnames = np.asarray(ann_fnames)


    for file_id in range(len(edf_fnames)):
        if os.path.exists(os.path.join(args.output_dir, edf_fnames[file_id].split('/')[-1])[:-4]+".npz"):
            continue
        print(edf_fnames[file_id])

        raw = read_raw_edf(edf_fnames[file_id], preload=True, stim_channel=None, verbose=None)
        sampling_rate = raw.info['sfreq']
        ch_type = args.select_ch.split(" ")[0]
        select_ch = [s for s in raw.info["ch_names"] if ch_type in s][0]

        raw_ch_df = raw.to_data_frame(scaling_time=sampling_rate)[select_ch]
        raw_ch_df = raw_ch_df.to_frame()
        raw_ch_df.set_index(np.arange(len(raw_ch_df)))



    ###################################################
        labels = []
        # Read annotation and its header
        t = ET.parse(ann_fnames[file_id])
        r = t.getroot()
        faulty_File = 0
        for i in range(len(r[4])):
            lbl = int(r[4][i].text)
            if lbl == 4:  # make stages N3, N4 same as N3
                labels.append(3)
            elif lbl == 5:  # Assign label 4 for REM stage
                labels.append(4)
            else:
                labels.append(lbl)
            if lbl > 5:  # some files may contain labels > 5 BUT not the selected ones.
                faulty_File = 1

        if faulty_File == 1:
            print( "============================== Faulty file ==================")
            continue

        labels = np.asarray(labels)

        # Remove movement and unknown stages if any
        raw_ch = raw_ch_df.values
        print(raw_ch.shape)

        # Verify that we can split into 30-s epochs
        if len(raw_ch) % (EPOCH_SEC_SIZE * sampling_rate) != 0:
            raise Exception("Something wrong")
        n_epochs = len(raw_ch) / (EPOCH_SEC_SIZE * sampling_rate)

        # Get epochs and their corresponding labels
        x = np.asarray(np.split(raw_ch, n_epochs)).astype(np.float32)
        y = labels.astype(np.int32)

        print(x.shape)
        print(y.shape)
        assert len(x) == len(y)

        # Select on sleep periods
        w_edge_mins = 30
        nw_idx = np.where(y != 0)[0]
        start_idx = nw_idx[0] - (w_edge_mins * 2)
        end_idx = nw_idx[-1] + (w_edge_mins * 2)
        if start_idx < 0: start_idx = 0
        if end_idx >= len(y): end_idx = len(y) - 1
        select_idx = np.arange(start_idx, end_idx + 1)
        print("Data before selection: {}, {}".format(x.shape, y.shape))
        x = x[select_idx]
        y = y[select_idx]
        print("Data after selection: {}, {}".format(x.shape, y.shape))

        # Saving as numpy files
        filename = os.path.basename(edf_fnames[file_id]).replace(".edf",  ".npz")
        save_dict = {
            "x": x,
            "y": y,
            "fs": sampling_rate
        }
        np.savez(os.path.join(args.output_dir, filename), **save_dict)
        print(" ---------- Done this file ---------")
data = pd.read_csv('full_data.csv', index_col=0)
cols = ['apparentTemperature', 'humidity', 'MWh']
df = data[cols]

df = (df - df.min()) / (df.max() - df.min())  ##Min-Max Normalization
#df = (df - df.mean())/df.std() ##Gaussian normalization

inputs = df
targets = df['MWh']  #Un-normalized targets

#Percentage of samples to use as training data
TRAINING_SAMPLE_RATIO = 0.7
num_training_samples = round(len(inputs) * TRAINING_SAMPLE_RATIO)

#Splits data samples
(training_inputs, test_inputs) = np.split(inputs.values,
                                          [num_training_samples])
(training_targets, test_targets) = np.split(targets.values,
                                            [num_training_samples])

#Splits timestamps for plotting later
(training_t, test_t) = np.split(data['index'].values, [num_training_samples])

#Prepares training data for input to network
training_inputs = Variable(torch.from_numpy(training_inputs).float()).cuda()
training_targets = Variable(torch.from_numpy(training_targets).float()).cuda()
test_inputs = Variable(torch.from_numpy(test_inputs).float()).cuda()
test_targets = Variable(torch.from_numpy(test_targets).float()).cuda()

# -------------------- Instantiate LSTM Network  --------------------- #
# Model Params
input_dim = training_inputs.shape[1]
示例#40
0
def np_split_squeeze(array, axis):
    axis_len = array.shape[axis]
    return [
        np.squeeze(arr, axis=(axis, ))
        for arr in np.split(array, axis_len, axis=axis)
    ]
示例#41
0
    def generate_random_rois(self, image_shape, count, gt_boxes):
        """
            Generates ROI proposals similar to what a region proposal network
            would generate.
        :param image_shape: [Height, Width, Depth]
        :param count: Number of ROIs to generate
        :param gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
        :return:
        """
        # placeholder
        rois = np.zeros((count, 4), dtype=np.int32)

        # Generate random ROIs around GT boxes (90% of count)
        rois_per_box = int(0.9 * count / gt_boxes.shape[0])
        for i in range(gt_boxes.shape[0]):
            gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
            h = gt_y2 - gt_y1
            w = gt_x2 - gt_x1
            # random boundaries
            r_y1 = max(gt_y1 - h, 0)
            r_y2 = min(gt_y2 + h, image_shape[0])
            r_x1 = max(gt_x1 - w, 0)
            r_x2 = min(gt_x2 + w, image_shape[1])

            # To avoid generating boxes with zero area, we generate double what
            # we need and filter out the extra. If we get fewer valid boxes
            # than we need, we loop and try again.
            while True:
                y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
                x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
                # Filter out zero area boxes
                threshold = 1
                y1y2 = y1y2[np.abs(y1y2[:, 0] -
                                   y1y2[:, 1]) >= threshold][:rois_per_box]
                x1x2 = x1x2[np.abs(x1x2[:, 0] -
                                   x1x2[:, 1]) >= threshold][:rois_per_box]
                if y1y2.shape[0] == rois_per_box and x1x2.shape[
                        0] == rois_per_box:
                    break

            # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
            # into x1, y1, x2, y2 order
            x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
            y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
            box_rois = np.hstack([y1, x1, y2, x2])
            rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois

        # Generate random ROIs anywhere in the image (10% of count)
        remaining_count = count - (rois_per_box * gt_boxes.shape[0])
        # To avoid generating boxes with zero area, we generate double what
        # we need and filter out the extra. If we get fewer valid boxes
        # than we need, we loop and try again.
        while True:
            y1y2 = np.random.randint(0, image_shape[0],
                                     (remaining_count * 2, 2))
            x1x2 = np.random.randint(0, image_shape[1],
                                     (remaining_count * 2, 2))
            # Filter out zero area boxes
            threshold = 1
            y1y2 = y1y2[np.abs(y1y2[:, 0] -
                               y1y2[:, 1]) >= threshold][:remaining_count]
            x1x2 = x1x2[np.abs(x1x2[:, 0] -
                               x1x2[:, 1]) >= threshold][:remaining_count]
            if y1y2.shape[0] == remaining_count and x1x2.shape[
                    0] == remaining_count:
                break

        # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
        # into x1, y1, x2, y2 order
        x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
        y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
        global_rois = np.hstack([y1, x1, y2, x2])
        rois[-remaining_count:] = global_rois

        return rois
        pass
示例#42
0

mat_size = int(sys.argv[1])

# Initialize the 2 random matrices only if this is rank 0
if rank == 0:
    mat_A = np.random.rand(mat_size, mat_size)
    mat_B = np.random.rand(mat_size, mat_size)
    ans = np.matmul(mat_A, mat_B)

    t_start = MPI.Wtime()

    power = np.log2(size) / 2
    i_len = int(2**(np.ceil(power)))
    j_len = int(2**(np.floor(power)))
    send_list_A = np.split(mat_A, i_len, axis=0)
    send_list_B = np.split(mat_B, j_len, axis=1)
    send_list = []
    for i in range(i_len):
        for j in range(j_len):
            send_list.append([send_list_A[i], send_list_B[j]])
else:
    mat_A = None
    mat_B = None
    send_list = None

mats = comm.scatter(send_list, root=0)

mat_C = matrix_mult(mats[0], mats[1])

res_list = comm.gather(mat_C, root=0)
示例#43
0
def get_params_from_mat(matpath):
    """Get parameter from .mat file into parms(dict)"""
    def squeeze(vars_):
        # Matlab save some params with shape (*, 1)
        # However, we don't need the trailing dimension in TensorFlow.
        if isinstance(vars_, (list, tuple)):
            return [np.squeeze(v, 1) for v in vars_]
        else:
            return np.squeeze(vars_, 1)

    netparams = sio.loadmat(matpath)["net"]["params"][0][0]
    params = dict()

    for i in range(netparams.size):
        param = netparams[0][i]
        name = param["name"][0]
        value = param["value"]
        value_size = param["value"].shape[0]

        match = re.match(r"([a-z]+)([0-9]+)([a-z]+)", name, re.I)
        if match:
            items = match.groups()
        elif name == 'adjust_f':
            params['detection/weights'] = squeeze(value)
            continue
        elif name == 'adjust_b':
            params['detection/biases'] = squeeze(value)
            continue
        else:
            raise Exception('unrecognized layer params')

        op, layer, types = items
        layer = int(layer)
        if layer in [1, 3]:
            if op == 'conv':  # convolution
                if types == 'f':
                    params['conv%d/weights' % layer] = value
                elif types == 'b':
                    value = squeeze(value)
                    params['conv%d/biases' % layer] = value
            elif op == 'bn':  # batch normalization
                if types == 'x':
                    m, v = squeeze(np.split(value, 2, 1))
                    params['conv%d/BatchNorm/moving_mean' % layer] = m
                    params['conv%d/BatchNorm/moving_variance' %
                           layer] = np.square(v)
                elif types == 'm':
                    value = squeeze(value)
                    params['conv%d/BatchNorm/gamma' % layer] = value
                elif types == 'b':
                    value = squeeze(value)
                    params['conv%d/BatchNorm/beta' % layer] = value
            else:
                raise Exception
        elif layer in [2, 4]:
            if op == 'conv' and types == 'f':
                b1, b2 = np.split(value, 2, 3)
            else:
                b1, b2 = np.split(value, 2, 0)
            if op == 'conv':
                if types == 'f':
                    params['conv%d/b1/weights' % layer] = b1
                    params['conv%d/b2/weights' % layer] = b2
                elif types == 'b':
                    b1, b2 = squeeze(np.split(value, 2, 0))
                    params['conv%d/b1/biases' % layer] = b1
                    params['conv%d/b2/biases' % layer] = b2
            elif op == 'bn':
                if types == 'x':
                    m1, v1 = squeeze(np.split(b1, 2, 1))
                    m2, v2 = squeeze(np.split(b2, 2, 1))
                    params['conv%d/b1/BatchNorm/moving_mean' % layer] = m1
                    params['conv%d/b2/BatchNorm/moving_mean' % layer] = m2
                    params['conv%d/b1/BatchNorm/moving_variance' %
                           layer] = np.square(v1)
                    params['conv%d/b2/BatchNorm/moving_variance' %
                           layer] = np.square(v2)
                elif types == 'm':
                    params['conv%d/b1/BatchNorm/gamma' % layer] = squeeze(b1)
                    params['conv%d/b2/BatchNorm/gamma' % layer] = squeeze(b2)
                elif types == 'b':
                    params['conv%d/b1/BatchNorm/beta' % layer] = squeeze(b1)
                    params['conv%d/b2/BatchNorm/beta' % layer] = squeeze(b2)
            else:
                raise Exception

        elif layer in [5]:
            if op == 'conv' and types == 'f':
                b1, b2 = np.split(value, 2, 3)
            else:
                b1, b2 = squeeze(np.split(value, 2, 0))
            assert op == 'conv', 'layer5 contains only convolution'
            if types == 'f':
                params['conv%d/b1/weights' % layer] = b1
                params['conv%d/b2/weights' % layer] = b2
            elif types == 'b':
                params['conv%d/b1/biases' % layer] = b1
                params['conv%d/b2/biases' % layer] = b2

    return params
示例#44
0
    def load_train_data(self, positive_file, negative_file):

        # #LOAD NEGATIVE
        # positive file is constant while negative file is changed during train!
        # if os.path.exists(negative_file + '.npy'):
        #     negative_examples = np.load(negative_file + '.npy')
        # else:
        negative_examples = []

        # remove \n
        with open(negative_file, 'r') as f:
            all_negative = f.read()
        with open(negative_file, 'w') as f:
            f.write(all_negative.replace('\n',''))

        with open(negative_file, 'r') as f:
            line = f.read(self.seq_len)
            while len(line) == self.seq_len:
                tokens = [int(self.charmap[char]) for char in line]
                assert len(tokens) == self.seq_len
                negative_examples.append(tokens)

                line = f.read(self.seq_len)

        # np.save(negative_file,np.array(negative_examples))
        negative_examples = np.array(negative_examples)
        num_positive_samples = negative_examples.shape[0]


        #LOAD POSITIVE

        cache_positive = "%s_seqlen%0d.npy"%(positive_file,self.seq_len)

        if os.path.exists(cache_positive):
            positive_examples = np.load(cache_positive)
        else:
            positive_examples = []

            with open(positive_file, 'r') as f:
                line = f.read(self.seq_len)
                while len(line) == self.seq_len:
                    tokens = [int(self.charmap[char]) for char in line]
                    assert len(tokens) == self.seq_len
                    positive_examples.append(tokens)

                    line = f.read(self.seq_len)

            positive_examples = np.array(positive_examples)
            np.save(cache_positive.replace('.npy',''),positive_examples)

        assert positive_examples.shape[1] == self.seq_len

        #choose only num_positive_samples from them
        permut = np.random.permutation(positive_examples.shape[0])[:num_positive_samples]
        positive_examples = positive_examples[permut]

        # CONCAT
        negative_examples = np.array(negative_examples)
        positive_examples = np.array(positive_examples)
        assert negative_examples.shape == positive_examples.shape
        self.sentences = np.concatenate((positive_examples,negative_examples),axis=0)

        # Generate labels
        positive_labels = [[0, 1]] * positive_examples.shape[0]
        negative_labels = [[1, 0]] * negative_examples.shape[0]
        self.labels = np.concatenate([positive_labels, negative_labels], 0)

        # # Shuffle the data
        # print "DISC shuffling data..."
        # shuffle_indices = np.random.permutation(self.sentences.shape[0])
        # self.sentences = self.sentences[shuffle_indices]
        # self.labels = self.labels[shuffle_indices]

        # Split batches
        self.num_batch = int(len(self.labels) / self.batch_size)
        self.sentences = self.sentences[:self.num_batch * self.batch_size]
        self.labels = self.labels[:self.num_batch * self.batch_size]
        self.sentences_batches = np.split(self.sentences, self.num_batch, 0)
        self.labels_batches = np.split(self.labels, self.num_batch, 0)

        self.pointer = 0

        print("done create_batches - [num_batch=%0d]"%self.num_batch)
def train_and_test(train_set,
                   test_set,
                   pipeline,
                   le,
                   srp_dict=None,
                   save_cls=False,
                   out_folder=None,
                   data_aug=True):

    # do flip based data augmentation
    if data_aug:
        if srp_dict is not None:
            train_set = do_data_augmentation(train_set, srp_dict['res'],
                                             srp_dict['nsegs'])

    # check until which column features are stored
    i_max = 1
    for i, col in enumerate(train_set.columns):
        if 'feat' in col:
            i_max = i + 1

    # split the dataframe to get features and append the transformed labels
    data_train = np.split(train_set.to_numpy(), [i_max], axis=1)
    data_train[1] = le.transform(train_set["Class"])

    data_test = np.split(test_set.to_numpy(), [i_max], axis=1)
    data_test[1] = le.transform(test_set["Class"])

    # fit the classifier and predict on the test set
    pipeline.fit(data_train[0], data_train[1])
    test_predicted = pipeline.predict(data_test[0])

    accuracy_score = skl.metrics.accuracy_score(data_test[1], test_predicted)

    # extract confusion matrix and metrics
    conf_mat = skl.metrics.confusion_matrix(data_test[1],
                                            test_predicted,
                                            labels=le.transform(le.classes_))

    if save_cls:
        if out_folder is None:
            save_dir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                'saved_classifier')
            os.makedirs(save_dir, exist_ok=True)
        else:
            save_dir = os.path.join(out_folder, 'saved_classifier/')
            os.makedirs(save_dir, exist_ok=True)

        print("Saving Classifier to {} ... ".format(save_dir))

        locs_in_train = train_set["Environment"].unique()
        save_string = "_".join(locs_in_train)

        pickle.dump(
            (pipeline),
            open(os.path.join(*[save_dir, save_string + '_classifier.obj']),
                 "wb"))
        test_set = test_set.drop_duplicates(subset=["Recording ID"])
        test_set["ID"].to_csv(
            os.path.join(*[save_dir, save_string + '_test_bags.csv']),
            index=False,
            header=True)

    return accuracy_score, conf_mat
def main():

    for i in range(2, 12, 2):

        problem = 2
        if problem == 1:
            traindata = np.loadtxt("Data_OneStepAhead/Lazer/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Lazer/test.txt")  #
            name = "Lazer"
        if problem == 2:
            traindata = np.loadtxt("Data_OneStepAhead/Sunspot/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Sunspot/test.txt")  #
            name = "Sunspot"
        if problem == 3:
            traindata = np.loadtxt("Data_OneStepAhead/Mackey/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Mackey/test.txt")  #
            name = "Mackey"
        if problem == 4:
            traindata = np.loadtxt("Data_OneStepAhead/Lorenz/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Lorenz/test.txt")  #
            name = "Lorenz"
        if problem == 5:
            traindata = np.loadtxt("Data_OneStepAhead/Rossler/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Rossler/test.txt")  #
            name = "Rossler"
        if problem == 6:
            traindata = np.loadtxt("Data_OneStepAhead/Henon/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/Henon/test.txt")  #
            name = "Henon"
        if problem == 7:
            traindata = np.loadtxt("Data_OneStepAhead/ACFinance/train.txt")
            testdata = np.loadtxt("Data_OneStepAhead/ACFinance/test.txt")  #
            name = "ACFinance"

        ###############################
        #THESE ARE THE HYPERPARAMETERS#
        ###############################

        hidden = 5
        ip = 4  #input
        output = 1
        topology = [ip, hidden, output]

        NumSample = 100000

        ###############################
        #THESE ARE THE HYPERPARAMETERS#
        ###############################
        topology = [ip, hidden, output]

        netw = topology

        print(traindata)

        y_test = testdata[:, netw[0]]
        y_train = traindata[:, netw[0]]

        maxtemp = i

        num_chains = 10
        swap_interval = 100000  # int(swap_ratio * (NumSample/num_chains)) #how ofen you swap neighbours. note if swap is more than Num_samples, its off
        burn_in = 0.6

        learn_rate = 0.01  # in case langevin gradients are used. Can select other values, we found small value is ok.

        use_langevin_gradients = False  # False leaves it as Random-walk proposals. Note that Langevin gradients will take a bit more time computationally

        problemfolder = '/home/rohit/Desktop/PT/PT_TimeSeriesResults_evalmaxtemp_/'  # change this to your directory for results output - produces large datasets

        problemfolder_db = 'PT_TimeSeriesResults_evalmaxtemp_/'  # save main results

        filename = ""
        run_nb = 0
        while os.path.exists(problemfolder + name + '_%s' % (run_nb)):
            run_nb += 1
        if not os.path.exists(problemfolder + name + '_%s' % (run_nb)):
            os.makedirs(problemfolder + name + '_%s' % (run_nb))
            path = (problemfolder + name + '_%s' % (run_nb))

        filename = ""
        run_nb = 0
        while os.path.exists(problemfolder_db + name + '_%s' % (run_nb)):
            run_nb += 1
        if not os.path.exists(problemfolder_db + name + '_%s' % (run_nb)):
            os.makedirs(problemfolder_db + name + '_%s' % (run_nb))
            path_db = (problemfolder_db + name + '_%s' % (run_nb))

        resultingfile = open(path + '/master_result_file.txt', 'a+')

        resultingfile_db = open(path_db + '/master_result_file.txt', 'a+')

        timer = time.time()

        pt = ParallelTempering(use_langevin_gradients, learn_rate, traindata,
                               testdata, topology, num_chains, maxtemp,
                               NumSample, swap_interval, path)

        directories = [
            path + '/predictions/', path + '/posterior', path + '/results',
            path + '/surrogate', path + '/surrogate/learnsurrogate_data',
            path + '/posterior/pos_w', path + '/posterior/pos_likelihood',
            path + '/posterior/surg_likelihood',
            path + '/posterior/accept_list'
        ]

        for d in directories:
            pt.make_directory((filename) + d)

        pt.initialize_chains(burn_in)

        pos_w, fx_train, fx_test, rmse_train, rmse_test, acc_train, acc_test, likelihood_rep, swap_perc, accept_vec, accept = pt.run_chains(
        )

        list_end = accept_vec.shape[1]
        accept_ratio = accept_vec[:, list_end - 1:list_end] / list_end
        accept_per = np.mean(accept_ratio) * 100

        print(accept_per, ' accept_per')

        timer2 = time.time()

        timetotal = (timer2 - timer) / 60
        print((timetotal), 'min taken')

        #PLOTS
        '''acc_tr = np.mean(acc_train [:])
		acctr_std = np.std(acc_train[:]) 
		acctr_max = np.amax(acc_train[:])

		acc_tes = np.mean(acc_test[:])
		acctest_std = np.std(acc_test[:]) 
		acctes_max = np.amax(acc_test[:])'''

        rmse_tr = np.mean(rmse_train[:])
        rmsetr_std = np.std(rmse_train[:])
        rmsetr_max = np.amin(rmse_train[:])

        rmse_tes = np.mean(rmse_test[:])
        rmsetest_std = np.std(rmse_test[:])
        rmsetes_max = np.amin(rmse_test[:])

        outres = open(path + '/result.txt', "a+")
        np.savetxt(outres, (use_langevin_gradients, learn_rate, rmse_tr,
                            rmsetr_std, rmsetr_max, rmse_tes, rmsetest_std,
                            rmsetes_max, swap_perc, accept_per, timetotal),
                   fmt='%1.5f')
        print(rmse_tr, rmsetr_max, rmse_tes, rmsetes_max)
        np.savetxt(
            resultingfile,
            (NumSample, maxtemp, swap_interval, num_chains, rmse_tr,
             rmsetr_std, rmsetr_max, rmse_tes, rmsetest_std, rmsetes_max),
            fmt='%1.5f')

        outres_db = open(path_db + '/result.txt', "a+")
        np.savetxt(outres_db, (use_langevin_gradients, learn_rate, rmse_tr,
                               rmsetr_std, rmsetr_max, rmse_tes, rmsetest_std,
                               rmsetes_max, swap_perc, accept_per, timetotal),
                   fmt='%1.5f')
        np.savetxt(
            resultingfile_db,
            (NumSample, maxtemp, swap_interval, num_chains, rmse_tr,
             rmsetr_std, rmsetr_max, rmse_tes, rmsetest_std, rmsetes_max),
            fmt='%1.5f')

        x = np.linspace(0, rmse_train.shape[0], num=rmse_train.shape[0])

        plt.plot(x, rmse_train, label='Test')
        plt.plot(x, rmse_test, label='Train')
        plt.legend(loc='upper right')

        plt.title("Plot of RMSE over time")
        plt.savefig(path + '/acc_samples.png')
        plt.clf()

        plt.plot(rmse_train, label='Test')
        plt.plot(rmse_test, label='Train')
        plt.legend(loc='upper right')

        plt.title("Plot of RMSE over time")
        plt.savefig(path_db + '/acc_samples.png')
        plt.clf()
        '''rmse_train =  np.split(rmse_train, num_chains)
		print(rmse_train.T, ' rmse_tr -- ')

		rmse_test = np.asarray(np.split(rmse_test, num_chains))


		plt.plot( rmse_train.T,  label='Test')
		plt.plot( rmse_test.T,   label='Train') 
		plt.legend(loc='upper right')

		plt.title("Accuracy -  sampling  time")
		plt.savefig(path_db+'/rmse_samples.png') 
		plt.clf()'''

        likelihood = likelihood_rep[:, 0]  # just plot proposed likelihood
        likelihood = np.asarray(np.split(likelihood, num_chains))

        #print(likelihood, ' rmse_tr -- ')

        # Plots
        plt.plot(likelihood.T)
        plt.savefig(path + '/likelihood.png')
        plt.clf()

        plt.plot(likelihood.T)
        plt.savefig(path_db + '/likelihood.png')
        plt.clf()

        plt.plot(accept_vec.T)
        plt.savefig(path_db + '/accept.png')
        plt.clf()

        #mpl_fig = plt.figure()
        #ax = mpl_fig.add_subplot(111)

        # ax.boxplot(pos_w)

        # ax.set_xlabel('[W1] [B1] [W2] [B2]')
        # ax.set_ylabel('Posterior')

        # plt.legend(loc='upper right')

        # plt.title("Boxplot of Posterior W (weights and biases)")
        # plt.savefig(path+'/w_pos.png')
        # plt.savefig(path+'/w_pos.svg', format='svg', dpi=600)

        # plt.clf()
        #dir()
        gc.collect()
        outres.close()
        resultingfile.close()
        resultingfile_db.close()
        outres_db.close()
示例#47
0
def agg_by_coords(pts, sigs, aggfunc='mean'):
    df = DataFrame(np.hstack(
        (pts, sigs))).groupby([0, 1]).agg(aggfunc).reset_index().values
    return np.split(df, [2], axis=1)
示例#48
0
def consecutive(data, step_size=1):
    """
    Identify groups of consecutive integers, split them into separate arrays.
    """
    return np.split(data, np.where(np.diff(data) != step_size)[0] + 1)
示例#49
0
    n_hashes=4,
    ff_chunks=10,
    lsh_dropout=0.1,
    weight_tie=True,
    causal=True,
    use_full_attn=False  # set this to true for comparison with full attention
)

model = TrainingWrapper(model)
model.cuda()

# prepare enwik8 data

with gzip.open('./data/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)


class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0,
                                   self.data.size(0) - self.seq_len - 1, (1, ))
        full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
        return full_seq.cuda()
示例#50
0
    # print 'x = \n', x
    # print 'y = \n', y
    # le = preprocessing.LabelEncoder()
    # le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
    # print le.classes_
    # y = le.transform(y)
    # print 'Last Version, y = \n', y
    #
    # # 路径,浮点型数据,逗号分隔,第4列使用函数iris_type单独处理
    data = np.loadtxt(path,
                      dtype=float,
                      delimiter=',',
                      converters={4: iris_type})
    print(data)
    # 将数据的0到3列组成x,第4列得到y
    x, y = np.split(data, (4, ), axis=1)

    # 为了可视化,仅使用前两列特征
    x = x[:, :2]
    #
    # print (x)
    # print (y)
    #
    # x = StandardScaler().fit_transform(x)
    # lr = LogisticRegression()   # Logistic回归模型
    #     lr.fit(x, y.ravel())        # 根据数据[x,y],计算回归参数
    #
    # 等价形式
    lr = Pipeline([('sc', StandardScaler()), ('clf', LogisticRegression())])
    lr.fit(x, y.ravel())
示例#51
0
def disc_d_state_input_expm(
    A: np.ndarray,
    B: np.ndarray,
    dA: np.ndarray,
    dB: np.ndarray,
    dt: float = 1.0,
    order_hold: int = 0,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Discretize the state and input matrices, and their derivatives with the matrix exponential

    Args:
        A: State matrix
        B: Input matrix
        dA: Derivative state matrix
        dB: Derivative input matrix
        dt: Sampling time
        order_hold: zero order hold = 0 or first order hold = 1

    Returns:
        6-elements tuple containing
            - Ad: Discrete state matrix
            - B0d: Discrete input matrix (zero order hold)
            - B1d: Discrete input matrix (first order hold)
            - dAd: Derivative discrete state matrix
            - dB0d: Derivative discrete input matrix (zero order hold)
            - dB1d: Derivative discrete input matrix (first order hold)
    """
    nj, nx, nu = dB.shape

    if order_hold == 0:
        F = np.zeros((nx + nu, nx + nu))
        dF = np.zeros((nj, nx + nu, nx + nu))
        dFd = np.zeros((nj, nx + nu, nx + nu))

        F[:nx, :nx] = A
        F[:nx, nx:] = B
        dF[:, :nx, :nx] = dA
        dF[:, :nx, nx:] = dB

        for n in range(nj):
            if dF[n].any() or n == 0:
                Fd, dFd[n] = expm_frechet(F * dt, dF[n] * dt)

        Ad, B0d = np.split(Fd[:nx, :], indices_or_sections=[nx], axis=1)
        dAd, dB0d = np.split(dFd[:, :nx, :], indices_or_sections=[nx], axis=2)
        B1d = np.zeros((nx, nu))
        dB1d = np.zeros((nj, nx, nu))
    else:
        F = np.zeros((nx + 2 * nu, nx + 2 * nu))
        dF = np.zeros((nj, nx + 2 * nu, nx + 2 * nu))
        dFd = np.zeros((nj, nx + 2 * nu, nx + 2 * nu))

        F[:nx, :nx] = A
        F[:nx, nx : nx + nu] = B
        F[nx : nx + nu, nx + nu :] = np.eye(nu)
        dF[:, :nx, :nx] = dA
        dF[:, :nx, nx : nx + nu] = dB

        for n in range(nj):
            if dF[n].any() or n == 0:
                Fd, dFd[n] = expm_frechet(F * dt, dF[n] * dt)

        Ad, B0d, B1d = np.split(Fd[:nx, :], indices_or_sections=[nx, nx + nu], axis=1)
        dAd, dB0d, dB1d = np.split(dFd[:, :nx, :], indices_or_sections=[nx, nx + nu], axis=2)

    return Ad, B0d, B1d, dAd, dB0d, dB1d
示例#52
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    trainer_lib.set_random_seed(FLAGS.random_seed)
    usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

    # Create hparams
    hparams = trainer_lib.create_hparams(FLAGS.hparams_set,
                                         FLAGS.hparams,
                                         data_dir=os.path.expanduser(
                                             FLAGS.data_dir),
                                         problem_name=FLAGS.problem)
    hparams.force_full_predict = True
    hparams.scheduled_sampling_k = -1

    # Params
    num_agents = 1  # TODO(mbz): fix the code for more agents
    num_steps = FLAGS.num_steps
    if hasattr(hparams.problem, "num_actions"):
        num_actions = hparams.problem.num_actions
    else:
        num_actions = None
    frame_shape = hparams.problem.frame_shape
    resized_frame = hparams.preprocess_resize_frames is not None
    if resized_frame:
        frame_shape = hparams.preprocess_resize_frames
        frame_shape += [hparams.problem.num_channels]

    dataset = registry.problem(FLAGS.problem).dataset(
        tf_estimator.ModeKeys.TRAIN,
        shuffle_files=True,
        data_dir=os.path.expanduser(FLAGS.data_dir),
        hparams=hparams)

    dataset = dataset.batch(num_agents, drop_remainder=True)
    data = dataset.make_one_shot_iterator().get_next()
    # Setup input placeholders
    input_size = [num_agents, hparams.video_num_input_frames]
    if num_actions is None:
        placeholders = {
            "inputs": tf.placeholder(tf.float32, input_size + frame_shape)
        }
    else:
        placeholders = {
            "inputs": tf.placeholder(tf.float32, input_size + frame_shape),
            "input_action": tf.placeholder(tf.int64, input_size + [1]),
            "input_reward": tf.placeholder(tf.int64, input_size + [1]),
            "reset_internal_states": tf.placeholder(tf.float32, []),
        }
    # Create model.
    model_cls = registry.model(FLAGS.model)
    model = model_cls(hparams, tf_estimator.ModeKeys.PREDICT)
    prediction_ops = model.infer(placeholders)

    states_q = Queue(maxsize=hparams.video_num_input_frames)
    actions_q = Queue(maxsize=hparams.video_num_input_frames)
    rewards_q = Queue(maxsize=hparams.video_num_input_frames)
    if num_actions is not None:
        all_qs = [states_q, actions_q, rewards_q]
    else:
        all_qs = [states_q]

    writer = common_video.WholeVideoWriter(fps=FLAGS.fps,
                                           output_path=FLAGS.output_gif)

    saver = tf.train.Saver(tf.trainable_variables())
    with tf.train.SingularMonitoredSession() as sess:
        # Load latest checkpoint
        ckpt = tf.train.get_checkpoint_state(
            FLAGS.output_dir).model_checkpoint_path
        saver.restore(sess.raw_session(), ckpt)

        # get init frames from the dataset
        data_np = sess.run(data)

        frames = np.split(data_np["inputs"], hparams.video_num_input_frames, 1)
        for frame in frames:
            frame = np.squeeze(frame, 1)
            states_q.put(frame)
            writer.write(frame[0].astype(np.uint8))

        if num_actions is not None:
            actions = np.split(data_np["input_action"],
                               hparams.video_num_input_frames, 1)
            for action in actions:
                actions_q.put(np.squeeze(action, 1))

            rewards = np.split(data_np["input_reward"],
                               hparams.video_num_input_frames, 1)
            for reward in rewards:
                rewards_q.put(np.squeeze(reward, 1))

        for step in range(num_steps):
            print(">>>>>>> ", step)

            if num_actions is not None:
                random_actions = np.random.randint(num_actions - 1)
                random_actions = np.expand_dims(random_actions, 0)
                random_actions = np.tile(random_actions, (num_agents, 1))

                # Shape inputs and targets
                inputs, input_action, input_reward = (np.stack(list(q.queue),
                                                               axis=1)
                                                      for q in all_qs)
            else:
                assert len(all_qs) == 1
                q = all_qs[0]
                elems = list(q.queue)
                # Need to adjust shapes sometimes.
                for i, e in enumerate(elems):
                    if len(e.shape) < 4:
                        elems[i] = np.expand_dims(e, axis=0)
                inputs = np.stack(elems, axis=1)

            # Predict next frames
            if num_actions is None:
                feed = {placeholders["inputs"]: inputs}
            else:
                feed = {
                    placeholders["inputs"]: inputs,
                    placeholders["input_action"]: input_action,
                    placeholders["input_reward"]: input_reward,
                    placeholders["reset_internal_states"]: float(step == 0),
                }
            predictions = sess.run(prediction_ops, feed_dict=feed)

            if num_actions is None:
                predicted_states = predictions[:, 0]
            else:
                predicted_states = predictions["targets"][:, 0]
                predicted_reward = predictions["target_reward"][:, 0]

            # Update queues
            if num_actions is None:
                new_data = (predicted_states)
            else:
                new_data = (predicted_states, random_actions, predicted_reward)
            for q, d in zip(all_qs, new_data):
                q.get()
                q.put(d.copy())

            writer.write(np.round(predicted_states[0]).astype(np.uint8))

        writer.finish_to_disk()
示例#53
0
def resample(*arrays, **options):
    """Resample arrays or sparse matrices in a consistent way

    The default strategy implements one step of the bootstrapping
    procedure.

    Parameters
    ----------
    *arrays : sequence of indexable data-structures
        Indexable data-structures can be arrays, lists, dataframes or scipy
        sparse matrices with consistent first dimension.

    Other Parameters
    ----------------
    replace : boolean, True by default
        Implements resampling with replacement. If False, this will implement
        (sliced) random permutations.

    n_samples : int, None by default
        Number of samples to generate. If left to None this is
        automatically set to the first dimension of the arrays.
        If replace is False it should not be larger than the length of
        arrays.

    random_state : int, RandomState instance or None, optional (default=None)
        Determines random number generation for shuffling
        the data.
        Pass an int for reproducible results across multiple function calls.
        See :term:`Glossary <random_state>`.

    stratify : array-like or None (default=None)
        If not None, data is split in a stratified fashion, using this as
        the class labels.

    Returns
    -------
    resampled_arrays : sequence of indexable data-structures
        Sequence of resampled copies of the collections. The original arrays
        are not impacted.

    Examples
    --------
    It is possible to mix sparse and dense arrays in the same run::

      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
      >>> y = np.array([0, 1, 2])

      >>> from scipy.sparse import coo_matrix
      >>> X_sparse = coo_matrix(X)

      >>> from sklearn.utils import resample
      >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
      >>> X
      array([[1., 0.],
             [2., 1.],
             [1., 0.]])

      >>> X_sparse
      <3x2 sparse matrix of type '<... 'numpy.float64'>'
          with 4 stored elements in Compressed Sparse Row format>

      >>> X_sparse.toarray()
      array([[1., 0.],
             [2., 1.],
             [1., 0.]])

      >>> y
      array([0, 1, 0])

      >>> resample(y, n_samples=2, random_state=0)
      array([0, 1])

    Example using stratification::

      >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
      >>> resample(y, n_samples=5, replace=False, stratify=y,
      ...          random_state=0)
      [1, 1, 1, 0, 1]


    See also
    --------
    :func:`sklearn.utils.shuffle`
    """

    random_state = check_random_state(options.pop('random_state', None))
    replace = options.pop('replace', True)
    max_n_samples = options.pop('n_samples', None)
    stratify = options.pop('stratify', None)
    if options:
        raise ValueError("Unexpected kw arguments: %r" % options.keys())

    if len(arrays) == 0:
        return None

    first = arrays[0]
    n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)

    if max_n_samples is None:
        max_n_samples = n_samples
    elif (max_n_samples > n_samples) and (not replace):
        raise ValueError("Cannot sample %d out of arrays with dim %d "
                         "when replace is False" % (max_n_samples,
                                                    n_samples))

    check_consistent_length(*arrays)

    if stratify is None:
        if replace:
            indices = random_state.randint(0, n_samples, size=(max_n_samples,))
        else:
            indices = np.arange(n_samples)
            random_state.shuffle(indices)
            indices = indices[:max_n_samples]
    else:
        # Code adapted from StratifiedShuffleSplit()
        y = check_array(stratify, ensure_2d=False, dtype=None)
        if y.ndim == 2:
            # for multi-label y, map each distinct row to a string repr
            # using join because str(row) uses an ellipsis if len(row) > 1000
            y = np.array([' '.join(row.astype('str')) for row in y])

        classes, y_indices = np.unique(y, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = np.bincount(y_indices)

        # Find the sorted list of instances for each class:
        # (np.unique above performs a sort, so code is O(n logn) already)
        class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
                                 np.cumsum(class_counts)[:-1])

        n_i = _approximate_mode(class_counts, max_n_samples, random_state)

        indices = []

        for i in range(n_classes):
            indices_i = random_state.choice(class_indices[i], n_i[i],
                                            replace=replace)
            indices.extend(indices_i)

        indices = random_state.permutation(indices)


    # convert sparse matrices to CSR for row-based indexing
    arrays = [a.tocsr() if issparse(a) else a for a in arrays]
    resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
    if len(resampled_arrays) == 1:
        # syntactic sugar for the unit argument case
        return resampled_arrays[0]
    else:
        return resampled_arrays
示例#54
0
def split_image_into_128(img):
    arr = np.array(np.split(img, 16))
    arr = np.array(np.split(arr, 16, -1))
    arr = arr.reshape((-1, 128,128))[..., None]
    return arr
def main(config_path, sigma_in):
    # hyper-parameter
    with open(config_path, 'r') as f:
        cfg = yaml.safe_load(f)

    cfg['MODEL']['ALPHA'] = 0.075
    cfg['DATALOADER']['TIME_LENGTH'] = 200
    cfg['DATALOADER']['SIGNAL_LENGTH'] = 50
    cfg['DATALOADER']['VARIABLE_DELAY'] = 15

    model_name = os.path.splitext(os.path.basename(config_path))[0]

    os.makedirs('../results/', exist_ok=True)
    save_path = f'../results/neural_norm/'
    os.makedirs(save_path, exist_ok=True)

    # print('sigma_neu accuracy')
    # performanceは1つの学習済みモデルに対してsigma_neu^testを0から0.15まで変えてそれぞれの正解率を記録する。
    results_norm = []

    # モデルのロード
    torch.manual_seed(1)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cfg['MODEL']['SIGMA_NEU'] = 0
    model = RecurrentNeuralNetwork(n_in=1, n_out=2, n_hid=cfg['MODEL']['SIZE'], device=device,
                                   alpha_time_scale=cfg['MODEL']['ALPHA'], beta_time_scale=cfg['MODEL']['BETA'],
                                   activation=cfg['MODEL']['ACTIVATION'],
                                   sigma_neu=cfg['MODEL']['SIGMA_NEU'],
                                   sigma_syn=cfg['MODEL']['SIGMA_SYN'],
                                   use_bias=cfg['MODEL']['USE_BIAS'],
                                   anti_hebbian=cfg['MODEL']['ANTI_HEBB']).to(device)

    model_path = f'../trained_model/freq_schedule/{model_name}/epoch_{cfg["TRAIN"]["NUM_EPOCH"]}.pth'
    model.load_state_dict(torch.load(model_path, map_location=device))

    model.eval()

    correct = 0
    num_data = 0
    # print('delta correct_rate')
    for delta_idx in range(50):
        while True:
            delta = np.random.rand() * 8 - 4
            if abs(delta) >= 1:
                break
        N = 500
        output_list = np.zeros(N)
        input_signal = romo_signal(delta, N, cfg['DATALOADER']['TIME_LENGTH'],
                                   cfg['DATALOADER']['SIGNAL_LENGTH'], sigma_in, cfg['MODEL']['ALPHA'])
        input_signal_split = np.split(input_signal, 10)
        for i in range(10):
            hidden = torch.zeros(50, model.n_hid)
            hidden = hidden.to(device)
            inputs = torch.from_numpy(input_signal_split[i]).float()
            inputs = inputs.to(device)
            hidden_list, outputs, _, _ = model(inputs, hidden)
            outputs_np = outputs.cpu().detach().numpy()
            output_list[i * 50: (i + 1) * 50] = np.argmax(outputs_np[:, -1], axis=1)
            results_norm.append(np.linalg.norm(hidden_list.cpu().detach().numpy()[:, :, :]))
        num_data += 500
        if delta > 0:
            ans = 1
        else:
            ans = 0
        correct += (output_list == ans).sum()
        if delta_idx % 10 == 0:
            print(f'{np.mean(results_norm):.4f}')

        # print(f'{delta:.3f}', (output_list == ans).sum() / 200)

    # print(cfg['MODEL']['SIGMA_NEU'], correct / num_data)
    print(np.mean(results_norm), np.std(results_norm))

    np.savetxt(os.path.join(save_path, f'{model_name}.txt'), np.array([np.mean(results_norm), np.std(results_norm)]))
示例#56
0
                        visbiasinc = np.zeros([W.shape[0], K])

                        grad = np.zeros(
                            W.shape)  # gradient tracker for learning

                        for epoch in range(1, epochs + 1):
                            # in each epoch, we'll visit all users in a random order
                            visitingOrder = np.array(trStats["u_users"])
                            np.random.shuffle(visitingOrder)

                            # | Extension | Adaptive Learning Rate
                            adapativeLearningRate = alpha / epoch**2

                            # | Extension | Mini Batch
                            # numBatches = np.ceil(visitingOrder.shape[0]/B)
                            batches = np.split(visitingOrder, B)

                            for batch in batches:

                                prevGrad = grad
                                grad = np.zeros(W.shape)

                                for user in batch:
                                    # get the ratings of that user
                                    ratingsForUser = lib.getRatingsForUser(
                                        user, training)

                                    # build the visible input
                                    v = rbm.getV(ratingsForUser)

                                    # get the weights associated to movies the user has seen
示例#57
0
import numpy as np

data = np.random.sample((8, 8))

a1, a2 = np.split(data, 2, axis=0)
示例#58
0
文件: getitem.py 项目: Haxine/mars-1
    def tile_with_columns(cls, op):
        in_df = op.inputs[0]
        out_df = op.outputs[0]
        col_names = op.col_names
        if not isinstance(col_names, list):
            column_index = calc_columns_index(col_names, in_df)
            out_chunks = []
            dtype = in_df.dtypes[col_names]
            for i in range(in_df.chunk_shape[0]):
                c = in_df.cix[(i, column_index)]
                op = DataFrameIndex(col_names=col_names)
                out_chunks.append(
                    op.new_chunk([c],
                                 shape=(c.shape[0], ),
                                 index=(i, ),
                                 dtype=dtype,
                                 index_value=c.index_value,
                                 name=col_names))
            new_op = op.copy()
            return new_op.new_seriess(op.inputs,
                                      shape=out_df.shape,
                                      dtype=out_df.dtype,
                                      index_value=out_df.index_value,
                                      name=out_df.name,
                                      nsplits=(in_df.nsplits[0], ),
                                      chunks=out_chunks)
        else:
            # combine columns into one chunk and keep the columns order at the same time.
            # When chunk columns are ['c1', 'c2', 'c3'], ['c4', 'c5'],
            # selected columns are ['c2', 'c3', 'c4', 'c2'], `column_splits` will be
            # [(['c2', 'c3'], 0), ('c4', 1), ('c2', 0)].
            selected_index = [
                calc_columns_index(col, in_df) for col in col_names
            ]
            condition = np.where(np.diff(selected_index))[0] + 1
            column_splits = np.split(col_names, condition)
            column_indexes = np.split(selected_index, condition)

            out_chunks = [[] for _ in range(in_df.chunk_shape[0])]
            column_nsplits = []
            for i, (columns,
                    column_idx) in enumerate(zip(column_splits,
                                                 column_indexes)):
                dtypes = in_df.dtypes[columns]
                column_nsplits.append(len(columns))
                for j in range(in_df.chunk_shape[0]):
                    c = in_df.cix[(j, column_idx[0])]
                    index_op = DataFrameIndex(col_names=list(columns),
                                              object_type=ObjectType.dataframe)
                    out_chunk = index_op.new_chunk(
                        [c],
                        shape=(c.shape[0], len(columns)),
                        index=(j, i),
                        dtypes=dtypes,
                        index_value=c.index_value,
                        columns_value=parse_index(pd.Index(columns),
                                                  store_data=True))
                    out_chunks[j].append(out_chunk)
            out_chunks = [item for l in out_chunks for item in l]
            new_op = op.copy()
            nsplits = (in_df.nsplits[0], tuple(column_nsplits))
            return new_op.new_dataframes(op.inputs,
                                         shape=out_df.shape,
                                         dtypes=out_df.dtypes,
                                         index_value=out_df.index_value,
                                         columns_value=out_df.columns,
                                         chunks=out_chunks,
                                         nsplits=nsplits)
示例#59
0
    def read(self):
        fname_template = osp.join(self.path, "{}_{{}}.txt".format(self.name))
        available = [
            f.split(os.sep)[-1][len(self.name) + 1:-4]  # Remove leading name
            for f in glob.glob(fname_template.format("*"))
        ]

        # Batch index
        node_batch_index = (
            io.load_txt(fname_template.format("graph_indicator")).astype(int) -
            1)
        n_nodes = np.bincount(node_batch_index)
        n_nodes_cum = np.concatenate(([0], np.cumsum(n_nodes)[:-1]))

        # Read edge lists
        edges = io.load_txt(fname_template.format("A"),
                            delimiter=",").astype(int) - 1
        # Remove duplicates and self-loops from edges
        _, mask = np.unique(edges, axis=0, return_index=True)
        mask = mask[edges[mask, 0] != edges[mask, 1]]
        edges = edges[mask]
        # Split edges into separate edge lists
        edge_batch_idx = node_batch_index[edges[:, 0]]
        n_edges = np.bincount(edge_batch_idx)
        n_edges_cum = np.cumsum(n_edges[:-1])
        el_list = np.split(edges - n_nodes_cum[edge_batch_idx, None],
                           n_edges_cum)

        # Node features
        x_list = []
        if "node_attributes" in available:
            x_attr = io.load_txt(fname_template.format("node_attributes"),
                                 delimiter=",")
            if x_attr.ndim == 1:
                x_attr = x_attr[:, None]
            x_list.append(x_attr)
        if "node_labels" in available:
            x_labs = io.load_txt(fname_template.format("node_labels"))
            if x_labs.ndim == 1:
                x_labs = x_labs[:, None]
            x_labs = np.concatenate(
                [_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1)
            x_list.append(x_labs)
        if len(x_list) > 0:
            x_list = np.concatenate(x_list, -1)
            x_list = np.split(x_list, n_nodes_cum[1:])
        else:
            print("WARNING: this dataset doesn't have node attributes."
                  "Consider creating manual features before using it with a "
                  "Loader.")
            x_list = [None] * len(n_nodes)

        # Edge features
        e_list = []
        if "edge_attributes" in available:
            e_attr = io.load_txt(fname_template.format("edge_attributes"))
            if e_attr.ndim == 1:
                e_attr = e_attr[:, None]
            e_attr = e_attr[mask]
            e_list.append(e_attr)
        if "edge_labels" in available:
            e_labs = io.load_txt(fname_template.format("edge_labels"))
            if e_labs.ndim == 1:
                e_labs = e_labs[:, None]
            e_labs = e_labs[mask]
            e_labs = np.concatenate(
                [_normalize(el_[:, None], "ohe") for el_ in e_labs.T], -1)
            e_list.append(e_labs)
        if len(e_list) > 0:
            e_available = True
            e_list = np.concatenate(e_list, -1)
            e_list = np.split(e_list, n_edges_cum)
        else:
            e_available = False
            e_list = [None] * len(n_nodes)

        # Create sparse adjacency matrices and re-sort edge attributes in lexicographic
        # order
        a_e_list = [
            sparse.edge_index_to_matrix(
                edge_index=el,
                edge_weight=np.ones(el.shape[0]),
                edge_features=e,
                shape=(n, n),
            ) for el, e, n in zip(el_list, e_list, n_nodes)
        ]
        if e_available:
            a_list, e_list = list(zip(*a_e_list))
        else:
            a_list = a_e_list

        # Labels
        if "graph_attributes" in available:
            labels = io.load_txt(fname_template.format("graph_attributes"))
        elif "graph_labels" in available:
            labels = io.load_txt(fname_template.format("graph_labels"))
            labels = _normalize(labels[:, None], "ohe")
        else:
            raise ValueError("No labels available for dataset {}".format(
                self.name))

        # Convert to Graph
        print("Successfully loaded {}.".format(self.name))
        return [
            Graph(x=x, a=a, e=e, y=y)
            for x, a, e, y in zip(x_list, a_list, e_list, labels)
        ]
示例#60
0
    def split(self,
              dataset,
              seed=None,
              frac_train=.8,
              frac_valid=.1,
              frac_test=.1,
              log_every_n=None):
        """
    Splits compounds into train/validation/test using stratified sampling.

    Parameters
    ----------
    dataset: dc.data.Dataset object
      Dataset.
    seed: int (Optional, Default None)
      Random seed.
    frac_train: float (Optional, Default .8)
      Fraction of dataset put into training data.
    frac_valid: float (Optional, Default .1)
      Fraction of dataset put into validation data.
    frac_test: float (Optional, Default .1)
      Fraction of dataset put into test data.
    log_every_n: int (Optional, Default None)
      Log every n examples (not currently used).

    Returns
    -------
    retval: Tuple
      Tuple containing train indices, valid indices, and test indices    
    """
        # JSG Assert that split fractions can be written as proper fractions over 10.
        # This can be generalized in the future with some common demoninator determination.
        # This will work for 80/20 train/test or 80/10/10 train/valid/test (most use cases).
        np.testing.assert_equal(frac_train + frac_valid + frac_test, 1.)
        np.testing.assert_equal(
            10 * frac_train + 10 * frac_valid + 10 * frac_test, 10.)

        if not seed is None:
            np.random.seed(seed)

        y_s = dataset.y[:, self.task_number]
        sortidx = np.argsort(y_s)

        split_cd = 10
        train_cutoff = int(frac_train * split_cd)
        valid_cutoff = int(frac_valid * split_cd) + train_cutoff
        test_cutoff = int(frac_test * split_cd) + valid_cutoff

        train_idx = np.array([])
        valid_idx = np.array([])
        test_idx = np.array([])

        while sortidx.shape[0] >= split_cd:
            sortidx_split, sortidx = np.split(sortidx, [split_cd])
            shuffled = np.random.permutation(range(split_cd))
            train_idx = np.hstack(
                [train_idx, sortidx_split[shuffled[:train_cutoff]]])
            valid_idx = np.hstack([
                valid_idx, sortidx_split[shuffled[train_cutoff:valid_cutoff]]
            ])
            test_idx = np.hstack(
                [test_idx, sortidx_split[shuffled[valid_cutoff:]]])

        # Append remaining examples to train
        if sortidx.shape[0] > 0: np.hstack([train_idx, sortidx])

        return (train_idx, valid_idx, test_idx)