示例#1
0
def write_12mers(mirname, mirseq, outfile):

    site8 = utils.rev_comp(mirseq[1:8]) + 'A'
    all_12mers = generate_12mers(site8)

    if len(all_12mers) != 262144:
        raise (ValueError("all_12mers should be 262144 in length"))

    with tf.python_io.TFRecordWriter(outfile) as tfwriter:
        for siteseq in all_12mers:

            aligned_stype = utils.get_centered_stype(site8, siteseq)
            if aligned_stype == 'no site':
                keep_prob = 0.001
            else:
                keep_prob = 1.0

            feature_dict = {
                'mir': tf_utils._bytes_feature(mirname.encode('utf-8')),
                'mir_1hot':
                tf_utils._float_feature(utils.one_hot_encode(mirseq)),
                'seq_1hot':
                tf_utils._float_feature(utils.one_hot_encode(siteseq)),
                'log_kd': tf_utils._float_feature([-0.0]),
                'keep_prob': tf_utils._float_feature([keep_prob]),
                'stype':
                tf_utils._bytes_feature(aligned_stype.encode('utf-8')),
            }

            example_proto = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
            example_proto = example_proto.SerializeToString()

            tfwriter.write(example_proto)
def load_npz(path, input_shape, label_shape, batch_size = 16, onehot_encode = True):
    print("Loading data from {} ...".format(path))
    ds = np.load(path)
    x, y = ds['x'], ds['y']
    x[...,2] = x[...,2]/255
    train_data = x[:int(0.8*x.shape[0])]
    test_data = x[int(0.8*x.shape[0]):]
    
    train_labels = y[:int(0.8*y.shape[0])]

    if onehot_encode:
        train_labels = one_hot_encode(train_labels)
    
    test_labels = y[int(0.8*y.shape[0]):]   
    if onehot_encode:
        test_labels = one_hot_encode(test_labels)
    
    def generator(data, labels, batch_size = batch_size):
        ind = 0
        while True:
            yield data[ind:ind+batch_size], labels[ind:ind+batch_size]
            if ind >= data.shape[0]: ind = 0
        
    train_dataset = generator(train_data, train_labels, batch_size)
    test_dataset = generator(test_data, test_labels, batch_size)
    print("Dataset Loaded.")
    print("Train dataset input shape: {} label shape: {}".format(train_data.shape, train_labels.shape))
    print("Test dataset input shape: {} label shape: {}".format(test_data.shape, test_labels.shape))
    
    
    return [train_dataset, test_dataset, train_data.shape[0], test_data.shape[0]]
示例#3
0
    def backward(self, x: np.ndarray, labels: np.ndarray, hs: Dict, ps: Dict):
        """
        Makes backward pass through the network.
        Returns the gradients of loss w.r.t. network parameters -  w_hx, w_hh, w_hy.

        :param x: the array of input characters, where each item is the index of character, the
                  size of array will be the sequence length
        :param labels: the array of target characters, where each item is the index of character,
                       the size of array will be the sequence length
        :param hs: the hidden states of network, (the first output of the self.forward method)
        :param ps: network predictions for given inputs,
                   (the second output of the self.forward method)
        :return: gradients of w_hx, w_hh, w_hy
        """
        inputs_matrix = one_hot_encode(x, self.vocabulary_size)
        labels_matrix = one_hot_encode(labels, self.vocabulary_size)

        dw_hx = np.zeros_like(self.w_hx)
        dw_hh = np.zeros_like(self.w_hh)
        dw_hy = np.zeros_like(self.w_hy)

        for t in reversed(range(len(x))):
            # dl / dy = p - label
            dy_t = ps[t] - labels_matrix[t]

            # dl / dw_hy = (dl / dy) * (dy / dw_hy)
            dw_hy += np.dot(dy_t, hs[t].T)

            # dl / dh = (dl / dy) * (dy / dh) = (p - label) * w_hy
            dh_t = np.dot(self.w_hy.T, dy_t)

            # dl / dz_{k} = (dl / dh_{k}) * (dh_{k} / dz_{k}) = dh_{t} * (dh_{k} / dz_{k})
            dz_k = dh_t * self.f_prime(hs[t])

            # dl / dw_hh = ∑ (dl / dz_{k}) * (dz_{k} / dw_hh) for all k from 1 to t
            # dl / dw_hx = ∑ (dl / dz_{k}) * (dz_{k} / dw_hx) for all k from 1 to t
            for k in reversed(range(t + 1)):
                # (dl / dz_{k}) (dz_{k} / dw_hh) = dz_k * h_{k-1}
                dw_hh += np.dot(dz_k, hs[k - 1].T)

                # (dl / dz_{k}) (dz_{k} / dw_h) = dz_k * x_{k}
                dw_hx += np.dot(dz_k, inputs_matrix[k].T)

                # updating dz_k using all previous derivatives (from t to t - k)
                # dl / dz_(k-1) = (dl / dz_{k})(dz_{k} / dh_{k-1}) * (dh_{k-1) / dz_{k-1})
                dz_k = np.dot(self.w_hh.T, dz_k) * self.f_prime(hs[k - 1])

        # clip to mitigate exploding gradients
        for d_param in (dw_hx, dw_hh, dw_hy):
            np.clip(d_param, -5, 5, out=d_param)

        return dw_hx, dw_hh, dw_hy
示例#4
0
 def fetch_batch(self,
                 args,
                 mode='train',
                 sample_strategy='random',
                 augment=True):
     n_classes, batch_size, seq_length = args.n_classes, args.batch_size, args.seq_length
     if mode == 'train':
         data = self.train_data
     elif mode == 'test':
         data = self.test_data
     classes = [
         np.random.choice(range(len(data)), replace=False, size=n_classes)
         for _ in range(batch_size)
     ]
     if sample_strategy == 'random':  # #(sample) per class may not be equal (sec 7)
         seq = np.random.randint(0, n_classes, [batch_size, seq_length])
     elif sample_strategy == 'uniform':  # #(sample) per class are equal
         seq = np.array([
             np.concatenate([[j] * int(seq_length / n_classes)
                             for j in range(n_classes)])
             for _ in range(batch_size)
         ])
         for i in range(batch_size):
             np.random.shuffle(seq[i, :])
     seq_pic = [[
         self.augment(data[classes[i][j]][np.random.randint(
             0, len(data[classes[i][j]]))],
                      only_resize=not augment) for j in seq[i, :]
     ] for i in range(batch_size)]
     if args.label_type == 'one_hot':
         seq_encoded = one_hot_encode(seq, n_classes)
         seq_encoded_shifted = np.concatenate([
             np.zeros(shape=[batch_size, 1, n_classes]),
             seq_encoded[:, :-1, :]
         ],
                                              axis=1)
     elif args.label_type == 'five_hot':
         label_dict = [[[
             int(j)
             for j in list(baseN(i, 5)) + [0] * (5 - len(baseN(i, 5)))
         ] for i in np.random.choice(
             range(5**5), replace=False, size=n_classes)]
                       for _ in range(batch_size)]
         seq_encoded_ = np.array([[label_dict[b][i] for i in seq[b]]
                                  for b in range(batch_size)])
         seq_encoded = np.reshape(one_hot_encode(seq_encoded_, dim=5),
                                  newshape=[batch_size, seq_length, -1])
         seq_encoded_shifted = np.concatenate(
             [np.zeros(shape=[batch_size, 1, 25]), seq_encoded[:, :-1, :]],
             axis=1)
     return seq_pic, seq_encoded_shifted, seq_encoded
示例#5
0
def obtain_data(data_dir, namefile, batch_s):
    # Load the training data.
    train_sample = pd.read_csv(os.path.join(data_dir, namefile), header=None, names=None)
    print('Loaded csv')

    train_sample_y = train_sample[train_sample.columns[0:34]]
    train_sample_len = train_sample[train_sample.columns[34]]
    train_sample_X = train_sample[train_sample.columns[34:69]]
    print('Size read from csv -> X: {}, Y: {}, len: {}'.format(train_sample_X.shape, train_sample_len.shape, train_sample_y.shape))

    X_np = train_sample_X.to_numpy(copy=True)
    len_np = train_sample_len.to_numpy(copy=True)
    Y_np = train_sample_y.to_numpy(copy=True)
    print('To numpied')

    dict_size = 34
    seq_len = 35
    batch_size = len(train_sample_X)
    input_seq = one_hot_encode(X_np, dict_size, seq_len, batch_size)
    print('One hot encoded')

    train_torch_x = torch.from_numpy(input_seq).float().squeeze()
    train_torch_len = torch.from_numpy(len_np).float().squeeze().type(torch.long)
    train_torch_y = torch.from_numpy(Y_np).float().squeeze().type(torch.long)
    print('Torched')

    train_sample_ds = torch.utils.data.TensorDataset(train_torch_x, train_torch_y, train_torch_len)
    train_loader = torch.utils.data.DataLoader(train_sample_ds, batch_size=batch_s)
    print('Train loaded')

    return train_loader
示例#6
0
 def sample_points(self,
                   n_sample=100,
                   temp=1.0,
                   prime_text="^",
                   maxlen=100):
     valid_mols = []
     print("\n\n----- SAMPLING POINTS AT TEMP %.2f -----" % temp)
     for x in range(n_sample):
         smiles = str()  # final smiles string will be stored in "generated"
         seed_token = []
         for t in list(prime_text):  # prepare seed token
             smiles += t
             seed_token += [self.token_indices[t]]
         while smiles[-1] != '$' and len(
                 smiles
         ) < maxlen:  # start sampling chars until maxlen or $ is reached
             x_seed = one_hot_encode([seed_token], self.n_chars)
             preds = self.model.predict(x_seed, verbose=0)[0]
             next_char_ind = transform_temp(preds[-1, :], temp)
             next_char = self.indices_token[str(next_char_ind)]
             smiles += next_char
             seed_token += [next_char_ind]
         val, s = is_valid_mol(smiles, True)
         if val:
             print(s)
             valid_mols.append(s)
     return valid_mols
示例#7
0
    def __getitem__(self, index):

        if self.mode.lower() == 'train':
            data_path, label_path = self.train_data[index], self.train_labels[
                index]

        elif self.mode.lower() == 'valid':
            data_path, label_path = self.valid_data[index], self.valid_labels[
                index]

        elif self.mode.lower() == 'test':
            data_path, label_path = self.test_data[index], self.test_labels[
                index]

        else:
            raise RuntimeError(
                'Unexpected dataset mode. Supported modes are: train, valid and test'
            )

        image, label = utils.pil_loader(data_path, label_path)

        if self.data_transform is not None:
            image = self.data_transform(image)

        if self.label_transform is not None:
            label = self.label_transform(label)

        # perform one-hot-encoding
        target = utils.one_hot_encode(label)
        target = torch.FloatTensor(target)

        return image, label, target
def train_model(model, vocab, train_dl, learning_rate=0.003, num_epochs=5):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    total_step = len(train_dl)
    for epoch in range(num_epochs):
        for i, batch in enumerate(train_dl):
            sequences, lengths, functions = batch['sequence'], batch['length'], batch['function']
            sequences = pad_char_sequences(sequences)
            sequences = one_hot_encode(sequences, vocab).float()
            sequences, lengths, functions = sequences.to(device), lengths.cpu(), functions.to(device)

            output = model(sequences, lengths)
            loss = criterion(output, functions)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
            if (i + 1) % 1000 == 0:
                torch.save(model.state_dict(), models_dir + 'model_{}.ckpt'.format(time.strftime("%Y%m%d-%H%M%S")))
                print("Saved model state to disk")
    torch.save(model.state_dict(), models_dir + 'model_{}.ckpt'.format(time.strftime("%Y%m%d-%H%M%S")))

    return model
示例#9
0
    def __init__(self,
                 data_type='CT',
                 train_ratio=None,
                 fold_k=None,
                 norm=None,
                 expand_dim=None,
                 seed=233):
        self.seed = seed
        data = np.load(datapaths[data_type])
        self.data = np.concatenate([data["x_train"], data["x_test"]])
        self.labels = np.concatenate([data["y_train"], data["y_test"]])
        self.class_num = np.max(self.labels) + 1
        self.labels = one_hot_encode(self.labels, self.class_num)
        del data

        self.divide_train_test(train_ratio, fold_k)

        if norm is not None:
            self.data = self.data / norm  # [0,1]

        self.data = self.data[:, :, :2]

        self.train_cur_pos, self.test_cur_pos = 0, 0

        self.expand_dim = expand_dim
示例#10
0
def test_model(model, vocab, test_ds, test_dl) -> None:
    """

    @rtype: NoneType
    """
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch in test_dl:
            sequences, lengths, functions = batch['sequence'], batch[
                'length'], batch['function']
            sequences = pad_char_sequences(sequences)
            sequences = one_hot_encode(sequences, vocab)
            sequences, lengths, functions = sequences.to(
                device), lengths.cpu(), functions.to(device)

            output = model(sequences, lengths)

            _, predicted = torch.max(output, 1)
            total += len(functions)
            correct += (predicted == functions).sum().item()

        print(
            'Test Accuracy of the model on the {} test sequences: {} %'.format(
                len(test_ds), 100 * correct / total))
示例#11
0
        def __init__(self, data, logit, dequantize, rng):

            x = self._dequantize(
                data[0], rng) if dequantize else data[0]  # dequantize pixels
            self.x = self._logit_transform(x) if logit else x  # logit
            self.labels = data[1]  # numeric labels
            self.y = utils.one_hot_encode(self.labels,
                                          10)  # 1-hot encoded labels
            self.N = self.x.shape[0]  # number of datapoints
示例#12
0
def train_model(X_train, X_test, y_train, y_test):
    # Can be configured in a seperate file
    n_inputs = 28 * 28
    n_h1 = 300
    n_h2 = 100
    n_outputs = 10
    n_epochs = 5
    batch_size = 20
    learning_rate = 0.001

    y_train = one_hot_encode(y_train, n_outputs)
    y_test = one_hot_encode(y_test, n_outputs)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')

    # Build a simple MLP model
    model = Sequential()
    # first hidden layer
    model.add(Dense(n_h1, activation='relu', input_shape=(n_inputs, )))
    # second hidden layer
    model.add(Dense(n_h2, activation='relu'))
    # output layer
    model.add(Dense(n_outputs, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(lr=learning_rate),
                  metrics=['accuracy'])

    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              epochs=n_epochs,
              verbose=2,
              validation_data=(X_test, y_test))
    # callbacks=[LogRunMetrics()])

    score = model.evaluate(X_test, y_test, verbose=0)

    # log a single value
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    metrics = {"acc": score[1]}
    return model, metrics
示例#13
0
def pos_neg(motif_name, seq_length, num_seq,
                  min_counts, max_counts, GC_fraction,
                  central_bp=None):

    #positive results test
    positive_set,positive_embedding,positive_positions_arr,positive_motif_name_arr = motif_density(motif_name, seq_length, num_seq,
                      min_counts, max_counts, GC_fraction,
                      central_bp=None)
    #pdb.set_trace()

    random.shuffle(positive_set)
    thresh_positive=int(0.3*(len(positive_set)))
    validation_positive_set = positive_set[0:thresh_positive]
    training_positive_set = positive_set[thresh_positive:]

    negative_set,negative_embedding,negative_positions_arr,negative_motif_name_arr = motif_density(motif_name, seq_length, num_seq,
                      min_counts = 0, max_counts = 0, GC_fraction = .4,
                      central_bp=None)
    random.shuffle(negative_set)
    thresh_negative=int(0.3*(len(negative_set)))
    validation_negative_set = negative_set[0:thresh_negative]
    training_negative_set = negative_set[thresh_negative:]

    validation_set = np.concatenate((validation_negative_set, validation_positive_set), axis = 0)
    training_set = np.concatenate((training_negative_set, training_positive_set), axis = 0)


    positive_labels = np.ones(training_positive_set.shape)
    positive_labels=np.reshape(positive_labels,(len(positive_labels),1))
    negative_labels = np.zeros(training_negative_set.shape)
    negative_labels=np.reshape(negative_labels,(len(negative_labels),1))
    training_labels = np.concatenate((positive_labels, negative_labels),axis=0)

    pos_val_labels=np.ones(validation_positive_set.shape)
    pos_val_labels=np.reshape(pos_val_labels,(len(pos_val_labels),1))
    neg_val_labels=np.zeros(validation_negative_set.shape)
    neg_val_labels=np.reshape(neg_val_labels,(len(neg_val_labels),1))
    validation_labels=np.concatenate((pos_val_labels,neg_val_labels),axis=0)
    #pdb.set_trace()
    training_set=one_hot_encode(np.array([i for i in training_set]));
    validation_set=one_hot_encode(np.array([i for i in validation_set]));
    return training_labels,training_set,validation_labels,validation_set,positive_positions_arr,positive_motif_name_arr,negative_positions_arr,negative_motif_name_arr
    def generate_xdy(self, indexes):
        """generate sequence input, descriptor input and sequence output for one batch of SMILES"""
        x, d, y = list(), list(), list()
        for idx in indexes:
            s = self.smiles[idx]
            inputs = []
            targets = []
            # split up into windows
            for i in range(0, len(s) - self.window, self.step):
                inputs.append(s[i:i + self.window])
                targets.append(s[(i + 1):(i + self.window + 1)])

            # tokenize windows
            input_token = tokenize_molecules(inputs, self.t2i)
            target_token = tokenize_molecules(targets, self.t2i)

            # one-hot encode tokenized windows
            x.extend(one_hot_encode(input_token, len(self.t2i)).tolist())
            y.extend(one_hot_encode(target_token, len(self.t2i)).tolist())
        return np.array(x), np.array(y)
    def __getitem__(self, idx: int):
        warnings.filterwarnings("ignore")
        sample = self.df.iloc[idx, :]
        # wav_name = sample["resampled_filename"]
        wav_name = sample["filename"]
        # wav_name = wav_name.replace("mp3", "wav")
        wav_name = wav_name.replace("mp3", "npy")
        wav_name = wav_name.replace("wav", "npy")
        ebird_code = sample["ebird_code"]
        duration = sample["duration"]
        wav_path = self.datadir / ebird_code / wav_name
        # y, sr = sf.read(self.datadir / ebird_code / wav_name)
        effective_length = self.sample_rate * self.period
        try:
            if duration > self.period:
                offset = int(np.random.rand() * (duration - self.period - 1))
                y = np.load(wav_path)
                y = y[offset * self.sample_rate:(offset + self.period) *
                      self.sample_rate]
                # y, _ = librosa.load(
                #     wav_path,
                #     sr=self.sample_rate,
                #     offset=offset,
                #     duration=self.period,
                #     mono=True,
                # )
            else:
                # y, _ = librosa.load(wav_path, sr=self.sample_rate, mono=True)
                y = np.load(wav_path)
                y = np.tile(
                    y, 15)  # the shortest rec in the train set is 0.39 sec
                y = y[:effective_length]
            # print(y.shape)
            if len(y) != 160000:
                raise ValueError()

            if self.composer:
                y = self.composer(y)
        except Exception:
            print(wav_path)
            print(duration)
            print(len(y))
            raise
        # if image.shape != (3, 224, 547):
        #     print(wav_path, duration, len(y), offset)

        labels = utils.one_hot_encode(ebird_code)
        if self.secondary_label is not None:
            labels = utils.add_secondary_label(labels,
                                               wav_name.replace("npy", "mp3"),
                                               self.secondary_label)
            # print("find secondary_label !!")
            # print(labels)
        return {"image": y, "targets": labels}
示例#16
0
    def gen():
        while True:
            random_mirseq = utils.generate_random_seq(options.MIRLEN)
            random_target = utils.get_target_no_match(random_mirseq, SEQLEN)
            random_image = np.outer(utils.one_hot_encode(random_mirseq),
                                    utils.one_hot_encode(random_target))

            rbns1_mir = np.random.choice(TRAIN_MIRS_KDS)
            rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options.
                                                                  MIRLEN]
            rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN)
            rbns1_image = np.outer(utils.one_hot_encode(rbns1_mirseq),
                                   utils.one_hot_encode(rbns1_target))

            rbns2_mir = np.random.choice(TRAIN_MIRS_KDS)
            rbns2_target = utils.generate_random_seq(3) + utils.rev_comp(
                MIRNA_DATA.loc[rbns2_mir]['guide_seq']
                [1:7]) + utils.generate_random_seq(3)
            rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN)
            rbns2_image = np.outer(utils.one_hot_encode(rbns2_mirseq),
                                   utils.one_hot_encode(rbns2_target))

            yield np.array([
                b'random',
                rbns1_mir.encode('utf-8'),
                rbns2_mir.encode('utf-8')
            ]), np.stack([random_image, rbns1_image, rbns2_image]), np.array(
                [[0.0], [0.0],
                 [0.0]]), np.array([b'no site', b'no site', b'no site'])
def predict_probs(model, hidden, character, vocab, device):
    # One-hot encoding our input to fit into the model
    character = np.array([[vocab[c] for c in character]])
    character = one_hot_encode(character, len(vocab))
    character = torch.from_numpy(character)
    character = character.to(device)

    with torch.no_grad():
        out, hidden = model(character, hidden)

    prob = nn.functional.softmax(out[-1], dim=0).data

    return prob, hidden
示例#18
0
def train(model, data_loader, optimizer, epoch):
    """Train CapsuleNet model on training set
    :param model: The CapsuleNet model
    :param data_loader: An interator over the dataset. It combines a dataset and a sampler
    :optimizer: Optimization algorithm
    :epoch: Current epoch
    :return: Loss
    """
    print('===> Training mode')

    last_loss = None

    # Switch to train mode
    model.train()

    if args.cuda:
        # When we wrap a Module in DataParallel for multi-GPUs
        model = model.module

    for batch_idx, (data, target) in enumerate(data_loader):
        target_one_hot = utils.one_hot_encode(
            target, length=args.num_classes)

        data, target = Variable(data), Variable(target_one_hot)

        if args.cuda:
            data = data.cuda()
            target = target.cuda()

        optimizer.zero_grad()
        output = model(data) # output from DigitCaps (out_digit_caps)
        loss = model.loss(data, output, target) # pass in data for image reconstruction
        loss.backward()
        last_loss = loss.data[0]
        optimizer.step()

        if batch_idx % args.log_interval == 0:
            mesg = 'Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch,
                batch_idx * len(data),
                len(data_loader.dataset),
                100. * batch_idx / len(data_loader),
                loss.data[0])

            print(mesg)

        if last_loss < args.loss_threshold:
            # Stop training early
            break

    return last_loss
示例#19
0
    def compute_eval_loss_pred(self, query_edge_losses, query_node_accs,
                               all_label_in_edge, point_similarities,
                               query_edge_mask, evaluation_mask, num_supports,
                               support_label, query_label):
        """
        compute the query classification loss and query classification accuracy
        :param query_edge_losses: container for losses of queries' edges
        :param query_node_accs: container for classification accuracy of queries
        :param all_label_in_edge: ground truth label in edge form of point graph
        :param point_similarities: prediction edges of point graph
        :param query_edge_mask: mask for queries
        :param evaluation_mask: mask for evaluation (for unsupervised setting)
        :param num_supports: number of samples in support set
        :param support_label: label of support set
        :param query_label: label of query set
        :return: query classification loss
                 query classification accuracy
        """

        point_similarity = point_similarities[-1]
        full_edge_loss = self.edge_loss(1 - point_similarity,
                                        1 - all_label_in_edge)

        pos_query_edge_loss = torch.sum(
            full_edge_loss * query_edge_mask * all_label_in_edge *
            evaluation_mask) / torch.sum(
                query_edge_mask * all_label_in_edge * evaluation_mask)
        neg_query_edge_loss = torch.sum(
            full_edge_loss * query_edge_mask *
            (1 - all_label_in_edge) * evaluation_mask) / torch.sum(
                query_edge_mask * (1 - all_label_in_edge) * evaluation_mask)

        # weighted loss for balancing pos/neg
        query_edge_loss = pos_query_edge_loss + neg_query_edge_loss

        # prediction
        query_node_pred = torch.bmm(
            point_similarity[:, num_supports:, :num_supports],
            one_hot_encode(self.eval_opt['num_ways'], support_label.long(),
                           self.arg.device))

        # test accuracy
        query_node_acc = torch.eq(
            torch.max(query_node_pred, -1)[1],
            query_label.long()).float().mean()

        query_edge_losses += [query_edge_loss.item()]
        query_node_accs += [query_node_acc.item()]

        return query_node_accs, query_edge_losses
示例#20
0
        def __init__(self, x, l, logit, flip, dequantize, rng):

            D = x.shape[1] // 3  # number of pixels
            x = self._dequantize(x, rng) if dequantize else x  # dequantize
            x = self._logit_transform(x) if logit else x  # logit
            x = self._flip_augmentation(x) if flip else x  # flip
            self.x = x  # pixel values
            self.r = self.x[:, :D]  # red component
            self.g = self.x[:, D:2 * D]  # green component
            self.b = self.x[:, 2 * D:]  # blue component
            self.labels = np.hstack([l, l]) if flip else l  # numeric labels
            self.y = utils.one_hot_encode(self.labels,
                                          10)  # 1-hot encoded labels
            self.N = self.x.shape[0]  # number of datapoints
示例#21
0
 def sample(self, temp=1.0, prime_text="^", maxlen=100):
     generated = str()
     seed_token = []
     for t in list(prime_text):
         generated += t
         seed_token += [self.token_indices[t]]
     while generated[-1] != '$' and len(generated) < maxlen:
         x_seed = one_hot_encode([seed_token], self.n_chars)
         preds = self.model.predict(x_seed, verbose=0)[0]
         next_char_ind = transform_temp(preds[-1, :], temp)
         next_char = self.indices_token[str(next_char_ind)]
         generated += next_char
         seed_token += [next_char_ind]
     return generated
示例#22
0
def process_data():
    images, labels, label_dict = flower_photos_data.load_flower_datasets()
    assert np.max(images[0]) <= 1, 'The image should be scaled to 0-1'
    images, labels = utils.shuffle_data(images, labels)
    labels_onehot = utils.one_hot_encode(labels)
    train_images, train_labels, valid_images, valid_labels, test_images, test_labels = \
        utils.split_data(images, labels_onehot, train_size=0.8, valid_size=0.1, test_size=0.1)
    os.mkdir('data/flower_npy')
    np.save('data/flower_npy/train_images.npy', train_images)
    np.save('data/flower_npy/train_labels.npy', train_labels)
    np.save('data/flower_npy/valid_images.npy', valid_images)
    np.save('data/flower_npy/valid_labels.npy', valid_labels)
    np.save('data/flower_npy/test_images.npy', test_images)
    np.save('data/flower_npy/test_labels.npy', test_labels)
示例#23
0
    def __init__(self,
                 path,
                 target,
                 cat_preproc_type='one-hot',
                 columns=None,
                 drop=None,
                 transforms=None):
        '''
        :param path: path to dataframe
        :param target: target column name
        :param cat_preproc_type: type of preprocessing for categorical data: 'no-preproc', 'one-hot', 'binary', 'backward'
        :param columns: list of new column names or None, values of target and drop args have to be in this list 
        :param transforms: class Compose or Transform or None
        '''

        self.path = path
        self.data = pd.read_csv(path, index_col=0)
        if 'index' in list(self.data.columns):
            self.data = pd.read_csv(path).drop(columns=['index'])
        if columns is not None:
            self.data.columns = columns
        self.y = self.data[target]
        self.data = self.data.drop(columns=[target])
        if drop is not None:
            self.data = self.data.drop(columns=drop)
        if cat_preproc_type == 'no-preproc':
            self.X = self.data.values
        else:
            self.cat_data = self.data.select_dtypes(include=['object']).copy()

            if cat_preproc_type == 'one-hot':
                self.cat_data = one_hot_encode(self.cat_data)
            elif cat_preproc_type == 'binary':
                self.cat_data = binary_encode(self.cat_data)
            elif cat_preproc_type == 'backward':
                self.cat_data = backward_encode(self.cat_data)
            else:
                raise ValueError(
                    "Categorical preprocessing type is not valid.")

            self.X = self.cat_data.join(
                self.data.select_dtypes(include=['int64', 'float64']))

        self.transforms = None
        if transforms is not None:
            if cat_preproc_type == 'no-preproc':
                print('Transforming is impossible when "no-preproc"')
            else:
                self.transforms = transforms
示例#24
0
    def _process(self, image, mask):
        # one-hot-encode the mask
        mask = one_hot_encode(mask, self.class_rgb_values).astype('float')

        # apply augmentations
        if self.augmentation:
            sample = self.augmentation(image=image, mask=mask)
            image, mask = sample['image'], sample['mask']

        # apply preprocessing
        if self.preprocessing:
            sample = self.preprocessing(image=image, mask=mask)
            image, mask = sample['image'], sample['mask']

        return image, mask
示例#25
0
    def gen():
        while True:
            random_mirs, random_images, random_labels, random_stypes = [], [], [], []

            # choose one of the RBNS miRNAs, generate target with no pairing, and assign logkd of 2
            rbns1_mir = np.random.choice(TRAIN_MIRS_KDS)
            random_mirs.append(rbns1_mir.encode('utf-8'))
            rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options.
                                                                  MIRLEN]
            rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN)
            random_images.append(
                np.outer(utils.one_hot_encode(rbns1_mirseq),
                         utils.one_hot_encode(rbns1_target)))
            random_labels.append([2.0])
            random_stypes.append(b'extra')

            # generate miRNA and target with no pairing and assign log kd of 2
            rbns2_mir = np.random.choice(TRAIN_MIRS_KDS)
            random_mirs.append(rbns2_mir.encode('utf-8'))
            rbns2_target = utils.generate_random_seq(3) + utils.rev_comp(
                MIRNA_DATA.loc[rbns2_mir]['guide_seq']
                [1:7]) + utils.generate_random_seq(3)
            rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN)
            random_images.append(
                np.outer(utils.one_hot_encode(rbns2_mirseq),
                         utils.one_hot_encode(rbns2_target)))
            random_labels.append([2.0])
            random_stypes.append(b'extra')

            # generate random 8mer pair and assign KD of average 8mer
            random_mirseq = utils.generate_random_seq(options.MIRLEN)
            random_mirs.append(b'random')
            up_flank = utils.generate_random_seq(2)
            down_flank = utils.generate_random_seq(2)
            random_target = up_flank + utils.rev_comp(
                random_mirseq[1:8]) + 'A' + down_flank
            random_images.append(
                np.outer(utils.one_hot_encode(random_mirseq),
                         utils.one_hot_encode(random_target)))
            # new_label = -5.367
            new_label = -5
            flank_vals = {
                'A': -0.34923908,
                'T': -0.24840472,
                'C': 0.12640774,
                'G': 0.47123606
            }
            all_flank = up_flank + down_flank
            for nt, val in flank_vals.items():
                new_label += val * all_flank.count(nt)
            random_labels.append([new_label])
            random_stypes.append(b'extra')

            yield np.array(random_mirs), np.stack(random_images), np.array(
                random_labels), np.array(random_stypes)
def predict(body):
    runtime = boto3.client('sagemaker-runtime')

    test_messages = [body]
    one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
    encoded_test_messages = vectorize_sequences(one_hot_test_messages,
                                                vocabulary_length)

    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       Body=json.dumps(
                                           encoded_test_messages.tolist()),
                                       ContentType='application/json')

    responseBody = response['Body'].read().decode("utf-8")
    responseBody = json.loads(responseBody)
    return responseBody
示例#27
0
文件: cifar10_data.py 项目: zzw95/CNN
def preprocess_and_save(batch_id):
    images, labels = load_cifar10_batch(batch_id)
    images = utils.normalize_data(images)
    labels = utils.one_hot_encode(labels, 10)
    train_images, train_labels, valid_images, valid_labels, test_images, test_labels =\
        utils.split_data(images, labels, train_size=0.8, valid_size=0.1, test_size=0.1)
    batch = {
        'train_images': train_images,
        'train_labels': train_labels,
        'valid_images': valid_images,
        'valid_labels': valid_labels,
        'test_images': test_images,
        'test_labels': test_labels
    }
    batch_path = os.path.join(folder_path, 'preprocess_batch_' + str(batch_id))
    np.save(batch_path, np.asarray(batch))
def gen_new_train(param):
    # load data
    X_train, y_train = utils.load_data('./data/train.p')

    # data augmentation
    X_train, y_train = utils.augment_data(X_train, y_train, param)

    # pre-process
    X_train = np.array(
        [utils.pre_process(X_train[i]) for i in range(len(X_train))],
        dtype=np.float32)

    # one hot
    oh_y_train = utils.one_hot_encode(y_train)

    return X_train, y_train, oh_y_train
示例#29
0
    def forward(self, x: Tensor, update_state: bool) -> Tuple[Tensor, Tensor]:
        """
        The basic forward pass:

        z_{t} = w_hh * h_{t-1} + w_hx * x_{t}
        h_{t} = f(z_{t})
        y_{t} = w_hy * h_{t}
        p_{t} = softmax(y_{t})

        Makes forward pass through network. self.w_hx.requires_grad()
        :param x: the array of integers, where each item is the index of character, the size of
                  array will be the sequence length
        :param update_state: bool, if True updates current state with last state
        :return: the tuple of states and predicted_probabilities
                 states - tensor of states, size = (sequence length, hidden size)
                 predicted_probabilities - tensor of predicted probabilities for each character in
                                           vocabulary, size = (sequence length, vocabulary size)
        """
        n = len(x)

        # one hot encoding of input
        inputs_matrix = one_hot_encode(x, self.vocabulary_size, self.dtype)

        log_ps = torch.zeros(n, self.vocabulary_size, dtype=self.dtype)
        hs = torch.zeros(n, self.hidden_size, dtype=self.dtype)

        for t in range(len(x)):
            # state at t - 1, dim : (self.hidden_size, 1)
            h_t_1 = self.current_state.clone() if t == 0 else hs[t - 1].clone()

            # state at t, dim : (self.hidden_size, 1)
            h_t = self.f(
                torch.matmul(self.w_hh, h_t_1) +
                torch.matmul(self.w_hx, inputs_matrix[t]))

            # prediction from hidden state at t,
            # log probabilities for next chars,  dim : (self.vocabulary_size, 1)
            p_t = F.log_softmax(torch.matmul(self.w_hy, h_t), dim=0)

            # updating hidden state and and predicted_probabilities keepers
            hs[t], log_ps[t] = h_t, p_t

        if update_state:
            self.current_state = hs[-1].clone()  # updating the current state

        return hs, log_ps
示例#30
0
    def __init__(self, train_ratio=None, fold_k=None, norm=False, expand_dim=False, seed=233):
        self.seed = seed
        mnist = np.load("data/mnist/mnist.npz")
        self.data = np.concatenate([mnist["x_train"], mnist["x_test"]])
        self.labels = np.concatenate([mnist["y_train"], mnist["y_test"]])
        self.labels = one_hot_encode(self.labels, 10)
        del mnist
        
        self.divide_train_test(train_ratio, fold_k)

        if norm:
            self.data = self.data / 255. # [0,1]
            # self.data = self.data / 127.5 - 1. # [-1, 1]
        
        self.train_cur_pos, self.test_cur_pos = 0, 0

        self.expand_dim = expand_dim
示例#31
0
def test(model, data_loader):
    """Evaluate model on validation set
    """
    print('===> Evaluate mode')

    # Switch to evaluate mode
    model.eval()

    if args.cuda:
        # When we wrap a Module in DataParallel for multi-GPUs
        model = model.module

    test_loss = 0
    correct = 0
    for data, target in data_loader:
        target_indices = target
        target_one_hot = utils.one_hot_encode(
            target_indices, length=args.num_classes)

        data, target = Variable(data, volatile=True), Variable(target_one_hot)

        if args.cuda:
            data = data.cuda()
            target = target.cuda()

        output = model(data) # output from DigitCaps (out_digit_caps)

        # sum up batch loss
        test_loss += model.loss(data, output, target, size_average=False).data[0] # pass in data for image reconstruction

        # evaluate
        v_magnitud = torch.sqrt((output**2).sum(dim=2, keepdim=True))
        pred = v_magnitud.data.max(1, keepdim=True)[1].cpu()
        correct += pred.eq(target_indices.view_as(pred)).sum()

    test_loss /= len(data_loader.dataset)

    mesg = 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        test_loss,
        correct,
        len(data_loader.dataset),
        100. * correct / len(data_loader.dataset))
    print(mesg)