示例#1
0
 def generate_samples(self, c1, c2, args):
     c1 = torch.LongTensor([c1]).to(self.device)
     c2 = torch.LongTensor([c2]).to(self.device)
     c1 = to_onehot(c1, self.device)
     c2 = to_onehot(c2, self.device)
     #对当前类别随机采样
     random_samples = self.generate_class_samples(c1, args.sample_num)
     img_save(random_samples, args.random_samples_save_path)
     # 生成插值样本
     interpolation_samples = self.generate_interpolation_samples(c1, c2, args.sample_num)
     img_save(interpolation_samples, args.interpolation_samples_save_path)
示例#2
0
    def update(self, output, **kwargs):
        y_pred, y = self.output_transform(output)

        dim = 1 if y_pred.dim() > 1 else 0
        _, predicted = torch.max(y_pred, dim=dim)
        predicted = to_onehot(predicted, self.num_classes)
        y = to_onehot(y, self.num_classes)

        correct = torch.eq(predicted, y)
        correct[1 - y.byte()] = 0
        correct = torch.sum(correct, dim=0)

        self.correct += correct
        self.n += torch.sum(y, dim=0)
示例#3
0
 def __getitem__(self, idx):
     question_file = h5py.File(self.question_file, 'r', swmr=True)
     q = question_file['questions'][idx]
     a = question_file['answers'][idx]
     q_t = question_file['question_types'][idx]
     ii = question_file['image_ids'][idx]
     if self.cv_pretrained:
         image = h5py.File(self.image_dir, 'r',
                           swmr=True)['images'][self.idx_dict[ii]]
         image = torch.from_numpy(image).unsqueeze(0)
     else:
         image_file = f'COCO_{self.mode}2014_{str(ii).zfill(12)}.jpg' if 'vqa' in self.dataset else f'CLEVR_{self.mode}_{str(ii).zfill(6)}.png'
         if self.dataset == 'sample':
             image_file = f'CLEVR_new_{str(ii).zfill(6)}.png'
         image = Image.open(os.path.join(self.image_dir,
                                         image_file)).convert('RGB')
         if self.transform:
             image = self.transform(image).unsqueeze(0)
     q = torch.from_numpy(q).to(torch.long)
     if self.text_max:
         if len(q) > self.text_max:
             q = q[:self.text_max]
     a = torch.Tensor(a).to(
         torch.long) if not self.multi_label else to_onehot(a, self.a_size)
     q_t = torch.Tensor([q_t]).to(torch.long)
     return image, q, a, q_t
示例#4
0
def predict(model, song, config, char_to_idx, idx_to_char):
    """
    This function takes in the model and character as arguments and returns the next character prediction and hidden state.
    :param idx_to_char:
    :param char_to_idx:
    :param config: Dict of settings
    :param model: nn.Module
    :param song: String
    :return:
    """
    VOCAB_SIZE = len(char_to_idx.keys())

    encoded_song = encode_songs([song], char_to_idx)[0]
    inputs_onehot = to_onehot(encoded_song, VOCAB_SIZE)

    out = model(inputs_onehot.unsqueeze(1))
    out.squeeze_(1)

    prob = softmax(out[-1] / config["TEMPERATURE"], dim=0).data

    if config["TAKE_MAX_PROBABLE"]:
        char_ind = torch.max(prob, dim=0)[1].item()
    else:
        m = Categorical(prob)
        char_ind = m.sample().item()

    return idx_to_char[char_ind]
示例#5
0
 def forward(self, inputs, state):  # inputs: (batch, seq_len)
     # 获取one-hot向量表示
     X = d2l.to_onehot(inputs, self.vocab_size)  # X是个list
     Y, self.state = self.rnn(torch.stack(X), state)
     # 全连接层会首先将Y的形状变成(num_steps * batch_size, num_hiddens),它的输出
     # 形状为(num_steps * batch_size, vocab_size)
     output = self.dense(Y.view(-1, Y.shape[-1]))
     return output, self.state
示例#6
0
    def loss(self, cls_scores, bbox_preds, centernesses, targets):
        featmap_sizes = [score.shape[-2:] for score in cls_scores]
        all_level_points = self.getpoints(featmap_sizes, bbox_preds[0].dtype,
                                          bbox_preds[0].device)
        labels, bbox_targets = self.fcos_target(all_level_points, targets)

        num_imgs = cls_scores[0].shape[0]

        flatten_cls_scores = [
            cls_score.permute(0, 2, 3, 1).reshape(-1, 80)
            for cls_score in cls_scores
        ]
        flatten_bbox_preds = [
            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
            for bbox_pred in bbox_preds
        ]
        flatten_centerness = [
            centerness.permute(0, 2, 3, 1).reshape(-1)
            for centerness in centernesses
        ]

        flatten_cls_scores = torch.cat(flatten_cls_scores)
        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
        flatten_centerness = torch.cat(flatten_centerness)
        flatten_labels = torch.cat(labels)
        flatten_bbox_targets = torch.cat(bbox_targets)
        flatten_points = torch.cat(
            [points.repeat(num_imgs, 1) for points in all_level_points])

        pos_inds = flatten_labels.nonzero().reshape(-1)
        num_pos = len(pos_inds)
        loss_cls = self.cls_criterion(
            flatten_cls_scores,
            to_onehot(flatten_labels)).sum() / (num_imgs + num_pos)

        pos_bbox_preds = flatten_bbox_preds[pos_inds]
        pos_centerness = flatten_centerness[pos_inds]

        if num_pos > 0:
            pos_bbox_targets = flatten_bbox_targets[pos_inds]
            pos_centerness_targets = self.centerness_target(pos_bbox_targets)
            pos_points = flatten_points[pos_inds]
            pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
            pos_decoded_target_preds = distance2bbox(pos_points,
                                                     pos_bbox_targets)

            # centerness weighted iou loss
            loss_bbox = self.box_criterion(pos_decoded_bbox_preds,
                                           pos_decoded_target_preds,
                                           weight=pos_centerness_targets).sum(
                                           ) / pos_centerness_targets.sum()
            loss_centerness = self.centerness_criterion(
                pos_centerness, pos_centerness_targets).mean()
        else:
            loss_bbox = pos_bbox_preds.sum()
            loss_centerness = pos_centerness.sum()
        return loss_cls, loss_bbox, loss_centerness
示例#7
0
 def forward(self, x, c):
     c = to_onehot(c, self.device)
     mu, log_sigma_2 = self.encode(x, c)
     sample = self.reparametrisation(mu, log_sigma_2)
     sample = torch.cat((sample, c), dim=-1)
     sample = self.decode_input(sample).view(sample.shape[0], -1, 2, 2)
     x_predict = self.decoder(sample)
     x_predict = x_predict[:, :, 2:-2, 2:-2]
     return mu, log_sigma_2, x_predict
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = us.data_iter_random
    else:
        data_iter_fn = us.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):  #单纯的训练轮数,与训练数据集分成多少批无关
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)  # 用所有训练数据产生多批歌词用于小批量随机梯度下降
        for X, Y in data_iter:  # 每一批数据产生一组歌词段索引序列X(每个序列都作为RNN模型的一个多时间步输入),和一组对应的歌词下一个字标签序列Y
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                '''
                当多个相邻小批量通过传递隐藏状态串联起来时,模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中(epoch),随着迭代次数的增加,梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列,我们可以在每次读取小批量前将隐藏状态从计算图中分离出来
                批之间单行(歌词段)语义连续,用于利用RNN模型在批之间传递隐藏状态,产生语义连贯加强的训练效果
                '''
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = us.to_onehot(X, vocab_size)  # 单批各歌词段的字符都转为one-hot
                # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 连结之后形状为(num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # Y的形状是(batch_size, num_steps),转置后再变成长度为
                # batch * num_steps 的向量,这样跟输出的行一一对应
                y = Y.T.reshape((-1,)) # Y,X都是转置,处理,然后按行序并为一列,因为Y,X原来就是对应的(歌词序列对应歌词下一次标签序列),所以这里也是一一对应的,用于计算交叉熵损失
                # 使用交叉熵损失计算平均分类误差 每个one-hot向量(对应一个字)计算交叉熵再求和取平均
                l = loss(outputs, y).mean()
            l.backward()
            us.grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
            # 每个小批量的所有歌词输出结果与对应标签计算交叉熵求和,并求了均值,这里对此采用梯度下降
            us.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.asscalar() * y.size  #所有批总损失
            n += y.size  # 所有批总字数

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))  # 计算一个平均总损失,这里做了指数运算
            for prefix in prefixes:  #每pred_period轮,每轮用所有批进行的完整训练后,打印损失,并打印预测出的歌词段
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
示例#9
0
def D_loss(D, G, src, trg, lamb, curriculum):
    src_len = min(curriculum, len(src)-1) + 1
    trg_len = min(curriculum, len(src)-1) + 1
    # with gen
    gen_trg, context = G(src[:src_len], trg[:trg_len])
    d_gen = D(gen_trg, context)
    # with real
    trg = to_onehot(trg, D.vocab_size).type(torch.FloatTensor)[1:trg_len]
    trg = Variable(trg.cuda())
    d_real = D(trg, context)
    # calculate gradient panalty
    penalty = grad_penalty(D, trg.data, gen_trg.data, context.data, lamb)
    loss = d_gen.mean() - d_real.mean() + penalty
    return loss
示例#10
0
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        x = cv2.imread(x)
        x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
        x = self.transform(image=x)["image"]
        x = np.rollaxis(x, -1, 0)  # H,W,C -> C,H,W

        y = to_onehot(y, num_classes)
        y = label_smoothing(y, self.ls_eps)

        data = {}
        data['x'] = torch.from_numpy(x.astype('float32'))
        data['y'] = torch.from_numpy(y.astype('float32'))
        return data
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens, ctx) #1批
    output = [char_to_idx[prefix[0]]]  # 第一个字符的索引
    for t in range(num_chars + len(prefix) - 1):
        # 将上一时间步的输出作为当前时间步的输入
        X = us.to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)  # 一批,每批仅一个字,即单行单时间步,没有词间隐藏层作输入的一部分,因为是用来预测下一个字,而不是训练
        # 计算输出和更新隐藏状态
        (Y, state) = rnn(X, state, params)  # 预测时是用训练好的参数进行运算,不再用上一个时间步的隐藏状态,所以不需要X是歌词段,而只是单个字符,返回的预测集合也仅单个字符,和一个隐藏状态
        # 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])  # 有原始歌词就用原始歌词预测,否则用预测出的歌词预测下一个字
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))  # one-hot分量中最大值作为最优预测的索引,对应一个预测字
    return ''.join([idx_to_char[i] for i in output])  # 输出包括前缀在内的所有字符,即只预测前缀后面的字符并迭代作为输入来预测下一个,前缀的预测舍掉
def negative_log_likelihood(model, encoded_data, criterion, config):
    """
    Average the cross entropy loss over all the chunks
    :param model: nn.Module
    :param encoded_data: List of encoded songs
    :return:
    """
    chunk_loss = 0
    number_of_chunks = 0
    with torch.no_grad():
        model.eval()
        for song in encoded_data:
            model.init_state()
            for seq, target in SlidingWindowLoader(song, window=config["CHUNK_SIZE"]):
                number_of_chunks += 1
                if len(seq) == 0:
                    continue
                inputs_onehot = to_onehot(seq, config["VOCAB_SIZE"])
                output = model(inputs_onehot.unsqueeze(1))  # Turn input into 3D (chunk_length, batch, vocab_size)
                output.squeeze_(1)  # Back to 2D
                chunk_loss += criterion(output, target.long())
    return chunk_loss / number_of_chunks
示例#13
0
    def __getitem__(self, idx):
        # print(idx, len(self.label_idx), self.label_idx[idx], self.questions.shape[0])
        idx = self.label_idx[idx] if self.top_k and not self.multi_label else idx
        ii = self.image_ids[idx]
        if self.dataset == 'vqa2' and self.cv_pretrained:
            ii = self.idx_dict[ii]
        #     else:
        # if self.dataset == 'vqa2':
        #     if self.cv_pretrained:
        #         ii = self.idx_dict[ii]
        #         image = h5py.File(self.image_dir, 'r', swmr=True)['images'][ii]
        #         image = torch.from_numpy(image).unsqueeze(0)
        #     else:
        #         image_file = f'COCO_{self.mode}2014_{str(ii).zfill(12)}.jpg'
        #         image = Image.open(os.path.join(self.image_dir, image_file)).convert('RGB')
        #         if self.transform:
        #             image = self.transform(image).unsqueeze(0)
        # else:
        with h5py.File(self.image_dir, 'r', swmr=True) as f:
            image = f['data'][ii]
        image = torch.from_numpy(image).unsqueeze(0)

        q = self.questions[idx]
        a = self.answers[idx]
        if self.top_k and not self.multi_label:
            a = self.answer_idx[a]
        q_t = self.question_types[idx]
        if self.question_inverse:
            q = q[::-1].copy()
        q = torch.from_numpy(q).to(torch.long)
        if self.text_max:
            if len(q) > self.text_max:
                q = q[:self.text_max]
        a = torch.Tensor([a]).to(torch.long) if not self.multi_label else to_onehot(a, self.total_a_size, self.mask)
        q_t = torch.Tensor([q_t]).to(torch.long)
        return image, q, a, q_t
示例#14
0
        lz_x = zs.Normal('z', mu, logstd, n_samples=1, group_event_ndims=1)
    return encoder, lz_x,


if __name__ == "__main__":

    casia_online = 10
    casia_offline_reverse = 10
    tf.set_random_seed(1234)
    np.random.seed(1234)
    if not args.code:
        if args.dataset == 'hand':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_hand_64(
                n_y, sample_num)
            t_train,t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'standard':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_standard_64(
                n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'casia-online':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_casia_online_64(
                n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'casia-offline':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_casia_offline_64(
                n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
示例#15
0
    def __initializeTrainData(self, frac_positives):
        k = self.window  # for brevity
        self.indelLocations = np.loadtxt(data_dir +
                                         "indelLocations21.txt").astype(int)
        lengthIndels = int(len(self.indelLocations) / 22) * 22
        num_negatives = int(
            int((1. / frac_positives - 1) * lengthIndels) / 22) * 22
        total_length = lengthIndels + num_negatives
        num_negatives_per_chrom = int(num_negatives / 22)
        lengthIndels_per_chrom = int(lengthIndels / 22)
        total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom
        dataset = np.zeros((total_length, 2 * k + 1, 4))
        coverageDataset = np.zeros((total_length, 2 * k + 1))
        entropyDataset = np.zeros((total_length, 2 * k + 1))
        indices = np.zeros(total_length, dtype=np.uint32)
        nearby_indels = np.zeros(total_length, dtype=np.uint32)
        if self.triclass:
            labeltype = np.uint8
        else:
            labeltype = np.bool
        labels = np.zeros(total_length, dtype=labeltype)
        genome_positions = np.zeros(total_length, dtype=np.uint32)

        for chromosome in range(1, 23):
            self.referenceChr = self.referenceChrFull[str(chromosome)]
            self.refChrLen = len(self.referenceChr)
            ext = ".txt"
            if not self.include_filtered: ext = "_filtered" + ext
            if self.triclass:
                self.insertionLocations = np.loadtxt(
                    data_dir + "indelLocations{}_ins".format(chromosome) +
                    ext).astype(int)
                self.deletionLocations = np.loadtxt(
                    data_dir + "indelLocations{}_del".format(chromosome) +
                    ext).astype(int)
                self.indelLocationsFull = np.concatenate(
                    (self.insertionLocations, self.deletionLocations))
                self.insertLocations = np.random.choice(
                    self.insertLocations,
                    size=int(lengthIndels_per_chrom / 2),
                    replace=False)
                self.deletionLocations = np.random.choice(
                    self.deletionLocations,
                    size=lengthIndels_per_chrom -
                    int(lengthIndels_per_chrom / 2),
                    replace=False)
                self.indelLocations = np.concatenate(
                    (self.insertionLocations, self.deletionLocations))
                self.indelLocations = self.indelLocations - self.offset
            else:
                self.indelLocationsFull = np.loadtxt(
                    data_dir + "indelLocations{}".format(chromosome) +
                    ext).astype(int)
                self.indelLocations = np.random.choice(
                    self.indelLocationsFull,
                    size=lengthIndels_per_chrom,
                    replace=False)
                self.indelLocations = self.indelLocations - self.offset
            self.nonzeroLocationsRef = np.where(
                np.any(self.referenceChr != 0, axis=1))[0]
            if self.nearby:
                self.zeroLocationsRef = np.where(
                    np.all(self.referenceChr == 0, axis=1))[0]
                self.setOfZeroLocations = set(self.zeroLocationsRef)
            self.coverage = None
            if self.load_coverage:
                self.coverage = lc.load_coverage(
                    data_dir + "coverage/{}.npy".format(chromosome))
            self.setOfIndelLocations = set(self.indelLocations)
            self.prevChosenRefLocations = set()
            nearby_indels[total_length_per_chrom *
                          (chromosome - 1):total_length_per_chrom *
                          (chromosome - 1) +
                          lengthIndels_per_chrom] = self.indelLocations

            # dataset should have all the indels as well as random negative training samples
            if self.nearby:
                neg_positions = np.random.choice(self.indelLocations,
                                                 size=num_negatives_per_chrom)
                nearby_indels[total_length_per_chrom * (chromosome - 1) +
                              lengthIndels_per_chrom:total_length_per_chrom *
                              chromosome] = neg_positions
                offset = np.multiply(
                    np.random.randint(1,
                                      self.nearby + 1,
                                      size=num_negatives_per_chrom),
                    np.random.choice([-1, 1], size=num_negatives_per_chrom))
                neg_positions = neg_positions + offset  # locations that are offset from indels by some amount
            else:
                neg_positions = np.random.choice(self.nonzeroLocationsRef,
                                                 size=num_negatives_per_chrom)
                self.nearby_indels = neg_positions  # to prevent error if this is undefined
            for i in range(lengthIndels_per_chrom + num_negatives_per_chrom):
                if i < lengthIndels_per_chrom:
                    if not self.triclass:
                        label = 1  # standard binary classification labels
                    elif i < len(self.insertionLocations):
                        label = 1  # insertions will be labeled as 1
                    else:
                        label = 2  # deletions will be labeled as 2
                    pos = self.indelLocations[i]
                else:
                    label = 0
                    pos = neg_positions[i - lengthIndels_per_chrom]
                    if self.nearby:
                        niter = 0
                        while (pos in self.prevChosenRefLocations) or (
                                pos in self.setOfZeroLocations
                        ) or (pos
                              in self.setOfIndelLocations) and niter < 1001:
                            nearby_indels[total_length_per_chrom *
                                          (chromosome - 1) +
                                          i] = np.random.choice(
                                              self.indelLocations)
                            pos = nearby_indels[
                                total_length_per_chrom *
                                (chromosome - 1) + i] + np.random.randint(
                                    1, self.nearby + 1) * np.random.choice(
                                        [-1, 1])
                            niter += 1
                    else:
                        while (pos in self.prevChosenRefLocations) or (
                                pos in self.setOfIndelLocations):
                            pos = np.random.choice(self.nonzeroLocationsRef)
                    self.prevChosenRefLocations.add(pos)
                indices[total_length_per_chrom * (chromosome - 1) + i] = pos
                coverageWindow = np.zeros(2 * k + 1)
                # get k base pairs before and after the position
                window = self.referenceChr[pos - k:pos + k + 1]
                coverageWindow = None
                if self.coverage is not None:
                    coverageWindow = utils.flatten(self.coverage[pos - k:pos +
                                                                 k + 1])
                dataset[total_length_per_chrom * (chromosome - 1) + i] = window
                coverageDataset[total_length_per_chrom * (chromosome - 1) +
                                i] = coverageWindow
                labels[total_length_per_chrom * (chromosome - 1) + i] = label
                genome_positions[total_length_per_chrom * (chromosome - 1) +
                                 i] = pos
        if self.load_entropy:
            entropyDataset[:, k + 1:2 * k + 1] = entropy.entropyVector(dataset)
        rawZipped = zip(list(dataset), list(coverageDataset), list(labels),
                        list(genome_positions), list(indices),
                        list(nearby_indels), list(entropyDataset))
        # Shuffle the list
        np.random.shuffle(rawZipped)
        a, b, c, d, e, f, g = zip(*rawZipped)
        dataset = np.array(a)
        coverageDataset = np.array(b)
        entropyDataset = np.array(g)
        labels = np.array(c, dtype=labeltype)
        genome_positions = np.array(d, dtype=np.uint32)
        self.indices = np.array(e, dtype=np.uint32)
        self.nearby_indels = np.array(f, dtype=np.uint32)
        self.dataset = dataset
        self.coverageDataset = coverageDataset
        self.entropyDataset = entropyDataset
        if self.triclass:
            self.labels = utils.to_onehot(labels, 3)
        else:
            self.labels = np.expand_dims(labels, axis=1)
        self.genome_positions = genome_positions
        self.num_train_examples = int(
            round(total_length * (1 - self.test_frac)))
        self.ordering = list(range(0, self.num_train_examples))
示例#16
0
def _train(epoch: int,
           enc: nn.Module,
           dec: nn.Module,
           disc: nn.Module,
           prior_size: int,
           dl: Iterator,
           vocab: Vocab,
           device: str,
           validate: bool = False) -> Tuple[float, float, float, float]:

    if not validate:
        enc.train()
        dec.train()
        disc.train()
    else:
        enc.eval()
        dec.eval()
        disc.eval()

    epoch_g_loss = 0.0
    epoch_ae_loss = 0.0
    epoch_disc_loss = 0.0

    strs = []
    dec_strs = []

    n_batches = len(dl)

    for batch_idx, batch in enumerate(dl):

        seq = batch.text
        seq = seq[1:]

        label = batch.label
        label = to_onehot(label, 2, device)

        (seq_len, batch_size) = seq.shape

        batch_zeros = torch.zeros((batch_size, 1)).to(device)
        batch_ones = torch.ones((batch_size, 1)).to(device)

        # ======== train/validate Discriminator ========

        if not validate:
            enc.zero_grad()
            disc.zero_grad()

        z = torch.randn((batch_size, prior_size)).to(device)
        z_label = to_onehot(
            torch.randint(0, 2, (batch_size, )).long(), 2, device)

        latent = enc(seq)
        fake_pred = disc(latent, label)
        true_pred = disc(z, z_label)

        fake_loss = F.binary_cross_entropy_with_logits(fake_pred, batch_zeros)
        true_loss = F.binary_cross_entropy_with_logits(true_pred, batch_ones)

        disc_loss = 0.5 * (fake_loss + true_loss)

        if not validate:
            disc_loss.backward()
            disc.optim.step()

        # ======== train/validate Autoencoder ========

        if not validate:
            enc.zero_grad()
            dec.zero_grad()
            disc.zero_grad()

        latent = enc(seq)
        x = torch.zeros(1, batch_size).to(device).long() + vocab.stoi['<sos>']

        h = None

        output = None

        for i in range(seq_len):
            o, h = dec(x, latent, h, label)
            x = seq[i].view(1, -1)
            output = o if output is None else torch.cat((output, o), 0)

        ae_loss = F.nll_loss(output, seq.view(-1))

        fake_pred_z = disc(latent, label)

        enc_loss = F.binary_cross_entropy_with_logits(fake_pred_z, batch_ones)

        g_loss = ae_loss + enc_loss

        if not validate:
            g_loss.backward()
            dec.optim.step()
            enc.optim.step()

        # ----------------------------------------------------

        epoch_g_loss += g_loss.item()
        epoch_ae_loss += ae_loss.item()
        epoch_disc_loss += disc_loss.item()

        _, w_idxs = output.topk(1, dim=1)
        dec_seq = w_idxs.view(seq_len, batch_size)

        strs.extend(seq_to_str(seq.detach(), vocab))
        dec_strs.extend(seq_to_str(dec_seq.detach(), vocab))

    epoch_g_loss /= n_batches
    epoch_ae_loss /= n_batches
    epoch_disc_loss /= n_batches

    bleu = moses_multi_bleu(np.array(dec_strs), np.array(strs))

    mode = 'Valid' if validate else 'Train'

    print(
        "Epoch {:3} {:5}: BLEU: {:.2f}, AE: {:.5f}, G: {:.5f}, D: {:.5f} at {}"
        .format(epoch, mode, bleu, epoch_ae_loss, epoch_g_loss,
                epoch_disc_loss,
                datetime.now().strftime("%H:%M:%S")))

    return epoch_ae_loss, epoch_g_loss, epoch_disc_loss, bleu
示例#17
0
def run_model(model,
              encoder,
              batch,
              target_vocab,
              teach_rate,
              device,
              verbose=False):
    target_vocab_size = target_vocab.size
    eos_id = target_vocab.eos_token_id
    if captioning:
        images, tgt_sents, lengths = batch
        ret = {
            'images': images,
            'tgt_sents': tgt_sents,
        }
    else:
        src_sents, tgt_sents = batch['source_text_ids'], batch[
            'target_text_ids']
        src_sents = torch.tensor(src_sents, dtype=torch.long, device=device)
        tgt_sents = torch.tensor(tgt_sents, dtype=torch.long, device=device)
        ret = {
            'src_sents': src_sents,
            'tgt_sents': tgt_sents,
        }
    batch_size = tgt_sents.shape[0]

    if train_config.enable_cross_entropy:
        ret['ce'] = {}

        if captioning:
            src = encoder(images)
            src = src.detach()
        else:
            src = src_sents

        if train_config.enable_xe:
            logits_xe = model(src, tgt_sents[:, :-1])
            tgt_sents_ = tgt_sents[:, 1:]
            flatten_logits_xe = logits_xe.contiguous().view(
                -1, logits_xe.shape[-1])
            flatten_tgt_sents_ = tgt_sents_.contiguous().view(-1)
            xel = criterion_cross_entropy(flatten_logits_xe,
                                          flatten_tgt_sents_)

            ret['ce']['xe'] = {
                'logits': logits_xe,
                'loss': xel,
            }

        else:
            xel = 0.

        if train_config.enable_pg:
            ret['ce']['pg'] = {}

            def seq_tolist(ids):
                a = ids.tolist()
                try:
                    return a[:a.index(eos_id)]
                except ValueError:
                    return a

            def tolist(ids):
                return list(map(seq_tolist, ids.cpu().numpy()))

            if hasattr(train_config,
                       'sample_baseline') and train_config.sample_baseline:

                def tile_batch(a, m):
                    shape = list(a.size())
                    a = a.unsqueeze(1).repeat(
                        *[m if d == 1 else 1 for d in range(len(shape) + 1)])
                    a = a.contiguous().view(*([-1] + shape[1:]))
                    return a

                def untile_batch(a, m):
                    shape = list(a.size())
                    return a.view(*([-1, m] + shape[1:]))

                src_ = tile_batch(src, train_config.sample_baseline)
                tgt_sents_ = tile_batch(tgt_sents,
                                        train_config.sample_baseline)
                ids_sample, logprobs_sample = model(
                    src_,
                    tgt_sents_[:, :-1],
                    max_decode_length=train_config.max_decode_length,
                    beam=-1)

                seq_sample = tolist(ids_sample)
                seq_target = tolist(tgt_sents_[:, 1:])

                rewards = []
                for seq_s, seq_t in zip(seq_sample, seq_target):
                    rewards.append(sentence_bleu([seq_t], seq_s))
                rewards = torch.tensor(rewards, device=device)

                rewards = untile_batch(rewards, train_config.sample_baseline)
                mean_rewards = rewards.mean(1, keepdim=True)
                rewards = rewards - mean_rewards
                rewards = rewards.view(-1)

                len_sample = torch.tensor(list(map(len, seq_sample)),
                                          device=device)
                mask = torch.le(
                    torch.arange(logprobs_sample.size(1), device=device),
                    len_sample.unsqueeze(1))

                pgl = -(rewards *
                        (mask.float() * logprobs_sample).sum(1)).mean()

            else:
                ids_sample, logprobs_sample = model(
                    src,
                    tgt_sents[:, :-1],
                    max_decode_length=train_config.max_decode_length,
                    beam=-1)
                logits_greedy = model(
                    src,
                    tgt_sents[:, :-1],
                    max_decode_length=train_config.max_decode_length,
                    beam=1)
                logprobs_greedy, ids_greedy = logits_greedy.max(-1)

                seq_sample = tolist(ids_sample)
                seq_greedy = tolist(ids_greedy)
                seq_target = tolist(tgt_sents[:, 1:])

                pgl = []
                for seq_s, seq_g, seq_t, logprob_sample \
                in zip(seq_sample, seq_greedy, seq_target, logprobs_sample):
                    reward = (sentence_bleu([seq_t], seq_s) -
                              sentence_bleu([seq_t], seq_g))
                    pgl.append(reward *
                               -logprob_sample[:len(seq_sample)].sum())
                pgl = torch.stack(pgl).mean()

            ret['ce']['pg']['loss'] = pgl

        else:
            pgl = 0.

        cel = train_config.xe_w * xel + train_config.pg_w * pgl

        ret['ce'].update({
            'loss': cel,
        })

    if train_config.enable_bleu:
        tgt_sents_onehot = to_onehot(tgt_sents,
                                     target_vocab_size,
                                     dtype=torch.float)
        ret['tgt_sents_onehot'] = tgt_sents_onehot

        gamma = train_config.gamma
        if gamma == 0:
            beam = 1
        else:
            beam = 0
        max_decode_length = train_config.max_decode_length
        if max_decode_length is None:
            max_decode_length = tgt_sents.shape[1] - 1
        if random.random() < train_config.fix_teach_gap:
            n = train_config.teach_gap + train_config.teach_cont
            r = random.randrange(n)
            teach_flags = [
                not (i % n < train_config.teach_gap)
                for i in range(r, r + max_decode_length)
            ]
            #logging.info("teach flags: {}".format("".join(str(int(flag)) for flag in teach_flags)))
        else:
            teach_flags = [
                random.random() < teach_rate for i in range(max_decode_length)
            ]
        teach_flags = [True] + teach_flags

        if captioning:
            src = encoder(images)
            src = src.detach()
        else:
            src = src_sents
        logits_mb = model(src,
                          tgt_sents[:, :-1],
                          max_decode_length=train_config.max_decode_length,
                          beam=beam,
                          teach_flags=teach_flags)

        probs = F.softmax(logits_mb, dim=-1)
        probs = torch.cat([tgt_sents_onehot[:, :1], probs], dim=1)

        if hasattr(train_config, "teach_X") and not train_config.teach_X:
            X = probs
        else:
            X = []
            for t in range(probs.shape[1]):
                X.append((tgt_sents_onehot if teach_flags[t] else probs)[:, t])
            X[0] = torch.tensor(X[0], requires_grad=True)
            X = torch.stack(X, dim=1)
        gen_probs, gen_ids = X.max(-1)
        Y = tgt_sents_onehot

        def length_mask(X):
            l = X.shape[1]
            mask = [torch.ones(X.shape[0], device=device)] * 2
            for t in range(l - 1):
                mask.append(mask[-1] * (1 - X[:, t, eos_id]))
            mask = torch.stack(mask, dim=1)
            lenX = torch.sum(mask, dim=1) - 1
            return mask, lenX

        maskY, lenY = length_mask(Y)
        if train_config.soft_length_mask:
            maskX, lenX = length_mask(X)
        else:
            assert X.shape == Y.shape, "X.shape={}, Y.shape={}".format(
                X.shape, Y.shape)
            maskX, lenX = maskY, lenY

        mbl, mbls_ = criterion_bleu(tgt_sents,
                                    X,
                                    lenY,
                                    lenX,
                                    maskY,
                                    maskX,
                                    min_fn=train_config.min_fn,
                                    min_c=train_config.min_c,
                                    enable_prec=train_config.enable_prec,
                                    enable_recall=train_config.enable_recall,
                                    recall_w=train_config.recall_w,
                                    device=device,
                                    verbose=verbose)

        ret['mb'] = {
            'logits': logits_mb,
            'probs': probs,
            'gen_probs': gen_probs,
            'gen_ids': gen_ids,
            'loss': mbl,
            'mbls_': mbls_,
            'X': X,
            'Y': Y,
        }

    bleu_w = train_config.bleu_w
    if bleu_w == 0.:
        loss = cel
    elif bleu_w == 1.:
        loss = mbl
    else:
        loss = (1. - bleu_w) * cel + bleu_w * mbl

    ret['loss'] = loss

    return ret
示例#18
0
	def __initializeTrainData(self, frac_positives):
		k = self.window # for brevity
		lengthIndels = len(self.indelLocations)
		num_negatives = int((1./frac_positives-1) * lengthIndels)
		total_length = lengthIndels + num_negatives
		dataset = np.zeros((total_length, 2*k + 1, 4))
		coverageDataset = np.zeros((total_length, 2*k + 1))
		entropyDataset = np.zeros((total_length, 2*k + 1))
                recombinationDataset = np.zeros((total_length, 1))
	        #recombinationDataset= np.zeros((total_length, 2*k + 1))
		if self.triclass:
		  labeltype = np.uint8
		else:
		  labeltype = np.bool
		labels = np.zeros(total_length, dtype=labeltype)
		genome_positions = np.zeros(total_length, dtype=np.uint32)
		num_negatives = int((1./frac_positives-1) * lengthIndels)

		# dataset should have all the indels as well as random negative training samples
		if self.nearby:
		  neg_positions = np.random.choice(self.indelLocations, size=num_negatives)
		  self.nearby_indels = neg_positions
		  offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives))
		  neg_positions = neg_positions + offset # locations that are offset from indels by some amount
		  self.indices = neg_positions
		else:
		  neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives)
		  self.indices = neg_positions
		  self.nearby_indels = neg_positions # to prevent error if this is undefined
		for i in range(lengthIndels + num_negatives):
			if i < lengthIndels:
				if not self.triclass:
				  label = 1 # standard binary classification labels
				elif i < len(self.insertionLocations):
				  label = 1 # insertions will be labeled as 1
				else:
				  label = 2 # deletions will be labeled as 2
				pos = self.indelLocations[i]
			else:
				label = 0
				pos = neg_positions[i - lengthIndels]
				if self.nearby:
				  niter = 0
				  while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001:
					self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations)
					pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1])
					niter += 1
				else:
				  while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations):
					pos = np.random.choice(self.nonzeroLocationsRef)
				self.indices[i - lengthIndels] = pos
				self.prevChosenRefLocations.add(pos)
			coverageWindow = np.zeros(2*k + 1)
			# get k base pairs before and after the position
			window = self.referenceChr[pos - k : pos + k + 1]
			coverageWindow = None
			if self.coverage is not None:
				coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1])
			recombWindowAverage = None
                        if self.recombination is not None:
                                recombWindow = np.zeros((2*k + 1, 1))
                                recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1))
                                recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))]
                                recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds]
                                recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))]
                                recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] 
                        	recombWindowAverage = np.mean(recombWindow)
				#recombWindowAverage = utils.flatten(recombWindow)
                        dataset[i] = window
			coverageDataset[i] = coverageWindow
                        recombinationDataset[i] = recombWindowAverage
			labels[i] = label
			genome_positions[i] = pos
		self.indices = np.concatenate((self.indelLocations, self.indices))
		self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels))
		if self.load_entropy:
			entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset)
		rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset))
		# Shuffle the list
		np.random.shuffle(rawZipped)
		a, b, c, d, e, f, g, h = zip(*rawZipped)
		dataset = np.array(a)
		coverageDataset = np.array(b)
		entropyDataset = np.array(g)
                recombinationDataset = np.array(h)
		labels = np.array(c, dtype=labeltype)
		genome_positions = np.array(d, dtype=np.uint32)
		self.indices = np.array(e, dtype=np.uint32)
		self.nearby_indels = np.array(f, dtype=np.uint32)
		self.dataset = dataset
		self.coverageDataset = coverageDataset
		self.entropyDataset = entropyDataset
                self.recombinationDataset = recombinationDataset
		if self.triclass:
		  self.labels = utils.to_onehot(labels, 3)
		else:
		  self.labels = np.expand_dims(labels, axis=1)
		self.genome_positions = genome_positions
		self.num_train_examples = int(round(total_length * (1-self.test_frac)))
		self.ordering = list(range(0, self.num_train_examples))
示例#19
0
  def __initializeTrainData(self, frac_positives):
    ##
    # for brevity
    k = self.window
    # The window size used to compute sequence complexity
    k_seq_complexity = 20
    # We use chromosomes 2-22, we won't use chromosome 1 until the very end
    num_chrom_used = 21
    ##
    # Number of indels in the entire dataset used to train/test/val
    lengthIndels = 25000*num_chrom_used
    # Number of non-indels in the entire dataset
    num_negatives = int(int((1./frac_positives-1) * lengthIndels)/num_chrom_used)*num_chrom_used
    # Number of locations in the entire dataset
    total_length = lengthIndels + num_negatives
    ##
    # Number of indels in the entire dataset per chromosome
    num_negatives_per_chrom = int(num_negatives/num_chrom_used)
    # Number of non-indels in the entire dataset per chromosome
    lengthIndels_per_chrom = int(lengthIndels/num_chrom_used)
    # Number of locations in the entire dataset per chromosome
    total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom
    ##
    # one-hot encoded sequences of size 2*k + 1 around each location
    dataset = np.zeros((total_length, 2*k + 1, 4))
    # coverage corresponding to each location in the dataset
    coverageDataset = np.zeros((total_length, 2*k + 1))
    # entropy of expanding windows in the dataset
    entropyDataset = np.zeros((total_length, 2*k + 1))
    # indices on the genome of the locations in the dataset
    indices = np.zeros(total_length, dtype=np.uint32)
    # allele count values for indels, 0 for non-indels
    allele_count = np.zeros(total_length, dtype=np.uint32)
    nearby_indels = np.zeros(total_length, dtype=np.uint32)
    # label is either a bool or an int depending on the number of classes
    if self.triclass:
      labeltype = np.uint8
    else:
      labeltype = np.bool
    # 0 for non-indels 1 (and 2) in case of indels
    labels = np.zeros(total_length, dtype=labeltype)
    # seems to be the same as indices, ToDo does it neet to be there???
    genome_positions = np.zeros(total_length, dtype=np.uint32)
    # the chromosome number corresponding to each location
    chrom_num = np.zeros(total_length, dtype=np.uint32)
    # Test the number of indels in a non-indel window, as well as multiple indel in a single indel window
    num_indel_neg_set = 0
    num_indel_pos_set = 0

    # Load data from chromosomes 2-22
    # populate dataset and related variables per chromosome
    for chromosome in range(2, 23):
      ##
      # Load the chromosome from the full genome
      referenceChr = self.referenceChrFull[str(chromosome)]
      ## Load and process the positive (indels) dataset
      # This is a 4 column data: indel locations, allele count, filter value, insertion (1) or deletion (0)
      indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(chromosome) + ".npy")
      indel_indices_set = set(np.array(indel_data_load[:, 0], dtype = int))
      indel_data_load = indel_data_load[indel_data_load[:, 0] + k < referenceChr.shape[0]]
      # Remove those that have complexity below the threshold
      indel_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
      indel_sequence_indices = np.repeat(indel_sequence_indices, indel_data_load.shape[0], axis = 0)
      indel_sequence_indices = np.reshape(indel_sequence_indices, [-1, indel_data_load.shape[0]])
      indel_sequence_indices += np.transpose(np.array(indel_data_load[:, 0], dtype = int))
      indel_sequence_complexity = entropy.entropySequence(referenceChr[indel_sequence_indices.transpose(), :])
      del indel_sequence_indices
      # Filter by sequence complexity and filter value around 20 sized window and complexity threshold
      total_indices = np.arange(indel_data_load.shape[0])
      filtered_indices = np.logical_and(indel_data_load[:, 2] == 1, indel_sequence_complexity >= self.complexity_threshold)
      # Add an additional filter for allele count = 1
      filtered_indices = np.logical_and(indel_data_load[:, 1] == 1, filtered_indices)

      # Sample the indels, taking into consideration the classification problem in hand
      if self.triclass:
        filtered_indices_insert = np.logical_and(indel_data_load.iloc[:, 3] == 1, filtered_indices)
        filtered_indices_insert = total_indices[filtered_indices_insert]
        filtered_indices_delete = np.logical_and(indel_data_load.iloc[:, 3] == 0, filtered_indices)
        filtered_indices_delete = total_indices[filtered_indices_delete]
        insertionLocations = np.random.choice(filtered_indices_insert, size = int(lengthIndels_per_chrom/2), replace = False)
        deletionLocations = np.random.choice(filtered_indices_delete, size = lengthIndels_per_chrom - int(lengthIndels_per_chrom/2), replace = False)
        indel_indices = np.concatenate((insertionLocations, deletionLocations))
        del filtered_indices_insert, filtered_indices_delete, insertionLocations, deletionLocations
      else:
        filtered_indices = total_indices[filtered_indices]
        indel_indices = np.random.choice(filtered_indices, size = lengthIndels_per_chrom, replace = False)
      ##
      indelLocations = np.array(indel_data_load[indel_indices, 0], dtype = int)
      allele_count_val = indel_data_load[indel_indices, 1]
      del indel_data_load, indel_indices, filtered_indices, total_indices
      indelLocations = indelLocations - self.offset

      ## Load the coverage data if needed
      coverage = None
      if self.load_coverage:
        coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome))

      ## Create the negative dataset
      rel_size_neg_large = 2
      neg_positions_large = np.load(data_dir + "nonindelLocationsSampled" + str(chromosome) + '.npy')
      neg_positions_large = np.random.choice(neg_positions_large, size = rel_size_neg_large*num_negatives_per_chrom, replace = False)
      # Remove those that have complexity below the threshold
      neg_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
      neg_sequence_indices = np.repeat(neg_sequence_indices, len(neg_positions_large), axis = 0)
      neg_sequence_indices = np.reshape(neg_sequence_indices, [-1, len(neg_positions_large)])
      neg_sequence_indices += np.transpose(neg_positions_large)
      neg_sequence_complexity = entropy.entropySequence(referenceChr[neg_sequence_indices.transpose(), :])
      neg_positions_large = neg_positions_large[neg_sequence_complexity >= self.complexity_threshold]
      del neg_sequence_indices, neg_sequence_complexity
      ##
      if self.nearby:
        # Create a list of all permissible nearby locations
        nearby_locations = np.arange(-self.nearby, self.nearby + 1)
        nearby_locations = np.repeat(nearby_locations, len(indelLocations), axis = 0)
        nearby_locations = np.reshape(nearby_locations, [-1, len(indelLocations)])
        nearby_locations += np.transpose(indelLocations)
        nearby_locations = np.reshape(nearby_locations, -1)
        # Remove all indel locations and low-complexity non-indel locations from nearby locations
        nearby_locations = np.array((set(nearby_locations) - set(indelLocationsFull)) & set(neg_positions_large))
        if len(nearby_locations) >= num_negatives_per_chrom:
          neg_positions = np.random.choice(nearby_locations, size = num_negatives_per_chrom, replace = False)
        else:
          # Else sample the remaining from the negative positions- this is the best that can be done, try increasing the nearby size
          print "Try increasing nearby or rel_size_neg_large. Not enough nearby-non-indels could be sampled in chromosome {}".format(chromosome)
          num_neg_needed = num_negatives_per_chrom - len(nearby_locations)
          not_nearby = np.random.choice(list((set(neg_positions_large) - set(indelLocationsFull)) - set(nearby_locations)), size = num_neg_needed, replace = False)
          neg_positions = np.concatenate((nearby_locations, not_nearby))
      else:
        neg_positions = np.random.choice(neg_positions_large, size = num_negatives_per_chrom, replace = False)

      for i in range(lengthIndels_per_chrom + num_negatives_per_chrom):
        if i < lengthIndels_per_chrom:
          if not self.triclass:
            label = 1 # standard binary classification labels
          elif i < int(lengthIndels_per_chrom/2):
            label = 1 # insertions will be labeled as 1
          else:
            label = 2 # deletions will be labeled as 2
          pos = indelLocations[i]
          allele_count[total_length_per_chrom*(chromosome - 2) + i] = allele_count_val[i]
          num_indel_pos_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
        else:
          label = 0
          pos = neg_positions[i - lengthIndels_per_chrom]
          # Compute the true value of nearby_indels TODO
          #if self.nearby:
          num_indel_neg_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
        indices[total_length_per_chrom*(chromosome - 2) + i] = pos
        coverageWindow = np.zeros(2*k + 1)
        # get k base pairs before and after the position
        window = referenceChr[pos - k : pos + k + 1]
        if coverage is not None:
          coverageWindow += np.mean(utils.flatten(coverage[pos - k : pos + k + 1]))#= utils.flatten(coverage[pos - k : pos + k + 1])
        dataset[total_length_per_chrom*(chromosome - 2) + i] = window
        coverageDataset[total_length_per_chrom*(chromosome - 2) + i] = coverageWindow
        labels[total_length_per_chrom*(chromosome - 2) + i] = label
        genome_positions[total_length_per_chrom*(chromosome - 2) + i] = pos
        chrom_num[total_length_per_chrom*(chromosome - 2) + i] = chromosome
    if self.load_entropy:
      entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset)
    ##
    # Randomly choose the validation and test chromosome
    self.val_chrom, self.test_chrom = np.random.choice(range(2, 23), 2, replace=False)
    # Set the number of training examples, and the respective set indices
    total_indices = np.arange(total_length)
    self.num_train_examples = total_length_per_chrom*(num_chrom_used - 2)
    self.train_indices = total_indices[np.logical_and(chrom_num != self.val_chrom, chrom_num != self.test_chrom)]
    self.test_indices = total_indices[chrom_num == self.test_chrom]
    self.val_indices = total_indices[chrom_num == self.val_chrom]
    ##
    # Set the respective variables
    self.dataset = dataset
    self.coverageDataset = coverageDataset
    self.entropyDataset = entropyDataset
    self.indices = indices
    self.allele_count = allele_count
    self.nearby_indels = nearby_indels
    self.genome_positions = genome_positions
    if self.triclass:
      self.labels = utils.to_onehot(labels, 3)
    else:
      self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience)
    del dataset, coverageDataset, entropyDataset, indices, allele_count, nearby_indels, genome_positions, labels
    print num_indel_pos_set
    print float(num_indel_pos_set)/lengthIndels
    print num_indel_neg_set
    print float(num_indel_neg_set)/num_negatives
    print np.mean(np.mean(self.coverageDataset, axis = 1))
    print np.mean(np.var(self.coverageDataset, axis = 1))
def fit(model, train_encoded, val_encoded, config):
    """
    Fit the models weights and save the training and validation loss in the model
    :param model: nn. Module
    :param train_encoded: Encoded training data
    :param val_encoded: Encoded validation data
    :param config: dict with settings
    :return:
    """
    n_songs_train = len(train_encoded)
    n_songs_val = len(val_encoded)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=config["LR"], weight_decay=config["WEIGHT_DECAY"])

    for epoch in range(1, config["EPOCHS"] + 1):
        train_loss = 0

        # Enter train mode to activate Dropout and Batch Normalization layers
        model.train()

        # Shuffle songs for each epoch
        random.shuffle(train_encoded)
        for i, song in enumerate(train_encoded):
            # Reset state for each song
            model.init_state()

            song_loss = 0
            n = 0  # Number of chunks made from song
            for seq, target in SlidingWindowLoader(song, window=config["CHUNK_SIZE"]):

                # Chunks is sometimes empty
                if len(seq) == 0:
                    continue
                n += 1

                # One-hot encode chunk tensor
                input_onehot = to_onehot(seq, config["VOCAB_SIZE"])

                optimizer.zero_grad()  # Reset gradient for every forward
                output = model(input_onehot.unsqueeze(1))  # Size = (chunk_length, batch, vocab_size)
                output.squeeze_(1)  # Back to 2D
                chunk_loss = criterion(output, target.long())
                chunk_loss.backward()
                optimizer.step()
                song_loss += chunk_loss.item()
            train_loss += song_loss / n
            if i % 100 == 0:
                print("Song: {}, AvgTrainLoss: {}".format(i, train_loss / (i + 1)))

        # Append average training loss for this epoch
        model.training_losses.append(train_loss / n_songs_train)

        # Generate a song at this epoch
        song = sample(model, "$", config)
        print("{}\n{}\n{}".format("-" * 40, song, "-" * 40))

        # Validation
        with torch.no_grad():
            print("Validating")
            model.eval()  # Turns of Dropout and BatchNormalization
            val_loss = 0

            for song in val_encoded:
                # Reset state
                model.init_state()

                song_loss = 0
                n = 0
                for seq, target in SlidingWindowLoader(song, window=config["CHUNK_SIZE"]):
                    # Chunks is sometimes empty
                    if len(seq) == 0:
                        continue
                    n += 1

                    # One-hot encode chunk tensor
                    input_onehot = to_onehot(seq, config["VOCAB_SIZE"])

                    output = model(input_onehot.unsqueeze(1))  # Size = (chunk_length, batch, vocab_size)
                    output.squeeze_(1)  # Back to 2D
                    song_loss += criterion(output, target.long()).item()
                val_loss += song_loss / n
            model.validation_losses.append(val_loss / n_songs_val)
            print("Epoch {}, Training loss: {}, Validation Loss: {}".format(epoch, model.training_losses[-1],
                                                                            model.validation_losses[-1]))
示例#21
0
def main():
    args = parse_arguments()
    use_cuda = torch.cuda.is_available()

    # visdom for plotting
    vis = Visdom()
    win_g, win_d, win_w = None, None, None
    assert vis.check_connection()

    # load datasets
    print("[!] preparing dataset...")
    TEXT = Field(lower=True, fix_length=args.seq_len,
                 tokenize=list, batch_first=True)
    LABEL = Field(sequential=False)
    train_data, test_data = IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)
    train_iter, test_iter = BucketIterator.splits(
            (train_data, test_data), batch_size=args.batch_size, repeat=True)
    vocab_size = len(TEXT.vocab)
    print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)\t[VOCAB]:%d"
          % (len(train_iter), len(train_iter.dataset),
             len(test_iter), len(test_iter.dataset), vocab_size))

    # instantiate models
    G = Generator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size)
    D = Discriminator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size)
    optim_G = optim.Adam(G.parameters(), lr=args.lr, betas=(0.5, 0.9))
    optim_D = optim.Adam(D.parameters(), lr=args.lr, betas=(0.5, 0.9))

    global one, mone
    one = torch.FloatTensor([1])
    mone = one * -1
    if use_cuda:
        G, D = G.cuda(), D.cuda()
        one, mone = one.cuda(), mone.cuda()

    train_iter = iter(train_iter)
    batch_size = args.batch_size
    for b in range(1, args.batchs+1):
        # (1) Update D network
        for p in D.parameters():  # reset requires_grad
            p.requires_grad = True
        for iter_d in range(args.critic_iters):  # CRITIC_ITERS
            batch = next(train_iter)
            text, label = batch.text, batch.label
            text = to_onehot(text, vocab_size)
            if use_cuda:
                text = text.cuda()
            real = Variable(text)
            d_loss, wasserstein = train_discriminator(
                    D, G, optim_D, real, args.lamb, batch_size, use_cuda)
        # (2) Update G network
        for p in D.parameters():
            p.requires_grad = False  # to avoid computation
        g_loss = train_generator(D, G, optim_G, batch_size, use_cuda)

        # plot losses on visdom
        win_d = plot('Discriminator Loss', vis,
                     x=b, y=d_loss.data[0], win=win_d)
        win_g = plot('Generator Loss', vis,
                     x=b, y=g_loss.data[0], win=win_g)
        win_w = plot('Wasserstein Distance', vis,
                     x=b, y=wasserstein.data[0], win=win_w)

        if b % 500 == 0 and b > 1:
            samples = sample(G, TEXT, 1, args.seq_len, vocab_size, use_cuda)
            print("[%d] D:%5.2f G:%5.2f W:%5.2f \nsample:%s \t [%d]" %
                  (b, d_loss.data[0], g_loss.data[0], wasserstein.data[0],
                   samples[0], label.data[0]))
            log_sample("Sample %d" % b, vis, samples)
        if b % 5000 == 0 and b > 1:
            print("[!] saving model")
            if not os.path.isdir(".save"):
                os.makedirs(".save")
            torch.save(G.state_dict(), './.save/wgan_g_%d.pt' % (b))
            torch.save(D.state_dict(), './.save/wgan_d_%d.pt' % (b))
示例#22
0
        mu, logstd = lz_x[:, :n_z], lz_x[:, n_z:]
        lz_x = zs.Normal('z', mu, logstd, n_samples=1, group_event_ndims=1)
    return encoder, lz_x,


if __name__ == "__main__":

    casia_online = 10
    casia_offline_reverse = 10
    tf.set_random_seed(1234)
    np.random.seed(1234)
    if not args.code:
        if args.dataset == 'hand':
            x_train, x_test, t_train,  t_test = dataset.hccr_onehot_hand_64(n_y, sample_num)
            t_train,t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'standard':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_standard_64(n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'casia-online':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_casia_online_64(n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        elif args.dataset == 'casia-offline':
            x_train, x_test, t_train, t_test = dataset.hccr_onehot_casia_offline_64(n_y, sample_num)
            t_train, t_test = \
                utils.to_onehot(t_train, n_y), utils.to_onehot(t_test, n_y)
        else:
            raise ValueError('Only have dataset: hand, standard, casia')
    else:
示例#23
0
  def __initializeTrainData(self, frac_positives):
    k = self.window # for brevity
    lengthIndels = len(self.indelLocations) # Total number of indels
    num_negatives = int((1./frac_positives-1) * lengthIndels) # Total number of negative training examples we need, based on the desired fraction of positive examples
    total_length = lengthIndels + num_negatives # Total number of examples [both training and testing!]
    dataset = np.zeros((total_length, 2*k + 1, 4))
    coverageDataset = np.zeros((total_length, 2*k + 1))
    entropyDataset = np.zeros((total_length, 2*k + 1))
    recombinationDataset = np.zeros((total_length, 1))
    #recombinationDataset = np.zeros((total_length, 2*k + 1))
    if self.triclass:
      labeltype = np.uint8 # Three distinct labels in this case
    else:
      labeltype = np.bool
    labels = np.zeros(total_length, dtype=labeltype)
    genome_positions = np.zeros(total_length, dtype=np.uint32)

    # dataset should have all the indels as well as random negative training samples
    if self.nearby:
      neg_positions = np.random.choice(self.indelLocations, size=num_negatives) # First choose a random number of examples among known indels
      self.nearby_indels = neg_positions # Store the locations of these selected indels
      offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives)) # Offset by a random nonzero amount <= to self.nearby
      neg_positions = neg_positions + offset # These locations that are offset from indels by some amount are [roughly] our negative examples; but see for loop below
    else:
      neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives) # Select random nonzero locations from the reference genomes
      self.nearby_indels = neg_positions # to prevent error if this is undefined (value should not be used as it is meaningless in this case)
    self.indices = neg_positions # Locations of the negative training examples
    for i in range(lengthIndels + num_negatives): # Loop over all examples
      if i < lengthIndels: # Positive example
        if not self.triclass:
          label = 1 # standard binary classification labels
        elif i < len(self.insertionLocations):
          label = 1 # insertions will be labeled as 1
        else:
          label = 2 # deletions will be labeled as 2
        pos = self.indelLocations[i]
      else: # Negative example (not an indel)
        label = 0
        pos = neg_positions[i - lengthIndels] # Get corresponding entry of neg_positions, which stores the tentative positions of all negative examples. However, we may need to update this position. We still predefine them and update if needed simply for efficiency's sake.
        if self.nearby: # Position must be near a known indel
          niter = 0 # Avoid infinite loops (probably should still make sure the selected position is not at an indel location (or zero?) regardless of # iterations in the below condition, though)
          while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001:
            # Avoid choosing an already selected position, a zero location (unknown reference base), or an actual indel
            self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations) # Select again using the same procedure, until we get a valid negative example
            pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1])
            niter += 1
        else: # Position simply just has to not be previously selected, and not a positive (i.e. indel) example
          while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations):
            pos = np.random.choice(self.nonzeroLocationsRef)
        self.indices[i - lengthIndels] = pos # True position of the negative example
        self.prevChosenRefLocations.add(pos) # Store this position, so we don't reuse it
      # get the k base pairs before and after the position, and the position itself
      window = self.referenceChr[pos - k : pos + k + 1]
      coverageWindow = None # Coverage window corresponding to the input base pairs (loaded only if necessary)
      if self.coverage is not None:
        coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1])
      recombWindowAverage = None
      if self.recombination is not None: # Recombination window, if needed
        recombWindow = np.zeros((2*k + 1, 1))
        recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1))
        recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))]
        recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds]
        recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))]
        recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] 
        recombWindowAverage = np.mean(recombWindow)
        #recombWindowAverage = utils.flatten(recombWindow)
      dataset[i] = window # Store the data for this example in the overall data structure
      coverageDataset[i] = coverageWindow
      recombinationDataset[i] = recombWindowAverage
      labels[i] = label
      genome_positions[i] = pos # This might be the same as self.indices?
    self.indices = np.concatenate((self.indelLocations, self.indices)) # Indices for positive examples are simply in self.indelLocations
    self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels)) # "Nearby" indel for a positive example is the indel itself
    if self.load_entropy:
      entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset) # Create the entropy vectors, if needed
    rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset))
    # Shuffle the data
    np.random.shuffle(rawZipped)
    a, b, c, d, e, f, g, h = zip(*rawZipped)
    dataset = np.array(a)
    coverageDataset = np.array(b)
    entropyDataset = np.array(g)
    recombinationDataset = np.array(h)
    labels = np.array(c, dtype=labeltype)
    genome_positions = np.array(d, dtype=np.uint32)
    self.indices = np.array(e, dtype=np.uint32)
    self.nearby_indels = np.array(f, dtype=np.uint32)
    self.dataset = dataset
    self.coverageDataset = coverageDataset
    self.entropyDataset = entropyDataset
    self.recombinationDataset = recombinationDataset
    if self.triclass:
      self.labels = utils.to_onehot(labels, 3)
    else:
      self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience)
    self.genome_positions = genome_positions
    self.num_train_examples = int(round(total_length * (1-self.test_frac))) # Number of examples to use for training (as opposed to testing)
    self.ordering = list(range(0, self.num_train_examples)) # Order in which we go through the training examples (will be changed)
^GG|B2B B2c BGA|B2d c2c d2B|g6 A3|BdB dBA B2d|edf ecA Bdg|gdc AAF |1 dfdd g2ge ||
d2B2 BdcB|A2Bc BdAB|cBc BAFG|A~G2 d2e2|dBc BGF|cAF G2G:|
,2D2B c2GB|A2FA FABc|c2GF A2AB|1 cded dFGA|B2de egec|
A2GG B2cA|DBcd eABA|Bcde dBAB|A2Ac d2d2|cgfe cedc|=cBce fedc|AFEA GGAF|BEFG A2AB|(c2df (ggfa | 
a4d2c|a2g'ggf edBc|c2d2e2A2|
BcdBBc d2d2|d2f2 f2ca|f2faf2d2|c2c2d2d2|f2fd g4f2|2fedc 
"""
print(text[:440])

with torch.no_grad():

    values = []
    actual_letters = []

    for c in text:
        inputs_onehot = to_onehot(torch.Tensor([char_to_idx[c]]),
                                  len(char_to_idx.keys()))
        out = model(inputs_onehot.unsqueeze(1))
        out.squeeze_(1)
        value = model.state[0].view(-1)
        values.append([value[n].item() for n in range(100)])

        # If special character
        if c is '\n':
            c = 'nl'
        elif c is ' ':
            c = 'sp'
        actual_letters.append(c)

    for idx in range(100):
        data = np.reshape([value[idx] for value in values][:440], (20, -1))
        letters = np.reshape(actual_letters[:440], (20, -1))
示例#25
0
    def skmeans_clustering(self, cluster_iter):
        self.target_ctr = self.source_ctr
        self.max_len = 1000
        centers = None
        self.stop = False

        batch_features = []
        batch_path = []
        for data, labels, path in iter(cluster_iter):
            data, labels = data.cuda(), labels.cuda()
            features, _ = self.base_network(data)
            batch_features += [features]
            batch_path += path
        self.target_features = torch.cat(batch_features, dim=0)
        self.clustered_targets["features"] = self.target_features
        self.clustered_targets["path"] = batch_path
        refs = utils.to_cuda(
            torch.LongTensor(range(self.num_classes)).unsqueeze(1))

        num_samples = self.target_features.size(0)
        num_split = ceil(1.0 * num_samples / self.max_len)

        # while True:
        #     print(self.stop)
        #     self.clustering_stop(centers)
        for _ in range(0, 1000):
            if centers is not None:
                self.target_ctr = centers
            # if self.stop: break

            centers = 0
            count = 0
            start = 0
            ps_label = []
            dis2c = []
            for N in range(num_split):
                cur_len = min(self.max_len, num_samples - start)
                cur_feature = self.target_features.narrow(0, start, cur_len)
                dist2center = utils.cosine_distance(cur_feature,
                                                    self.target_ctr,
                                                    cross=True)
                # dis2c +=[dist2center]
                dis, pseudo_labels = torch.min(dist2center, dim=1)
                dis2c += [dis]
                ps_label += [pseudo_labels]
                # dist2center, pseudo_labels = self.assign_labels(cur_feature)
                labels_onehot = utils.to_onehot(pseudo_labels,
                                                self.num_classes)
                count += torch.sum(labels_onehot, dim=0)
                pseudo_labels = pseudo_labels.unsqueeze(
                    0)  #.expand(self.num_classes, -1)
                mask = (pseudo_labels == refs).unsqueeze(2).type(
                    torch.cuda.FloatTensor)
                reshaped_feature = cur_feature.unsqueeze(0)
                # update centers
                centers += torch.sum(reshaped_feature * mask, dim=1)
                start += cur_len

            self.clustered_targets["ps_label"] = torch.cat(ps_label, dim=0)
            self.clustered_targets["Dis2C"] = torch.cat(dis2c, dim=0)

            mask = (count.unsqueeze(1) > 0).type(torch.cuda.FloatTensor)
            centers = mask * centers + (1 - mask) * self.source_ctr
        del self.clustered_targets[
            "features"], mask, pseudo_labels, labels_onehot, ps_label, dis2c, dis, dist2center, cur_feature, reshaped_feature