示例#1
0
文件: tasks.py 项目: jim-bo/dimwit
def dim_survey(X, entry_id):

    # convert to numpy
    X = np.array(X)

    # run the reduction.
    X_pca = PCA(n_components=3).fit_transform(X)
    X_tsne = TSNE(n_components=3).fit_transform(X)
    X_ica = FastICA(n_components=3).fit_transform(X)

    # connect to db.
    with mongoctx() as db:

        # update the stuff.
        db['entry'].update(
            {
                '_id': ObjectId(entry_id)
            },
            {
                '$set': {
                    'pca': X_pca.tolist(),
                    'tsne': X_tsne.tolist(),
                    'ica': X_ica.tolist(),
                }
            }
        )
示例#2
0
 def execute(self):
     X = self.X
     #X = np.array(self.X)
     #print("XXn",len(X))
     X2 = TSNE(n_components=self.p, random_state=7,
               perplexity=40).fit_transform(X)
     return X2.tolist()
示例#3
0
def get_2d_projection(articles):
    vectors = get_wordvecs(articles)

    vocab = [word for word in vectors.vocab]
    wordvecs = []
    for word in vocab:
        wordvecs.append(vectors[word])

    embedded = TSNE(n_components=2).fit_transform(wordvecs)
    return embedded.tolist(), vocab
示例#4
0
def proj_docs(corpus, n_components=2):
    """文档投影"""
    features = matutils.corpus2dense(corpus['tfidfcorpus'],
                                     num_terms=len(
                                         corpus['dictionary'].keys()),
                                     num_docs=len(corpus['doc2bow'])).T
    proj_data = TSNE(n_components=n_components,
                     random_state=0).fit_transform(features)
    proj_data = np.array(proj_data, dtype='float')
    return proj_data.tolist()
示例#5
0
 def proj_docs(self, n_components=2):
     """文档投影"""
     features = matutils.corpus2dense(self.__tfidf_corpus,
                                      num_terms=len(
                                          self.__dictionary.keys()),
                                      num_docs=len(self.corpus)).T
     proj_data = TSNE(n_components=n_components,
                      random_state=0).fit_transform(features)
     proj_data = np.array(proj_data, dtype='float')
     # return np.around(proj_data, 5).tolist()
     return proj_data.tolist()
示例#6
0
    def evaluate(self, dataset):
        """
        Evaluate the passed network on the given dataset for the Context retoration task.
        ----------
        INPUT
            |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image,
            |           and the sample index.
        OUTPUT
            |---- None
        """
        logger = logging.getLogger()
        # make loader
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False,
                                             num_workers=self.num_workers, worker_init_fn=lambda _: np.random.seed())
        # put net on device
        self.net = self.net.to(self.device)

        # Evaluate
        logger.info('Start Evaluating the context restoration model.')
        start_time = time.time()
        idx_repr = [] # placeholder for bottleneck representation
        n_batch = len(loader)
        self.net.eval()
        with torch.no_grad():
            for b, data in enumerate(loader):
                # get data : load in standard way (no patch swaped image)
                input, idx = data
                input = input.to(self.device).float()
                idx = idx.to(self.device)
                # get representation
                self.net.return_bottleneck = True
                _, repr = self.net(input)
                # down sample representation for reduced memory impact
                repr = nn.AdaptiveAvgPool2d((4,4))(repr)
                # add ravelled representations to placeholder
                idx_repr += list(zip(idx.cpu().data.tolist(), repr.view(repr.shape[0], -1).cpu().data.tolist()))
                # print_progress
                if self.print_progress:
                    print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True)
            # reset the network attriubtes
            self.net.return_bottleneck = False

        # compute tSNE for representation
        idx, repr = zip(*idx_repr)
        repr = np.array(repr)
        logger.info('Computing the t-SNE representation.')
        repr_2D = TSNE(n_components=2).fit_transform(repr)
        self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist()))
        logger.info('Succesfully computed the t-SNE representation.')
        # finish evluation
        self.outputs['eval']['time'] = time.time() - start_time
        logger.info(f"Finished evaluating on the context restoration task in {timedelta(seconds=int(self.outputs['eval']['time']))}")
def get_2d_projection(articles, filter):
    vectors = get_wordvecs(articles)
    vocab = []
    for word in vectors.index_to_key:
        if filter.isValid(word):
            vocab.append(word)

    wordvecs = []
    for word in vocab:
        wordvecs.append(vectors[word])

    #print("start compressing")
    embedded = TSNE(n_components=2).fit_transform(wordvecs)
    #print("end compressing")
    return embedded.tolist(), vocab
示例#8
0
 def evaluate(self, dataset):
     """
     Evaluate the network on the given dataset for the Contrastive task (get t-SNE representation of samples). Only if global task.
     ----------
     INPUT
         |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image,
         |           and the sample index.
     OUTPUT
         |---- None
     """
     if self.is_global:
         logger = logging.getLogger()
         # initiliatize Dataloader
         loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers,
                                              worker_init_fn=lambda _: np.random.seed())
         # Evaluate
         logger.info("Start Evaluating the network on global contrastive task.")
         start_time = time.time()
         idx_repr = [] # placeholder for bottleneck representation
         n_batch = len(loader)
         self.net.eval()
         self.net.return_bottleneck = True
         with torch.no_grad():
             for b, data in enumerate(loader):
                 im, idx = data
                 im = im.to(self.device).float()
                 idx = idx.to(self.device)
                 # get representations
                 _, z = self.net(im)
                 # keep representations (bottleneck)
                 idx_repr += list(zip(idx.cpu().data.tolist(), z.squeeze().cpu().data.tolist()))
                 # print_progress
                 if self.print_progress:
                     print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True)
             # reset the network attriubtes
             self.net.return_bottleneck = False
         # compute tSNE for representation
         idx, repr = zip(*idx_repr)
         repr = np.array(repr)
         logger.info('Computing the t-SNE representation.')
         repr_2D = TSNE(n_components=2).fit_transform(repr)
         self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist()))
         logger.info('Succesfully computed the t-SNE representation.')
         # finish evluation
         self.outputs['eval']['time'] = time.time() - start_time
         logger.info(f"Finished evaluating of encoder on the global contrastive task in {timedelta(seconds=int(self.outputs['eval']['time']))}")
     else:
         warnings.warn("Evaluation is only possible with a global contrastive task.")
def tsne_embed(embedding_container):
    """
        Function to run TSNE on list of embeddings contained in embedding_c
        ontainer.
        Returns:
            points:
                List of python dicts of format
                points [
                    {
                        x: str(float),
                        y: str(float),
                        z: str(float)
                    },
                    {
                        x: str(float),
                        y: str(float),
                        z: str(float)
                    }
                    {
                        ...
                    } ...
                ]
    """

    embeddings = np.array(embedding_container)
    tsne_output = TSNE(n_components=3).fit_transform(embeddings)

    tsne_output_list = tsne_output.tolist()

    # Makes an array of dictionaries of points
    points = []
    for tsne_point in tsne_output_list:
        point_dict = {
            'x': tsne_point[0],
            'y': tsne_point[1],
            'z': tsne_point[2]
        }

        points.append(point_dict)

    return points
示例#10
0
def transGraphToMatrixAndtSNE(path):
    f_r = open(path, 'r')
    f_w = open('trans_query_graph_to_matrix2.txt', 'w+')
    allLine = f_r.readlines()
    seq = 0
    result = []
    while seq < len(allLine):
        line = allLine[seq].strip().split()
        if line[0] == 't':
            print(str(line[2]))
            seq += 1
            result = [[0 for i in range(836)] for j in range(836)]
            line2 = allLine[seq].strip().split()
            while line2[0] == 'v':
                seq += 1
                line2 = allLine[seq].strip().split()
            while line2[0] == 'e':
                result[int(line2[2])][int(line2[3])] = 1
                result[int(line2[3])][int(line2[2])] = 1
                seq += 1
                if seq < len(allLine):
                    line2 = allLine[seq].strip().split()
                else:
                    break
            matrix = np.array(result)
            matrix_embedded = TSNE(n_components=1).fit_transform(matrix)
            matrix_embedded = np.hstack((matrix_embedded))
            matrix_embedded = matrix_embedded.tolist()
        restr = ''
        for i in range(836):
            restr = restr + '{:.2f}'.format(matrix_embedded[i]) + ' '
        restr += '\n'
        f_w.write(restr)
    f_w.close()
    f_r.close()
    print("end")
    def evaluate(self,
                 net,
                 dataset,
                 print_to_logger=True,
                 return_auc=False,
                 save_tSNE=True):
        """
        Evaluate the natwork on the provided dataset.
        ----------
        INPUT
            |---- net (nn.Module) The autoencoder to train. It must return two
            |           embedding (after the convolution and after the MLP) as
            |           well as the reconstruction
            |---- dataset (torch.utils.data.Dataset) the dataset on which the
            |           network is validated. It must return an image and a mask
            |           of where the loss is to be computed.
            |---- print_to_logger (bool) whether to print info in logger.
            |---- return_auc (bool) whether to return the computed AUC.
            |---- save_tSNE (bool) whether to save the intermediate representation
            |           as a 2D vector using tSNE.
        OUTPUT
            |---- None
        """
        if print_to_logger:
            logger = logging.getLogger()
        # make dataloader (with drop_last = True to ensure that the loss can be computed)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=self.batch_size,
                                             shuffle=True,
                                             num_workers=self.n_job_dataloader)

        # put net on device
        net = net.to(self.device)

        # define loss function
        loss_fn = MaskedMSELoss(reduction='none')

        if print_to_logger:
            logger.info("Start Evaluating AE.")

        idx_label_scores = []
        n_batch = len(loader)

        net.eval()
        with torch.no_grad():
            for b, data in enumerate(loader):
                input, label, mask, semi_label, idx = data
                # put inputs to device
                input = input.to(self.device).float().requires_grad_(True)
                label = label.to(self.device)
                mask = mask.to(self.device)
                semi_label = semi_label.to(self.device)
                idx = idx.to(self.device)

                h, z, rec = net(input)

                # compute score as mean loss over by sample
                rec_loss = loss_fn(rec, input, mask)
                score = torch.mean(rec_loss, dim=tuple(range(1, rec.dim())))

                # append scores : idx label score h z
                idx_label_scores += list(
                    zip(idx.cpu().data.numpy().tolist(),
                        label.cpu().data.numpy().tolist(),
                        score.cpu().data.numpy().tolist(),
                        h.cpu().data.numpy().tolist(),
                        z.cpu().data.numpy().tolist()))

                if self.print_batch_progress:
                    print_progessbar(b,
                                     n_batch,
                                     Name='\t\tEvaluation Batch',
                                     Size=40,
                                     erase=True)

        if save_tSNE:
            if print_to_logger:
                logger.info("Computing the t-SNE representation.")
            # Apply t-SNE transform on embeddings
            index, label, scores, h, z = zip(*idx_label_scores)
            h, z = np.array(h), np.array(z)
            h = TSNE(n_components=2).fit_transform(h)
            z = TSNE(n_components=2).fit_transform(z)
            self.eval_repr = list(
                zip(index, label, scores, h.tolist(), z.tolist()))

            if print_to_logger:
                logger.info("Succesfully computed the t-SNE representation ")

        if return_auc:
            _, label, scores, _, _ = idx_label_scores
            auc = roc_auc_score(np.array(label), np.array(scores))
            return auc
    def evaluate(self,
                 dataset,
                 net,
                 save_tSNE=False,
                 return_loss=True,
                 print_to_logger=True):
        """

        """
        if print_to_logger:
            logger = logging.getLogger()
        # make dataloader (with drop_last = True to ensure that the loss can be computed)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=self.batch_size,
                                             shuffle=True,
                                             num_workers=self.n_job_dataloader,
                                             drop_last=True)

        # put net on device
        net = net.to(self.device)

        # define loss function, supervised or self-supervised
        if self.supervised_loss:
            loss_fn = SupervisedContrastiveLoss(self.tau,
                                                self.batch_size,
                                                y_list=[1],
                                                device=self.device)
        else:
            loss_fn = NT_Xent_loss(self.tau,
                                   self.batch_size,
                                   device=self.device)

        if print_to_logger:
            logger.info("Start Evaluating SimCLR.")

        net.eval()
        with torch.no_grad():
            sum_loss = 0.0
            idx_h_z = []
            n_batch = len(loader)

            for b, data in enumerate(loader):
                # get input
                input_1, input_2, semi_label, idx = data
                input_1 = input_1.to(self.device).float()
                input_2 = input_2.to(self.device).float()
                semi_label = semi_label.to(self.device)
                idx = idx.to(self.device)
                # forward
                h_1, z_1 = net(input_1)
                h_2, z_2 = net(input_2)
                # normalize
                z_1 = F.normalize(z_1, dim=1)
                z_2 = F.normalize(z_2, dim=1)
                # compute loss
                if self.supervised_loss:
                    y = torch.where(
                        semi_label == -1, torch.ones_like(semi_label),
                        torch.zeros_like(semi_label)
                    )  # generate labels (1 if known abnormal, else it's considered normal)
                    loss = loss_fn(z_1, z_2, y)
                else:
                    loss = loss_fn(z_1, z_2)

                sum_loss += loss.item()
                # save embeddings
                if save_tSNE:
                    idx_h_z += list(
                        zip(idx.cpu().data.numpy().tolist(),
                            h_1.cpu().data.numpy().tolist(),
                            z_1.cpu().data.numpy().tolist()))

                if self.print_batch_progress:
                    print_progessbar(b,
                                     n_batch,
                                     Name='\t\tEvaluation Batch',
                                     Size=40,
                                     erase=True)

        if save_tSNE:
            if print_to_logger:
                logger.info("Computing the t-SNE representation.")
            # Apply t-SNE transform on embeddings
            index, h, z = zip(*idx_h_z)
            h, z = np.array(h), np.array(z)
            h = TSNE(n_components=2).fit_transform(h)
            z = TSNE(n_components=2).fit_transform(z)
            self.eval_repr = list(zip(index, h.tolist(), z.tolist()))

            if print_to_logger:
                logger.info("Succesfully computed the t-SNE representation ")

        if return_loss:
            return loss / n_batch
    def evaluate(self, dataset, save_tsne=False, return_scores=False):
        """
        Evaluate the passed network on the given dataset for the Context retoration task.
        ----------
        INPUT
            |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image,
            |           and the sample index.
            |---- save_tsne (bool) whether to compute and store in self.outputs the tsne representation of the feature map
            |           after the average pooling layer and before the MLP
            |---- return_scores (bool) whether to return the measured ROC AUC, accuracy, recall, precision and f1-score.
        OUTPUT
            |---- (auc) (float) the ROC AUC on the dataset.
            |---- (acc) (float) the accuracy on the dataset.
            |---- (recall) (float) the recall on the dataset.
            |---- (precision) (float) the precision on the dataset.
            |---- (f1) (float) the f1-score on the dataset.
        """
        logger = logging.getLogger()
        # make loader
        loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            worker_init_fn=lambda _: np.random.seed())
        # put net on device
        self.net = self.net.to(self.device)

        # Evaluate
        start_time = time.time()
        idx_repr = []  # placeholder for bottleneck representation
        idx_label_pred = []  # placeholder for label & prediction
        n_batch = len(loader)
        self.net.eval()
        with torch.no_grad():
            for b, data in enumerate(loader):
                # get data : load in standard way (no patch swaped image)
                input, label, idx = data
                input = input.to(self.device).float()
                label = label.to(self.device).float()
                idx = idx.to(self.device)
                if save_tsne:
                    # get representation
                    self.net.return_bottleneck = True
                    pred_score, repr = self.net(input)
                    pred_score = torch.sigmoid(pred_score)
                    pred = torch.where(
                        pred_score > 0.5,
                        torch.ones_like(pred_score, device=self.device),
                        torch.zeros_like(pred_score, device=self.device))
                    # down sample representation for reduced memory impact
                    #repr = nn.AdaptiveAvgPool2d((4,4))(repr)
                    # add ravelled representations to placeholder
                    idx_repr += list(
                        zip(idx.cpu().data.tolist(),
                            repr.view(repr.shape[0], -1).cpu().data.tolist()))
                    idx_label_pred += list(
                        zip(idx.cpu().data.tolist(),
                            label.cpu().data.tolist(),
                            pred.cpu().data.tolist(),
                            pred_score.cpu().data.tolist())
                    )  # pred score is softmax activation of class 1
                else:
                    pred_score = self.net(input)
                    pred_score = torch.sigmoid(pred_score)  # B x N_class
                    pred = torch.where(
                        pred_score > 0.5,
                        torch.ones_like(pred_score, device=self.device),
                        torch.zeros_like(pred_score, device=self.device))
                    idx_label_pred += list(
                        zip(idx.cpu().data.tolist(),
                            label.cpu().data.tolist(),
                            pred.cpu().data.tolist(),
                            pred_score.cpu().data.tolist()))
                # print_progress
                if self.print_progress:
                    print_progessbar(b,
                                     n_batch,
                                     Name='\t\tEvaluation Batch',
                                     Size=50,
                                     erase=True)
            # reset the network attriubtes
            if save_tsne:
                self.net.return_bottleneck = False

        # compute tSNE for representation
        if save_tsne:
            idx, repr = zip(*idx_repr)
            repr = np.array(repr)
            logger.info('Computing the t-SNE representation.')
            repr_2D = TSNE(n_components=2).fit_transform(repr)
            self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist()))
            logger.info('Succesfully computed the t-SNE representation.')

        # Compute Accuracy
        _, label, pred, pred_score = zip(*idx_label_pred)
        label, pred, pred_score = np.array(label), np.array(pred), np.array(
            pred_score)
        auc = roc_auc_score(label, pred_score, average=self.score_average)
        acc = accuracy_score(label.ravel(), pred.ravel())
        sub_acc = accuracy_score(label, pred)
        recall = recall_score(label, pred, average=self.score_average)
        precision = precision_score(label, pred, average=self.score_average)
        f1 = f1_score(label, pred, average=self.score_average)
        self.outputs['eval']['auc'] = auc
        self.outputs['eval']['acc'] = acc
        self.outputs['eval']['subset_acc'] = sub_acc
        self.outputs['eval']['recall'] = recall
        self.outputs['eval']['precision'] = precision
        self.outputs['eval']['f1'] = f1
        self.outputs['eval']['pred'] = idx_label_pred

        # finish evluation
        self.outputs['eval']['time'] = time.time() - start_time

        if return_scores:
            return auc, acc, sub_acc, recall, precision, f1
def tsne(comp, perp, lr, init):

    print "N_components (fed to SVD) :", comp
    print "Perplexity (fed to TSNE) :", perp
    print "learning_rate (fed to TSNE) :", lr
    print "init(fed to TSNE) :", init

    X_reduced = TruncatedSVD(n_components=50,
                             random_state=0).fit_transform(vectors)
    X_embedded = TSNE(n_components=comp,
                      perplexity=perp,
                      verbose=2,
                      learning_rate=lr,
                      init=init).fit_transform(X_reduced)

    # width and height of image
    iw = 16
    ih = 9

    fig = plt.figure(figsize=(iw, ih))
    fig.patch.set_facecolor('white')
    ax = plt.axes(frameon=False)
    plt.setp(ax, xticks=(), yticks=())
    plt.subplots_adjust(left=0.0,
                        bottom=0.0,
                        right=1.0,
                        top=0.9,
                        wspace=0.0,
                        hspace=0.0)
    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".")
    fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') +
                "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" +
                str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" +
                str(init) + "_300dpi_1wh.png",
                transparent=False,
                dpi=300)

    fig = plt.figure(figsize=(iw * 2, ih * 2))
    fig.patch.set_facecolor('white')
    ax = plt.axes(frameon=False)
    plt.setp(ax, xticks=(), yticks=())
    plt.subplots_adjust(left=0.0,
                        bottom=0.0,
                        right=1.0,
                        top=0.9,
                        wspace=0.0,
                        hspace=0.0)
    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".")
    fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') +
                "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" +
                str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" +
                str(init) + "_300dpi_2wh.png",
                transparent=False,
                dpi=300)

    fig = plt.figure(figsize=(iw * 3, ih * 3))
    fig.patch.set_facecolor('white')
    ax = plt.axes(frameon=False)
    plt.setp(ax, xticks=(), yticks=())
    plt.subplots_adjust(left=0.0,
                        bottom=0.0,
                        right=1.0,
                        top=0.9,
                        wspace=0.0,
                        hspace=0.0)
    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".")
    fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') +
                "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" +
                str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" +
                str(init) + "_300dpi_3wh.png",
                transparent=False,
                dpi=300)

    X = X_embedded.tolist()
    for idx, x in enumerate(X):
        x.extend([authors[idx], titles[idx], ids[idx]])

    #print X
    # SAVE to txt file as csv for later import to d3
    txt_fn = datetime.datetime.now().strftime(
        '%Y-%m-%d_%Hh%M') + "_LYRICS_TSNE_" + type_of_run + "_perp" + str(
            perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" + str(
                init) + ".txt"

    txt_fn_path = "txt/" + txt_fn
    f_txt = open(txt_fn_path, 'w')
    f_txt.write("x,y,author,title,id")
    for x in X:
        f_txt.write("\n")
        for idx, item in enumerate(x):
            if idx == 4:
                f_txt.write("\"%s\"" % item)
            elif type(item) is str:
                f_txt.write("\"%s\"," % item)
            else:
                f_txt.write("%s," % item)

    f_txt.close()
    "\nTXT file created at:", txt_fn_path
示例#15
0
class QueryData(dict):
    def __init__(self, query, labels=[], vectors=[]):
        self.query = query
        self.labels = labels
        self.distances = []
        self.vectors = vectors
        self.vocab_size = 0
        self.query_size = 0
        self.parsed_positive = []
        self.parsed_negative = []
        self.dim_embedded = []
        self.embedding = []
        self.cluster_data = []
        self.cluster_centroids = []

    def clear_data(self):
        self.distances.clear()
        self.labels.clear()
        self.vectors.clear()
        self.parsed_positive.clear()
        self.parsed_negative.clear()
        self.dim_embedded.clear()
        self.cluster_data.clear()
        self.cluster_centroids.clear()

    def _closest_node(self, node, nodes):
        nodes = np.asarray(nodes)
        dist_2 = np.sum((nodes - node)**2, axis=1)
        return np.argmin(dist_2)

    def dim_reduce(self):
        self.dim_embedded = TSNE(n_components=2).fit_transform(
            np.array(self.vectors, dtype=np.float64))
        self.embedding = self.dim_embedded.tolist()

    def cluster(self, num_clusters):
        clustering = KMeans(n_clusters=num_clusters).fit(self.dim_embedded)
        cluster_labels = clustering.labels_.tolist()
        centroids = clustering.cluster_centers_.tolist()
        for cluster_id in range(num_clusters):
            closest_node = self._closest_node(centroids[cluster_id],
                                              self.dim_embedded)
            closest_node_word = self.labels[closest_node]
            self.cluster_centroids.append({
                'x': centroids[cluster_id][0],
                'y': centroids[cluster_id][1],
                'label': cluster_labels[cluster_id],
                'word': closest_node_word
            })
        embedding = self.dim_embedded.tolist()
        for item in range(len(cluster_labels)):
            self.cluster_data.append({
                'x': embedding[item][0],
                'y': embedding[item][1],
                'label': cluster_labels[item]
            })

    def to_dict(self):
        result = {
            'query': self.query,
            'labels': self.labels,
            'vocab_size': self.vocab_size,
            'query_size': self.query_size,
            'centroids': self.cluster_centroids,
            'cluster_data': self.cluster_data
        }
        return result
示例#16
0
class Exploration():
    def __init__(self, query, labels=[], vectors=[]):
        self.query = query
        self.parsed_query = {}
        self.labels = labels
        self.vectors = vectors
        self.reduction = []
        self.clusters = []
        self.distances = []
        self.stats = {}

    def reduce(self):
        print('Performing tSNE reduction ' +
              'on {} vectors'.format(len(self.vectors)))
        self.reduction = TSNE(n_components=2, verbose=1).fit_transform(
            np.array(self.vectors, dtype=np.float64))  # slower than below
        # replaced below tsne with scikit's above
        # self.reduction = bh_sne(np.array(self.vectors, dtype=np.float64))

    def cluster(self, num_clusters=30):
        clustering = KMeans(n_clusters=num_clusters)
        clustering.fit(self.reduction)
        self.clusters = clustering.labels_
        clustermatrix = []
        reduction = self.reduction.tolist()
        for cluster_id in range(num_clusters):
            clustermatrix.append([
                reduction[i] for i in range(len(self.vectors))
                if self.clusters[i] == cluster_id
            ])
        self.cluster_centroids = clustering.cluster_centers_.tolist()
        self.cluster_centroids_closest_nodes = []
        for cluster_id in range(num_clusters):
            nodes_for_cluster = clustermatrix[cluster_id]
            centroid = self.cluster_centroids[cluster_id]
            closest_node_to_centroid = self._closest_node(
                centroid, nodes_for_cluster)
            coords = nodes_for_cluster[closest_node_to_centroid]
            node_id = reduction.index(coords)
            self.cluster_centroids_closest_nodes.append(node_id)

    def serialize(self):
        result = {
            'query': self.query,
            'parsed_query': self.parsed_query,
            'labels': self.labels,
            'stats': self.stats
        }
        if len(self.reduction) > 0:
            result['reduction'] = self.reduction.tolist()
        if len(self.distances) > 0:
            result['distances'] = self.distances
        if len(self.clusters) > 0:
            result['clusters'] = self.clusters.tolist()
            result['cluster_centroids'] = self.cluster_centroids
            closest_nodes = self.cluster_centroids_closest_nodes
            result['cluster_centroids_closest_nodes'] = closest_nodes
        return result

    def _closest_node(self, node, nodes):
        nodes = np.asarray(nodes)
        dist_2 = np.sum((nodes - node)**2, axis=1)
        return np.argmin(dist_2)
 def tsne(self, X):
     X2 = TSNE(n_components=2, random_state=0,
               perplexity=40).fit_transform(X)
     return X2.tolist()
示例#18
0
			
	plt.scatter(X_embedded[:, 0], X_embedded[:, 1], s=size_array[:],
			c='black', marker=".", alpha=1)

	fig.savefig("img/"+save_PATH+"/TSNE_SONDHEIM_{0:05d}.png".format(iters), transparent=True, figsize=(16.0, 9.0), dpi=320)
	print("IMAGE saved at: img/"+save_PATH+"/TSNE_SONDHEIM_{0:05d}.png".format(iters))

	# for i, a in enumerate(txt_fn):
	# 	ax.annotate(a, (X_embedded[:, 0][i], X_embedded[:, 1][i]))
	# 	#ax.annotate(a+"\n'"+titles[i]+"'", (X_embedded[:, 0][i], X_embedded[:, 1][i]))

	# #plt.show()
	# fig.savefig("img/"+save_PATH+"/"+str(iters)+"_TSNE_SONDHEIM_ANNOTATED.png", transparent=False, figsize=(16.0, 9.0), dpi=320)


	X = X_embedded.tolist()
	for idx,x in enumerate(X):
		x.extend([ids[idx]])

	# SAVE to txt file as csv for later import to d3
	#txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M')+"_SONDHEIM_TSNE_"+str(complexity_INIT)+"_"+str(n_comp)+"_"+str(perp)+"_"+str(lr)+"_"+str(exag)+"_num_iter"+str(num_iter)+".txt"

	txt_fn = "LR{}_EXAG{}_C{}_LAYERS{}_P{}.txt".format(lr,exag,complexity_INIT,n_comp,perp)


	txt_fn_path = txt_save_PATH+txt_fn
	f_txt=open(txt_fn_path,'w')
	f_txt.write("x,y,id,s")

	inc=0
	for x in X:
示例#19
0
    def evaluate(self,
                 dataset,
                 net,
                 save_tSNE=False,
                 return_loss=True,
                 print_to_logger=True):
        """
        Evaluate the Contrative network on the provided dataset.
        ----------
        INPUT
            |---- net (nn.Module) The Encoder network to validate.
            |---- dataset (torch.utils.data.Dataset) the dataset on which the
            |           network is evaluated.
            |---- print_to_logger (bool) whether to print in the logger.
            |---- save_tSNE (bool) whether to save a 2D t-SNE representation of
            |           the embeded data points
            |---- return_loss (bool) whether to return the validation loss.
        OUTPUT
            |---- (auc) (float) the validation loss if required.
        """
        if print_to_logger:
            logger = logging.getLogger()
        # make dataloader (with drop_last = True to ensure that the loss can be computed)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=self.batch_size,
                                             shuffle=True,
                                             num_workers=self.n_job_dataloader,
                                             drop_last=True)

        # put net on device
        net = net.to(self.device)

        # define loss function
        loss_fn = InfoNCE_loss(self.tau, self.batch_size, device=self.device)

        if print_to_logger:
            logger.info("Start Evaluating Contrastive.")

        net.eval()
        with torch.no_grad():
            sum_loss = 0.0
            idx_h_z = []
            n_batch = len(loader)

            for b, data in enumerate(loader):
                # get input
                input_1, input_2, _, idx = data
                input_1 = input_1.to(self.device).float()
                input_2 = input_2.to(self.device).float()
                idx = idx.to(self.device)
                # forward
                h_1, z_1 = net(input_1)
                h_2, z_2 = net(input_2)
                # normalize
                z_1 = F.normalize(z_1, dim=1)
                z_2 = F.normalize(z_2, dim=1)
                # compute loss
                loss = loss_fn(z_1, z_2)
                sum_loss += loss.item()
                # save embeddings
                if save_tSNE:
                    idx_h_z += list(
                        zip(idx.cpu().data.numpy().tolist(),
                            h_1.cpu().data.numpy().tolist(),
                            z_1.cpu().data.numpy().tolist()))

                if self.print_batch_progress:
                    print_progessbar(b,
                                     n_batch,
                                     Name='\t\tEvaluation Batch',
                                     Size=40,
                                     erase=True)

        if save_tSNE:
            if print_to_logger:
                logger.info("Computing the t-SNE representation.")
            # Apply t-SNE transform on embeddings
            index, h, z = zip(*idx_h_z)
            h, z = np.array(h), np.array(z)
            h = TSNE(n_components=2).fit_transform(h)
            z = TSNE(n_components=2).fit_transform(z)
            self.eval_repr = list(zip(index, h.tolist(), z.tolist()))

            if print_to_logger:
                logger.info("Succesfully computed the t-SNE representation ")

        if return_loss:
            return loss / n_batch
示例#20
0
    def evaluate(self,
                 net,
                 dataset,
                 return_auc=False,
                 print_to_logger=True,
                 save_tSNE=True):
        """
        Evaluate the DSAD network on the provided dataset.
        ----------
        INPUT
            |---- net (nn.Module) The DeepSAD network to validate.
            |---- dataset (torch.utils.data.Dataset) the dataset on which the
            |           network is evaluated.
            |---- net (nn.Module) The DeepSAD network to validate.
            |---- return_auc (bool) whether to return the computed auc or not.
            |---- print_to_logger (bool) whether to print in the logger.
            |---- save_tSNE (bool) whether to save a 2D t-SNE representation of
            |           the embeded data points
        OUTPUT
            |---- None
        """
        if print_to_logger:
            logger = logging.getLogger()

        # make dataloader
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=self.batch_size,
                                             shuffle=True,
                                             num_workers=self.n_job_dataloader)
        # put net on device
        net = net.to(self.device)

        # Evaluating
        if print_to_logger:
            logger.info('Start Evaluating the DMSAD.')
        start_time = time.time()
        idx_label_score = []

        net.eval()
        with torch.no_grad():
            for b, data in enumerate(loader):
                # get data on device
                input, label, mask, semi_label, idx = data
                input = input.to(self.device).float()
                label = label.to(self.device)
                mask = mask.to(self.device)
                semi_label = semi_label.to(self.device)
                idx = idx.to(self.device)

                # mask input
                input = input * mask

                # Embed input and compute anomaly  score
                _, embed = net(input)
                # find closest sphere
                dist, sphere_idx = torch.min(torch.norm(self.c.unsqueeze(0) -
                                                        embed.unsqueeze(1),
                                                        p=2,
                                                        dim=2),
                                             dim=1)

                # if self.R is not None:
                #     # anomaly scores positive if dist > R and negative if dist < R
                #     score = dist - torch.stack([self.R[j] for j in sphere_idx], dim=0)
                # else:
                #     # else scores is just the minimal distance to a center
                score = dist

                # append idx, scores, label and embeding
                idx_label_score += list(
                    zip(idx.cpu().data.numpy().tolist(),
                        label.cpu().data.numpy().tolist(),
                        score.cpu().data.numpy().tolist(),
                        sphere_idx.cpu().data.numpy().tolist(),
                        embed.cpu().data.numpy().tolist()))

                if self.print_batch_progress:
                    print_progessbar(b,
                                     len(loader),
                                     Name='\t\t Evaluation Batch',
                                     Size=40,
                                     erase=True)

        # compute AUCs
        index, label, score, sphere_index, embed = zip(*idx_label_score)
        label, score = np.array(label), np.array(score)
        auc = roc_auc_score(label, score)

        if save_tSNE:
            embed = np.array(embed)
            embed = TSNE(n_components=2).fit_transform(embed)
            idx_label_score = list(
                zip(index, label.tolist(), score.tolist(), sphere_index,
                    embed.tolist()))

        self.eval_time = time.time() - start_time
        self.eval_scores = idx_label_score
        self.eval_auc = auc

        if print_to_logger:
            logger.info(f'Evaluation Time : {self.eval_time}')
            logger.info(f'Evaluation AUC : {self.eval_auc:.3%}')
            logger.info('Finished Evaluating the DMSAD.')

        if return_auc:
            return auc
示例#21
0
vectors = []
artists = []

with open('song_vectors.json') as jsonfile:
    result = json.load(jsonfile)
    for artist, values in result.items():
        if artist in [
                "Eminem", "Nirvana", "Billy Talent", "Ska-P", "Daft Punk",
                "Chick Corea", "Kings Of Leon"
        ]:
            for value in values:
                vector = value["latent"]
                artists.append(artist)
                vectors.append(vector)

print(np.array(vectors).shape)

X_embedded = TSNE(n_components=2).fit_transform(np.array(vectors))

for artist, p in zip(artists, X_embedded.tolist()):
    plt.scatter(p[0], p[1], color=artist_map[artist], s=30)

ax = plt.gca()
ax.set_xticks([])
ax.set_yticks([])
ax.set(xlabel="t-SNE axis 1", ylabel="t-SNE axis 2")
ax.grid(True)

# plt.show()
plt.savefig("latent_vectors.pgf")
示例#22
0
from sklearn.manifold import TSNE
import numpy as np
import json

with open('mv_data.json') as f:
    j = json.loads(f.read())


def distance(a, b):
    return np.linalg.norm(a.reshape((40, 40))-b.reshape((40, 40)))


data = np.array(list(map(lambda x: np.array(x).flatten(),
                         map(lambda x: x['viewMatrix'], j['papers']))))
embed = TSNE(metric=distance).fit_transform(data)
embed -= embed.min(axis=0)
embed /= embed.max(axis=0)
embed *= 2
embed -= 1

with open('tsne.json', 'w') as f:
    f.write(json.dumps(embed.tolist()))
示例#23
0
文件: mtsne.py 项目: dxshhh/CV
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import sys

relu = lambda x: max(0.0, x)

ans = []
with open('tsne.in') as f:  # 需要重新打开文本进行读取
    #with open('in.txt') as f:  # 需要重新打开文本进行读取
    for line2 in f:
        tmp = line2.rstrip().split()
        for i in range(len(tmp)):
            tmp[i] = relu(float(tmp[i]))
        #tmp = torch.Tensor(tmp)
        ans.append(tmp)

#ans = ans[:2000]

x = np.array(ans)
print(x.tolist())

y = TSNE(n_components=2, perplexity=10, learning_rate=50,
         n_iter=10000).fit_transform(x)

with open('draw_data.py', 'w') as f:
    f.write('a=')
    f.write('%s' % y.tolist())
示例#24
0
class tSNE(Similarity):

    _similarity_type = 'tsne'

    def __init__(self, *args, **kwargs):
        """
        This function might be called locally and in that case we want to return the
        actual similarity calculator instance.  Or it might be run rmeotely (via celery)
        and in this case we want to reutrn the serialized version of the similarity instance.

        Parameters
        ----------
        display_type : string
           String representation of the display, can be 'plot', 'hexbin'.

        Returns
        -------
        N/A
        """

        # Pull the display type out of kwargs if it is there. If not then we will
        # use 'plot' as the default.
        if 'display_type' in kwargs:
            display_type = kwargs['display_type']
            del kwargs['display_type']
        else:
            display_type = 'plot'

        super().__init__(tSNE._similarity_type, *args, **kwargs)
        log.info('Created {}'.format(self._similarity_type))

        # Each line / element in these should correpsond
        self._Y = None
        self._fingerprints = []
        self._filename_index = []
        self._distance_measure = 'l2'

        # Display types
        self._display_type = display_type
        self._display_types = ['plot', 'hexbin', 'mpl']
        if self._display_type not in self._display_types:
            raise Exception('Display type {} not one of {}'.format(
                self._display_type, self._display_types))

        # Define the distance measures.
        self._distance_measures = {
            'l2':
            lambda Y, point: np.sqrt(np.sum((Y - np.array(point))**2, axis=1)),
            'l1':
            lambda Y, point: np.sum(np.abs((Y - np.array(point)), axis=1)),
        }

    @property
    def data(self):
        return self._Y

    @property
    def data_filtered(self):
        return self._Y[self._fingerprint_filter_inds]

    #
    #  Calculation Methods
    #

    def calculate(self, fingerprints):
        """
        Calculate the TSNE based on the fingerprints.

        Parameters
        ----------
        fingerprints : list of Fingerprint instances
           The fingerprints we want to calculate over.

        Returns
        -------
        N/A
        """
        log.info('Going to calculate tSNE from {} fingerprints'.format(
            len(fingerprints)))

        #
        #  Filter the fingerprints, if the filter is set.
        #

        if self._fingerprint_filter is not None:
            fingerprints = self._fingerprint_filter(fingerprints)

        #
        # Calculate the unique labels
        #

        labels = []
        values = {}
        for ii, fp in enumerate(fingerprints):
            log.debug('    fingerprint is {}'.format(fp))

            #
            # Add to unique label list
            #

            labels.extend(
                [pred[1] for pred in fp.predictions if pred[1] not in labels])

            #
            # Store the predictions for next processing
            #

            values[ii] = fp.predictions

            self._fingerprints.append(fp)
        log.info('Unique labels {}'.format(labels))

        #
        # Set up the similarity matrix X based on the predictions
        #

        X = np.zeros((len(fingerprints), len(labels)))
        for ii, fp in enumerate(fingerprints):
            inds = [labels.index(prediction[1]) for prediction in values[ii]]
            X[ii][inds] = [prediction[2] for prediction in values[ii]]

        log.debug('X is {}'.format(X))
        log.debug('Fingerprint list {}'.format(self._fingerprints))

        #
        # Compute the tSNE of the data.
        #

        log.info('Calculating the tSNE...')
        self._Y = TSNE(n_components=2).fit_transform(X)
        log.debug('self._Y is {}'.format(self._Y))
        log.info('Done calculation')

    #
    #  Utility Methods
    #

    def save(self):
        """
        Save function converts the instance to a dict.

        Parameters
        ----------
        None

        Returns
        -------
        dict
            Dictionary representation of this instance.
        """
        log.info('Returning the dictionary of information')
        return {
            'uuid': self._uuid,
            'similarity_type': self._similarity_type,
            'similarity': self._Y.tolist(),
            'fingerprint': [fp.save() for fp in self._fingerprints],
            'parameters': {
                'distance_measure': self._distance_measure
            }
        }

    def load(self, thedict, db=None):
        """
        Reload the internal variables from the dictionary.

        Parameters
        ----------
        thedict : dict
            The first parameter.
        db : str
            database object

        Returns
        -------
        N/A
        """
        log.info(
            'Loading the dictionary of information with database {}'.format(
                db))

        self._uuid = thedict['uuid']
        self._similarity_type = thedict['similarity_type']
        self._Y = np.array(thedict['similarity'])
        self._fingerprints = [
            Fingerprint.factory(x) for x in thedict['fingerprint']
        ]
        self._parameters = thedict['parameters']
        self._distance_measure = self._parameters['distance_measure']

        self._fingerprint_filter_inds = list(range(len(self._fingerprints)))

    #
    #  Display methods
    #

    def set_display_type(self, display_type):
        """
        Set the display type. Currently 'plot', 'hexbin', and 'mpl' are defined.

        Parameters
        ----------
        display_type : string
            Display type: 'plot', 'hexbin', and 'mpl' are defined.

        Returns
        -------
        N/A
        """
        if display_type in self._display_types:
            self._display_type = display_type
        else:
            raise ValueError('Display type {} not in {}'.format(
                display_type, self._display_types))

    def select_distance_measure(self, distance_measure=None):
        """
        Select the distance measure.

        Parameters
        ----------
        distance_measure : string
            Display type: 'plot', 'hexbin', and 'mpl' are defined.

        Returns
        -------
        N/A
        """

        if not distance_measure:
            dm_options = self._distance_measures.keys()

            selected = False
            N = 0
            while not selected:
                # Show the fingerprints in order to allow for the person to select one
                print('Select distance measure to use (q to quit:')
                for ii, x in enumerate(dm_options):
                    print('   {}) {}'.format(ii, x))
                    N = ii

                s = input('Select Number > ')

                if s == 'q':
                    return

                try:
                    s = int(s)
                    if s >= 0 and s < N:
                        self._distance_measure = self._distance_measures[s]
                except Exception:
                    pass
        else:
            if distance_measure in self._distance_measures:
                self._distance_measure = distance_measure
            else:
                self._distance_measure = self._distance_measures.keys()[0]
                log.error(
                    'ERROR: No definition for {} so using {} instead.'.format(
                        distance_measure, self._distance_measure))

    def display(self, tsne_axis):
        """
        Display the plot into the matplotlib axis in the
        parameter based on the plot type. This just determines
        the plot type and then calls the internal plot function.

        Parameters
        ----------
        tsne_axis : Matplotlib.axes.axis
            The matplotlib axis into which we want to display the plot.

        Returns
        -------
        N/A
        """
        if self._display_type == 'plot':
            self._display_plot(tsne_axis)
        elif self._display_type == 'hexbin':
            return self._display_hexbin(tsne_axis)


#        elif self._display_type == 'mpl':
#            self._display_mpl(tsne_axis)
        else:
            raise ValueError('Plot type {} is not in the valid list {}'.format(
                self._display_type, self._display_types))

    def _display_plot(self, tsne_axis):
        """
        Display the plot into the matplotlib axis as a regular scatter plot.

        Parameters
        ----------
        tsne_axis : Matplotlib.axes.axis
            The matplotlib axis into which we want to display the plot.

        Returns
        -------
        N/A
        """
        tsne_axis.plot(self._Y[:, 0], self._Y[:, 1])  #, '.')
        tsne_axis.grid('on')
        tsne_axis.set_title('tSNE [{}]'.format(self._distance_measure))

    def _display_hexbin(self, tsne_axis):
        """
        Display the plot into the matplotlib axis as a hexbin.

        Parameters
        ----------
        tsne_axis : Matplotlib.axes.axis
            The matplotlib axis into which we want to display the plot.

        Returns
        -------
        N/A
        """
        output = tsne_axis.hexbin(self._Y[:, 0], self._Y[:, 1], cmap='hot')
        tsne_axis.grid('on')
        tsne_axis.set_title('tSNE [{}]'.format(self._distance_measure))

        # Set the color limits so that it is a little brighter
        limmax = np.percentile(output.get_array(), 99.9)
        output.set_clim((0, limmax))

        return output

    def find_similar(self, point, n=9, allow_overlapping_bounding_boxes=True):
        """
        Find fingerprints that are close to the input point.

        Parameters
        ----------
        point : tuple (int, int)
            A point in the plot.
        n : int
            Number to return.

        allow_overlapping_bounding_boxes: bool
            Whether to allow overlapping bb or not.

        Returns
        -------
        list
            List of dicts that describe the closest fingerprints.
        """
        log.info('')

        if self._fingerprint_filter_inds is None:
            self._fingerprint_filter_inds = list(range(len(
                self._fingerprints)))

        distances = self._distance_measures[self._distance_measure](self._Y,
                                                                    point)

        log.debug('Filtering based, n distances {}  n filter_inds {}'.format(
            len(distances), len(self._fingerprint_filter_inds)))

        inds = []
        for ind in np.argsort(distances):

            # First, make sure this index is one of the filtered ones.
            if ind in self._fingerprint_filter_inds:

                # Next check to see if we allow overlapping bounding boxes
                # If not, make sure this one doesn't overlap with any in the list so far.
                if (allow_overlapping_bounding_boxes or not any([
                        self._fingerprints[ind].cutout.bounding_box.overlap(
                            self._fingerprints[ii].cutout.bounding_box)
                        for ii in inds
                ])):
                    inds.append(ind)
                    if len(inds) == n:
                        break

        # Now we want to look only in the "search_inds" if that is passed in
        return [{
            'tsne_point': self._Y[ind],
            'distance': distances[ind],
            'fingerprint': self._fingerprints[ind]
        } for ind in inds[:n]]

    def cutout_point(self, cutout):
        """
        Given a cutout (and therefore a fingerprint), find the point in the
        tSNE plot that it corresponds to.

        Parameters
        -----------
        cutout : Cutout
            The cutout we want to find.

        Return
        ------
        tSNE point: tuple
            Point in the tSNE space the cutout corresponds to
        """
        log.info('cutout {}'.format(cutout))

        index = [
            fingerprint.cutout.uuid for fingerprint in self._fingerprints
        ].index(cutout.uuid)
        return self._Y[index]

    def closest_cutout(self, data, point):
        """
        Given a cutout (and therefore a fingerprint), find the point in the
        tSNE plot that it corresponds to.

        Parameters
        -----------
        cutout : Cutout
            The cutout we want to find.

        Return
        ------
        tSNE point: tuple
            Point in the tSNE space the cutout corresponds to
        """
        log.info('data {}  point {}'.format(data, point))

        #
        # Get the cutouts assocated with the data passed in.
        #

        cutouts = [
            fingerprint.cutout for fingerprint in self._fingerprints
            if fingerprint.cutout.data == data
        ]

        #
        # Compute distance between cutout bounding boxes centers and the point.
        #

        distances = [c.bounding_box.distance(point) for c in cutouts]

        #
        # Find the smallest.
        #

        index = np.argsort(distances)[0]

        log.debug('Closest cutout is with bb {} and dist {}'.format(
            cutouts[index].bounding_box, distances[index]))

        return cutouts[index]