示例#1
0
        def coords_func(lib, opts, args):
            if opts.type:
                seq_features_file = config["blackbird"]["seq_features"].get("unicode")
                seq_features = cPickle.load(open(seq_features_file, "rb"))

                keys = seq_features.keys()

                if opts.type == "mean":
                    features = np.empty((len(seq_features), 20))

                    for idx, key in enumerate(seq_features):
                        length = seq_features[key].shape[1]
                        features[idx, :] = seq_features[key][:, int(0.1 * length):int(0.9 * length)].mean(axis=1)
                elif opts.type == "lstm":
                    print("Loading network...")
                    model = LSTMSeq2Seq(config["blackbird"]["lstm"]["arch"].get("unicode"),
                                        config["blackbird"]["lstm"]["weights"].get("unicode"),
                                        config["blackbird"]["lstm"]["output"].get())
                    # Pad sequences
                    maxlen = 150
                    padded_seq_features = np.empty((len(seq_features), maxlen, 20))
                    for idx, key in enumerate(seq_features):
                        padded_seq_features[idx, :, :] = sequence.pad_sequences(seq_features[key], maxlen=maxlen, dtype="float32").T

                    print("Getting vectors...")
                    features = model.predict(padded_seq_features)
                else:
                    print("Provide a valid --type [mean, lstm]")
                    sys.exit(1)

                print("Reducing dimensions...")
                features_2d = TSNE(n_components=2).fit_transform(features)

                print("Writing to db...")
                conn = sqlite3.connect(config["blackbird"]["db"].get("unicode"))
                cur = conn.cursor()
                cur.execute("DELETE FROM coords")

                to_insert = []
                for idx in xrange(features_2d.shape[0]):
                    to_insert.append((keys[idx],
                                      features_2d[idx, 0],
                                      features_2d[idx, 1]))
                cur.executemany("INSERT INTO coords VALUES (?, ?, ?)", to_insert)
                conn.commit()
                conn.close()

                # Fill leftovers
                ids_to_fill = []
                for item in lib.items():
                    if item.id not in keys:
                        ids_to_fill.append(item.id)

                self.fill(ids_to_fill, features_2d.min(axis=0), features_2d.max(axis=0))
            else:
                print("Provide a valid --type [mean, lstm]")
示例#2
0
"""## t-SNE"""

import matplotlib.pyplot as plt
from matplotlib import offsetbox

"""Classify restaurants according to TF-IDF values"""

from sklearn.manifold import TSNE
# X = w2v_for_each_restaurant
X = pd.read_csv("tf_idf_values.csv")
rating = pd.read_csv("lratings_w2v.csv")
co = rating['0']
# X = whole
X_tsne = TSNE(n_components=2,perplexity=5,n_iter=100000,early_exaggeration=70,learning_rate=10,method="exact").fit_transform(X)
x_min, x_max = X_tsne.min(0), X_tsne.max(0)
X_norm = (X_tsne - x_min) / (x_max - x_min)
y = reviews["restaurant"]
plt.figure(figsize=(12, 12))
for i in range(X_norm.shape[0]):
  plt.text(X_norm[i, 0], X_norm[i, 1], y[i], color=plt.cm.Set1(co[i]+1), 
             fontdict={'weight': 'bold', 'size': 12})
plt.xticks([])
plt.yticks([])
plt.show()

"""Classify restaurants based on the whole reviews"""

# X = w2v_for_each_restaurant
X = whole
X_tsne = TSNE(n_components=2,perplexity=5,n_iter=100000,early_exaggeration=60,learning_rate=10,method="exact",init="pca").fit_transform(X)
示例#3
0
    def plot_transfer_embeddings(self):

        output_image_path = os.path.join(self.args.output_image_path,self.args.val_set,self.args.load_iteration)
        os.makedirs(output_image_path,exist_ok=True)

        speakers = []
        utts = []
        # in_test
        # self.samples = ['252', '240', '237', '341', '274', '236', '272', '329', '271', '301']
        # out_test
        # self.samples = ['232', '305', '227', '238', '263', '339', '376', '318', '286', '312']
        for speaker in self.samples:
            speakers += [speaker] * len(self.indexes[speaker])
            utts += self.indexes[speaker]

        use_spec = 'dmel' if self.args.model_type == 'AdaVAEd' else 'mel'
        dataset = TransferDateset(os.path.join(self.args.data_dir, self.args.dataset), 
                                  speakers, utts, self.indexes, segment_size=None, 
                                  load_spectrogram=use_spec)

        dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)
        embs = []
        embs_tran = []

        for data in dataloader:
            spec = cc(data['spectrogram'])
            spec_d = cc(data['dmel'])
            spec_src = cc(data['source'])

            use_spec = spec_d if self.args.model_type == 'AdaVAEd' else spec
            emb = self.model.get_speaker_embeddings(use_spec)

            with torch.no_grad():
                spec_tran = self.model.inference(spec_src, spec)
                emb_tran = self.model.get_speaker_embeddings(spec_tran)

            embs += emb.detach().cpu().numpy().tolist() 
            embs_tran += emb_tran.detach().cpu().numpy().tolist() 

            print('Evaluate: {}/{}'.format(len(embs),len(dataloader)),end='\r') 

        embs_all = embs + embs_tran

        embs_all = np.array(embs_all)
        norms = np.sqrt(np.sum(embs_all ** 2, axis=1, keepdims=True))
        embs_all = embs_all / norms 

        # t-SNE
        print('\nt-SNE...')
        embs_2d = TSNE(n_components=2, init='pca', perplexity=50).fit_transform(embs_all)
        x_min, x_max = embs_2d.min(0), embs_2d.max(0)
        embs_2d = (embs_2d - x_min) / (x_max - x_min)

        embs_2d_src = embs_2d[:len(embs)]
        embs_2d_tran = embs_2d[len(embs):]
        # plot to figure
        female_cluster = [i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'F']
        male_cluster = [i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'M']
        colors = np.array([self.samples_index[speaker] for speaker in speakers])
        plt.scatter(embs_2d_src[female_cluster, 0], embs_2d_src[female_cluster, 1],  c=colors[female_cluster], marker='x') 
        plt.scatter(embs_2d_src[male_cluster, 0], embs_2d_src[male_cluster, 1], c=colors[male_cluster], marker='o') 
        plt.scatter(embs_2d_tran[female_cluster, 0], embs_2d_tran[female_cluster, 1],  c=colors[female_cluster], marker='s') 
        plt.scatter(embs_2d_tran[male_cluster, 0], embs_2d_tran[male_cluster, 1], c=colors[male_cluster], marker='+') 
        plt.savefig(os.path.join(output_image_path,'transfer.png'))
        plt.clf()
        plt.cla()
        plt.close()
        return
        alpha = 0.5
    hit = np.append(train.rowmeta['Tissue'] == tissue,
                    valid.rowmeta['Tissue'] == tissue)
    ax.plot(T[hit, 0],
            T[hit, 1],
            linestyle='None',
            linewidth=0,
            marker='o',
            markerfacecolor=color,
            markeredgecolor=color,
            markersize=2,
            markeredgewidth=0,
            alpha=alpha,
            zorder=zorder,
            label=tissue)
ax.set_xlim(T.min(0)[0], 1.5 * (T.max(0)[0] - T.min(0)[0]) - T.max(0)[0])
#ax.set_ylim(T.min(0)[1], T.max(0)[1]+1*(T.max(0)[1]-T.min(0)[1]))
ax.legend(loc='best',
          ncol=2,
          numpoints=1,
          markerscale=2,
          fontsize=8,
          labelspacing=0.1)
ax.tick_params(axis='both',
               which='major',
               bottom='off',
               top='off',
               labelbottom='off',
               labeltop='off',
               left='off',
               right='off',
示例#5
0
    def main_train(self):
        with tf.Graph().as_default():
            with tf.Session() as sess:
                img_data = facenet.get_dataset(self.datadir)
                path, label = facenet.get_image_paths_and_labels(img_data)
                print("label")
                print(label)
                print('Classes: %d' % len(img_data))
                print('Images: %d' % len(path))

                facenet.load_model(self.modeldir)
                images_placeholder = tf.get_default_graph().get_tensor_by_name(
                    "input:0")
                embeddings = tf.get_default_graph().get_tensor_by_name(
                    "embeddings:0")
                phase_train_placeholder = tf.get_default_graph(
                ).get_tensor_by_name("phase_train:0")
                embedding_size = embeddings.get_shape()[1]

                print('Extracting features of images for model')
                batch_size = 1000
                image_size = 160
                nrof_images = len(path)
                nrof_batches_per_epoch = int(
                    math.ceil(1.0 * nrof_images / batch_size))
                emb_array = np.zeros((nrof_images, embedding_size))
                #print(nrof_batches_per_epoch)
                #for i in range(nrof_batches_per_epoch):
                start_index = 0 * batch_size
                end_index = min((0 + 1) * batch_size, nrof_images)
                paths_batch = path[start_index:end_index]
                images = facenet.load_data(paths_batch, False, False,
                                           image_size)
                feed_dict = {
                    images_placeholder: images,
                    phase_train_placeholder: False
                }
                emb_array[start_index:end_index, :] = sess.run(
                    embeddings, feed_dict=feed_dict)

                class_names = [cls.name.replace('_', ' ') for cls in img_data]
                classifier_file_name = os.path.expanduser(
                    self.classifier_filename)
                print('emb_array')
                print(emb_array)
                X_embedded = TSNE(n_components=2).fit_transform(emb_array)
                X_embedded -= X_embedded.min(axis=0)
                X_embedded /= X_embedded.max(axis=0)
                print("X_embedded")
                print(X_embedded)

                #for i in range(0, nrof_images-1):
                #    plt.plot(X_embedded[i, 0], X_embedded[i, 1],'bo')
                plt.legend(bbox_to_anchor=(1, 1))
                plt.show()
                out_dim = 8
                out_res = 160
                to_plot = np.square(out_dim)
                grid = np.dstack(
                    np.meshgrid(np.linspace(0, 1, out_dim),
                                np.linspace(0, 1, out_dim))).reshape(-1, 2)
                cost_matrix = cdist(grid, X_embedded,
                                    "sqeuclidean").astype(np.float32)
                cost_matrix = cost_matrix * (100000 / cost_matrix.max())
                print(cost_matrix)
                #rids, cids = solve_dense(costs)
                #print(rids)
                row_ind, col_ind = linear_sum_assignment(cost_matrix)
                row_asses, col_asses, _ = lapjv(cost_matrix)
                print("To cos")
                print(col_asses)
                print("teraz to!")
                print(row_ind)
                print(col_ind)
                for r, c in zip(row_ind, col_asses):
                    print(r, c)  # Row/column pairings
                grid_jv = grid[col_asses]
                out = np.ones((out_dim * out_res, out_dim * out_res, 3))
                print(grid_jv)

                for pos, img in zip(grid_jv, images[0:to_plot]):
                    h_range = int(np.floor(pos[0] * (out_dim - 1) * out_res))
                    w_range = int(np.floor(pos[1] * (out_dim - 1) * out_res))
                    out[h_range:h_range + out_res,
                        w_range:w_range + out_res] = image.img_to_array(img)
                print(out)
                im = image.array_to_img(out)
                im.save("obrazek.jpg", quality=95)
示例#6
0
from sklearn.manifold import TSNE
import numpy as np
import json

with open('mv_data.json') as f:
    j = json.loads(f.read())


def distance(a, b):
    return np.linalg.norm(a.reshape((40, 40))-b.reshape((40, 40)))


data = np.array(list(map(lambda x: np.array(x).flatten(),
                         map(lambda x: x['viewMatrix'], j['papers']))))
embed = TSNE(metric=distance).fit_transform(data)
embed -= embed.min(axis=0)
embed /= embed.max(axis=0)
embed *= 2
embed -= 1

with open('tsne.json', 'w') as f:
    f.write(json.dumps(embed.tolist()))
    def plot_segment_embeddings(self):

        output_image_path = os.path.join(self.args.output_image_path,
                                         self.args.val_set,
                                         self.args.load_iteration)
        os.makedirs(output_image_path, exist_ok=True)

        speakers = []
        utts = []
        for speaker in self.samples:
            speakers += [speaker] * len(self.indexes[speaker])
            utts += self.indexes[speaker]

        dataset = EvaluateDateset(
            os.path.join(self.args.data_dir, self.args.dataset),
            speakers,
            utts,
            segment_size=self.config['data_loader']['segment_size'],
            dset=self.args.val_set,
            load_spectrogram='dmel')

        dataloader = DataLoader(dataset,
                                batch_size=128,
                                shuffle=False,
                                num_workers=0,
                                pin_memory=True)
        batchiter = infinite_iter(dataloader)

        embs = []
        speakers = []
        # run the model
        while (len(embs) < self.args.n_segments):
            data = next(batchiter)
            speakers += data['speaker']
            data = cc(data['spectrogram'].permute(0, 2, 1))
            emb = self.model.get_speaker_embeddings(data)
            embs += emb.detach().cpu().numpy().tolist()
            print('Evaluate: {}/{}'.format(len(embs), self.args.n_segments),
                  end='\r')

        embs = np.array(embs)
        norms = np.sqrt(np.sum(embs**2, axis=1, keepdims=True))
        embs = embs / norms

        # t-SNE
        print('\nt-SNE...')
        embs_2d = TSNE(n_components=2, init='pca',
                       perplexity=50).fit_transform(embs)
        x_min, x_max = embs_2d.min(0), embs_2d.max(0)
        embs_2d = (embs_2d - x_min) / (x_max - x_min)

        # plot to figure
        female_cluster = [
            i for i, speaker in enumerate(speakers)
            if self.speaker_infos[speaker][1] == 'F'
        ]
        male_cluster = [
            i for i, speaker in enumerate(speakers)
            if self.speaker_infos[speaker][1] == 'M'
        ]
        colors = np.array(
            [self.samples_index[speaker] for speaker in speakers])
        plt.scatter(embs_2d[female_cluster, 0],
                    embs_2d[female_cluster, 1],
                    c=colors[female_cluster],
                    marker='x')
        plt.scatter(embs_2d[male_cluster, 0],
                    embs_2d[male_cluster, 1],
                    c=colors[male_cluster],
                    marker='o')
        plt.savefig(os.path.join(output_image_path, 'segment.png'))
        plt.clf()
        plt.cla()
        plt.close()
        return
示例#8
0
def convert_to_dict(clusters_to_filter, ru_idfs, fi_idfs, start_time):
    print(start_time)
    if isinstance(clusters_to_filter, dict):
        clusters_to_filter = clusters_to_filter.values()

    clusters_to_save = filter_interesting_clusters(clusters_to_filter)
    json_formatted = []

    cdata = [c.center / c.norm for c in clusters_to_save]
    if len(cdata) < 5:
        return json_formatted

    t_sne_space = TSNE(n_components=2, metric='cosine').fit_transform(cdata)
    # normalize T-SNE space to -1 to 1
    minimums = t_sne_space.min(axis=0)
    maximums = t_sne_space.max(axis=0)
    for v in t_sne_space:
        v[0] = 2 * (v[0] - minimums[0]) / (maximums[0] - minimums[0]) - 1
        v[1] = 2 * (v[1] - minimums[1]) / (maximums[1] - minimums[1]) - 1

    for cluster_index in range(len(clusters_to_save)):
        c = clusters_to_save[cluster_index]

        idfs = ru_idfs if c.lang == 'ru' else fi_idfs

        # TODO remove temporary filtering
        #
        if (c.created_at <
            (start_time - len(c.hourly_growth_rate) * 3600 * 1000)
            ):  #1405555200000): # 17/07/2014 00:00:00
            continue
        #if (c.created_at < 1503014400000): # 18/08/2017 00:00:00
        #continue

        if len(c.hourly_growth_rate) < 1:
            continue

        start_idx = max(int((c.created_at - start_time) / 3600 / 1000), 1)
        for i in range(start_idx, len(c.hourly_growth_rate)):
            update = {}
            # timestamp
            update['t'] = int(c.first_growth_time / 1000) + i * 60 * 60

            # start with a new cluster event
            if i == start_idx:

                total_sentiment = c.hourly_accum_sentiment[
                    len(c.hourly_accum_sentiment) - 1]

                tags = c.hourly_tags[len(c.hourly_tags) - 1]

                if tags is not None:
                    tags = [
                        tag_label_overrides.get(t, t.title()) for t in tags
                    ]

                #get_keywords(c, idfs)[:4],
                update['n'] = {c.id:                                                \
                                {                                                   \
                                  's': round(c.hourly_growth_rate[i]),              \
                                  'k': c.hourly_keywords[i],                        \
                                  'lang': c.lang,                                   \
                                  'sentiment': round(c.hourly_sentiment[i], 3),     \
                                  'sentiment_total': round(total_sentiment, 3),     \
                                  'tags': tags if tags is not None else [],         \
                                  't_sne': [float(t_sne_space[cluster_index][0]),   \
                                            float(t_sne_space[cluster_index][1])]   \
                                }                                                   \
                              }
            elif i == len(c.hourly_growth_rate):
                # insert a negative number at the end to mark the end of the cluster
                update['u'] = {c.id: {'s': -1}}
            else:
                update['u'] = {
                    c.id: {
                        's': int(round(c.hourly_growth_rate[i])),
                        'sentiment': round(c.hourly_sentiment[i], 3),
                        'sentiment_accum': round(c.hourly_accum_sentiment[i],
                                                 3),
                        'k': c.hourly_keywords[i - 1]
                    }
                }

            json_formatted.append(update)

    json_formatted.sort(key=lambda update: update['t'])
    return json_formatted