예제 #1
0
def tsne():
    corpus = load_hobbies()
    docs = TfidfVectorizer().fit_transform(corpus.data)

    oz = TSNEVisualizer(ax=newfig())
    oz.fit(docs, corpus.target)
    savefig(oz, "corpus_tsne")
예제 #2
0
def perform_tsne(X,
                 Y,
                 vec=None,
                 outpath="",
                 clusterLabels=False,
                 savePlot=False):
    if vec == None:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity)

    docs = vec.fit_transform(X)
    labels = Y

    # from yellowbrick.text import TSNEVisualizer
    tsne = TSNEVisualizer()

    if clusterLabels:
        tsne.fit(docs,
                 ["c{}".format(c) for c in Y])  # where Y=clusters.labels_
    else:
        tsne.fit(docs, labels)

    if savePlot:
        # tsne.finalize()
        tsne.poof(outpath=outpath)
    else:
        tsne.poof()
예제 #3
0
파일: plotters.py 프로젝트: GA239/DS_course
 def clusters_tsne(self, labels: pd.Series, title: str = 'title'):
     tsne = TSNEVisualizer(random_state=42)
     tsne.fit(self.vectors, labels)
     f = tsne.show().figure
     f.set_figheight(15)
     f.set_figwidth(15)
     f.suptitle(title)
     return f
예제 #4
0
def tsne_visualization(dataset_object, num_examples):
    """
    Produce and save T-SNE visualization of feature vectors for given dataset
    Parameters
    ----------
    dataset_object: tv_net.dataset.Dataset
        dataset object containing feature vectors and class names
    num_examples: int
        number of examples to plot
    """

    dataset_object.shuffle_examples(
    )  # shuffle so that we don't get all one class
    feature_vectors = np.array(
        [item.feature_vector for item in dataset_object.items[:num_examples]])
    label_list = [
        item.class_name for item in dataset_object.items[:num_examples]
    ]

    title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format(
        num_examples)
    tsne = TSNEVisualizer(colormap='rainbow', title=title)
    tsne.fit(feature_vectors, label_list)
    output_path = os.path.join(dataset_object.config.OUTPUT_DIR,
                               'visualizations', 'feature_vector_tsne.png')
    tsne.show(outpath=output_path)
    tsne.show()  # have to repeat to show and save
예제 #5
0
def text_cluster_tsne(text_vector,
                      TextVectorizer=TfidfVectorizer,
                      text_kwargs=text_kwargs,
                      n_clusters=10,
                      labels=None):
    '''Uses a TextVectorizer to transform the text contained (at the sentence
    or paragraph level) in the text_vector arg to produce a TSNE visualization.
    The label for the final plot is clusters produced from KMeans if labels
    are not passed.

    ARGS:
        text_vector <np.array>: Vector of text units.  Must be type str.
    KWARGS:
        TextVectorizer <sklearn.feature_extraction.text>: Transformer.
        text_kwargs <dict>: kwargs to pass to TextVectorizer
        n_clusters <int>: If not using labels, number of clusters in KMeans
        labels <np.array>: True categorical labels.  Discrete.
    RETURNS:
        None, prints visualizations to the console.
    '''
    txt_vctzr = TextVectorizer(**text_kwargs)
    docs = txt_vctzr.fit_transform(text_vector)
    tsne = TSNEVisualizer()

    if labels is None:
        # derive clusters if labels not provided
        clusters = KMeans(n_clusters=n_clusters)
        clusters.fit(docs)
        tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_])
    else:
        # otherwise use labels
        tsne.fit(docs, labels)
    sns.despine()
    tsne.poof()
def visualize_yellowbrick(dim_reduction,
                          encoding,
                          corpus_data,
                          corpus_target,
                          labels=True,
                          alpha=0.7,
                          metric=None):
    # https://pypi.org/project/yellowbrick/
    # https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples
    # https://medium.com/@sangarshananveera/rapid-text-visualization-with-yellowbrick-51d3499c9333

    if 'tfidf' in encoding.lower():
        encode = TfidfVectorizer()
    if 'count' in encoding.lower():
        encode = CountVectorizer()
    docs = encode.fit_transform(corpus_data)
    if labels is True:
        labels = corpus_target
    else:
        labels = None
    if 'umap' in dim_reduction.lower():
        if metric is None:
            viz = UMAPVisualizer()
        else:
            viz = UMAPVisualizer(metric=metric)
    if 't-sne' in dim_reduction.lower():
        viz = TSNEVisualizer(alpha=alpha)

    viz.fit(docs, labels)

    return viz.poof()
예제 #7
0
    def result(self):
        data_df = self.clean_data.data()
        all_data_df = self.clean_data.getSpambase_data()
        target_df = self.clean_data.target()

        # Defining Model
        model = TSNE(learning_rate=100)

        # Fitting Model
        transformed = model.fit_transform(all_data_df)

        # Plotting 2d t-Sne
        x_axis = transformed[:, 0]
        pprint.pprint(x_axis)
        y_axis = transformed[:, 1]
        pprint.pprint(y_axis)

        plt.scatter(x_axis, y_axis, c=target_df)
        #plt.show()
        plt.savefig(self.file_png)

        # Create the visualizer and draw the vectors
        tfidf = TfidfVectorizer()
        docs = tfidf.fit_transform(data_df)

        tsne = TSNEVisualizer()
        tsne.fit(docs, target_df)
        tsne.poof()
예제 #8
0
def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]):
    '''
    Vectorizer results are normalized, which makes KMeans behave as
    spherical k-means for better results. Since LSA/SVD results are
    not normalized, we have to redo the normalization.
    '''
    print(
        '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}'
        .format(num_clusters))

    for k in num_clusters:
        start = datetime.now()

        svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        reduced = lsa.fit_transform(tfidf)

        # next, apply kmeans to the corpus to get labels
        clusters = KMeans(n_clusters=k, init='k-means++')
        clusters.fit(reduced)

        tsne = TSNEVisualizer(decompose=None)
        tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_])

        tsne.finalize()
        filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str(
            k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png'
        plt.savefig(filename)
        plt.close()

        end = datetime.now()
        print('            ' + filename)
        print("            Time taken: {}".format(end - start))
예제 #9
0
def update_justifications_data(text, assignment_name, rating_type):
    if(assignment_name is None):
        return None
    
    global db

    db.close()
    db.connect()

    if(text=='Decision Point'):
        all_justifications, title, custom_stopwords = get_dp_justifications(assignment_name)
    elif(text=='Innovation Ratings'):
        if(rating_type is None):
            return None

        assignment = assignment_name.split()[0]
        pre_post = assignment_name.split()[1].split('-')[0]
        all_justifications, title, custom_stopwords = get_ratings_justifications(assignment, pre_post, rating_type)

    custom_stopwords.extend(['wa', 'could', 'also', 'would', 'ha', 'i', 'p', 'g', 'this', 'the'])

    all_justifications = all_justifications[ [True if len(x)>2 else False for x in all_justifications['justification']] ]
    all_justifications['justification'].replace('', np.nan, inplace=True)
    all_justifications.dropna(inplace=True)
    all_justifications['original'] = all_justifications['justification']
    all_justifications['justification'] = all_justifications['justification'].apply(lambda x: lemmatize(x))
    all_justifications['sentiment'] = all_justifications['justification'].apply(lambda x: calcParagraphSentiment(x))

    tfidf = TfidfVectorizer(stop_words=custom_stopwords)
    docs = tfidf.fit_transform(list(all_justifications['justification']))

    tsne = TSNEVisualizer(random_state=14)
    transformer = tsne.make_transformer()
    data = transformer.fit_transform(docs)

    all_justifications['x'] = data[:, 0]
    all_justifications['y'] = data[:, 1]

    return {'data': all_justifications.to_dict(), 'title': title, 'custom_stopwords': custom_stopwords}
예제 #10
0
def tsne_pack(c, l):
    my_title = "t-SNE Plot of " + c + " feature"
    data = df.filter(like=c)
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()
예제 #11
0
def tsne(docs, target, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Visualize the frequency distribution
    visualizer = TSNEVisualizer(ax=ax, **kwargs)
    visualizer.fit(docs, target)
    visualizer.poof(outpath=outpath)
def plot_tsne_clusters(corpus, fileids=None, labels=None):
    from yellowbrick.text import TSNEVisualizer
    from sklearn.feature_extraction.text import TfidfVectorizer

    words = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (sent for title in normalizer.transform(words) for sent in title)
    # normed = (dd for dd in normalizer.transform(docs))
    tfidf = TfidfVectorizer()
    procd = tfidf.fit_transform(normed)

    tsne = TSNEVisualizer()
    if labels is None:
        tsne.fit(procd)
    else:
        tsne.fit(procd, ["c{}".format(c) for c in labels])
    tsne.poof()
예제 #13
0
def analyse_2_step_model():
    X_test = np.load(
        "test_prepared.npy").item()  # this is our Single point of truth
    #test_silhouette(30, X_test)

    test = X_test[0:1000]
    prediction = test_entire_model()[0:1000]

    vis_shilouette(test, prediction)
    plt.savefig("silhouette.png")

    tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction))))
    tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction])
    tsne.poof(outpath="tsne.png")
예제 #14
0
def tsne(c, l):
    my_title = "t-SNE Plot of final model"
    data = c
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_bias3)

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_fact)
예제 #15
0
def tsne(ax, classes=True):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from yellowbrick.text import TSNEVisualizer

    X, y = load_data("hobbies", text=True)
    if not classes:
        y = None

    freq = TfidfVectorizer(input='filename', stop_words='english')
    X = freq.fit_transform(X)

    visualizer = TSNEVisualizer(ax=ax)
    visualizer.title = "t-SNE Projection of the Hobbies Corpus"
    if not classes:
        visualizer.title = "Unlabeled " + visualizer.title
    visualizer.fit(X, y)
    return visualizer
예제 #16
0
def generate_tsne(title, X, labels):

    fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2))
    title_dic = {'fontsize': 7, 'fontweight': 'bold'}

    colors = resolve_colors(11, 'Spectral_r')
    colors2 = resolve_colors(10, 'BrBG_r')
    tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None)
    tsne.fit(X, labels)
    tsne.finalize()
    ax1 = tsne.ax
    ax1.set_title(title, title_dic)

    path = os.path.join(OUTPUT)
    filename = title
    filename = os.path.join(path, filename)
    plt.savefig(filename)
예제 #17
0
def cluster(corpus, k):
    y = [i[0] for i in corpus]
    corpus = [i[1] for i in corpus]
    eng = list(set(stopwords.words('english')))

    trump = [
        'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build',
        'thank', 'people'
    ]

    s_w = eng + trump

    vectorizer = TfidfVectorizer(stop_words=s_w)
    vectorizer.fit(corpus)
    features = vectorizer.transform(corpus)

    tsne = TSNEVisualizer()
    tsne.fit(features, y)
    tsne.show()
예제 #18
0
def analyse_results():
    rerun = False
    if ("rerun" in sys.argv):
        print("Redo everything")
        rerun = True

    X_test = np.load("test_prepared.npy").item()

    results = []
    names = []

    for filename in os.listdir("results"):
        if filename.endswith(".npy"):
            if filename[:-4] + "tsne.png" in os.listdir(
                    "results") and not rerun:
                continue

            results.append(np.load("results/" + filename))
            names.append(filename[:-4])

    for i in range(len(results)):
        print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " +
              names[i])

        vis_shilouette(X_test, results[i])
        plt.savefig("results/" + names[i] + "silhouette.png")

        plt.close()
        plt.figure()

        tsne = TSNEVisualizer(colormap=cm.get_cmap(
            'jet', len(set(results[i][0:5000]))),
                              alpha=0.5,
                              random_state=45)  # make it deterministic
        tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]])
        tsne.poof(outpath="results/" + names[i] + "tsne.png",
                  clear_figure=True)
예제 #19
0
    def tsne_plot(self, outpath, sample_size=1000, tfidf=True):
        """
        Creates a png file at `outpath` with t-SNE visualization.
        `sample_size` determines the size of the random sample from each label.
        Uses TfidfVectorizer by default;
        if `tfidf` is set to False, CountVectorizer is used.
        -----------------------------------------------------------------------
        More info:
        https://www.scikit-yb.org/en/latest/api/text/tsne.html
        https://lvdmaaten.github.io/tsne/
        """

        if self.tokenizer is None:
            print('No tokenizer was loaded.')
            return None

        df = pd.DataFrame(columns=self.data.columns)
        for label in self.labels:
            samp_df = self.data \
                .query("Label == @label") \
                .sample(sample_size, random_state=19)
            df = df.append(samp_df, ignore_index=True)

        # vectorize
        if tfidf:
            vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize)
        else:
            vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize)
        X = vectorizer.fit_transform(df.Text)
        y = df.Label

        # create the visualizer and draw the vectors
        tsne = TSNEVisualizer()
        tsne.fit(X, y)
        tsne.show(outpath=outpath)

        return None
예제 #20
0
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False)
dummies = vect.fit_transform(df['ingredients'].apply(','.join)) 

df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names())
print("Vocab Length: ", len(vect.get_feature_names()))
print("All Data Shape: ", df.shape)
df.index= df_index

print("Number of Predictors: ", df.shape[0])
df.head()

# Create the visualizer and draw the vectors
plt.figure(figsize = [15,9])
tsne = TSNEVisualizer()
tsne.fit(df.loc[traindex,:][:7000], y[:7000])
tsne.poof()

X = df.loc[traindex,:]
print("Number of Cuisine Types: ", y.nunique())
print("X Shape: ", X.shape)
test_df = df.loc[testdex,:]
print("Test DF Shape: ", test_df.shape)
del df; gc.collect();

LogisticRegression().get_params().keys()

model = LogisticRegression(multi_class= 'ovr')
score = cross_validate(model, X, y, return_train_score=False)
score["test_score"].mean()
# freq_dist_viz(vectorizer, df_train['Lyrics'],
#               "images/tfid_stopwords_train.png")
# freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png")


def get_sentence_embedding(w2v_model, sentence):
    embedding = np.zeros(3000)

    for word in sentence.split():
        try:
            vector = w2v_model.wv.get_vector(word)
        except KeyError:
            vector = np.zeros(3000)
        embedding += vector

    return embedding / len(sentence.split())


w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
docs = np.array([
    get_sentence_embedding(w2v_model, sentence)
    for sentence in df_train['Lyrics']
])
# tfidf = TfidfVectorizer()
# docs = tfidf.fit_transform(X)
labels = df_train['Genre']

tsne = TSNEVisualizer()
tsne.fit(docs, labels)
tsne.poof("images/w2v_tsne.png")
예제 #22
0
#!/usr/bin/env python3

import pickle
from yellowbrick.text import TSNEVisualizer

with open('data/agorb.csv', 'rb') as file:
    agora = pickle.load(file)

with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file:
    X = pickle.load(file)

with open('data/tno/categorieen.pkl', 'rb') as file:
    c = pickle.load(file)

tsne = TSNEVisualizer()
tsne.fit(X, c)
tsne.show()
예제 #23
0
def main(X_train_smart, X_test_smart, y_train_smart, y_test_smart,
         X_train_bank, X_test_bank, y_train_bank, y_test_bank, args):

    # em = KMeans(n_clusters=4, random_state=27)
    # em.fit(X_train_smart)
    # prediction = em.predict(X_train_smart)

    # viz = RadViz()
    # viz.fit_transform(X_train_smart, prediction)
    # viz.show()

    # umap = UMAPVisualizer()
    # umap.fit(X_train_smart, ["c{}".format(c) for c in prediction])
    # umap.show()

    # tsne = TSNEVisualizer(decompose_by=4)
    # tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction])
    # tsne.show()
    # exit()
    sil_score_list_smart = []
    cal_har_score_list_smart = []
    davies_bouldin_score_list_smart = []
    sil_score_list_bank = []
    cal_har_score_list_bank = []
    davies_bouldin_score_list_bank = []
    num_clusters_list = np.arange(2, 25)
    for num_clusters in num_clusters_list:
        k_means = KMeans(n_clusters=num_clusters, random_state=27)
        k_means.fit(X_train_smart)
        prediction = k_means.predict(X_train_smart)
        # print(prediction)
        sil_score_list_smart.append(silhouette_score(X_train_smart,
                                                     prediction))
        cal_har_score_list_smart.append(
            calinski_harabasz_score(X_train_smart, prediction))
        davies_bouldin_score_list_smart.append(
            davies_bouldin_score(X_train_smart, prediction))

    for num_clusters in num_clusters_list:
        k_means = KMeans(n_clusters=num_clusters, random_state=27)
        k_means.fit(X_train_bank)
        prediction = k_means.predict(X_train_bank)
        # print(prediction)
        sil_score_list_bank.append(silhouette_score(X_train_bank, prediction))
        cal_har_score_list_bank.append(
            calinski_harabasz_score(X_train_bank, prediction))
        davies_bouldin_score_list_bank.append(
            davies_bouldin_score(X_train_bank, prediction))

    with open('experiment_best.json') as f:
        params = json.load(f)
    if args.dimensionality is None:
        num_clusters_smart = params['k_means']['smart']
        num_clusters_bank = params['k_means']['bank']
    else:
        num_clusters_smart = params[args.dimensionality[0]]['k_means']['smart']
        num_clusters_bank = params[args.dimensionality[0]]['k_means']['bank']

    # Scale these for plotting
    cal_har_score_list_smart = [x / 500 for x in cal_har_score_list_smart]
    cal_har_score_list_bank = [x / 500 for x in cal_har_score_list_bank]
    davies_bouldin_score_list_smart = [
        x / 5 for x in davies_bouldin_score_list_smart
    ]
    davies_bouldin_score_list_bank = [
        x / 5 for x in davies_bouldin_score_list_bank
    ]

    plt.rc("font", size=8)
    plt.rc("axes", titlesize=12)
    plt.rc("axes", labelsize=10)
    plt.rc("xtick", labelsize=8)
    plt.rc("ytick", labelsize=8)
    plt.rc("legend", fontsize=8)
    plt.rc("figure", titlesize=11)
    #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
    fig, ax = plt.subplots(1, 4, figsize=(15, 4))
    fig.suptitle(
        'K-Means Clusters - # of clusters Analysis (Left: Smart Grid, Right: Bank Loan)',
        fontsize=14)
    # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1)
    # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1)
    ax[0].plot(num_clusters_list,
               sil_score_list_smart,
               'b-',
               label='Silhouette',
               linewidth=1)
    ax[0].plot(num_clusters_list,
               cal_har_score_list_smart,
               'r--',
               label='Calinksi-Harabasz / 500',
               linewidth=1)
    ax[0].plot(num_clusters_list,
               davies_bouldin_score_list_smart,
               'g-.',
               label='Davies-Bouldin / 5',
               linewidth=1)
    ax[0].set(xlabel='K (# of clusters)', ylabel='Scores')
    ax[0].set_title('Clustering Scores')
    ax[0].legend()

    k_means = KMeans(n_clusters=num_clusters_smart, random_state=27)
    k_means.fit(X_train_smart)
    prediction_smart = k_means.predict(X_train_smart)
    tsne = TSNEVisualizer(decompose_by=X_train_smart.shape[1] - 1,
                          ax=ax[1],
                          random_state=27)
    tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction_smart])
    ax[1].set_title(
        'tSNE Projection (clusters = {0})'.format(num_clusters_smart))
    ax[1].set_xticklabels([])
    ax[1].set_yticklabels([])

    ax[2].plot(num_clusters_list,
               sil_score_list_bank,
               'b-',
               label='Silhouette',
               linewidth=1)
    ax[2].plot(num_clusters_list,
               cal_har_score_list_bank,
               'r--',
               label='Calinksi-Harabasz / 5d00',
               linewidth=1)
    ax[2].plot(num_clusters_list,
               davies_bouldin_score_list_bank,
               'g-.',
               label='Davies-Bouldin / 5',
               linewidth=1)
    ax[2].set(xlabel='K (# of clusters)', ylabel='Scores')
    ax[2].set_title('Clustering Scores')
    ax[2].legend()

    k_means = KMeans(n_clusters=num_clusters_bank, random_state=27)
    k_means.fit(X_train_bank)
    prediction_bank = k_means.predict(X_train_bank)
    tsne_bank = TSNEVisualizer(decompose_by=X_train_bank.shape[1] - 1,
                               ax=ax[3],
                               random_state=27)
    tsne_bank.fit(X_train_bank, ["c{}".format(c) for c in prediction_bank])
    ax[3].set_title(
        'tSNE Projection (clusters = {0})'.format(num_clusters_bank))
    ax[3].set_xticklabels([])
    ax[3].set_yticklabels([])

    plt.show()

    # Boosting validation
    # Smart grid
    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_smart, y_train_smart)
    boost_fit_time = time() - boost_fit_t
    print('Boosting baseline fit time (smart): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_smart)
    boost_pred_time = time() - boost_pred_t
    print('Boosting baseline predict time (smart): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_smart,
                                  y_train_smart,
                                  cv=10)
    print('Boosting baseline cross validation score (smart): ' +
          str(np.mean(boost_score)))
    # boost_accuracy = accuracy(boosting_learner, y_test, boost_pred)
    # print('Boosting baseline test set predict accuracy: ' + str(boost_accuracy))

    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_smart, prediction_smart)
    boost_fit_time = time() - boost_fit_t
    print('Boosting DR + cluster fit time (smart): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_smart)
    boost_pred_time = time() - boost_pred_t
    print('Boosting DR + cluster predict time (smart): ' +
          str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_smart,
                                  prediction_smart,
                                  cv=10)
    print('Boosting DR + cluster cross validation score (smart): ' +
          str(np.mean(boost_score)))

    # Bank loan
    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_bank, y_train_bank)
    boost_fit_time = time() - boost_fit_t
    print('Boosting baseline fit time (bank): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_bank)
    boost_pred_time = time() - boost_pred_t
    print('Boosting baseline predict time (bank): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_bank,
                                  y_train_bank,
                                  cv=10)
    print('Boosting baseline cross validation score (bank): ' +
          str(np.mean(boost_score)))

    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_bank, prediction_bank)
    boost_fit_time = time() - boost_fit_t
    print('Boosting DR + cluster fit time (bank): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_bank)
    boost_pred_time = time() - boost_pred_t
    print('Boosting DR + cluster predict time (bank): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_bank,
                                  prediction_bank,
                                  cv=10)
    print('Boosting DR + cluster cross validation score (bank): ' +
          str(np.mean(boost_score)))

    return
예제 #24
0
def load_corpus():
    c = Corpus("all_posts01.txt")
    return c


corpus = load_corpus()

#tfidf  = TfidfVectorizer(stop_words='english')
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=10000,
                             min_df=2,
                             use_idf=True)
#transformer =  TfidfTransformer()
#tfidf = make_pipeline(hasher,transformer)
docs = vectorizer.fit_transform(corpus.documents)

print(docs)

true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(docs)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

tsne = TSNEVisualizer(labels=["documents"])
tsne.fit(docs)
tsne.poof()
예제 #25
0
           cbar=False,
           fmt='g')

####################visualisng Clusters

###########Dendogram for TF-IDF features
from scipy.cluster.hierarchy import dendrogram, linkage

np.set_printoptions(precision=6, suppress=True)
H_cluster = linkage(tfidf_matrix, 'ward')
plt.title('Dendogram')
plt.xlabel('Data')
plt.ylabel('Distance bewteen data points')
dendrogram(
    H_cluster,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=13,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=
    True,  # to get a distribution impression in truncated branches
)
plt.show()

#########Scatter plot to visualise k-means clusters
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(tfidf_matrix, ["c{}".format(c) for c in labels])
tsne.poof()
예제 #26
0
        color = "#000000"
        colormap.append(color)

for label in labels:
    big_colormap.append(mycolormap[label])



t6 = time.time()






tsne = TSNEVisualizer(colormap='RdYlGn')
tsne.fit(tfidf_matrix, labels)
tsne.poof()

t7 = time.time()

print("time for TSNE and vis: " + str(t7-t6))


tsne.poof()





예제 #27
0
파일: tsne.py 프로젝트: yokeyong/atap
        categories=categories,
        files=files,
        data=data,
        target=target,
    )


# Load the data and create document vectors
corpus = load_corpus('hobbies')
tfidf  = TfidfVectorizer()

docs   = tfidf.fit_transform(corpus.data)
labels = corpus.target

# Create a visualizer to simply see the vectors plotted in 2D
tsne = TSNEVisualizer()
tsne.fit(docs)
tsne.poof()


# Create a visualizer to see how k-means clustering grouped the docs
from sklearn.cluster import KMeans

clusters = KMeans(n_clusters=5)
clusters.fit(docs)

tsne = TSNEVisualizer()
tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_])
tsne.poof()

예제 #28
0
    liste_galaxies = get_list_galaxie(path)

    matrix = np.zeros([len(t), len(liste_galaxies)])

    dirGalaxies = shelve.open(path + '/BDs/listeGalaxies')

    for galaxie in range(len(liste_galaxies)):
        for node in dirGalaxies[str(liste_galaxies[galaxie])]:
            matrix[index[node]][galaxie] += 1
        
        matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])])

    dirGalaxies.close()
    
    label = np.array([i for i in range(len(t))])
    tsne = TSNEVisualizer(decompose='svd',decompose_by=15)
    tsne.fit(matrix, label)
    print(tsne.transformer_)
    tsne.poof()

    svd = TruncatedSVD(n_components=15)
    svd_matrix = svd.fit_transform(matrix)
    tsne = ts.TSNE()
    y = tsne.fit_transform(svd_matrix)
    kmeans = Kmeans(5,200,0.1)
    kmeans.fit(y)
    for i in range(kmeans.nb_cluster):
        print("Cluster ",i)
        print((np.where(kmeans.which_cluster == i))[0])
        print()
    plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis')
    def initialization_layers_train(self, train_path, test_path, dataset,
                                    no_of_layers, filter_sizes):
        global n1, n2
        # y_pred_cl=tf.get_variable("y_pred_cl",0)

        x, x_image, y_true, y_true_cls = self.load_main(train_path, dataset, 0)
        self.load_main(test_path, dataset, 1)
        # print(len(x_image))
        # y_true_cls=tf.Variable(y_true_cls)
        if dataset == "Fashion-MNIST":
            layer_conv1, weights_conv1 = self.new_conv_layer(
                input=x_image,
                num_input_channels=1,
                filter_size=filter_sizes[0],
                num_filters=64,
                name="conv1")

        elif dataset == "CIFAR-10":
            layer_conv1, weights_conv1 = self.new_conv_layer(
                input=x_image,
                num_input_channels=3,
                filter_size=filter_sizes[0],
                num_filters=64,
                name="conv1")

        layer_pool1 = self.new_pool_layer(layer_conv1, name="pool1")
        layer_pool1 = tf.nn.local_response_normalization(layer_pool1)
        layer_relu1 = self.new_relu_layer(layer_pool1, name="relu1")

        layer_pools = []
        layer_relus = []
        layer_convs = []
        weight_convs = []

        layer_convs.append(layer_conv1)
        layer_pools.append(layer_pool1)
        layer_relus.append(layer_relu1)
        weight_convs.append(weights_conv1)
        n2 = 5
        for k1 in range(1, no_of_layers):
            namee = "conv" + str(k1 + 1)
            layer_conv1, weights_conv1 = self.new_conv_layer(
                input=layer_relus[k1 - 1],
                num_input_channels=64,
                filter_size=filter_sizes[k1],
                num_filters=64,
                name=namee)
            name2 = "pool" + str(k1 + 1)
            name1 = "relu" + str(k1 + 1)

            layer_pool1 = self.new_pool_layer(layer_conv1, name=name2)
            layer_pool1 = tf.nn.local_response_normalization(layer_pool1)
            layer_relu1 = self.new_relu_layer(layer_pool1, name=name1)

            layer_convs.append(layer_conv1)
            layer_pools.append(layer_pool1)
            layer_relus.append(layer_relu1)
            weight_convs.append(weights_conv1)
            n2 = 10

        num_features = layer_relu1.get_shape()[1:4].num_elements()
        layer_flat = tf.reshape(layer_relu1, [-1, num_features])

        layer_fc1 = self.new_fc_layer(layer_flat,
                                      num_inputs=num_features,
                                      num_outputs=512,
                                      name="fc1")
        layer_relu4 = self.new_relu_layer(layer_fc1,
                                          name="relu" + str(no_of_layers + 1))

        layer_fc3 = self.new_fc_layer(layer_relu4,
                                      num_inputs=512,
                                      num_outputs=192,
                                      name="fc3")
        layer_relu3 = self.new_relu_layer(layer_fc3,
                                          name="relu" + str(no_of_layers + 2))

        layer_fc2 = self.new_fc_layer(input=layer_relu3,
                                      num_inputs=192,
                                      num_outputs=10,
                                      name="fc2")

        with tf.variable_scope("Softmax"):
            y_pred = (tf.nn.softmax(layer_fc2))

            y_pred_cls = tf.argmax(y_pred, dimension=1)

            # y_pred_cl=y_pred_cls

        with tf.name_scope("cross_ent"):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=layer_fc2, labels=y_true)
            cost = tf.reduce_mean(cross_entropy)

        with tf.name_scope("optimizer"):
            optimizer = tf.train.AdamOptimizer(
                learning_rate=1e-3).minimize(cost)

        with tf.variable_scope("accuracy"):
            # print("-----------")
            # print(y_pred_cls)
            correct_prediction = tf.equal(y_pred_cls, y_true_cls)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        num_epochs = 20
        batch_size = 100

        trainset = self.trainset
        testset = self.testset
        trainlabel = self.trainlabel
        testlabel = self.testlabel
        f = open("CF_allembeddings.txt", "w")
        tsne = TSNEVisualizer()
        with tf.Session() as sess:
            global train_acc
            global f1_mi
            global f1_ma
            train_acc = []
            f1_mi = []
            f1_ma = []
            sess.run(tf.global_variables_initializer())
            trainset1 = self.trainset
            trainlabels11 = self.train_labels
            #                print(len(trainset1))
            trainlabel1 = self.trainlabel
            for i in range(0, 4):
                f.write("\n\n                   Percentage of traindata ::: ")
                f.write(str((1 + i) * 10))
                f.write("\n")
                # indx  = np.arange(trainlabel1.shape[0])
                # np.random.shuffle(indx)
                # trainset,trainlabel = trainset1[indx], trainlabel1[indx]
                # trainset1, trainlabel1=trainset1[indx], trainlabel1[indx]
                # trainlabels1=trainlabels11[indx]
                n = len(trainset1)
                n = int(n * (i + 1) / 10)
                # print(n)
                trainset, trainlabel = trainset1, trainlabel1
                trainlabels1 = trainlabels11
                testset, testlabel = trainset[n:, :], trainlabel[n:, :]
                testlabels1 = trainlabels11[n:, :]
                # print(len(testlabel))
                trainset, trainlabel = trainset[0:n, :], trainlabel[0:n, :]
                trainlabels1 = trainlabels1[0:n, :]
                print("----------------------------------------------")
                # print(len(trainlabel1),len(trainset),len(testset))
                #                   testset,testlabel =trainset1[n:,:],trainlabel1[n:,:]
                # print(trainlabel)
                # writer.add_graph(sess.graph)
                # print("testlabel:",str(testlabels1[4]))
                # print("testsset",str(testset[4]))
                # print("testlabel",str(testlabel[4]))
                # print("set",str(trainset1[n+4]))
                # print("lab",str(trainlabel1[n+4]))
                # break
                for epoch in range(num_epochs):
                    start_time = time.time()
                    train_accuracy = 0
                    k = 0
                    f1_macro = 0
                    f1_micro = 0
                    a1 = 0
                    # batch_size=int(len(trainlabel))

                    for batch in range(0, int(len(trainlabel) / batch_size)):
                        # print("--")
                        x_batch = trainset[k:k + batch_size]
                        y_true_batch = trainlabel[k:k + batch_size]
                        k = k + batch_size
                        feed_dict_train = {x: x_batch, y_true: y_true_batch}
                        sess.run(optimizer, feed_dict=feed_dict_train)
                        a, y_p, y, t = sess.run(
                            [accuracy, y_pred_cls, y_true_cls, layer_relu3],
                            feed_dict=feed_dict_train)
                        a1 += a


#                                 a1+=a
                    print("acc :", str(epoch), "---",
                          str(a1 / int(len(trainlabel) / batch_size)))

                vali_accuracy, l = sess.run([accuracy, layer_relu3],
                                            feed_dict={
                                                x: testset,
                                                y_true: testlabel
                                            })
                # print(np.array(l))
                kmeans = KMeans(n_clusters=10, random_state=0).fit(np.array(l))
                # yy=kmeans.predict(np.array(l))
                # plt.scatter()
                labels = kmeans.labels_
                # print(labels)
                f.write("\n\nEMBEDDINGS ARE : \n\n")
                f.write(str(np.array(l)))
                f.write("\n\n")
                mydict = {
                    i: np.where(kmeans.labels_ == i)[0]
                    for i in range(kmeans.n_clusters)
                }
                # print(mydict)
                # print(len(mydict[9]))

                # print(len(mydict[8]))

                # print(len(mydict[7]))

                # print(len(mydict[6]))
                dictlist = []
                #print(mydict)
                mydict1 = {}
                for key in mydict:
                    mydict1[key] = list(mydict[key])
                # print(len(mydict[0]))

                cluster_label = {}
                acc = 0.0
                #                print(testlabels1[0][0])
                lab = [i for i in range(10)]
                while (len(lab) != 0):
                    #iidx=max(mydict.items(), key=operator.itemgetter(1))[0]  #idx is the cluster no.
                    iidx = max(mydict1, key=mydict1.get)
                    cluster_label[iidx] = 0
                    keys1 = mydict1[iidx]
                    #                    print(keys1)
                    dict1 = {}
                    print(keys1)
                    for i in (keys1):
                        if testlabels1[i][0] not in dict1:
                            dict1[testlabels1[i][0]] = 0

                        dict1[testlabels1[i][0]] += 1
                    idx1 = max(
                        dict1.items(), key=operator.itemgetter(1)
                    )[0]  #idx1 is the label of maximum occuring samples in idx cluste
                    while (idx1 not in lab):
                        del dict1[idx1]
                        idx1 = max(dict1.items(),
                                   key=operator.itemgetter(1))[0]
                    # print(dict1[idx1])
                    # print(dict1)
                    #print(cluster_label)
                    acc += (dict1[idx1])
                    cluster_label[iidx] = idx1
                    # print(idx1)
                    lab.remove(idx1)
                    del mydict1[iidx]
                print("Acc: ")
                print(acc)
                print(acc / len(testset))
                f.write("\nACCURACY AFTER CLUSTERING IS:   ")
                f.write(str(acc))
                f.write("\n\n")
예제 #30
0
newdf_countvectorizer = vectorizer.fit_transform(newdf['newPreprocessed']) # The text has to be cleaned first.
newdf_countvectorizer.shape
print(vectorizer.get_feature_names())
print(len(vectorizer.get_feature_names()))

"""**Display TSNE**"""

from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer

data = newdf['newPreprocessed']
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(data)
labels = newdf['feedback']

tsne = TSNEVisualizer()
tsne.fit_transform(docs, labels)
tsne.poof() 
# show the distribution of negative and positive reviews

newdf.drop(['reviews.text'], axis=1, inplace=True) 
reviews = pd.DataFrame(newdf_countvectorizer.toarray())
newdf.head(1)

"""**Set Feature X and Target Y**"""

newdf.reset_index(drop=True, inplace=True)
newdf = pd.concat([newdf, reviews], axis=1)
X = newdf.drop(['reviews.rating','feedback','preprocessed','preprocessedStr','preprocessedStr','newPreprocessed','keepAdj','posTag'],axis=1)
y = newdf['feedback']
예제 #31
0
    # Load the data from the files in the corpus
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)

            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())

    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )


# Load the data and create document vectors
corpus = load_corpus('hobbies')
tfidf = TfidfVectorizer()

docs = tfidf.fit_transform(corpus.data)
labels = corpus.target

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer()
tsne.fit(docs, labels)
tsne.poof()