Exemplo n.º 1
0
def plot(X_transform, model, figsize=(15, 6)):
    """
	Funcao responsavel por plotar um grafico dispersao para avaliar como estar a distribuicao
	dos dados apos treinado.

	----------
	parameters:
		X_transform: Dados transformado em versao numerica
		model: instancia do modelo kmeans apos o treino
		figsize: tamanho do grafico
	"""

    # Usando o SVD para diminuir a dimencionalidade dos dados para 2 dimenções.
    svd = SVD(n_components=2, random_state=0)
    vectorizer_2D = svd.fit_transform(X_transform)

    plt.figure(figsize=figsize)
    plt.scatter(vectorizer_2D[:, 0], vectorizer_2D[:, 1], c=model.labels_)
    plt.scatter(model.cluster_centers_[:, 0],
                model.cluster_centers_[:, 1],
                s=200,
                color='black',
                label='Centroids')
    plt.title('Cluster of Tweets')
    plt.legend()
    plt.show()
Exemplo n.º 2
0
 def infer_topics(self, num_topics=10, **kwargs):
     self.nb_topics = num_topics
     nmf = SVD(n_components=num_topics)
     topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space)
     self.topic_word_matrix = []
     self.document_topic_matrix = []
     vocabulary_size = len(self.corpus.vocabulary)
     row = []
     col = []
     data = []
     for topic_idx, topic in enumerate(nmf.components_):
         for i in range(vocabulary_size):
             row.append(topic_idx)
             col.append(i)
             data.append(topic[i])
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     row = []
     col = []
     data = []
     doc_count = 0
     for doc in topic_document:
         topic_count = 0
         for topic_weight in doc:
             row.append(doc_count)
             col.append(topic_count)
             data.append(topic_weight)
             topic_count += 1
         doc_count += 1
     self.document_topic_matrix = coo_matrix((data, (row, col)),
                                             shape=(self.corpus.size, self.nb_topics)).tocsr()
def main():
	if len(sys.argv) < 2:
		print('Expected arguments are not provided.')
		return
	actorid = int(sys.argv[1])
	imdb_actor_info = util.read_imdb_actor_info()
	input_actor_name = imdb_actor_info[imdb_actor_info['id'] == actorid]['name'].values[0]

	tf_idf_matrix = util.get_tf_idf_matrix()
	#print(tf_idf_matrix)
	actor_tf_idf = tf_idf_matrix.loc[actorid]
	#print(actor_tf_idf)

	svd = SVD(n_components=no_of_components)
	svd.fit(tf_idf_matrix)
	svd_df = pd.DataFrame(svd.transform(tf_idf_matrix), index=tf_idf_matrix.index)

	input_actor_row = svd_df.loc[actorid]

	actors = []
	for index, row in svd_df.iterrows():
		name = imdb_actor_info[imdb_actor_info['id'] == index]['name'].values[0]
		actors.append((index, name, 1 - cosine(row, input_actor_row)))
	other_actors = list(filter(lambda tup: tup[0] != actorid, actors))
	other_actors.sort(key=lambda tup: tup[2], reverse=True)
	util.print_output(actorid, input_actor_name, other_actors[:no_of_actors])
	util.write_output_file(actorid, input_actor_name, other_actors[:no_of_actors], output_file)
Exemplo n.º 4
0
    def test_grid_search_model(self):
        X, y = make_classification(random_state=42)
        param_grid = [{
            'pca': [PCA(2)],
            'lr__fit_intercept': [False, True]
        }, {
            'pca': [SVD(2)],
            'lr__fit_intercept': [False, True]
        }]
        pipe = Pipeline([('pca', 'passthrough'), ('lr', LogisticRegression())])
        grid0 = GridSearchCV(pipe, param_grid, error_score='raise')
        grid0.fit(X, y)

        pipe = PipelineCache([('pca', 'passthrough'),
                              ('lr', LogisticRegression())], 'cache__3')
        grid = GridSearchCV(pipe, param_grid, error_score='raise')

        grid.fit(X, y)
        cache = MLCache.get_cache('cache__3')
        # 0.22 increases the number of cached results
        self.assertIn(len(cache), (7, 11))
        key = list(cache.keys())[0]
        self.assertIn("[('X',", key)
        self.assertIn("('copy', 'True')", key)
        MLCache.remove_cache('cache__3')
        self.assertEqual(grid0.best_params_, grid.best_params_)
Exemplo n.º 5
0
def get_lsa():
    print("lsa")
    train = get_tfidf("lr")
    lsa = SVD(n_components=400)
    train = lsa.fit_transform(train)

    return train
Exemplo n.º 6
0
def run_SVD(X,y,title):
    
    dims = list(np.arange(1,X.shape[1]))
    svd = SVD(random_state=10)

    for dim in dims:
        svd.set_params(n_components=dim)
        svd.fit_transform(X)
        ev = svd.explained_variance_ratio_
        cum_var = np.cumsum(ev)

    fig, ax1 = plt.subplots()
    ax1.plot(dims, cum_var, 'b-')
    ax1.set_xlabel('Principal Components')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Cumulative Explained Variance Ratio', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims, ev, 'm-')
    ax2.set_ylabel('Explained Variance Ratio', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("SVD: "+ title)
    fig.tight_layout()
    plt.show()
def main():
    if len(sys.argv) < 2:
        print('Expected arguments are not provided.')
        return
    movieid = int(sys.argv[1])
    mlmovies = util.read_mlmovies()
    movie_actors = util.read_movie_actor()
    imdb_actor_info = util.read_imdb_actor_info()

    input_movie = mlmovies[mlmovies['movieid'] ==
                           movieid]['moviename'].values[0]
    actors_of_movie = movie_actors.where(
        movie_actors['movieid'] == movieid).dropna().loc[:,
                                                         'actorid'].unique()
    #print (actors_of_movie)

    movie_matrix = util.get_movie_tf_idf_matrix()
    actor_matrix = util.get_actor_tf_idf_matrix()
    #print(actor_matrix.shape)
    input_movie_vector = pd.DataFrame(movie_matrix.loc[movieid])  #.transpose()
    #print(input_movie_vector.shape)
    #similarity_matrix = actor_matrix.dot(input_movie_vector)
    #similarity_matrix = similarity_matrix[~similarity_matrix.index.isin(actors_of_movie)]
    #print(similarity_matrix)

    svd = SVD(n_components=no_of_components)
    svd.fit(movie_matrix)
    svd_movie_df = pd.DataFrame(svd.transform(movie_matrix),
                                index=movie_matrix.index)

    input_movie_vector = pd.DataFrame(svd_movie_df.loc[movieid])
    #print(input_movie_vector)

    df_components = pd.DataFrame(svd.components_,
                                 columns=actor_matrix.columns).transpose()
    #print(df_components.shape)
    #print(df_components)
    #print(actor_matrix.shape)
    projected_matrix = actor_matrix.dot(df_components)
    #print(projected_matrix)

    similarity_matrix = projected_matrix.dot(input_movie_vector)
    similarity_matrix = similarity_matrix[~similarity_matrix.index.
                                          isin(actors_of_movie)]
    #print(similarity_matrix)

    actors = []
    for index, row in similarity_matrix.iterrows():
        actor_name = imdb_actor_info[imdb_actor_info['id'] ==
                                     index]['name'].values[0]
        actors.append(
            (index, actor_name, similarity_matrix.loc[index][movieid]))
    actors.sort(key=lambda tup: tup[2], reverse=True)
    #print (actors)

    util.print_output(movieid, input_movie, actors[:no_of_actors])
    util.write_output_file(movieid, input_movie, actors[:no_of_actors],
                           output_file)
 def svd(table, k):
     """
     Decompose table into matrices U . S . V = table.
     :param DataFrame table: table to decompose.
     :param int k: number of components to get from decomposition.
     :return DataFrame reduced matrix.
     """
     matrix = SVD(n_components=k)
     out = matrix.fit_transform(table)
     return out, matrix.components_
Exemplo n.º 9
0
def decompose_into_topics(document_concept_matrix,
                          k_estimator=mdu.estimate_k_singular,
                          l1_ratio=1,
                          alpha=1,
                          sparsity_scale=0.15,
                          decompo='NMF',
                          freq_threshold=0.85,
                          normalize=True,
                          mink=4):

    # We ignore concepts that are present too many times in this matrix
    # Or not present at all. Prescence is binary.
    binz = np.zeros_like(document_concept_matrix)
    binz[document_concept_matrix > 0] = 1
    freqs = binz.sum(axis=0)
    num_docs = document_concept_matrix.shape[0]
    bad_cpts = np.nonzero(freqs > freq_threshold * num_docs)[0].tolist()
    bad_cpts += np.nonzero(freqs <= 0)[0].tolist()
    good_cpts = range(document_concept_matrix.shape[1])
    good_cpts = [c for c in good_cpts if (c not in bad_cpts)]

    m_clean = document_concept_matrix[:, good_cpts]

    if isinstance(k_estimator, int):
        k = k_estimator
    else:
        k, gaps, deltas = k_estimator(m_clean, exp=10)
    k = max([k, np.ceil(num_docs / 100), mink])
    print("\tk=" + str(k), end=" ")
    if decompo == 'NMF':
        model = NMF(n_components=int(k),
                    init='random',
                    random_state=0,
                    l1_ratio=l1_ratio,
                    alpha=alpha)
    else:
        model = SVD(n_components=int(k))

    # print("\tC:", m_clean.shape)
    tdm = model.fit_transform(m_clean)
    topic_concept_matrix, threshold = mdu.remove_excess_nonzero(
        m_clean,
        model,
        good_cpts,
        numorigcpts=document_concept_matrix.shape[1],
        scale=sparsity_scale)
    tcm, tdm = mdu.remove_blank_topics(topic_concept_matrix, tdm)

    if normalize:
        rowsums = tcm.sum(axis=1)
        for top in range(tcm.shape[0]):
            tcm[top, :] = tcm[top, :] / rowsums[top]

    return tdm, tcm
Exemplo n.º 10
0
def main():
	err, input_movie_ids = util.parse_input(sys.argv)
	if err:
		return

	# *** Write your code here ***
	#process movie list to get matrix
	matrix = util.get_movie_matrix_from_hd5()
	#print(matrix)

	#perform SVD
	svd = SVD(n_components=no_of_components)
	svd.fit(matrix)
	svd_df = pd.DataFrame(svd.transform(matrix), index=matrix.index)

	input_movie_df = svd_df.loc[input_movie_ids]

	output_movies = []
	for index, movie in svd_df.iterrows():
		cosine_sum = 0
		order = 1
		for j, input_movie in input_movie_df.iterrows():
			cosine_sum += (1 - cosine(movie, input_movie))*order
			order -= order_factor
		output_movies.append((index, cosine_sum))
	other_movies = list(filter(lambda tup: tup[0] not in input_movie_ids, output_movies))
	other_movies.sort(key=lambda tup: tup[1], reverse=True)
	output_movie_ids = [t[0] for t in other_movies][:5]

	#print output and log them
	feedback = util.process_output(input_movie_ids, output_movie_ids, output_file)

	#process feedback to get relevant movies and movies to be excluded
	relevant_movies, movie_to_exclude = util.process_feedback(feedback, input_movie_ids)

	relevant_movie_count = len(relevant_movies)
	#if all recommended movies are relevant then return
	if relevant_movie_count==5:
		print "\nAll the movies were relevant hence no modification to the suggestion"
		return

	#fetch data frames for relevant and feedback movies	
	relevant_movies_df = svd_df.loc[relevant_movies]
	feedback_movies_df = svd_df.loc[feedback.keys()]

	modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, svd_df.index, relevant_movie_count)

	revised_movie_ids = util.get_revised_movies(svd_df, modified_query, movie_to_exclude)

	util.print_revised(revised_movie_ids, output_file)
Exemplo n.º 11
0
def main():
    store = pd.HDFStore('../task1/movie_final.h5')
    movie_matrix = store['df']
    print(movie_matrix)

    svd = SVD(n_components=500)
    svd.fit(movie_matrix)
    movie_matrix_svd = pd.DataFrame(svd.transform(movie_matrix),
                                    index=movie_matrix.index)

    print(movie_matrix.shape)
    print(movie_matrix_svd.shape)

    movie_matrix_svd.to_pickle("movie_matrix_svd.pkl")
Exemplo n.º 12
0
def main():
    scores = load('scores', {})

    PREPROCESSING = {
        'raw': lambda: get_featureset(dataset, tf_idf=False),
        'tfidf': lambda: get_featureset(dataset),
        'nostem': lambda: get_featureset(dataset, stem=False),
        'svd50': lambda: SVD(50).fit_transform(get_featureset(dataset)),
        'svd100': lambda: SVD(100).fit_transform(get_featureset(dataset)),
        'lda': lambda: get_lda(dataset, 100),
    }

    for dataset in DATASETS:
        for method in PREPROCESSING:
            # name of feature set
            name = method + ':' + dataset

            # check if we calulated this score before
            if name in scores:
                continue

            # we can only do iton some datasets
            if method == 'lda':
                if 'dataset' not in LDA_DATASETS:
                    continue

            print(name)

            # get the data
            data = PREPROCESSING[method]()

            # train model
            scores[name] = get_scores(data)

            # save it
            save('scores', scores)
Exemplo n.º 13
0
def do_svd(matrix, input_movie_ids):
    svd = SVD(n_components=no_of_components)
    svd.fit(matrix)
    svd_df = pd.DataFrame(svd.transform(matrix), index=matrix.index)

    input_movie_df = svd_df.loc[input_movie_ids]

    output_movies = {}
    for index, movie in svd_df.iterrows():
        cosine_sum = 0
        order = 1
        for j, input_movie in input_movie_df.iterrows():
            cosine_sum += (1 - cosine(movie, input_movie)) * order
            order -= order_factor
        output_movies[index] = cosine_sum
    return output_movies, svd_df
Exemplo n.º 14
0
    def pca(self, data, labels):
        #Calculate Mean
        mean = data.mean(axis=0)

        #Subtract mean from data
        data = np.subtract(data, mean)

        #Calculate Covariance
        cov = np.cov(data.T)

        #Calculate Eigen Values and Eigen Vectors
        evals, evecs = np.linalg.eig(cov)

        #Get Sorted Tuples
        tuples = self.sort(evals, evecs)
        # Choosing top 2 eigen vectors
        tuples = tuples[0:2]

        #Get the projection matrix
        projection_matrix = np.zeros((2, self.cols))
        count = 0
        for i in tuples:
            for k, v in i:
                for j in range(0, self.cols):
                    projection_matrix[count][j] = v[j]
            count += 1
        projection_matrix = projection_matrix.T

        #Get new_data
        pca_data = data.dot(projection_matrix)
        self.plot(pca_data, labels, self.labels, self.name, "PCA")

        #Using package TruncatedSVD
        svd = SVD(n_components=2)
        svd_data = svd.fit_transform(data)
        self.plot(svd_data, labels, self.labels, self.name, "SVD")

        tsne = TSNE(n_components=2, init="pca", learning_rate=100)
        tsne_data = tsne.fit_transform(data)
        self.plot(tsne_data, labels, self.labels, self.name, "tSNE")
Exemplo n.º 15
0
    def svd1(table, k):
        """
        Decompose table into matrices U . S . V = table.
        :param DataFrame table: table to decompose.
        :param int k: number of components to get from decomposition.
        :return DataFrame reduced matrix.
        """
        indexes = table.index

        matrix = SVD(n_components=k)
        out = matrix.fit_transform(table)
        temp = np.array(matrix.components_, dtype=float)
        l = 0
        for i in temp:
            j = 0
            print("LATENT SEMANTICS " + str(l))
            for comp in i:
                print(str(j) + "\t" + str(comp))
                j = j + 1
            l = l + 1

        return DataFrame(data=out, index=indexes, columns=range(k))
Exemplo n.º 16
0
if not os.path.exists(args.Output_Folder):
    os.makedirs(args.Output_Folder)
hasil = open('{}/{}'.format(args.Output_Folder, args.Output_File), 'w')
n_topics = eval(args.n_topics)
with open(args.Clean_Tweets_Location, 'rb') as handle:
    tfidf, tfidf_terms = pickle.load(handle)
komponen = 5
if dimred == "grp":
    dr = GRP(n_components=komponen, random_state=11)
elif dimred == 'srp':
    if UseKernel:
        dr = SRP(n_components=komponen, random_state=11)
    else:
        dr = SRP(n_components=komponen, random_state=11, dense_output=True)
elif dimred == 'svd':
    dr = SVD(n_components=komponen, random_state=11)
if dimred in ['lda', 'nmf']:
    if dimred == 'lda':
        dr = LDA(n_components=n_topics, random_state=11).fit(tfidf)
    else:
        dr = NMF(n_components=n_topics, random_state=11).fit(tfidf)
    cntr = dr.components_
else:
    tfile = open(
        "Time/{}".format(dimred.upper()) + UseKernel * "-Kernel" +
        "/{}".format(time_file), 'a+')
    timer = time()
    data = dr.fit_transform(tfidf)
    tfile.write("{}\n".format(time() - timer))
    tfile.close()
    cntr, u, d = huft.fcmeans(data.T,
Exemplo n.º 17
0
def svd(train, test, n_components):
    svd = SVD(n_components=n_components)
    new_train = svd.fit_transform(X=train)
    new_test = svd.transform(X=test)
    return new_train, new_test
Exemplo n.º 18
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD as SVD
import sklearn.model_selection as k
r = pd.read_csv('bank_contacts.csv')
x = r.drop('credit_application', axis=1)
y = r['credit_application']
train_x, test_x, train_y, test_y = k.train_test_split(x,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=42)
sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.transform(test_x)
svd = SVD(n_components=3, random_state=42)
train_x = svd.fit_transform(train_x, train_y)
test_x = svd.transform(test_x)
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)

classifier.fit(train_x, train_y)
pred = classifier.predict(test_x)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print(confusion_matrix(test_y, pred))
print('Accuracy:', accuracy_score(test_y, pred))
plt.scatter(pred, test_x[:, 0], marker='o')
plt.scatter(pred, test_x[:, 1], marker='o')
Exemplo n.º 19
0
    def task6(self, *args):
        """
        Command:\ttask6 <k>
        Description:\tCreates a location - location similarity matrix, performs SVD, \
        and reports the top k latent semantics.
        Arguments:
        \tK - Number of latent semantics to identify.
        """
        if len(args) < 1:
            print("[ERROR] Not enough args were provided. Expected 1 but got " + str(len(args)))
            print("\targs = " + str(args))
            return
        if len(args) > 1:
            print("[ERROR] Too many arguments were provided. Expected 1 but got " + str(len(args)))
            print("\targs = " + str(args))
            return
        if not self.__database__:
            print("[ERROR] The Database must be loaded before this can be run.")
            print("\tCommand: load <filepath>")
            return

        try:
            k = int(args[0])
        except:
            print("[ERROR] One or more arguments could not be parsed: " + str(args))

        # First getting all the location tables having all the visual discriptors.

        # IMP NOTE: Remember to change all the static values after the test data arrives.
        all_location_tables = dict()
        for id in range(1,36):
            all_location_tables[id] = Database.get_vis_table(self.__database__,locationid=id)
        # Start finding the similarity between each pair of locations and store the results into a dictionary.
        if isfile('./all_similarities.npy'):
            all_similarities = np.load('all_similarities.npy').item()
        else:
            all_similarities = dict()
            for i in range(1,36):
                for j in range(i,36):
                    cos_sim = cosine_similarity(all_location_tables[i], all_location_tables[j])
                    all_similarities[(i, j)] = Scoring.score_matrix(cos_sim)
                    all_similarities[(j, i)] = all_similarities[(i, j)]
            np.save('all_similarities.npy', all_similarities)

        similarity_matrix = df(index=range(1,36),columns=range(1,36))
        for i in range(1,36):
            sim_list = list()
            for j in range(1,36):
                sim_list.append(all_similarities[(i, j)])
            similarity_matrix.loc[i] = sim_list
        similarity_matrix.to_csv('Task6_SimilarityMatrix.csv')
        print("Location-Location Similarity matrix created...")

        reduced = SVD(n_components=k)
        # reduced_table = reduced.fit_transform(similarity_matrix)
        reduced.fit(similarity_matrix)
        VTranspose = df(data=reduced.components_, index=range(1,k+1),columns=range(1,36))
        VTranspose.to_csv('task6transposetable.csv')

        filename = 'devset_topics.xml'
        tree = ET.parse(filename)
        root = tree.getroot()
        location_name = dict()
        location_dict = dict()
        i = 0

        while i < len(root):
            v1 = int(root[i][0].text)
            v2 = root[i][1].text
            location_name[v1] = v2
            i += 1

        print("Top k latent semantics in form of their location-weight pair are:")
        for _, row in VTranspose.iterrows():
            for j in range(1, 36):
                loc = location_name[j]
                location_dict[loc] = row[j]
            sorted_location_dict = sorted(location_dict.items(), key=itemgetter(1), reverse=True)

            # print("latent semantic " + str(index) + " : " + str(sorted_location_dict))
            print(f"LATENT SEMANTIC {i}")
            for key, value in sorted_location_dict:
                print(f"{key} : {value}")
Exemplo n.º 20
0
 def svd2(table, k):
     indexes = table.index
     matrix = SVD(n_components=k)
     out = matrix.fit_transform(table)
     return DataFrame(data=out, index=indexes, columns=range(k))
Exemplo n.º 21
0
        dataY = data.iloc[:, 41]
        features = list(dataX.columns.values)
        dataset = "QSAR"
        k = 10
        k_em = 10
        comp = 39
    else:
        dataX = data.iloc[:, :20]
        dataY = data.iloc[:, 20]
        features = list(dataX.columns.values)
        dataset = "Voice"
        k = 10
        k_em = 15
        comp = 19
    print("Running SVD for {}...".format(datst))
    svd = SVD(n_components=comp, random_state=5)
    dataX_SVD = svd.fit_transform(dataX)

    #kmeans

    model = KMeans(n_clusters=k)
    labels = model.fit_predict(dataX_SVD)
    model = KMeans(n_clusters=k)
    labels_KM = model.fit_predict(dataX)

    accuracy = cluster_acc(dataY, labels)
    print("\nAccuracy for k-means on", dataset, "is", accuracy)
    accuracy_clusters = cluster_acc(labels_KM, labels)
    print("\nCluster alignment for k-means is", accuracy_clusters)

    #EM
Exemplo n.º 22
0

cv = sklearn.feature_extraction.text.CountVectorizer()
m = cv.fit_transform(['this is a document', 'this is a second document', 'third document document document']).todense()
print(cv.vocabulary_)
print(m)

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
td_count_matrix = count_vectorizer.fit_transform(df['tags'])

td_count_matrix.shape

list(count_vectorizer.vocabulary_.items())[:10]

from sklearn.decomposition import TruncatedSVD as SVD
svd = SVD(10)
svd.fit(td_count_matrix)

for i in range(len(svd.components_)):
    topk = sorted(zip(svd.components_[i], count_vectorizer.get_feature_names()), reverse=True)[:10]
    print(' + '.join('{:.3f} * {}'.format(v, word) for v, word in topk))

# vectorize the documents
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
td_count_matrix = count_vectorizer.fit_transform(df['description'])
svd = SVD(10)
svd.fit(td_count_matrix)
#show just the words, not the scores this time
for i in range(len(svd.components_)):
    topk = sorted(zip(svd.components_[i], count_vectorizer.get_feature_names()), reverse=True)[:10]
    print(' | '.join(w for _, w in topk))
Exemplo n.º 23
0
    (trainData, testData, trainTarget, testTarget) = split
    model = LinearSVC()
    model.fit(trainData, trainTarget)
    baseline = metrics.accuracy_score(model.predict(testData), testTarget)
    model = LinearSVC()
    model.fit(trainData, trainTarget)
    baseline = metrics.accuracy_score(model.predict(testData), testTarget)
    print("Running RP...")
    accuracies = []
    for comp in comps:
        # create the random projection
        #sp = SparseRandomProjection(n_components = comp)
        #X = sp.fit_transform(trainData)
        #sp = PCA(n_components = comp, random_state=5)
        #X = sp.fit_transform(trainData)
        sp = SVD(n_components=comp)
        X = sp.fit_transform(trainData)
        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, trainTarget)
    
        # evaluate the model and update the list of accuracies
        test = sp.transform(testData)
        accuracies.append(metrics.accuracy_score(model.predict(test), testTarget))

    plt.figure()
    plt.suptitle("Accuracy of Sparse Projection on {}".format(dataset))
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    if dataset =="QSAR":
        plt.xlim([2, 40])
        try:
            with open(fname) as pearl:
                text = pearl.read()
                token_dict[f] = re.sub("[^A-Za-z]", " ", text)
        except UnicodeDecodeError as e:
            with open(fname, encoding="utf8") as pearl:
                text = pearl.read()
                token_dict[f] = re.sub("[^A-Za-z]", " ", text)

stopwords = stopwords.words("english")
add = ['search', 'engine', 'web', 'internet']
stopwords.extend(add)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords)
tfs = tfidf.fit_transform(token_dict.values())

lsa = SVD(n_components=4, n_iter=100)
doc_top = lsa.fit_transform(tfs)
doc_top = Normalizer(copy=False).fit_transform(doc_top)
terms = tfidf.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip(terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:5]
    print("Topic %d:" % i)
    for term in sortedTerms:
        print(term[0])
    print(" ")

##import umap
##X_topics = lsa.fit_transform(tfs)
##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)
##
Exemplo n.º 25
0
def run_EM_dr(X_norm,y,title):
    range_n_clusters = [2,3,4,5,6]
    loss_n = []
    loss_pca = []
    loss_ica = []
    loss_srp = []
    loss_svd = []
    silhouette_avg0 = []
    silhouette_avg1 = []
    silhouette_avg2 = []
    silhouette_avg3 = []
    silhouette_avg4 = []
    homo0 = []
    homo1 = []
    homo2 = []
    homo3 = []
    homo4 = []
    comp0 = []
    comp1 = []
    comp2 = []
    comp3 = []
    comp4 = []
    
    for index, n_clusters in enumerate(range_n_clusters):
    
        X_pca = PCA(n_components=3,random_state=5).fit_transform(X_norm)
        X_ica = FastICA(n_components=2,random_state=5).fit_transform(X_norm)
        X_srp = SRP(n_components=4,random_state=5).fit_transform(X_norm)
        X_svd = SVD(n_components=4,random_state=5).fit_transform(X_norm)
        clusterer_n =  GaussianMixture(n_components=n_clusters, random_state=10).fit(X_norm)
        clusterer_pca =  GaussianMixture(n_components=n_clusters, random_state=10).fit(X_pca)
        clusterer_ica =  GaussianMixture(n_components=n_clusters, random_state=10).fit(X_ica)
        clusterer_srp =  GaussianMixture(n_components=n_clusters, random_state=10).fit(X_srp)
        clusterer_svd =  GaussianMixture(n_components=n_clusters, random_state=10).fit(X_svd)
        loss_n.append(clusterer_n.bic(X_norm))
        loss_pca.append(clusterer_pca.aic(X_pca))
        loss_ica.append(clusterer_ica.aic(X_ica))    
        loss_srp.append(clusterer_srp.aic(X_srp))
        loss_svd.append(clusterer_svd.aic(X_svd))   
        cluster0_labels = clusterer_n.predict(X_norm)
        cluster1_labels = clusterer_pca.predict(X_pca)
        cluster2_labels = clusterer_ica.predict(X_ica)
        cluster3_labels = clusterer_srp.predict(X_srp)
        cluster4_labels = clusterer_svd.predict(X_svd)
        silhouette_avg0.append(silhouette_score(X_norm, cluster1_labels))
        silhouette_avg1.append(silhouette_score(X_pca, cluster1_labels))
        silhouette_avg2.append(silhouette_score(X_ica, cluster1_labels))
        silhouette_avg3.append(silhouette_score(X_srp, cluster1_labels))
        silhouette_avg4.append(silhouette_score(X_svd, cluster1_labels))
        homo0.append(metrics.homogeneity_score(y, cluster0_labels))
        homo1.append(metrics.homogeneity_score(y, cluster1_labels))
        homo2.append(metrics.homogeneity_score(y, cluster2_labels))
        homo3.append(metrics.homogeneity_score(y, cluster3_labels))
        homo4.append(metrics.homogeneity_score(y, cluster4_labels))
        comp0.append(metrics.completeness_score(y, cluster0_labels))
        comp1.append(metrics.completeness_score(y, cluster1_labels))
        comp2.append(metrics.completeness_score(y, cluster2_labels))
        comp3.append(metrics.completeness_score(y, cluster3_labels))
        comp4.append(metrics.completeness_score(y, cluster4_labels))

    
    #BIC
    plt.plot(range_n_clusters, loss_n, label="without rd")
    plt.plot(range_n_clusters, loss_pca, label="pca")
    plt.plot(range_n_clusters, loss_ica, label="ica")
    plt.plot(range_n_clusters, loss_srp, label="srp")
    plt.plot(range_n_clusters, loss_svd, label="svd")
    plt.xlabel('number of cluster')
    plt.title('EM:BIC')
    plt.legend(loc="best")
    plt.show()
    
    #silhouette
    plt.plot(range_n_clusters, silhouette_avg0, label="without rd")
    plt.plot(range_n_clusters, silhouette_avg1, label="pca")
    plt.plot(range_n_clusters, silhouette_avg2, label="ica")
    plt.plot(range_n_clusters, silhouette_avg3, label="srp")
    plt.plot(range_n_clusters, silhouette_avg4, label="svd")
    plt.xlabel('number of cluster')
    plt.title('EM:silhouette')
    plt.legend(loc="best")
    plt.show()
    
    #homogeneity
    plt.plot(range_n_clusters, homo0, label="without rd")
    plt.plot(range_n_clusters, homo1, label="pca")
    plt.plot(range_n_clusters, homo2, label="ica")
    plt.plot(range_n_clusters, homo3, label="srp")
    plt.plot(range_n_clusters, homo4, label="svd")
    plt.xlabel('number of cluster')
    plt.title('EM:homogeneity')
    plt.legend(loc="best")
    plt.show()
    
    #completeness
    plt.plot(range_n_clusters, comp0, label="without rd")
    plt.plot(range_n_clusters, comp1, label="pca")
    plt.plot(range_n_clusters, comp2, label="ica")
    plt.plot(range_n_clusters, comp3, label="srp")
    plt.plot(range_n_clusters, comp4, label="svd")
    plt.xlabel('number of cluster')
    plt.title('EM:completeness')
    plt.legend(loc="best")
    plt.show()
Exemplo n.º 26
0
        gs.fit(labels_EM_RP.reshape(-1, 1), dataY)
        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(out + 'QSAR NN EM RP.csv')
        best_indices = tmp.index[tmp['rank_test_score'] == 1].tolist()
        best_em = best_em.append(
            {
                'Layers': str(tmp.iloc[best_indices[0], 4]),
                'Iterations': tmp.iloc[best_indices[0], 5],
                'Score': tmp.iloc[best_indices[0], 12]
            },
            ignore_index=True)

        # Fit/transform with SVD
        print("Running SVD...")
        svd = SVD(n_components=3, random_state=5)
        dataX_SVD = svd.fit_transform(dataX)

        # Run KM
        print("Running k-means...")
        model = KMeans(n_clusters=km)
        labels_KM_SVD = model.fit_predict(dataX_SVD)

        grid = {
            'NN__hidden_layer_sizes': nn_arch,
            'NN__max_iter': nn_iter,
            'NN__learning_rate_init': [0.016],
            'NN__alpha': [0.316227766]
        }
        mlp = MLPClassifier(activation='relu',
                            early_stopping=True,
Exemplo n.º 27
0
		print("{0}: {1}, with distance of {2}:".format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))


### collaborative filtering using matrix factorization ###
us_canada_user_rating_pivot2 = us_canada_user_rating.pivot(index = 'userID', columns = 'bookTitle', values = 'bookRating').fillna(0)
#print
(us_canada_user_rating_pivot2.head())
#print
(us_canada_user_rating_pivot2.shape)

X = us_canada_user_rating_pivot2.values.T

import sklearn
from sklearn.decomposition import TruncatedSVD as SVD

SVD = SVD(n_components =12, random_state = 17)
matrix = SVD.fit_transform(X)
#print
(matrix.shape)

import warnings
warnings.filterwarnings("ignore", category = RuntimeWarning)
corr = np.corrcoef(matrix)
#print
(corr.shape)

us_canada_book_title = us_canada_user_rating_pivot2.columns
us_canada_book_list = list(us_canada_book_title)
coffey_hands = us_canada_book_list.index("The Green Mile: Coffey's Hands (Green Mile Series)")
#print
(coffey_hands)