def to_vector(self, title_list): vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=self.MAX_DF) vectorizer.max_features = self.MAX_FEATURES vectorizer.fit(title_list) tf = vectorizer.transform(title_list) lsa = TruncatedSVD(self.LSA_DIM) lsa.fit(tf) tf = lsa.transform(tf) return tf, vectorizer, lsa
def get_keywords(docs, max_feature, stopwords=None): vectorizer = TfidfVectorizer(max_features=max_feature, min_df=3, stop_words=stopwords) try: vectorizer.fit(docs) except: vectorizer.min_df = 1 vectorizer.max_features = 30 vectorizer.fit(docs) return vectorizer.vocabulary_
def to_vector(self, title_list): vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=self.MAX_DF) vectorizer.max_features = self.MAX_FEATURES vectorizer.fit(title_list) tf = vectorizer.transform(title_list) lsa = TruncatedSVD(self.LSA_DIM) lsa.fit(tf) tf = lsa.transform(tf) return tf, vectorizer, lsa
def to_vector(self, text_set, MAX_DF, MAX_FEATURES, LSA_DIM): ''' bag of words に変換、次元削減 ''' vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords) vectorizer.max_features = MAX_FEATURES X = vectorizer.fit_transform(text_set) lsa= TruncatedSVD(LSA_DIM) X = lsa.fit_transform(X) return X, lsa, vectorizer
def transform_data(filename,MAX_DF = 0.9, MAX_FEATURES = 500, LSA_DIM = 100): '''mecabのテンプレート、ファイルを読み込み、タイトルを形態素解析して次元圧縮して正規化かする。戻り値はデータセットとタイトルの行列''' data = pd.read_csv(filename) title = [] for i in data.index: title.append(data.ix[i, 'Title'].decode('utf-8')) vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords) vectorizer.max_features = MAX_FEATURES X = vectorizer.fit_transform(title) lsa= TruncatedSVD(LSA_DIM) X = lsa.fit_transform(X) X = Normalizer(copy=False).fit_transform(X) return data,X
def main(filename): # load tweets tweets = get_tweets_from_csv(filename) # feature extraction vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF) vectorizer.max_features = MAX_FEATURES X = vectorizer.fit_transform(tweets) # dimensionality reduction by LSA lsa = TruncatedSVD(LSA_DIM) X = lsa.fit_transform(X) X = Normalizer(copy=False).fit_transform(X) # clustering by KMeans if MINIBATCH: km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True) else: km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True) km.fit(X) labels = km.labels_ transformed = km.transform(X) dists = np.zeros(labels.shape) for i in range(len(labels)): dists[i] = transformed[i, labels[i]] # sort by distance clusters = [] for i in range(NUM_CLUSTERS): cluster = [] ii = np.where(labels == i)[0] dd = dists[ii] di = np.vstack([dd, ii]).transpose().tolist() di.sort() for d, j in di: cluster.append(tweets[int(j)]) clusters.append(cluster) return clusters
def handle(self, *args, **options): # tweets ret = Timeline.objects.all()[:100] tweets = [r.body for r in ret] # feature extraction vectorizer = TfidfVectorizer(analyzer = self.__analyzer, max_df = MAX_DF) vectorizer.max_features = MAX_FEATURES x = vectorizer.fit_transform(tweets) # dimensionality reduction by LSA lsa = TruncatedSVD(LSA_DIM) x= lsa.fit_transform(x) x= Normalizer(copy=False).fit_transform(x) # clustering by KMeans if MINIBATCH: km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++',batch_size=1000,n_init=10,max_no_improvement=10) else: km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1) km.fit(x) labels = km.labels_ transformed = km.transform(x) dists = np.zeros(labels.shape) for i in range(len(labels)): dists[i] = transformed[i, labels[i]] # sort by distance clusters = [] for i in range(NUM_CLUSTERS): cluster = [] ii = np.where(labels == i)[0] dd = dists[ii] di = np.vstack([dd,ii]).transpose().tolist() di.sort() for d, j in di: cluster.append(tweets[int(j)]) clusters.append(cluster) for i,cluster in enumerate(clusters): for c in cluster: print "%s: %s" % (i,c)
def main(filename): # load tweets tweets = get_tweets_from_csv(filename) # print tweets # feature extraction vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF) vectorizer.max_features = MAX_FEATURES X = vectorizer.fit_transform(tweets) # dimensionality reduction by LSA lsa = TruncatedSVD(LSA_DIM) X = lsa.fit_transform(X) X = Normalizer(copy=False).fit_transform(X) # clustering by KMeans if MINIBATCH: km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True) else: km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True) km.fit(X) labels = km.labels_ transformed = km.transform(X) dists = np.zeros(labels.shape) for i in range(len(labels)): dists[i] = transformed[i, labels[i]] # sort by distance clusters = [] for i in range(NUM_CLUSTERS): cluster = [] ii = np.where(labels==i)[0] dd = dists[ii] di = np.vstack([dd,ii]).transpose().tolist() di.sort() for d, j in di: cluster.append(tweets[int(j)]) clusters.append(cluster) return clusters
# preprocess the sentences train['sentence'] = preprocess(train) test['sentence'] = preprocess(test) # if there are enough training samples, even the label ratios out if train.shape[0] > 1000: train = training_sample(train) list_tokens = train['sentence'].apply(lambda x: x.split(' ')) test_tokens = test['sentence'].apply(lambda x: x.split(' ')) # if there are more than 1000 training samples, limit the max_features to 1000 as otherwise it will exceed memory # try tfidf vectorizer = TfidfVectorizer() if train.shape[0] > 1000: vectorizer.max_features = 1000 vectorizer.fit(train['sentence']) selected_features = vectorizer.get_feature_names() # try bow # tokenizer = Tokenizer(num_words=1000, lower=True) # tokenizer.fit_on_texts(train['sentence'].values) # selected_features = list(tokenizer.word_index.keys())[:1000] # w2v model = Word2Vec(list_tokens, size=100, window=5, min_count=1) # fast text # model = FastText(size=100, window=3, min_count=1) # model.build_vocab(sentences=list_tokens) model.train(list_tokens, total_examples=len(list_tokens), epochs=30)
X = dataset.iloc[:, 1].values y = dataset.iloc[:, 0].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # Applying TF-TDF from sklearn.feature_extraction.text import TfidfVectorizer v = TfidfVectorizer() v.max_features = 5000 X_train = v.fit_transform(X_train).toarray() X_test = v.transform(X_test).toarray() # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import MultinomialNB # GaussianNB classifier = MultinomialNB() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix