def comparison_test(text): import sklearn.feature_extraction.text as txt h_trick = txt.HashingVectorizer(n_features=20, binary=True, norm=None) oh_encoder = txt.CountVectorizer() oh_encoded = oh_encoder.fit_transform(text) hashing = h_trick.transform(text) return oh_encoded, hashing
def QuickClusterParamaterFinder(data): Cost = list() vectorizer = txtvectorizer.HashingVectorizer(analyzer='word', ngram_range=(1, 10)) vectors = vectorizer.transform(data['content'].dropna()) for c in range(1, 100, 5): kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=c) kmeans.fit_transform(vectors) Cost.append(kmeans.inertia_) print(str(c) + " / 100") plt.plot(range(1, 100, 5), Cost) return Cost
def build_vectors(sentences, vacabulary_size): vectorizer = skyfe.CountVectorizer() trans = vectorizer.fit_transform(sentences) fname = vectorizer.get_feature_names() print(trans) print(trans.toarray()) print(fname) #enable tf-idf transformer = skyfe.TfidfTransformer() tfidf = transformer.fit_transform(trans) print(tfidf.toarray()) print(tfidf.get_feature_names()) #hashed vectorizer2 = skyfe.HashingVectorizer(n_features=6, norm=None) trans = vectorizer2.fit_transform(sentences) #fname = vectorizer2.get_feature_names() print(trans.toarray())
def cluster_topics(): #model = cluster.Birch( #branching_factor=2, #threshold=0.002 # Lower = more clusters, higher = fewer clusters #) #model = cluster.KMeans( #branching_factor=10, #threshold=0.1 # Lower = more clusters, higher = fewer clusters #) model = cluster.DBSCAN(min_samples=2, eps=0.2) #model = cluster.AffinityPropagation( #) vectorizer = text.HashingVectorizer( analyzer='char_wb', # The feature is made of words not characters norm='l2', # Normalize the words lowercase=True, # Converts everything to lowercase stop_words=stopwords) num_samples = 10000 offset = 0 while True: log.debug(u"Loading topics...") topic_rows = db.session.query( models.TopicModel.id, models.TopicModel.topic).filter_by(clustered=False).order_by( models.TopicModel.id.asc()).limit(num_samples).offset( offset).all() if not topic_rows: break log.debug(u"Loaded {} topics".format(len(topic_rows))) offset += len(topic_rows) go_cluster(vectorizer, model, topic_rows)
#!/usr/bin/python # -*- coding: utf-8 -*- #[email protected] """ ============================== 文本向量化方法3 ============================== Tfidf方法进行文本向量化过程中如果单词量很大会遇到内存问题 因此利用hash编码可以将特征数减少 这个方法本质就是将单词进行hash编码,因此一个编码可以对应多个词语 从而压缩内存 """ from sklearn.datasets import fetch_20newsgroups import sklearn.feature_extraction.text as t2v import numpy as np #获取数据 newsgroups_train = fetch_20newsgroups(data_home="data", subset='train') newsgroups_test = fetch_20newsgroups(data_home="data", subset='test') #单词向量化 vectorizer = t2v.HashingVectorizer(n_features=6) vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) print("class:", newsgroups_train.target_names[newsgroups_train.target[1]]) print("data:", newsgroups_train.data[1]) print(vectors[1])
# take document stream from stream_docs and return specific number of documents def get_minibatch(doc_stream, size): docs, labels = [], [] try: for _ in range(size): doc, label = next(doc_stream) docs.append(doc) labels.append(label) except StopIteration: return None, None return docs, labels # use data independent hasher vect = text.HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=preprocessor_two) clf = lm.SGDClassifier(loss='log', random_state=1, max_iter=1) doc_stream = stream_docs(path='./movie_data.csv') # perform out of core learning classes = np.array([0, 1]) for _ in range(45): data_train, target_train = get_minibatch(doc_stream, size=1000) if not data_train: break data_train = vect.transform(data_train) clf.partial_fit(data_train, target_train, classes=classes) # use last 5000 documents for evaluation data_test, target_test = get_minibatch(doc_stream, size=5000) data_test = vect.transform(data_test) print('Accuracy: %.3f' % clf.score(data_test, target_test))
@author: Administrator """ from sklearn.datasets import fetch_20newsgroups newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) #from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.naive_bayes import MultinomialNB #Bernoulli = BernoulliNB(alpha=0.01) Multinomial = MultinomialNB(alpha=0.01) import sklearn.feature_extraction.text as txt multinomial_hashing_trick = txt.HashingVectorizer(stop_words='english', binary=False, norm=None, non_negative=True) #binary_hashing_trick = txt.HashingVectorizer(stop_words='english', # binary=True, norm=None, non_negative=True) Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data), newsgroups_train.target) #Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data), # newsgroups_train.target) from sklearn.metrics import accuracy_score for m, h in [(Multinomial, multinomial_hashing_trick)]: print 'Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target, y_pred=m.predict(h.transform( newsgroups_test.data))))
def vectorizeStrings(documents, ngramRange): vectorizer = txtvectorizer.HashingVectorizer(analyzer='char', ngram_range=ngramRange) vectors = vectorizer.transform(documents.fillna("")) return vectors
test_data['Position_Extra'] = test_data['Position_Extra'].apply(clean) test_data['Program_Description'] = test_data['Program_Description'].apply(clean) test_data['SubFund_Description'] = test_data['SubFund_Description'].apply(clean) test_data['Sub_Object_Description'] = test_data['Sub_Object_Description'].apply(clean) test_data['Text_1'] = test_data['Text_1'].apply(clean) test_data['Text_2'] = test_data['Text_2'].apply(clean) test_data['Text_3'] = test_data['Text_3'].apply(clean) test_data['Text_4'] = test_data['Text_4'].apply(clean) # create a single new column for cleaned text data training_data["combined"] = [' '.join(row) for row in training_data[training_data.columns].values] test_data["combined"] = [' '.join(row) for row in test_data[test_data.columns].values] # initialize TFIDF vectorizer and Hashing Vectorizer tfidf = txt.TfidfVectorizer(ngram_range=(2, 1), max_df=1.0, min_df=10) hsv = txt.HashingVectorizer() # fit tfidf and hashing vectorizer to train data tfidf.fit(training_data['combined']) hsv.fit(test_data['combined']) # transform the training and test datasets to obtain a sparse matrix X_tfidf = tfidf.transform(training_data['combined']) X_test_tfidf = tfidf.transform(test_data['combined']) X_hsv = hsv.transform(training_data['combined']) X_test_hsv = hsv.transform(test_data['combined']) X = sparse.hstack((X_hsv, X_tfidf)) X_test = sparse.hstack((X_test_hsv, X_test_tfidf))
def hash_vector(text,features): hash_vectorizer = txt.HashingVectorizer(n_features=features,binary=True,norm=None) text_vector = hash_vectorizer.transform(text) return text_vector
""" __author__ = "Adrian Langseth" import pickle import sklearn.feature_extraction.text as skt import time from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier t0 = time.time() k = pickle.load(open("sklearn-data.pickle", "rb")) vectorizer = skt.HashingVectorizer(stop_words='english', analyzer='word') xtrain_transformed = vectorizer.fit_transform(k['x_train']) xtest_transformed = vectorizer.fit_transform(k['x_test']) # NB classifier part NBclassifier = BernoulliNB() # Build Model NBclassifier.fit(xtrain_transformed, k['y_train']) # Fitting model NBpredicted_y = NBclassifier.predict(xtest_transformed) # Make prediction NBaccuracy_score = accuracy_score(k['y_test'], NBpredicted_y) # Evaluate prediction # Decision Tree part DTclassifier = DecisionTreeClassifier(max_depth=64, criterion="entropy") # Create Model DTclassifier.fit(xtrain_transformed, k['y_train']) # Fit model
if __name__ == "__main__": input_files = list(glob(str(base_dir / "data" / "*" / "*"))) data = [] for file_path in input_files: with open(file_path, "rt") as fh: data.append(fh.read()) dataset_size = 91 # MB for 20 newsgroup dataset print("# vectorizing {} documents:".format(len(data))) for label, vect in [ ("HashingVectorizer (vtext)", vtext.HashingVectorizer(norm=None)), ( "HashingVectorizer (scikit-learn)", skt.HashingVectorizer(lowercase=False, norm=None), ), ("CountVectorizer (vtext)", vtext.CountVectorizer(lowercase=False)), ("CountVectorizer (scikit-learn)", skt.CountVectorizer(lowercase=False)), ]: t0 = time() X = vect.fit_transform(data) dt = time() - t0 print("{:>40}: {:.2f}s [{:.1f} MB/s], shape={}, nnz={}".format( label, dt, dataset_size / dt, X.shape, X.nnz))
for word in input_string.split(' '): index = abs( hash(word) ) % vector_size #turn the position in vector where hash(word)%vector_size into 1 feature_vector[index] = 1 return feature_vector # if vector_size too small, there will be easily overlapping from scipy.sparse import csc_matrix print csc_matrix([1, 0, 0, 0, 0, 1, 1, 0, 1, 0]) # Sparse CSC_matrix only record the position of 1, by eliminating 0, saving a lot of memory #------------Using BUiltin HashingVectorizer ------------------- import sklearn.feature_extraction.text as txt sklearn_hashing_trick = txt.HashingVectorizer(n_features=20, binary=True, norm=None) text_vector = sklearn_hashing_trick.transform( [' Python for data science', 'Python for machine learning']) text_vector #transform text into vector, and surpress using sparse-like funciton # CountVectorizer: Optimally encodes text into a data matrix but cannot address subsequent novelties in text. # HashingVectorizer: Provides flexibility in situations when it is likely that the application will receive new data, but is less optimal than techniques based on hashing functions.