def vectorize(train_words, test_words): # 停用词表 with open('dict/stopwords.txt', 'r') as f: stopwords = set([w.strip() for w in f]) v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data
def feature_extraction(self, test): """ function:特征提取 :param test: :return:训练特征,测试特征 """ train = self.load_train_set() vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=25000) fea_train = vectorizer.fit_transform(train) # 特征提取 fea_test = vectorizer.fit_transform(test) # 特征提取 return fea_train, fea_test
def main(new): with open("trainingdata.txt","r") as f: int(f.readline()) training_set = [r.split(" ") for r in f] y = [int(doc[0]) for doc in training_set] corpus = [reduce(lambda x, y: x + " " + y, doc[1::]) for doc in training_set] # vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', lowercase=True) vectorizer = HashingVectorizer() # vectorizer = HashingVectorizer() X_train = vectorizer.fit_transform(corpus) y_train = np.array(y) data = vectorizer.fit_transform(new) clf = (LinearSVC(), "SVM") # print Corups test(clf, X_train, y_train, data)
def get_hashing(data): t0 = time.time() print("* Making hashing vectorizor with the data ...") hasher = HashingVectorizer(stop_words='english', ngram_range=(1,3), norm='l2', non_negative=True) #l2 projected on the euclidean unit sphere hX = hasher.fit_transform(data) print("done in %0.3fs." % (time.time() - t0)) return hX, hasher
def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def sim_char10(text1, text2): vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10)) texts = [text1, text2] matrix = vect.fit_transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.utils.extmath import density y_train = y_VariableList if hashing == True: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=2 ** 16) X_train = vectorizer.transform(x_VariableList) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(x_VariableList) X_test = vectorizer.transform(testSetList) if chi_squared == True: print("Extracting best features by a chi-squared test") ch2 = SelectKBest(chi2, k=2 * 16) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) classifierObject = "" print "Using :", classifier if classifier == "LinearSVC": classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3) elif classifier == "PassiveAggressiveClassifier": classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=50, n_jobs=1, random_state=None, shuffle=True, verbose=0, warm_start=False) elif classifier == "RidgeClassifier": classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='lsqr', tol=0.01) elif classifier == "Perceptron": classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True, n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True, verbose=0, warm_start=False) elif classifier == "SGDClassifier": classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) classifierObject.fit(X_train, y_train) pred = classifierObject.predict(X_test) return pred[0]
class Featurizer: def __init__(self): self.vectorizer = HashingVectorizer(stop_words="english") def train_feature(self, examples): return self.vectorizer.fit_transform(examples) def test_feature(self, examples): return self.vectorizer.transform(examples)
def ngrams_hashing_vectorizer(strings, n, n_features): """ Return the a disctionary with the count of every unique n-gram in the string. """ hv = HashingVectorizer(analyzer='char', ngram_range=(n, n), n_features=n_features, norm=None, alternate_sign=False) hash_matrix = hv.fit_transform(strings) return hash_matrix
def get_x(text,ngram_range): hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode") tfidf_transformer_object = TfidfTransformer(use_idf=True) x_train_counts = hash_vect_object.fit_transform(text) x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts) return x_train_tfidf
def vectorize(docs): """ 文档向量化 :param docs list: iterable over raw text documents :return: """ v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True) train_data = v.fit_transform(docs) return train_data
def vectorize_data(train_data, test_data): global app_vocabulary # vectorize=CountVectorizer(vocabulary=list(app_vocabulary)) # counts_train=vectorize.fit_transform(train_data) # counts_test=vectorize.fit_transform(test_data) # tfidftransformer = TfidfTransformer(); # counts_train=tfidftransformer.fit(counts_train).transform(counts_train); # counts_test=tfidftransformer.fit(counts_test).transform(counts_test); # f=open('model/vector.pkl','w') # pickle.dump(vectorize, f) vectorizer=HashingVectorizer() counts_train=vectorizer.fit_transform(train_data) counts_test=vectorizer.fit_transform(test_data) return counts_train, counts_test
def vector_func_char(l): vectorizer = HashingVectorizer( analyzer="char", input="content", decode_error="ignore", strip_accents="ascii", ngram_range=(2, 2), n_features=524288, ) return str(l).split(" ")[0], vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0], ""))
def vectorize_2(test_words): input_words = jieba.lcut(test_words[0]) print check_neg(input_words) # if len(jieba.lcut(test_words[0])) < 2: if len(jieba.lcut(test_words[0])) < 2: return None, False else: v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True) test_data = v.fit_transform(test_words) print test_data return test_data, check_neg(input_words)
def vector_func_word(l): vectorizer = HashingVectorizer( non_negative=True, stop_words="english", input="content", decode_error="ignore", strip_accents="ascii", n_features=262144, ) # return str(l).split(" ")[0],vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0],"")) return vectorizer.fit_transform(l).shape
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100): if useHashTable: from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures) X_noProcess = vec.transform(listOfSentences).toarray() else: from sklearn.feature_extraction.text import TfidfVectorizer vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_noProcess = vec.fit_transform(listOfSentences).toarray() return vec, X_noProcess
def get_x(text,ngram_range): hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode", token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b") # tokens are character strings of 2 or more characters tfidf_transformer_object = TfidfTransformer(use_idf=True) x_train_counts = hash_vect_object.fit_transform(text) x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts) return x_train_tfidf
def trainFeatureExtract(self, opts, trainData, trainDataSize): print 'Extracting features from the training dataset using a sparse vectorizer' t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) dataTrain = vectorizer.transform(trainData.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') dataTrain = vectorizer.fit_transform(trainData.data) duration = time() - t0 print 'done in %fs at %0.3fMB/s' % (duration, trainDataSize / duration) print 'n_samples: %d, n_features: %d' % dataTrain.shape print return dataTrain, vectorizer
class MultiNBClass: def __init__ (self, corpus, classes, method): # Set up vectorizier if method == 'count': self.vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3)) elif method == 'tfidf': self.vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 3)) elif method == 'hashing': self.vectorizer = HashingVectorizer(non_negative = True) else: print 'Method must be count, tfidf, or hashing' # vectorize and set up classifier. self.X = self.vectorizer.fit_transform(corpus) classifier = MultinomialNB() self.classifier = classifier.fit(self.X, classes)
class CompanyPrefix(BaseEstimator, TransformerMixin): def __init__(self, filepath, key=""): self.key = key self.vect = HashingVectorizer(decode_error='strict', n_features = 2**18, binary=True) self.gcp_length_table = pd.read_csv(filepath, sep="\t", dtype=str) lens = self.gcp_length_table['prefix'].str.len() self.max_key_len = lens.max() self.min_key_len = lens.min() def fit(self, x, y=None): return self def transform(self, data_dict): # Pad GTIN's with zeroes so that they are all 13 chars long data_dict[self.key] = data_dict[self.key].fillna(0).astype(int).astype(str).str.pad(13, fillchar="0") # Create a row for keeping track of company prefix lengths data_dict['gcp_length'] = np.nan # Iterate through the different lengths of prefixes in the # company prefix lookup table. Then extract possible prefixes from the # GTIN's for each prefix length. # Join these possible prefixes with the lookup table prefixes # to get the length of this gcp prefix for length in range(self.min_key_len, self.max_key_len+1): # Generate column gtin_<length> with the first length digits of each # row's GTIN data_dict['gtin_'+str(length)] = data_dict[self.key].str[0:length] # Join this GTIN prefix with the lookup table to see if it exists. # If it doesn't, the gcp_len will be NaN data_dict = data_dict.merge(self.gcp_length_table, how="left", left_on="gtin_"+str(length), right_on='prefix', sort=False) is_not_nan = pd.notnull(data_dict['gcp_len']) # set gcp_length column for rows that aren't NaN data_dict.ix[is_not_nan, 'gcp_length'] = data_dict.ix[is_not_nan, 'gcp_len'] # Drop temporary rows that were created data_dict.drop(['gtin_'+str(length), 'prefix', 'gcp_len'], axis=1, inplace=True) # Fill NaN's with 0's data_dict['gcp_length'] = data_dict['gcp_length'].fillna('0').astype(int) # Create string columns for storing actual gcp's data_dict['gcp'] = '' # Only apply to strings that actually have a GTIN # (would have been padded with 13 zeroes in earlier steps) isvalid = data_dict[self.key] != '0000000000000' # lambda to substring gtins based on gcp_length column value substring_GTIN = lambda row: row[self.key][0:row['gcp_length']] # apply substring data_dict.ix[isvalid, 'gcp'] = data_dict.ix[isvalid].apply(substring_GTIN, axis=1) # apply HashingVectorizer to result return self.vect.fit_transform(data_dict['gcp'])
def test_dummy_analyzer(self): X, X_rdd = self.generate_text_dataset() def splitter(x): return x.split() X = map(splitter, X) X_rdd = X_rdd.map(lambda x: map(splitter, x)) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_chunked_hashing_vectorizer(self): # results should not depend on chunk size _, X = _extract_reads(Artifact.import_data( 'FeatureData[Sequence]', self.get_data_path('se-dna-sequences.fasta')).view(DNAIterator)) params = {'analyzer': 'char', 'n_features': 8192, 'ngram_range': [8, 8], 'alternate_sign': False} hv = HashingVectorizer(**params) unchunked = hv.fit_transform(X) for chunk_size in (-1, 3, 13): chv = ChunkedHashingVectorizer(chunk_size=chunk_size, **params) chunked = chv.fit_transform(X) for x1, x2 in zip(chunked, unchunked): self.assertTrue((x1.todense() == x2.todense()).all())
def test_dummy_analyzer(self): X, X_rdd = self.make_text_rdd() def splitter(x): return x.split() X = list(map(splitter, X)) X_rdd = X_rdd.map(lambda x: list(map(splitter, x))) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def vectorize(self, wsl): print("loading wiki documents dataset") # wsl = WikiSampleLoader() data = wsl.load_dataset() self._cluster_list = data.target_names self._labels = data.target print("%d documents" % len(data.data)) print("%d categories" % len(data.target_names)) print print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if self._use_hashing: if self._use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer( n_features=self._n_features, stop_words=self._stop_words, non_negative=self._non_negative, norm=self._norm, binary=self._binary, ) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer( n_features=self._n_features, stop_words=self._stop_words, non_negative=self._non_negative, norm="l2", binary=self._binary, ) else: vectorizer = TfidfVectorizer( max_df=self._max_df, max_features=self._n_features, min_df=self._min_df, stop_words=self._stop_words, use_idf=self._use_idf, ) self._X = vectorizer.fit_transform(data.data) self._vectorizer = vectorizer print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % self._X.shape) print()
class svm_text(SVC): # svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2, # tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1) def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000, ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None): self.conn = None self.is_tfidf = tfidf if tfidf: self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1], max_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') else: self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True, n_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma), 'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)} if class_weight == 'auto': class_weight = {} for item in train_data.target: if class_weight.get(item): class_weight.update({item: class_weight[item] + 1.0}) else: class_weight.update({item: 1.0}) for key in class_weight: class_weight.update({key: 1.0 / class_weight[key]}) self.class_weight_dict = class_weight super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0, tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1) if self.is_tfidf: train_x = self.vectorizer.fit_transform(train_data.data) else: train_x = self.vectorizer.transform(train_data.data) self.fit(train_x, train_data.target) def test_data(self, test_data): test_x = self.vectorizer.transform(test_data.data) predicted_values = self.predict(test_x) test_y = test_data.target self.score = metrics.f1_score(test_y, predicted_values) self.accuracy = metrics.accuracy_score(test_y, predicted_values) def guess_text(self, text_text): text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),]) return self.predict(text_x)
def build_sentiment_classifier(X, y, bids, dates): ''' Train and pickle the sentiment classifier ''' n_train_samples = y.shape[0] tfidf = HashingVectorizer(tokenizer=word_tokenize, stop_words='english', \ ngram_range=(1, 3), n_features=10000) X_tfidf = tfidf.fit_transform(X)#.todense() ''' X1 = X[:n_train_samples] X2 = X[n_train_samples:] ''' X1_tfidf = X_tfidf[:n_train_samples, :] X2_tfidf = X_tfidf[n_train_samples:, :] # Uncomment the section below to enable Grid Search for optimal parameter # search ''' clf_SVM = Pipeline([('clf_SVM', LinearSVC())]) params = { 'clf_SVM__C': [0.01, 0.5, 1, 10], 'clf_SVM__tol': [1e-2, 1e-3, 1e-4], 'clf_SVM__dual': [True, False] } gs = GridSearchCV(clf_SVM, params, cv=5, scoring='f1') gs.fit(X1_tfidf, y) print gs.best_score_ print gs.best_estimator_.get_params() ''' clf_SVM = LinearSVC(C=0.5, tol=1e-2, dual=False) clf_SVM.fit(X1_tfidf, y) y2 = clf_SVM.predict(X2_tfidf) y2 = np.vstack((dates, bids, y2)) return y2
def calssify(text): # Multinomial Naive Bayes Classifier clf = MultinomialNB() clf = joblib.load('model/'+str(type(clf))[8:-2]+'.model') with open('dict/stopwords.txt', 'r') as f: stopwords = set([w.strip() for w in f]) v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000) text = text.replace('\n', ' ') text = text.replace('\t', ' ') text = ' '.join(jieba.cut(text, cut_all=False)) text = re.sub(u'[$^()-=~!@#¥%……&*()——+·{}|:“”《》?【】、;‘’,。、]+', u'', text) text = text.encode('utf-8') test_data = v.fit_transform([text]) pred = clf.predict(test_data) return pred[0][0]
def extractFeatures(): print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() return X_train, X_test
def main(): vectorizer = HashingVectorizer(stop_words="english", binary=True, tokenizer=lambda text: text.split(), token_pattern=r"(?u)\b(?:\w|\?)(?:\w|\?)+\b", ngram_range=(1,2)) (X, Y) = get_train() (Xcv, Ycv) = get_cv() Xt = vectorizer.fit_transform(X) #selector = SelectPercentile(f_classif, percentile=40) #Xtt = selector.fit_transform(Xt, Y) Xtt = Xt Xcvt = vectorizer.transform(Xcv) #Xcvtt = selector.transform(Xcvt) Xcvtt = Xcvt #model = LinearSVC() #model = SVC(kernel='rbf', gamma=1.0, cache_size=1000) model = MultinomialNB(fit_prior=False) model.fit(Xtt, Y) Pcv = model.predict(Xcvtt) print_stats(Ycv, Pcv)
class svm_multi_label_text(OneVsRestClassifier): # svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2, # tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1) def __init__(self, train_data, C=None, n_features=10000000, loss='l2', penalty='l1', ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), dual=True, tol=1e-4): self.conn = None self.is_tfidf = tfidf if tfidf: self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1], max_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word') else: self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True, n_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word') self.param_set = {'C': str(), 'kernel': str(), 'gamma': str(), 'degree': str(), 'coef0': str(), 'n_features': str(n_features)} super(svm_multi_label_text, self).__init__(LinearSVC(C=C, loss=loss, penalty=penalty, dual=(False if penalty == 'l1' else dual), tol=tol)) if self.is_tfidf: train_x = self.vectorizer.fit_transform(train_data.data) else: train_x = self.vectorizer.transform(train_data.data) train_y = train_data.target self.fit(train_x, train_y) def test_data(self, test_data): test_x = self.vectorizer.transform(test_data.data) predicted_values = self.predict(test_x) test_y = test_data.target try: self.score = metrics.f1_score(test_y, predicted_values) except ZeroDivisionError: self.score = -0.1 try: self.accuracy = metrics.accuracy_score(test_y, predicted_values) except ZeroDivisionError: self.accuracy = -0.1 def guess_text(self, text_text): text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),]) return self.predict(text_x)
from sklearn.feature_extraction.text import HashingVectorizer from bin2op import parse, unique, counts, nextIndex import numpy as np import math import sys np.set_printoptions(threshold=sys.maxsize) file = './a.exe' syntax = "intel" shellcode, code, opcodes, operands, instructions = parse(file, syntax, None) sentences = instructions ops = unique(operands + opcodes) ops.sort() unique_ops_count = len(ops) vectorizer = HashingVectorizer(norm=None, n_features=unique_ops_count) sentence_vectors = vectorizer.fit_transform(sentences) vector2array = sentence_vectors.toarray() arr = np.array(vector2array) print(arr[0:3])
# Build tokenizer (removes upper case ) tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False) # Make a callable function for the vectorizer tok_func = lambda s: tokenizer.tokenize(s) ############################# # VECTORIZER and CLASSIFIER # ############################# vectorizer = HashingVectorizer(tokenizer=tok_func, ngram_range=(1, 1)) # Vectorize the tweets train_vectors = vectorizer.fit_transform(train_tweets) dev_vectors = vectorizer.transform(dev_tweets) test_vectors = vectorizer.transform(test_tweets) # Add lexicon information train_vectors = hstack((train_vectors, train_polarities)) dev_vectors = hstack((dev_vectors, dev_polarities)) test_vectors = hstack((test_vectors, test_polarities)) classifier = LinearSVC(C=0.1) ######### # TRAIN # ######### classifier.fit(train_vectors, train_labels)
import codecs from idlelib.ReplaceDialog import replace from idlelib.IOBinding import encoding from sklearn.feature_extraction.text import HashingVectorizer from sklearn.cluster import KMeans fr = open('weibo_fenci_result.txt', 'r', encoding='utf-8') id_list = [] data_list = [] for line in fr.readlines(): term = line.strip().split("\t") if len(term) == 2 and term[1] != " ": id_list.append(term[0]) data_list.append(term[1]) hv = HashingVectorizer(n_features=10000, non_negative=True) post_tfidf = hv.fit_transform(data_list) print('Size of fea_train:' + repr(post_tfidf.shape)) print(post_tfidf.nnz) print("tfidf has done!!!") id = id_list tfidf_vec = post_tfidf kmean = KMeans(n_clusters=300) kmean.fit(tfidf_vec) pred = kmean.predict(tfidf_vec) print(pred) fo = open("cluster.txt", "a+", encoding="utf-8") count = 0 for i in range(len(pred)): count += 1 fo.write(id[i] + "\t" + str(pred[i]) + "\n")
# split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing:
]) # Import HashingVectorizer from sklearn.feature_extraction.text import HashingVectorizer # Get text data: text_data text_data = combine_text_columns(X_train) # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Instantiate the HashingVectorizer: hashing_vec hashing_vec = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC) # Fit and transform the Hashing Vectorizer hashed_text = hashing_vec.fit_transform(text_data) # Create DataFrame and print the head hashed_df = pd.DataFrame(hashed_text.data) print(hashed_df.head()) # Import the hashing vectorizer from sklearn.feature_extraction.text import HashingVectorizer # Instantiate the winning model pipeline: pl pl = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features',
default=False, action='store_true', help="Use dictionary features") parser.add_argument('--limit', default=-1, type=int, help="How many sentences to use") flags = parser.parse_args() analyzer = Analyzer(flags.word, flags.all_before, flags.all_after, flags.one_before, flags.one_after, flags.characters, flags.dictionary) vectorizer = HashingVectorizer(analyzer=analyzer) x_train = vectorizer.fit_transform( ex for ex, tgt in all_examples(flags.limit)) x_test = vectorizer.fit_transform( ex for ex, tgt in all_examples(flags.limit, train=False)) for ex, tgt in all_examples(1): print(" ".join(analyzer(ex))) y_train = array(list(tgt for ex, tgt in all_examples(flags.limit))) y_test = array( list(tgt for ex, tgt in all_examples(flags.limit, train=False))) lr = SGDClassifier(loss='log', penalty='l2', shuffle=True) lr.fit(x_train, y_train) print("TRAIN\n-------------------------") accuracy(lr, x_train, y_train, all_examples(flags.limit))
if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer)
cosine_list = clean_concat(lm_total) cosine_list['concat_data'] = cosine_list['concat_data'].str.lower() del lm_total # # a = cosine_list.concat_data.str.split(expand=True).stack().value_counts().head(2000) # # keep the stop-wordd not the digits # ind_word_a = [ind for ind in a.index if ~ind.isdigit()] # a = a[a.index.isin(ind_word_a)] '''CONCAT_DATA ''' # vectorizer = TfidfVectorizer(analyzer=ngrams, min_df=1) stop_words = get_stop_words('de') vectorizer = HashingVectorizer(stop_words=stop_words) vector_1 = vectorizer.fit_transform(cosine_sign_ups.concat_data) vector_2 = vectorizer.fit_transform(cosine_list.concat_data) t1 = time.time() matches = awesome_cossim_top(vector_1, vector_2.transpose(), 1, 0.1) t = time.time() - t1 print("SELFTIMED:", t) matches_df = get_matches_df(matches, name_vector_1=cosine_sign_ups, name_vector_2=cosine_list, col_name='concat_data', top=cosine_list.shape[0]) matches_df.sort_values('similarity', inplace=True) '''''' matches_df = pd.read_csv('matches_df_left_overs.csv') matches_df = matches_df.query('similarity > 0.35')
if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(corpus) # print(X) print("done in %fs" % (time() - t0)) # n_samples: how many articles are there # n_features: how many different words in all articles are there print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are
class Reddit: # initialize class variables text_matrix = None unstemmed_text_matrix = None vectorized_text_matrix = None text_matrix_reduced = None sub_list = None sub_to_index = None index_to_sub = None commonality_matrix = None def __init__(self, from_db=True, encoding_type='tfidf', distance_method='cosine'): # check if everything already exists # get it from the db or from the web if not try: print("Checking if information is available...", end="") Reddit.text_matrix = np.load('text_matrix.npy') Reddit.unstemmed_text_matrix = np.load('unstemmed_text_matrix.npy') Reddit.sub_list = pickle.load(open("sub_list.p", "rb")) Reddit.sub_to_index = pickle.load(open("sub_to_index.p", "rb")) print("Done.\n") except FileNotFoundError: print("Not available.\n") print("Loading from database.\n") print("This will take a minute...\n") if from_db: Reddit.text_matrix, Reddit.sub_list, Reddit.sub_to_index = self.data_from_db() else: Reddit.text_matrix, Reddit.sub_list, Reddit.sub_to_index = self.data_from_scrape() Reddit.index_to_sub = {value: key for key, value in Reddit.sub_to_index.items()} if encoding_type == 'tfidf': self.vectorizer = TfidfVectorizer() elif encoding_type == 'count': self.vectorizer = CountVectorizer() elif encoding_type == 'hash': self.vectorizer = HashingVectorizer() if distance_method == 'cosine': self.distance = self.cosine_distance try: Reddit.vectorized_text_matrix = np.load('vectorized_text_matrix.npy') Reddit.text_matrix_reduced = np.load('text_matrix_reduced.npy') except FileNotFoundError: print("Vectorizing and reducing text matrix.\n") Reddit.vectorized_text_matrix, Reddit.text_matrix_reduced = self.process_text_matrix() print("Done\n") # check if the commonality matrix already exists, build it if it doesn't try: Reddit.commonality_matrix = np.load('commonality_matrix.npy') except FileNotFoundError: Reddit.commonality_matrix = self.build_matrix() def data_from_db(self): """ get subreddit corpus from database reddit.db :return: text_matrix: matrix of text in subreddits. rows are subreddits. sub_list: list of subreddits included in the matrix sub_to_index: dictionary for converting from subreddit name to index in the matrix """ sub_list = [] text_matrix = [] unstemmed_text_matrix = [] # used for word cloud later connecting_to_db = True sql_command = "SELECT subreddit, GROUP_CONCAT(body, ' ') as all_comments FROM comments GROUP BY subreddit" while connecting_to_db: try: print("Connecting to DB.\n") pwd = os.getcwd() db_conn = sqlite3.connect(pwd + '/../db/reddit.db') c = db_conn.cursor() results = c.execute(sql_command) except sqlite3.OperationalError: print("Table does not exist yet. Creating from CSV.\n") create_db(db_conn) continue print("Done.") break english_stop_words = stopwords.words('english') r = praw.Reddit(user_agent='daniel_scraper') for i, row in enumerate(list(results)): print("Loading subreddit {}: {}....".format(i, row[0]), end="") ''' try: if r.get_subreddit(row[0]).subscribers < 50000: print("Done") continue except: print("Something went wrong. Continuing.") continue ''' sub_list.append(row[0].lower()) text_matrix.append(process_text(row[1], punctuation, english_stop_words)) unstemmed_text_matrix.append(process_text(row[1], punctuation, english_stop_words, stem=False)) print("Done") sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))} print("Done.\n") text_matrix = np.array(text_matrix) unstemmed_text_matrix = np.array(unstemmed_text_matrix) np.save('unstemmed_text_matrix.npy', unstemmed_text_matrix) np.save('text_matrix.npy', text_matrix) pickle.dump(sub_list, open("sub_list.p", "wb")) pickle.dump(sub_to_index, open("sub_to_index.p", "wb")) return text_matrix, sub_list, sub_to_index def data_from_scrape(self): """ get subreddit corpus from web scrape if database is not available :return: text_matrix: matrix of text in subreddits. rows are subreddits. sub_list: list of subreddits included in the matrix. sub_to_index: dictionary for converting from subreddit name to index in the matrix. """ text_matrix = [] response = requests.get('http://redditlist.com/sfw') sub_list = re.findall('/r/(\w+)\\\'', response.text) sub_list = set(sub_list) r = praw.Reddit(user_agent='daniel_scraper') for sub in self.sub_list: if r.get_subreddit(sub).subscribers < 50000: self.sub_list.pop(sub) sub_list = list(sub_list) for sub in sub_list: # instantiate string of submission and comments for this specific subreddit this_subs_submissions = '' this_subs_comments = '' submissions = r.get_subreddit(sub).get_hot(limit=25) # get the top 25 submissions for submission in submissions: this_subs_submissions += " " this_subs_submissions += submission.title.lower() # add submission to all submissions for comment in submission.comments: this_subs_comments += " " this_subs_comments += comment.body.lower() # add comment to all comments text_matrix.append(this_subs_submissions + this_subs_comments) text_matrix = np.array(text_matrix) sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))} np.save('text_matrix.npy', text_matrix) return text_matrix, sub_list, sub_to_index def process_text_matrix(self, n_components=100): """ :param n_components: number of singular values to retain :return: reduced dimension text matrix using truncated SVD """ vectorized_text_matrix = self.vectorizer.fit_transform(self.text_matrix) reducer = TruncatedSVD(n_components=n_components) text_matrix_reduced = reducer.fit_transform(vectorized_text_matrix) np.save('vectorized_text_matrix.npy', vectorized_text_matrix) np.save('text_matrix_reduced.npy', text_matrix_reduced) return vectorized_text_matrix, text_matrix_reduced @staticmethod def cosine_distance(vec1, vec2): """ :param vec1: 1D numpy array :param vec2: 1D numpy array :return: cosine distance between the two vectors """ # confirm they're numpy arrays vec1 = np.array(vec1) vec2 = np.array(vec2) return vec1.dot(vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)) def build_matrix(self): """ :return: Reddit "commonality matrix" C C[i,j] corresponds to the similarity between subreddit i and subreddit j Distance measure is a parameter to the class, defaults to cosine distance """ # initialize a commonality matrix commonality_matrix = np.zeros((Reddit.text_matrix.shape[0], Reddit.text_matrix.shape[0])) for i in range(len(commonality_matrix)): for j in range(i, len(commonality_matrix)): commonality = self.distance(Reddit.text_matrix_reduced[i], Reddit.text_matrix_reduced[j]) commonality_matrix[i, j] = commonality commonality_matrix[(i+1):, i] = commonality_matrix[i, (i+1):] # save commonality matrix for later use np.save('commonality_matrix.npy', commonality_matrix) return commonality_matrix
if x < -0.05: return 0 elif -0.05 < x < 0.05: return 1 else : return 2 #Labeling based on returned values: data_df['label_stemmed'] = data_df['sentiment_stemmed'].apply(lambda x: convert(x['compound'])) #importing HashingVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.model_selection import train_test_split #hashing vectorization X= data_df['tweet_stemmed'] hashing_vectorizer = HashingVectorizer(stop_words = 'english',alternate_sign= False) hash_stem = hashing_vectorizer.fit_transform(X) y= data_df['label_stemmed'] #print("Data vectorized") #vectorization time Vectorizing_time = time.time() #print("Vectorizing_time :",Vectorizing_time - start_time) #train and test set formed hashing_trainset = hash_stem[:319685, :] hashing_testset = hash_stem[319685:,:] x_train, x_test , y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) x_train = hashing_trainset[y_train.index] x_test= hashing_trainset[y_test.index] print("Data split into train and test set")
take = .98 filtered_df = films[ films['opening_wknd'] < films['opening_wknd'].quantile(.97)].reset_index( drop=True) filtered_df = filtered_df[ filtered_df['pct_profit'] < filtered_df['pct_profit'].quantile(take)] filtered_df = filtered_df[filtered_df['pct_profit'] > filtered_df['pct_profit'] .quantile(1 - take)].reset_index(drop=True) #******this filters the films we suspect reported bad data********** # filtered_df = filtered_df.drop(filtered_df[(filtered_df['budget'] != filtered_df['opening_wknd']) & # (filtered_df['budget'] < 150000)].index).reset_index(drop=True) # Make the vector from the strings vectorizer = HashingVectorizer(n_features=1000) vector = vectorizer.fit_transform(filtered_df['train_string'].to_numpy()) vec_df = pd.DataFrame.sparse.from_spmatrix(vector) #make dummies from our curated columns dum = pd.get_dummies(filtered_df[[ 'release_month', 'actor1_class', 'actor2_class', 'actor3_class', 'rating' ]]) #'actor1_class', 'actor2_class', 'actor3_class', dum.head(1) #pull the columns we want from the main DF use_cols = filtered_df[[ 'budget', 'action', 'adventure', 'animated', 'biography', 'drama', 'documentary', 'comedy', 'crime', 'fantasy', 'family', 'musical', 'horror', 'war', 'mystery', 'sci-fi', 'thriller', 'romance' ]]
print() print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: print('using hashing...') vectorizer = HashingVectorizer(non_negative=True, n_features=opts.n_features, tokenizer=jieba_tokenizer) X_train = vectorizer.transform(x_train) else: print('using tfidf...') vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, tokenizer=jieba_tokenizer) X_train = vectorizer.fit_transform(x_train) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(x_test) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing:
def plot(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # parse commandline arguments op = OptionParser() op.add_option("--report", action="store_true", dest="print_report", help="Print a detailed classification report.") op.add_option("--chi2_select", action="store", type="int", dest="select_chi2", help="Select some number of features using a chi-squared test") op.add_option("--confusion_matrix", action="store_true", dest="print_cm", help="Print the confusion matrix.") op.add_option("--top10", action="store_true", dest="print_top10", help="Print ten most discriminative terms per class" " for every classifier.") op.add_option("--all_categories", action="store_true", dest="all_categories", help="Whether to use all categories or not.") op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") op.add_option("--n_features", action="store", type=int, default=2 ** 16, help="n_features when using the hashing vectorizer.") op.add_option("--filtered", action="store_true", help="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print(__doc__) op.print_help() print() ############################################################################### # Load some categories from the training set if opts.all_categories: categories = None else: categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove) print('data loaded') categories = data_train.target_names # for case categories == None def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = np.asarray(vectorizer.get_feature_names()) ############################################################################### # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) class L1LinearSVC(LinearSVC): def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y) def predict(self, X): X = self.transformer_.transform(X) return LinearSVC.predict(self, X) print('=' * 80) print("LinearSVC with L1-based feature selection") results.append(benchmark(L1LinearSVC())) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) pl.figure(figsize=(12,8)) pl.title("Score") pl.barh(indices, score, .2, label="score", color='r') pl.barh(indices + .3, training_time, .2, label="training time", color='g') pl.barh(indices + .6, test_time, .2, label="test time", color='b') pl.yticks(()) pl.legend(loc='best') pl.subplots_adjust(left=.25) pl.subplots_adjust(top=.95) pl.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): pl.text(-.3, i, c) pl.show()
### Most real sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20] ### Most fake sorted(zip(clf.coef_[0], feature_names))[:20] # clearly there are certain words which might show political intent and source in the top fake features (such as the words corporate and establishment). tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0]))) #print(tokens_with_weights) #-------------------------------------------------------------- # HashingVectorizer : require less memory and are faster (because they are sparse and use hashes rather than tokens) #-------------------------------------------------------------- hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True) hash_train = hash_vectorizer.fit_transform(X_train) hash_test = hash_vectorizer.transform(X_test) #-------------------------------------------------------------- # Naive Bayes classifier for Multinomial model #-------------------------------------------------------------- clf = MultinomialNB(alpha=.01) clf.fit(hash_train, y_train) pred = clf.predict(hash_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) print(cm)
if word != ' ' and word not in stopwords: words.append(word) sentences.append(' '.join(words)) return sentences # 训练时CountVectorizer用的什么vocab 测试时也必须使用原本的vocab #vectorizer = CountVectorizer() #FLAG='countvectorizer' #vectorizer = TfidfVectorizer() #FLAG='tfidfvectorizer' vectorizer = HashingVectorizer() FLAG = 'hashingvectorizer' reviews = pd.read_csv('./data/train.csv') vectorizer.fit_transform(get_sentences(reviews.review.values, stopwords)) print('模型加载中...') model = joblib.load('./lr_weibo_output/' + 'weibo_lr_' + FLAG + '_model.pkl') print('模型加载结束...') def predict(): print('请输入文本:') review = str(input()) if review == 'exit': exit(0) else: try: sentences = get_sentences([review], stopwords) print(sentences) review_ids = vectorizer.transform(sentences)
#y.ravel() #y = np.array([y]) #data_train_size_mb = size_mb(X_train.data) #data_test_size_mb = size_mb(X_test.data) print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(bunch.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(bunch.data) #y_train = y_train.reshape(y_train.shape[0],1) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) #y_train = np.array([y_train]).T #y_train = y_train.reshape(y_train.shape[0],1) np.transpose(X_train) print("samples %d features %d" % X_train.shape) print(X_train.shape) #print(y_train.shape) print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(bunch.data) #y_test = y_test.reshape(y_test.shape[0],1) #y_test = np.array([y_test]).T
def vectorize(df_test): v = HashingVectorizer() train_vectors = v.fit_transform(df_test) return train_vectors
(0, 3) 1 (0, 15) 2 (0, 4) 1 (1, 5) 1 (1, 9) 1 (1, 2) 1 (1, 6) 1 (1, 14) 1 (1, 3) 1 (2, 1) 1 (2, 0) 1 (2, 12) 1 (2, 7) 1 (3, 10) 1 (3, 8) 1 (3, 11) 1 (3, 18) 1 (3, 17) 1 (3, 13) 1 (3, 5) 1 (3, 6) 1 (3, 15) 1 左边的括号中第一个数字是文本的序号,第二个数字是词的序号,注意词的序号是基于所有的文档的, 第三个数字就是我们的词频。 ''' #利用hash trick 进行降维 vectorizer2 = HashingVectorizer(n_features=6, norm= None) print(vectorizer2.fit_transform(corpus))
def main(args=None): args = args.split(' ') if isinstance(args, str) else args args = args or sys.argv[1:] import logging import numpy as np from optparse import OptionParser from time import time from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 import dim_reduction as dr # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # parse commandline arguments op = OptionParser() op.add_option( "--chi2_select", default=-1, action="store", type="int", dest="select_chi2", help= "Select some number of features using a chi-squared test; all set -1") op.add_option('-f', "--filename", default="data.tsv", dest="fname", help="data filename") op.add_option("-d", "--dataset", default='news', dest="dataset", help="dataset to load (%s)" % list(_load_map.keys())) op.add_option('-n', "--n_features", action="store", type=int, default=1000, help="n_features when using the hashing vectorizer.") op.add_option("--use_hashing", default=False, action="store_true", help="Use a hashing vectorizer.") op.add_option("--hack", default=False, action="store_true", dest="hack", help="use test instead on train to speedup process") op.add_option("--no-text", default=True, action="store_false", dest="text", help="features are not text (default = text)") op.add_option("--class_sample", default=2, type=int, dest="n_sample_by_class", help="show only [%default%] sample by class") op.add_option("--lnob", default=True, action='store_true', dest='legend_outside_box', help="legend not outside of the box") op.add_option("--legend", default=False, action='store_true', dest='enable_legend_picking', help='set legend picking not points') op.add_option( "--noX", default=False, action='store_true', dest='nox', help= "if you just want to generate graph and don't have acess to the X server " ) op.add_option( "-m", "--methods", default=dr.METHODS, dest="methods", help="dimension reduction method to try (split by ','); default = %s" % dr.METHODS) op.add_option("-e", dest='exclude', default=None, help="exclude class (separarated by ,)") op.add_option("-o", dest='only', default=None, help="include only class (separarated by ,)") op.add_option("-v", dest='verbose', default=False, action='store_true', help="verbose") (opts, args) = op.parse_args(args) if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) if opts.nox: matplotlib.use('Agg') # warning: pylab should be import after call to matplotlib.use(...) import pylab # load data data_train, data_test, legend_labels = _load_map[opts.dataset](opts.fname) if opts.hack: print("hack: working on test dataset") data_train = data_test opts.dataset += '_test' if opts.verbose: print("----------example data loaded--------------") print("data:", data_train.data[0].strip()) print("target:", data_train.target[0]) print("-------------------------------------------") y_train, y_test = data_train.target, data_test.target data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print(("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb))) print(("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb))) print( "Extracting features from the training dataset using a sparse vectorizer" ) t0 = time() if not opts.text: print("std features") X_train = np.array(data_train.data, ndmin=2) features_names = data_train.features else: # its text features dood print( "features are extracted from text -> words vectorization is required, hey Samu!" ) if opts.use_hashing: print(("Use feature hashing %s" % opts.n_features)) vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') # mapping from integer feature name to original token string X_train = vectorizer.fit_transform(data_train.data) feature_names = vectorizer.get_feature_names() if opts.verbose: print("----------example data transformed--------------") print("data:", X_train[0]) print("target:", y_train[0]) print("-------------------------------------------") duration = time() - t0 print(("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))) print(("n_samples: %d, n_features: %d" % X_train.shape)) print() print( "Extracting features from the test dataset using the same vectorizer") t0 = time() if not opts.text: X_test = np.array(data_test.data, ndmin=2) else: X_test = vectorizer.transform(data_test.data) duration = time() - t0 print(("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))) print(("n_samples: %d, n_features: %d" % X_test.shape)) print() if opts.select_chi2 != -1: print(("Extracting %d best features by a chi-squared test" % opts.select_chi2)) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) print("data:", X_train[0]) print("target", y_train[0]) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print(("done in %fs" % (time() - t0))) print() X = X_train.todense() if "todense" in dir(X_train) else X_train X_test = X_test.todense() if "todense" in dir(X_test) else X_test print("data shape: (%i,%i)" % (X.shape)) if opts.only: idx = opts.only.split(',') X, y_train = filter_classes(X, y_train, idx, False) X_test, y_test = filter_classes(X_test, y_test, idx, False) if opts.exclude: idx = opts.exclude.split(',') X, y_train = filter_classes(X, y_train, idx, True) X_test, y_test = filter_classes(X_test, y_test, idx, True) # run all dim reduction algo for method in opts.methods.split(','): t0 = time() try: resdr = dr.dim_reduce(method, X=X, Y=y_train) if resdr == None: continue trans, X_trans, title = resdr print(('Projecting {} on test set'.format(method))) if hasattr(trans, "transform"): X_trans_test = trans.transform(X_test) elif hasattr(trans, "fit_transform"): warnings.warn( "the method as no transform (fallback to fit_transform", Warning) X_trans_test = trans.fit_transform(X_test) title = "%s (time %.2fs)" % (title, (time() - t0)) print(('Rendering plot {}'.format(title))) has_plot = dr.plot_embedding( X=X_trans_test, Y=y_test, title=title, n_sample_by_class=opts.n_sample_by_class, source=data_test.data, legend_outside_box=opts.legend_outside_box, enable_legend_picking=opts.enable_legend_picking, legend_labels=legend_labels) if has_plot: fname = "%s_%s.png" % (opts.dataset, method) print("saving %s" % fname) pylab.savefig(fname, bbox_inches=0) else: print('Nothing to plot.') except Exception as ex: print(method, ex) print(traceback.format_exc()) pylab.show()
] newsgroup_train = fetch_20newsgroups(subset='train', categories=categories) #print category names from pprint import pprint pprint(list(newsgroup_train.target_names)) #newsgroup_train.data is the original documents, but we need to extract the #feature vectors inorder to model the text data from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=10000) fea_train = vectorizer.fit_transform(newsgroup_train.data) fea_test = vectorizer.fit_transform(newsgroups_test.data) #return feature vector 'fea_train' [n_samples,n_features] print 'Size of fea_train:' + repr(fea_train.shape) print 'Size of fea_train:' + repr(fea_test.shape) #11314 documents, 130107 vectors for all categories print 'The average feature sparsity is {0:.3f}%'.format( fea_train.nnz / float(fea_train.shape[0] * fea_train.shape[1]) * 100) #---------------------------------------------------- #method 1:CountVectorizer+TfidfTransformer print '*************************\nCountVectorizer+TfidfTransformer\n*************************' from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer count_v1 = CountVectorizer(stop_words='english', max_df=0.5)
u'连衣裙': 12, u'其它': 13 } data[u'二级类目'] = data[u'二级类目'].map(secondtype_mapping) data.fillna(0) # print(vectorizer.fit_transform(data[u'产品标题'])) print(vectorizer.fit_transform(data[u'产品标题']).toarray()) # print(vectorizer.get_feature_names()) from sklearn.feature_extraction.text import HashingVectorizer vectorizer2 = HashingVectorizer(n_features=100, norm=None) data_Y = data[u'二级类目'] #data_X=vectorizer.fit_transform(data[u'产品标题']).toarray() data_X = vectorizer2.fit_transform(data[u'产品标题']).toarray() ''' from sklearn.metrics import accuracy_score, log_loss from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.model_selection import learning_curve from sklearn.metrics import precision_score from sklearn.metrics import precision_recall_curve from sklearn.metrics import recall_score
def func(): hv = HashingVectorizer() hv.fit_transform(['hello world', np.nan, 'hello hello'])
if __name__ == '__main__': use_hashing = True select_chi2 = True X_train, X_test, y_train, y_test, target_names = get_data() print("%d rows: " %len(y_train) + "\n") print("%d features:" % len(X_train[0] + "\n")) if use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=400) X_train = vectorizer.transform(X_train) X_test = vectorizer.transform(X_test) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.fit_transform(X_test) # feature select if select_chi2: ch2 = SelectKBest(chi2, k=30) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) # classify, train model mnb = MultinomialNB(alpha=1) mnb.fit(X_train, y_train) mnb_result = mnb.predict(X_test) scoremnb = metrics.accuracy_score(y_test, mnb_result) nb = multiNBayes.multiNByes(X_train.toarray(), y_train)
file.drop("label", axis=1) """ Chooses random files in dataset to be training and testing data. test_size shows what portion of the data will be test data. random_state is used for generating random numbers to help """ X_train, X_test, y_train, y_test = train_test_split(file['text'], y, test_size=0.4, random_state=53) #Stores tokens as numerical indexes hash_vect = HashingVectorizer(stop_words='english', non_negative=True) #fits the data to make a normal model hash_train = hash_vect.fit_transform(X_train) hash_test = hash_vect.transform(X_test) #Creates instance of passive aggressive classifier classifier = PassiveAggressiveClassifier() #fit classifier onto training data classifier.fit(hash_train, y_train) #using 'learned' features from training data, predicts whether news is fake or real prediction = classifier.predict(hash_test) accuracy = accuracy_score(y_test, prediction) * 100 #print out total accuracy of classifier print("The accuracy is %0.5f" % accuracy + " percent.") #creates confusion matrix matrix = confusion_matrix(y_test, prediction, labels=['FAKE', 'REAL'])
def create_sentence_vectors(self): vectorizer = HashingVectorizer(norm=None, n_features=17) return (vectorizer.fit_transform( self.formatted_article_text)).toarray()
f = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'snippetCollection_text.pkl'), 'rb') snippetCollection_text = pickle_zloads(f.read()) f.close() ''' from nltk.stem.snowball import SnowballStemmer import re from sklearn.feature_extraction.text import HashingVectorizer import numpy import sys def stemTokenize(doc): stemmer = SnowballStemmer('english') return [stemmer.stem(word) for word in re.findall(r'\b\w+\b', doc)] vectorizer = HashingVectorizer(tokenizer=stemTokenize, ngram_range=(1, 3), token_pattern=r'\b\w+\b', stop_words='english', binary=False, norm='l2', n_features=2**19) trainedVectorArray = vectorizer.fit_transform(snippetCollection_text) anchorVector = vectorizer.transform([sys.argv[1]]).toarray() distances = (anchorVector * trainedVectorArray.T)[0] nonzeroIndices = numpy.nonzero(distances)[0] sortedIndices = nonzeroIndices[numpy.argsort(distances[nonzeroIndices])][::-1] for i in sortedIndices[:int(sys.argv[2])]: print(snippetCollection_text[i] + '\n')
import numpy as np import pickle from sklearn.feature_extraction.text import HashingVectorizer from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier data = pickle.load(open('sklearn-data.pickle', 'rb')) x_train = data["x_train"] y_train = data["y_train"] x_test = data["x_test"] y_test = data["y_test"] v ##### Vectorizing data for sklearn vectorizer = HashingVectorizer(stop_words="english", lowercase=True, binary=True, n_features=2**18) x_train_hash = vectorizer.fit_transform(x_train) x_test_hash = vectorizer.fit_transform(x_test) classifier_DT = DecisionTreeClassifier() classifier_DT.fit(x_train_hash, y_train) y_DT = classifier_DT.predict(x_test_hash) acc_DT = accuracy_score(y_DT, y_test) print("\nDecision tree accuracy: ", round(acc_DT, 4) * 100, "%")
def predict_and_cluster(opts, mode): n_digits = 3 # n_samples, n_features = (25,1927) n_samples, n_features = (25, 491) labels = array([ 0, 1, 2, 1, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 ]) true_k = np.unique(labels).shape[0] corpus, news = jieba_tokenizer() print( "Extracting features from the training dataset using a sparse vectorizer" ) t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) # n_samples: how many articles are there # n_features: how many different words in all articles are there print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False)) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) svd = TruncatedSVD().fit(X) X_proj = svd.transform(X) explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum() print("Explained variance of the SVD step: {}%".format( int(explained_variances[0] * 100))) print() # ================================================= # KMeans clustering # if opts.minibatch: # km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, # init_size=1000, batch_size=1000, verbose=True) # else: print('*' * 80) km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True) # always better print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=None)) print("labels ", labels) print("my_labels ", km.labels_) if not (opts.n_components or opts.use_hashing): print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() for i in range(len(news)): news[i].category = labels[i] from sklearn.metrics.pairwise import cosine_similarity FG = nx.Graph() for i in range(len(news)): news[i].similarity = cosine_similarity(X[i:i + 1], X)[0] cs = news[i].similarity # print (cs) for j in range(len(news)): if i != j: FG.add_weighted_edges_from([(i, j, cs[j])]) print() print('*' * 80) print(X.shape[0]) print(X.shape) print(self) gmm(X) print() print('*' * 80) best_part(FG) print() print('*' * 80)
def numberize_hash(filename, number_of_features): vectorizer = HashingVectorizer(n_features = number_of_features) return vectorizer, vectorizer.fit_transform(read_file(filename))